119 lines
3.6 KiB
TypeScript
119 lines
3.6 KiB
TypeScript
import fs from 'fs';
|
|
import { parse, HTMLElement } from 'node-html-parser';
|
|
|
|
interface Article {
|
|
title: string;
|
|
link: string;
|
|
}
|
|
|
|
/**
|
|
* Parse HTML file and return array of articles
|
|
* @param htmlFilePath - Path to HTML file
|
|
* @returns Array of articles with title and link
|
|
*/
|
|
function parseArticles(htmlFilePath: string): Article[] {
|
|
try {
|
|
// Read HTML file
|
|
const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8');
|
|
|
|
// Parse HTML with node-html-parser
|
|
const root = parse(htmlContent);
|
|
|
|
// Array to store results
|
|
const articles: Article[] = [];
|
|
|
|
// Set to track unique links (avoid duplicates)
|
|
const seenLinks = new Set<string>();
|
|
|
|
// Method 1: Find all li elements with data-component-name="card"
|
|
const cardElements = root.querySelectorAll('li[data-component-name="card"]');
|
|
|
|
cardElements.forEach((element: HTMLElement) => {
|
|
// Get title from headline text
|
|
const titleElement = element.querySelector('.container__headline-text');
|
|
const title = titleElement ? titleElement.text.trim() : '';
|
|
|
|
// Get link from data-open-link attribute
|
|
let link = element.getAttribute('data-open-link') || '';
|
|
|
|
// If no data-open-link, try to find link in anchor tag
|
|
if (!link) {
|
|
const linkElement = element.querySelector('a[href]');
|
|
link = linkElement ? linkElement.getAttribute('href') || '' : '';
|
|
}
|
|
|
|
// If link is relative path, add domain
|
|
if (link && !link.startsWith('http')) {
|
|
link = `https://www.cnn.com${link}`;
|
|
}
|
|
|
|
// Only add to array if both title and link exist and link is not duplicate
|
|
if (title && link && !seenLinks.has(link)) {
|
|
articles.push({ title, link });
|
|
seenLinks.add(link);
|
|
}
|
|
});
|
|
|
|
// Method 2: Also check for article links in other containers
|
|
const allLinks = root.querySelectorAll('a.container__link');
|
|
|
|
allLinks.forEach((linkElement: HTMLElement) => {
|
|
// Get href
|
|
let link = linkElement.getAttribute('href') || linkElement.getAttribute('data-open-link') || '';
|
|
|
|
// If link is relative path, add domain
|
|
if (link && !link.startsWith('http')) {
|
|
link = `https://www.cnn.com${link}`;
|
|
}
|
|
|
|
// Skip if already added or empty
|
|
if (!link || seenLinks.has(link)) return;
|
|
|
|
// Try to find title in parent element
|
|
const parentLi = linkElement.closest('li[data-component-name="card"]');
|
|
if (parentLi) {
|
|
const titleElement = parentLi.querySelector('.container__headline-text');
|
|
const title = titleElement ? titleElement.text.trim() : '';
|
|
|
|
if (title) {
|
|
articles.push({ title, link });
|
|
seenLinks.add(link);
|
|
}
|
|
}
|
|
});
|
|
|
|
return articles;
|
|
|
|
} catch (error) {
|
|
console.error('Error parsing HTML:', error);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main function for testing
|
|
*/
|
|
function main(): void {
|
|
const htmlFilePath = 'source-pages/world.html';
|
|
const articles = parseArticles(htmlFilePath);
|
|
|
|
console.log(`Found ${articles.length} articles:\n`);
|
|
|
|
// Print first 10 articles for demo
|
|
articles.slice(0, 10).forEach((article, index) => {
|
|
console.log(`${index + 1}. ${article.title}`);
|
|
console.log(` ${article.link}\n`);
|
|
});
|
|
|
|
console.log(`\n... and ${articles.length - 10} more articles`);
|
|
|
|
// Export to JSON file
|
|
fs.writeFileSync('data/articles.json', JSON.stringify(articles, null, 2));
|
|
console.log('\nAll articles saved to articles.json');
|
|
}
|
|
|
|
// Run main function
|
|
main();
|
|
|
|
// Export function for use in other modules
|
|
export { parseArticles };
|