import fs from 'fs'; import { parse, HTMLElement } from 'node-html-parser'; interface Article { title: string; link: string; } /** * Parse HTML file and return array of articles * @param htmlFilePath - Path to HTML file * @returns Array of articles with title and link */ function parseArticles(htmlFilePath: string): Article[] { try { // Read HTML file const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8'); // Parse HTML with node-html-parser const root = parse(htmlContent); // Array to store results const articles: Article[] = []; // Set to track unique links (avoid duplicates) const seenLinks = new Set(); // Method 1: Find all li elements with data-component-name="card" const cardElements = root.querySelectorAll('li[data-component-name="card"]'); cardElements.forEach((element: HTMLElement) => { // Get title from headline text const titleElement = element.querySelector('.container__headline-text'); const title = titleElement ? titleElement.text.trim() : ''; // Get link from data-open-link attribute let link = element.getAttribute('data-open-link') || ''; // If no data-open-link, try to find link in anchor tag if (!link) { const linkElement = element.querySelector('a[href]'); link = linkElement ? linkElement.getAttribute('href') || '' : ''; } // If link is relative path, add domain if (link && !link.startsWith('http')) { link = `https://www.cnn.com${link}`; } // Only add to array if both title and link exist and link is not duplicate if (title && link && !seenLinks.has(link)) { articles.push({ title, link }); seenLinks.add(link); } }); // Method 2: Also check for article links in other containers const allLinks = root.querySelectorAll('a.container__link'); allLinks.forEach((linkElement: HTMLElement) => { // Get href let link = linkElement.getAttribute('href') || linkElement.getAttribute('data-open-link') || ''; // If link is relative path, add domain if (link && !link.startsWith('http')) { link = `https://www.cnn.com${link}`; } // Skip if already added or empty if (!link || seenLinks.has(link)) return; // Try to find title in parent element const parentLi = linkElement.closest('li[data-component-name="card"]'); if (parentLi) { const titleElement = parentLi.querySelector('.container__headline-text'); const title = titleElement ? titleElement.text.trim() : ''; if (title) { articles.push({ title, link }); seenLinks.add(link); } } }); return articles; } catch (error) { console.error('Error parsing HTML:', error); return []; } } /** * Main function for testing */ function main(): void { const htmlFilePath = 'source-pages/world.html'; const articles = parseArticles(htmlFilePath); console.log(`Found ${articles.length} articles:\n`); // Print first 10 articles for demo articles.slice(0, 10).forEach((article, index) => { console.log(`${index + 1}. ${article.title}`); console.log(` ${article.link}\n`); }); console.log(`\n... and ${articles.length - 10} more articles`); // Export to JSON file fs.writeFileSync('data/articles.json', JSON.stringify(articles, null, 2)); console.log('\nAll articles saved to articles.json'); } // Run main function main(); // Export function for use in other modules export { parseArticles };