import fs from 'fs'; import { parse, HTMLElement } from 'node-html-parser'; interface ArticleDetail { title: string; link: string; image: string; content: string; publishDate: string; author: string; } /** * Parse article detail page and extract information * @param htmlFilePath - Path to HTML file * @returns Article details object */ function parseArticleDetail(htmlFilePath: string): ArticleDetail | null { try { const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8'); const root = parse(htmlContent); const article: ArticleDetail = { title: '', link: '', image: '', content: '', publishDate: '', author: '' }; // 1. Title const titleElement = root.querySelector('.headline__text'); if (titleElement) article.title = titleElement.text.trim(); // 2. Link const canonicalLink = root.querySelector('link[rel="canonical"]'); article.link = canonicalLink?.getAttribute('href') || ''; if (!article.link) { const ogUrl = root.querySelector('meta[property="og:url"]'); if (ogUrl) article.link = ogUrl.getAttribute('content') || ''; } // 3. Featured Image const ogImage = root.querySelector('meta[property="og:image"]'); article.image = ogImage?.getAttribute('content') || ''; if (!article.image) { const firstImage = root.querySelector('.article__content img, .image__picture img, picture img'); if (firstImage) article.image = firstImage.getAttribute('src') || firstImage.getAttribute('data-src') || ''; } // 4. Publish Date const timestampElement = root.querySelector('time[datetime]'); article.publishDate = timestampElement?.getAttribute('datetime') || ''; if (!article.publishDate) { const timestampText = root.querySelector('.timestamp__time-since'); if (timestampText) article.publishDate = timestampText.text.trim(); } // 5. Author const authorElement = root.querySelector('.byline__name'); if (authorElement) article.author = authorElement.text.trim(); // 6. Content const contentParagraphs: string[] = []; const articleBody = root.querySelector('.article__content, .article-body, .l-container'); if (articleBody) { const paragraphs = articleBody.querySelectorAll('p.paragraph, p'); paragraphs.forEach((p: HTMLElement) => { const text = p.text.trim(); if (text && !text.includes('CNN') && !text.includes('©') && text.length > 20) { contentParagraphs.push(text); } }); } if (contentParagraphs.length === 0) { const allParagraphs = root.querySelectorAll('p'); const filtered = allParagraphs .map(p => p.text.trim()) .filter(t => t.length > 50); article.content = filtered.slice(0, 10).join('\n\n'); } else { article.content = contentParagraphs.join('\n\n'); } return article; } catch (error) { console.error('Error parsing article detail:', error); return null; } } /** * Parse multiple article detail pages */ function parseMultipleArticles(htmlFiles: string[]): ArticleDetail[] { const articles: ArticleDetail[] = []; htmlFiles.forEach(filePath => { const article = parseArticleDetail(filePath); if (article && article.title) articles.push(article); }); return articles; } /** * Main function for testing */ function main(): void { const htmlFilePath = 'source-pages/china-people-cafe-change-name-intl-hnk.html'; // Create data folder if not exists if (!fs.existsSync('data')) fs.mkdirSync('data'); console.log('Parsing article detail...\n'); const article = parseArticleDetail(htmlFilePath); if (article) { console.log('=== ARTICLE DETAILS ===\n'); console.log(`Title: ${article.title}\n`); console.log(`Link: ${article.link}\n`); console.log(`Image: ${article.image}\n`); console.log(`Author: ${article.author}\n`); console.log(`Publish Date: ${article.publishDate}\n`); console.log(`Content Preview:\n${article.content.substring(0, 500)}...\n`); console.log(`\nTotal content length: ${article.content.length} characters`); fs.writeFileSync('data/article-detail.json', JSON.stringify(article, null, 2), 'utf-8'); console.log('\nArticle details saved to data/article-detail.json'); } else { console.log('Failed to parse article'); } } // Run main main(); // Export for other modules export { parseArticleDetail, parseMultipleArticles };