imnews.crawl/get-page-details.ts
2025-11-12 15:31:38 +07:00

139 lines
4.4 KiB
TypeScript

import fs from 'fs';
import { parse, HTMLElement } from 'node-html-parser';
interface ArticleDetail {
title: string;
link: string;
image: string;
content: string;
publishDate: string;
author: string;
}
/**
* Parse article detail page and extract information
* @param htmlFilePath - Path to HTML file
* @returns Article details object
*/
function parseArticleDetail(htmlFilePath: string): ArticleDetail | null {
try {
const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8');
const root = parse(htmlContent);
const article: ArticleDetail = {
title: '',
link: '',
image: '',
content: '',
publishDate: '',
author: ''
};
// 1. Title
const titleElement = root.querySelector('.headline__text');
if (titleElement) article.title = titleElement.text.trim();
// 2. Link
const canonicalLink = root.querySelector('link[rel="canonical"]');
article.link = canonicalLink?.getAttribute('href') || '';
if (!article.link) {
const ogUrl = root.querySelector('meta[property="og:url"]');
if (ogUrl) article.link = ogUrl.getAttribute('content') || '';
}
// 3. Featured Image
const ogImage = root.querySelector('meta[property="og:image"]');
article.image = ogImage?.getAttribute('content') || '';
if (!article.image) {
const firstImage = root.querySelector('.article__content img, .image__picture img, picture img');
if (firstImage) article.image = firstImage.getAttribute('src') || firstImage.getAttribute('data-src') || '';
}
// 4. Publish Date
const timestampElement = root.querySelector('time[datetime]');
article.publishDate = timestampElement?.getAttribute('datetime') || '';
if (!article.publishDate) {
const timestampText = root.querySelector('.timestamp__time-since');
if (timestampText) article.publishDate = timestampText.text.trim();
}
// 5. Author
const authorElement = root.querySelector('.byline__name');
if (authorElement) article.author = authorElement.text.trim();
// 6. Content
const contentParagraphs: string[] = [];
const articleBody = root.querySelector('.article__content, .article-body, .l-container');
if (articleBody) {
const paragraphs = articleBody.querySelectorAll('p.paragraph, p');
paragraphs.forEach((p: HTMLElement) => {
const text = p.text.trim();
if (text && !text.includes('CNN') && !text.includes('©') && text.length > 20) {
contentParagraphs.push(text);
}
});
}
if (contentParagraphs.length === 0) {
const allParagraphs = root.querySelectorAll('p');
const filtered = allParagraphs
.map(p => p.text.trim())
.filter(t => t.length > 50);
article.content = filtered.slice(0, 10).join('\n\n');
} else {
article.content = contentParagraphs.join('\n\n');
}
return article;
} catch (error) {
console.error('Error parsing article detail:', error);
return null;
}
}
/**
* Parse multiple article detail pages
*/
function parseMultipleArticles(htmlFiles: string[]): ArticleDetail[] {
const articles: ArticleDetail[] = [];
htmlFiles.forEach(filePath => {
const article = parseArticleDetail(filePath);
if (article && article.title) articles.push(article);
});
return articles;
}
/**
* Main function for testing
*/
function main(): void {
const htmlFilePath = 'source-pages/china-people-cafe-change-name-intl-hnk.html';
// Create data folder if not exists
if (!fs.existsSync('data')) fs.mkdirSync('data');
console.log('Parsing article detail...\n');
const article = parseArticleDetail(htmlFilePath);
if (article) {
console.log('=== ARTICLE DETAILS ===\n');
console.log(`Title: ${article.title}\n`);
console.log(`Link: ${article.link}\n`);
console.log(`Image: ${article.image}\n`);
console.log(`Author: ${article.author}\n`);
console.log(`Publish Date: ${article.publishDate}\n`);
console.log(`Content Preview:\n${article.content.substring(0, 500)}...\n`);
console.log(`\nTotal content length: ${article.content.length} characters`);
fs.writeFileSync('data/article-detail.json', JSON.stringify(article, null, 2), 'utf-8');
console.log('\nArticle details saved to data/article-detail.json');
} else {
console.log('Failed to parse article');
}
}
// Run main
main();
// Export for other modules
export { parseArticleDetail, parseMultipleArticles };