backup source local
This commit is contained in:
parent
3d0aef852f
commit
abba78815f
7 changed files with 25491 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
package-lock.json
|
||||||
|
node_modules
|
||||||
|
data/*
|
||||||
119
get-list-pages.ts
Normal file
119
get-list-pages.ts
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
import fs from 'fs';
|
||||||
|
import { parse, HTMLElement } from 'node-html-parser';
|
||||||
|
|
||||||
|
interface Article {
|
||||||
|
title: string;
|
||||||
|
link: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse HTML file and return array of articles
|
||||||
|
* @param htmlFilePath - Path to HTML file
|
||||||
|
* @returns Array of articles with title and link
|
||||||
|
*/
|
||||||
|
function parseArticles(htmlFilePath: string): Article[] {
|
||||||
|
try {
|
||||||
|
// Read HTML file
|
||||||
|
const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8');
|
||||||
|
|
||||||
|
// Parse HTML with node-html-parser
|
||||||
|
const root = parse(htmlContent);
|
||||||
|
|
||||||
|
// Array to store results
|
||||||
|
const articles: Article[] = [];
|
||||||
|
|
||||||
|
// Set to track unique links (avoid duplicates)
|
||||||
|
const seenLinks = new Set<string>();
|
||||||
|
|
||||||
|
// Method 1: Find all li elements with data-component-name="card"
|
||||||
|
const cardElements = root.querySelectorAll('li[data-component-name="card"]');
|
||||||
|
|
||||||
|
cardElements.forEach((element: HTMLElement) => {
|
||||||
|
// Get title from headline text
|
||||||
|
const titleElement = element.querySelector('.container__headline-text');
|
||||||
|
const title = titleElement ? titleElement.text.trim() : '';
|
||||||
|
|
||||||
|
// Get link from data-open-link attribute
|
||||||
|
let link = element.getAttribute('data-open-link') || '';
|
||||||
|
|
||||||
|
// If no data-open-link, try to find link in anchor tag
|
||||||
|
if (!link) {
|
||||||
|
const linkElement = element.querySelector('a[href]');
|
||||||
|
link = linkElement ? linkElement.getAttribute('href') || '' : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// If link is relative path, add domain
|
||||||
|
if (link && !link.startsWith('http')) {
|
||||||
|
link = `https://www.cnn.com${link}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only add to array if both title and link exist and link is not duplicate
|
||||||
|
if (title && link && !seenLinks.has(link)) {
|
||||||
|
articles.push({ title, link });
|
||||||
|
seenLinks.add(link);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Method 2: Also check for article links in other containers
|
||||||
|
const allLinks = root.querySelectorAll('a.container__link');
|
||||||
|
|
||||||
|
allLinks.forEach((linkElement: HTMLElement) => {
|
||||||
|
// Get href
|
||||||
|
let link = linkElement.getAttribute('href') || linkElement.getAttribute('data-open-link') || '';
|
||||||
|
|
||||||
|
// If link is relative path, add domain
|
||||||
|
if (link && !link.startsWith('http')) {
|
||||||
|
link = `https://www.cnn.com${link}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip if already added or empty
|
||||||
|
if (!link || seenLinks.has(link)) return;
|
||||||
|
|
||||||
|
// Try to find title in parent element
|
||||||
|
const parentLi = linkElement.closest('li[data-component-name="card"]');
|
||||||
|
if (parentLi) {
|
||||||
|
const titleElement = parentLi.querySelector('.container__headline-text');
|
||||||
|
const title = titleElement ? titleElement.text.trim() : '';
|
||||||
|
|
||||||
|
if (title) {
|
||||||
|
articles.push({ title, link });
|
||||||
|
seenLinks.add(link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return articles;
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error parsing HTML:', error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main function for testing
|
||||||
|
*/
|
||||||
|
function main(): void {
|
||||||
|
const htmlFilePath = 'source-pages/world.html';
|
||||||
|
const articles = parseArticles(htmlFilePath);
|
||||||
|
|
||||||
|
console.log(`Found ${articles.length} articles:\n`);
|
||||||
|
|
||||||
|
// Print first 10 articles for demo
|
||||||
|
articles.slice(0, 10).forEach((article, index) => {
|
||||||
|
console.log(`${index + 1}. ${article.title}`);
|
||||||
|
console.log(` ${article.link}\n`);
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`\n... and ${articles.length - 10} more articles`);
|
||||||
|
|
||||||
|
// Export to JSON file
|
||||||
|
fs.writeFileSync('data/articles.json', JSON.stringify(articles, null, 2));
|
||||||
|
console.log('\nAll articles saved to articles.json');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run main function
|
||||||
|
main();
|
||||||
|
|
||||||
|
// Export function for use in other modules
|
||||||
|
export { parseArticles };
|
||||||
139
get-page-details.ts
Normal file
139
get-page-details.ts
Normal file
|
|
@ -0,0 +1,139 @@
|
||||||
|
import fs from 'fs';
|
||||||
|
import { parse, HTMLElement } from 'node-html-parser';
|
||||||
|
|
||||||
|
interface ArticleDetail {
|
||||||
|
title: string;
|
||||||
|
link: string;
|
||||||
|
image: string;
|
||||||
|
content: string;
|
||||||
|
publishDate: string;
|
||||||
|
author: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse article detail page and extract information
|
||||||
|
* @param htmlFilePath - Path to HTML file
|
||||||
|
* @returns Article details object
|
||||||
|
*/
|
||||||
|
function parseArticleDetail(htmlFilePath: string): ArticleDetail | null {
|
||||||
|
try {
|
||||||
|
const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8');
|
||||||
|
const root = parse(htmlContent);
|
||||||
|
|
||||||
|
const article: ArticleDetail = {
|
||||||
|
title: '',
|
||||||
|
link: '',
|
||||||
|
image: '',
|
||||||
|
content: '',
|
||||||
|
publishDate: '',
|
||||||
|
author: ''
|
||||||
|
};
|
||||||
|
|
||||||
|
// 1. Title
|
||||||
|
const titleElement = root.querySelector('.headline__text');
|
||||||
|
if (titleElement) article.title = titleElement.text.trim();
|
||||||
|
|
||||||
|
// 2. Link
|
||||||
|
const canonicalLink = root.querySelector('link[rel="canonical"]');
|
||||||
|
article.link = canonicalLink?.getAttribute('href') || '';
|
||||||
|
if (!article.link) {
|
||||||
|
const ogUrl = root.querySelector('meta[property="og:url"]');
|
||||||
|
if (ogUrl) article.link = ogUrl.getAttribute('content') || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Featured Image
|
||||||
|
const ogImage = root.querySelector('meta[property="og:image"]');
|
||||||
|
article.image = ogImage?.getAttribute('content') || '';
|
||||||
|
if (!article.image) {
|
||||||
|
const firstImage = root.querySelector('.article__content img, .image__picture img, picture img');
|
||||||
|
if (firstImage) article.image = firstImage.getAttribute('src') || firstImage.getAttribute('data-src') || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Publish Date
|
||||||
|
const timestampElement = root.querySelector('time[datetime]');
|
||||||
|
article.publishDate = timestampElement?.getAttribute('datetime') || '';
|
||||||
|
if (!article.publishDate) {
|
||||||
|
const timestampText = root.querySelector('.timestamp__time-since');
|
||||||
|
if (timestampText) article.publishDate = timestampText.text.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Author
|
||||||
|
const authorElement = root.querySelector('.byline__name');
|
||||||
|
if (authorElement) article.author = authorElement.text.trim();
|
||||||
|
|
||||||
|
// 6. Content
|
||||||
|
const contentParagraphs: string[] = [];
|
||||||
|
const articleBody = root.querySelector('.article__content, .article-body, .l-container');
|
||||||
|
if (articleBody) {
|
||||||
|
const paragraphs = articleBody.querySelectorAll('p.paragraph, p');
|
||||||
|
paragraphs.forEach((p: HTMLElement) => {
|
||||||
|
const text = p.text.trim();
|
||||||
|
if (text && !text.includes('CNN') && !text.includes('©') && text.length > 20) {
|
||||||
|
contentParagraphs.push(text);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (contentParagraphs.length === 0) {
|
||||||
|
const allParagraphs = root.querySelectorAll('p');
|
||||||
|
const filtered = allParagraphs
|
||||||
|
.map(p => p.text.trim())
|
||||||
|
.filter(t => t.length > 50);
|
||||||
|
article.content = filtered.slice(0, 10).join('\n\n');
|
||||||
|
} else {
|
||||||
|
article.content = contentParagraphs.join('\n\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
return article;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error parsing article detail:', error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse multiple article detail pages
|
||||||
|
*/
|
||||||
|
function parseMultipleArticles(htmlFiles: string[]): ArticleDetail[] {
|
||||||
|
const articles: ArticleDetail[] = [];
|
||||||
|
htmlFiles.forEach(filePath => {
|
||||||
|
const article = parseArticleDetail(filePath);
|
||||||
|
if (article && article.title) articles.push(article);
|
||||||
|
});
|
||||||
|
return articles;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main function for testing
|
||||||
|
*/
|
||||||
|
function main(): void {
|
||||||
|
const htmlFilePath = 'source-pages/china-people-cafe-change-name-intl-hnk.html';
|
||||||
|
|
||||||
|
// Create data folder if not exists
|
||||||
|
if (!fs.existsSync('data')) fs.mkdirSync('data');
|
||||||
|
|
||||||
|
console.log('Parsing article detail...\n');
|
||||||
|
const article = parseArticleDetail(htmlFilePath);
|
||||||
|
|
||||||
|
if (article) {
|
||||||
|
console.log('=== ARTICLE DETAILS ===\n');
|
||||||
|
console.log(`Title: ${article.title}\n`);
|
||||||
|
console.log(`Link: ${article.link}\n`);
|
||||||
|
console.log(`Image: ${article.image}\n`);
|
||||||
|
console.log(`Author: ${article.author}\n`);
|
||||||
|
console.log(`Publish Date: ${article.publishDate}\n`);
|
||||||
|
console.log(`Content Preview:\n${article.content.substring(0, 500)}...\n`);
|
||||||
|
console.log(`\nTotal content length: ${article.content.length} characters`);
|
||||||
|
|
||||||
|
fs.writeFileSync('data/article-detail.json', JSON.stringify(article, null, 2), 'utf-8');
|
||||||
|
console.log('\nArticle details saved to data/article-detail.json');
|
||||||
|
} else {
|
||||||
|
console.log('Failed to parse article');
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run main
|
||||||
|
main();
|
||||||
|
|
||||||
|
// Export for other modules
|
||||||
|
export { parseArticleDetail, parseMultipleArticles };
|
||||||
22
package.json
Normal file
22
package.json
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
{
|
||||||
|
"name": "crawl-ttr-test",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "dist/parseArticles.js",
|
||||||
|
"scripts": {
|
||||||
|
"dev-get-list": "ts-node-dev --respawn --transpile-only get-list-pages.ts",
|
||||||
|
"dev-get-page-details": "ts-node-dev --respawn --transpile-only get-page-details.ts"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"type": "commonjs",
|
||||||
|
"dependencies": {
|
||||||
|
"node-html-parser": "^7.0.1"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^24.10.1",
|
||||||
|
"ts-node-dev": "^2.0.0",
|
||||||
|
"typescript": "^5.9.3"
|
||||||
|
}
|
||||||
|
}
|
||||||
9672
source-pages/china-people-cafe-change-name-intl-hnk.html
Normal file
9672
source-pages/china-people-cafe-change-name-intl-hnk.html
Normal file
File diff suppressed because one or more lines are too long
15524
source-pages/world.html
Normal file
15524
source-pages/world.html
Normal file
File diff suppressed because one or more lines are too long
12
tsconfig.json
Normal file
12
tsconfig.json
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2020",
|
||||||
|
"module": "CommonJS",
|
||||||
|
"moduleResolution": "Node",
|
||||||
|
"strict": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"outDir": "dist"
|
||||||
|
},
|
||||||
|
"include": ["*.ts"]
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue