diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e7871ec --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +package-lock.json +node_modules +data/* \ No newline at end of file diff --git a/get-list-pages.ts b/get-list-pages.ts new file mode 100644 index 0000000..41131b9 --- /dev/null +++ b/get-list-pages.ts @@ -0,0 +1,119 @@ +import fs from 'fs'; +import { parse, HTMLElement } from 'node-html-parser'; + +interface Article { + title: string; + link: string; +} + +/** + * Parse HTML file and return array of articles + * @param htmlFilePath - Path to HTML file + * @returns Array of articles with title and link + */ +function parseArticles(htmlFilePath: string): Article[] { + try { + // Read HTML file + const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8'); + + // Parse HTML with node-html-parser + const root = parse(htmlContent); + + // Array to store results + const articles: Article[] = []; + + // Set to track unique links (avoid duplicates) + const seenLinks = new Set(); + + // Method 1: Find all li elements with data-component-name="card" + const cardElements = root.querySelectorAll('li[data-component-name="card"]'); + + cardElements.forEach((element: HTMLElement) => { + // Get title from headline text + const titleElement = element.querySelector('.container__headline-text'); + const title = titleElement ? titleElement.text.trim() : ''; + + // Get link from data-open-link attribute + let link = element.getAttribute('data-open-link') || ''; + + // If no data-open-link, try to find link in anchor tag + if (!link) { + const linkElement = element.querySelector('a[href]'); + link = linkElement ? linkElement.getAttribute('href') || '' : ''; + } + + // If link is relative path, add domain + if (link && !link.startsWith('http')) { + link = `https://www.cnn.com${link}`; + } + + // Only add to array if both title and link exist and link is not duplicate + if (title && link && !seenLinks.has(link)) { + articles.push({ title, link }); + seenLinks.add(link); + } + }); + + // Method 2: Also check for article links in other containers + const allLinks = root.querySelectorAll('a.container__link'); + + allLinks.forEach((linkElement: HTMLElement) => { + // Get href + let link = linkElement.getAttribute('href') || linkElement.getAttribute('data-open-link') || ''; + + // If link is relative path, add domain + if (link && !link.startsWith('http')) { + link = `https://www.cnn.com${link}`; + } + + // Skip if already added or empty + if (!link || seenLinks.has(link)) return; + + // Try to find title in parent element + const parentLi = linkElement.closest('li[data-component-name="card"]'); + if (parentLi) { + const titleElement = parentLi.querySelector('.container__headline-text'); + const title = titleElement ? titleElement.text.trim() : ''; + + if (title) { + articles.push({ title, link }); + seenLinks.add(link); + } + } + }); + + return articles; + + } catch (error) { + console.error('Error parsing HTML:', error); + return []; + } +} + +/** + * Main function for testing + */ +function main(): void { + const htmlFilePath = 'source-pages/world.html'; + const articles = parseArticles(htmlFilePath); + + console.log(`Found ${articles.length} articles:\n`); + + // Print first 10 articles for demo + articles.slice(0, 10).forEach((article, index) => { + console.log(`${index + 1}. ${article.title}`); + console.log(` ${article.link}\n`); + }); + + console.log(`\n... and ${articles.length - 10} more articles`); + + // Export to JSON file + fs.writeFileSync('data/articles.json', JSON.stringify(articles, null, 2)); + console.log('\nAll articles saved to articles.json'); +} + +// Run main function +main(); + +// Export function for use in other modules +export { parseArticles }; diff --git a/get-page-details.ts b/get-page-details.ts new file mode 100644 index 0000000..520c671 --- /dev/null +++ b/get-page-details.ts @@ -0,0 +1,139 @@ +import fs from 'fs'; +import { parse, HTMLElement } from 'node-html-parser'; + +interface ArticleDetail { + title: string; + link: string; + image: string; + content: string; + publishDate: string; + author: string; +} + +/** + * Parse article detail page and extract information + * @param htmlFilePath - Path to HTML file + * @returns Article details object + */ +function parseArticleDetail(htmlFilePath: string): ArticleDetail | null { + try { + const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8'); + const root = parse(htmlContent); + + const article: ArticleDetail = { + title: '', + link: '', + image: '', + content: '', + publishDate: '', + author: '' + }; + + // 1. Title + const titleElement = root.querySelector('.headline__text'); + if (titleElement) article.title = titleElement.text.trim(); + + // 2. Link + const canonicalLink = root.querySelector('link[rel="canonical"]'); + article.link = canonicalLink?.getAttribute('href') || ''; + if (!article.link) { + const ogUrl = root.querySelector('meta[property="og:url"]'); + if (ogUrl) article.link = ogUrl.getAttribute('content') || ''; + } + + // 3. Featured Image + const ogImage = root.querySelector('meta[property="og:image"]'); + article.image = ogImage?.getAttribute('content') || ''; + if (!article.image) { + const firstImage = root.querySelector('.article__content img, .image__picture img, picture img'); + if (firstImage) article.image = firstImage.getAttribute('src') || firstImage.getAttribute('data-src') || ''; + } + + // 4. Publish Date + const timestampElement = root.querySelector('time[datetime]'); + article.publishDate = timestampElement?.getAttribute('datetime') || ''; + if (!article.publishDate) { + const timestampText = root.querySelector('.timestamp__time-since'); + if (timestampText) article.publishDate = timestampText.text.trim(); + } + + // 5. Author + const authorElement = root.querySelector('.byline__name'); + if (authorElement) article.author = authorElement.text.trim(); + + // 6. Content + const contentParagraphs: string[] = []; + const articleBody = root.querySelector('.article__content, .article-body, .l-container'); + if (articleBody) { + const paragraphs = articleBody.querySelectorAll('p.paragraph, p'); + paragraphs.forEach((p: HTMLElement) => { + const text = p.text.trim(); + if (text && !text.includes('CNN') && !text.includes('©') && text.length > 20) { + contentParagraphs.push(text); + } + }); + } + if (contentParagraphs.length === 0) { + const allParagraphs = root.querySelectorAll('p'); + const filtered = allParagraphs + .map(p => p.text.trim()) + .filter(t => t.length > 50); + article.content = filtered.slice(0, 10).join('\n\n'); + } else { + article.content = contentParagraphs.join('\n\n'); + } + + return article; + } catch (error) { + console.error('Error parsing article detail:', error); + return null; + } +} + +/** + * Parse multiple article detail pages + */ +function parseMultipleArticles(htmlFiles: string[]): ArticleDetail[] { + const articles: ArticleDetail[] = []; + htmlFiles.forEach(filePath => { + const article = parseArticleDetail(filePath); + if (article && article.title) articles.push(article); + }); + return articles; +} + +/** + * Main function for testing + */ +function main(): void { + const htmlFilePath = 'source-pages/china-people-cafe-change-name-intl-hnk.html'; + + // Create data folder if not exists + if (!fs.existsSync('data')) fs.mkdirSync('data'); + + console.log('Parsing article detail...\n'); + const article = parseArticleDetail(htmlFilePath); + + if (article) { + console.log('=== ARTICLE DETAILS ===\n'); + console.log(`Title: ${article.title}\n`); + console.log(`Link: ${article.link}\n`); + console.log(`Image: ${article.image}\n`); + console.log(`Author: ${article.author}\n`); + console.log(`Publish Date: ${article.publishDate}\n`); + console.log(`Content Preview:\n${article.content.substring(0, 500)}...\n`); + console.log(`\nTotal content length: ${article.content.length} characters`); + + fs.writeFileSync('data/article-detail.json', JSON.stringify(article, null, 2), 'utf-8'); + console.log('\nArticle details saved to data/article-detail.json'); + } else { + console.log('Failed to parse article'); + } + +} + +// Run main +main(); + +// Export for other modules +export { parseArticleDetail, parseMultipleArticles }; diff --git a/package.json b/package.json new file mode 100644 index 0000000..f8f5865 --- /dev/null +++ b/package.json @@ -0,0 +1,22 @@ +{ + "name": "crawl-ttr-test", + "version": "1.0.0", + "description": "", + "main": "dist/parseArticles.js", + "scripts": { + "dev-get-list": "ts-node-dev --respawn --transpile-only get-list-pages.ts", + "dev-get-page-details": "ts-node-dev --respawn --transpile-only get-page-details.ts" + }, + "keywords": [], + "author": "", + "license": "ISC", + "type": "commonjs", + "dependencies": { + "node-html-parser": "^7.0.1" + }, + "devDependencies": { + "@types/node": "^24.10.1", + "ts-node-dev": "^2.0.0", + "typescript": "^5.9.3" + } +} diff --git a/source-pages/china-people-cafe-change-name-intl-hnk.html b/source-pages/china-people-cafe-change-name-intl-hnk.html new file mode 100644 index 0000000..0483ec0 --- /dev/null +++ b/source-pages/china-people-cafe-change-name-intl-hnk.html @@ -0,0 +1,9672 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A Chinese coffee shop called itself People’s Cafe. That name got it roasted. | CNN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + +
+
+
+ + +
+
+ +
+ +
+
+
+
+ + + +
+
+ + + World + + China + + +
+ +
+
+
+
+
+
+

+ A Chinese coffee shop called itself People’s Cafe. That name got it roasted. +

+ +
+ +
+ +
+ +
+
+
+
+ +
+
+
+
+ +
+ People's Cafe in Harbin +
+ + + + +
+ +
+ + + + +
+
+
+
+
+
+
+ + +
+
+
+ + + + + +
+
+ + +
+
+
+ + +
+
+ +
+
+ + Hong Kong +  —  + +
+ +

+ A Chinese coffee chain has apologized and changed its name after state media roasted it for misusing a word revered by the nation’s communist government. +

+ +

+ People’s Cafe decorates most of its branches in bright red, often with a star on their storefront, evoking a very Chinese Communist Party aesthetic – as well as using a typeface inspired by the calligraphy of China’s first communist leader, Mao Zedong. +

+ +

+ The company issued an apology Saturday, days after state media singled it out for riding on the “gimmick economy.” +

+ +

+ As in many officially communist states, the term “people” – or “renmin,” in Mandarin – is deployed liberally but earnestly by the Chinese government. +

+ +

+ The country’s official name is the People’s Republic of China, with many authoritative institutions also deploying the prefix, including its military, the People’s Liberation Army; its currency, the renminbi; and even the Communist Party mouthpiece that criticized the cafe. +

+ +

+ The term “carries a distinct public character and profound political connotations, embodying specific social sentiments and public interests,” the People’s Daily wrote in an opinion article Thursday, adding that the term “must not be profaned, nor can it be misused.” +

+ + + + + + + +

+ “Marketing can be creative, but it must not cross the bottom line,” it wrote. +

+ +

+ Pictures circulated on Chinese social media show the cafe’s patrons being served drinks sprinkled with “China” on the top. The slogan “Tell China’s story with coffee” adorned the walls and facades of some branches. +

+ +

+ On Saturday, Yao Chao People’s Cafe, which has about 30 outlets across the country, according to state media, issued an apology on social media. +

+ +

+ “We extend our sincerest apologies to every consumer who has been hurt,” it said, adding that after “deep reflection” it had taken “proactive steps” to rectify the situation. +

+ +

+ The chain said it would add “Yao Chao” back to the name of all shops in mainland China to reflect its legally registered moniker, although branches overseas will remain People’s Cafe, as well as in the semi-autonomous cities of Hong Kong and Macao. +

+ +

+ The chain also faced a chiding on social media. “Not a very smart move,” one user wrote on X-like platform Weibo. +

+ +

+ Another wrote: “Isn’t that in breach of the law? Shouldn’t it be suspended and reorganized?” +

+ +

+ The chain’s rapid expansion in recent years – alongside bigger local coffee makers such as Luckin Coffee – highlights rising demand for a brew in the nation of 1.4 billion people, traditionally dominated by tea. +

+ +

+ The market has also become increasingly cut-throat with cheaper options from local makers squeezing foreign chains such as Starbucks, once a symbol of status and Western influence in a more affluent China. +

+ +

+ Last week, the Seattle-founded coffee giant announced that it would sell the controlling stake of its more than 8,000 outlets in China to a local investment firm. +

+ +
+
+
+
+
+
+
+ + +
+
+
+ + + + + +
+
+ + +
+
+
+ + +
+
+ +
+ +
+
+
+
+ +
+ +
+
+
+
+ +
+ +
+
+ +
+
+
+
+
+
+ +
+ +
+
+ +
+ +
+ + + +
+
+
+
+
+ +
+ +
+ +
+ +
+
+ +
+ +
+ +
+
+ +
+ +
+ +
+
+
+
+
+ + +
+
+ +
+ +
+
+
+ + + diff --git a/source-pages/world.html b/source-pages/world.html new file mode 100644 index 0000000..20dd8b9 --- /dev/null +++ b/source-pages/world.html @@ -0,0 +1,15524 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + World news - breaking news, video, headlines and opinion | CNN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+
+
+
+ +
+ +
+
+
+ +
+
+
+ + +
+
+ +
+ +
+
+
+
+
+ + +
+
+
+
+ +
+
+
+
+

+ World +

+ +
+
+ +
+
+ +
+
+
+
+
+
+
+
+
+
+ +
+
+ + +
+ + + + + + +
+
+
+ +
+
+ +
+
+ + +

Video

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ +
+ +
+
+
+ +
+
+ +
+
+ + +

Around the world

+
+ + + + + + +
+
+
+ +
+ +
+
+
+ +
+
+ +
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+ +
+
+ +
+
+ + +

Featured

+
+ + + + + + + + + + + + +
+
+ +
+
+
+ +
+
+ +
+
+ + +

More of the latest stories

+
+ + + + + + +
+
+
+
+
+ +
+
+ +
+
+ +
+ +
+
+
+ +
+
+ +
+
+ + +

Something Extra

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+ +
+
+
+
+
+ +
+ +
+
+ +
+
+
+ +
+
+
+
+
+
+
+ + +
+
+ +
+ +
+
+
+ + + diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..4d39c66 --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,12 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "CommonJS", + "moduleResolution": "Node", + "strict": true, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "outDir": "dist" + }, + "include": ["*.ts"] +}