Skip to content

Commit 016e46e

Browse files
committed
Add code sample and exercise for web scraping
1 parent e7532a7 commit 016e46e

7 files changed

Lines changed: 1558 additions & 0 deletions

File tree

Lesson03/exercise_005/index.js

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
const http = require('https');
2+
const JSDOM = require('jsdom').JSDOM;
3+
const url = require('url');
4+
5+
const topics = [
6+
'artificial-intelligence',
7+
'data-science',
8+
'javascript',
9+
'programming',
10+
'software-engineering',
11+
];
12+
13+
function downloadPage(urlToDownload, callback) {
14+
const request = http.get(urlToDownload, (response) => {
15+
if (response.statusCode != 200) {
16+
console.error('Error while downloading page %s.', urlToDownload);
17+
console.error('Response was: %s %s', response.statusCode, response.statusMessage);
18+
return;
19+
}
20+
21+
let content = '';
22+
response.on('data', (chunk) => content += chunk.toString());
23+
response.on('close', () => callback(content));
24+
});
25+
request.end();
26+
}
27+
28+
function findArticles(document) {
29+
const articles = {};
30+
Array.from(document.querySelectorAll('h1 a, h3 a'))
31+
.filter(el => {
32+
const parsedUrl = url.parse(el.href);
33+
const split = parsedUrl.pathname.split('/').filter((s) => s.trim() != '');
34+
return split.length == 2;
35+
}).forEach(el => {
36+
const description = el.parentNode.nextSibling.querySelector('p a').text;
37+
articles[el.text] = {
38+
description: description,
39+
link: url.parse(el.href).pathname,
40+
title: el.text,
41+
};
42+
});
43+
return articles;
44+
}
45+
46+
function printArticle(article) {
47+
console.log('-----');
48+
console.log(` ${article.title}`);
49+
console.log(` ${article.description}`);
50+
console.log(` https://medium.com${article.link}`);
51+
}
52+
53+
topics.forEach(topic => {
54+
downloadPage(`https://medium.com/topic/${topic}`, (content) => {
55+
const articles = findArticles(new JSDOM(content).window.document);
56+
Object.values(articles)
57+
.forEach(printArticle);
58+
});
59+
});

0 commit comments

Comments
 (0)