Skip to content

Commit 57f8293

Browse files
committed
Parse additional metadata from URLs.
1 parent b137eab commit 57f8293

12 files changed

Lines changed: 1691 additions & 286 deletions

File tree

aiohttp_aiofiles_tutorial/__init__.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,27 +10,25 @@
1010
from logger import LOGGER
1111

1212
from .data import urls
13-
from .loops import inspect_event_loop
1413
from .tasks import create_tasks
1514

1615

1716
async def init_script():
1817
"""Initiate script by preparing an output file prior to executing tasks."""
1918
start_time = timer()
2019
async with aiofiles.open(EXPORT_FILE, mode="w+") as outfile:
21-
await outfile.write("title,url,\n")
22-
await create_and_execute_tasks(outfile)
20+
await outfile.write("title,description,primary_tag,url,published_at\n")
21+
await execute_fetcher_tasks(outfile)
2322
await outfile.close()
2423
LOGGER.success(f"Executed {__name__} in {time.perf_counter() - start_time:0.2f} seconds.")
2524

2625

27-
async def create_and_execute_tasks(outfile: AsyncIOFile):
26+
async def execute_fetcher_tasks(outfile: AsyncIOFile):
2827
"""
2928
Open async HTTP session & execute created tasks.
3029
3130
:param AsyncIOFile outfile: Path of local file to write to.
3231
"""
3332
async with ClientSession(headers=HTML_HEADERS) as session:
3433
task_list = await create_tasks(session, urls, outfile)
35-
inspect_event_loop()
3634
await asyncio.gather(*task_list)

aiohttp_aiofiles_tutorial/data/urls.csv

Lines changed: 173 additions & 253 deletions
Large diffs are not rendered by default.

aiohttp_aiofiles_tutorial/fetcher.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ async def fetch_url_and_save_title(
2020
"""
2121
try:
2222
async with session.get(url) as resp:
23+
if resp.status != 200:
24+
pass
2325
html = await resp.text()
2426
await write_to_outfile(html, url, outfile, total_count, i)
2527
except InvalidURL as e:

aiohttp_aiofiles_tutorial/loops.py

Lines changed: 0 additions & 12 deletions
This file was deleted.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from bs4 import BeautifulSoup
2+
from logger import LOGGER
3+
4+
5+
async def parse_html_page_data(html: str, url: str) -> str:
6+
"""
7+
Extract page title from raw HTML of fetched URL; return a title
8+
9+
:param str html: Raw HTML source of a given fetched URL.
10+
:param str url: URL associated with the extracted HTML.
11+
12+
:returns: str
13+
"""
14+
try:
15+
soup = BeautifulSoup(html, "html.parser")
16+
title = soup.title.string.replace(",", "")
17+
description = (
18+
soup.head.select_one("meta[name=description]")
19+
.get("content")
20+
.replace(",", "")
21+
.replace('"', "")
22+
)
23+
primary_tag = soup.head.select_one("meta[property='article:tag']").get("content")
24+
published_at = soup.head.select_one("meta[property='article:published_time']").get(
25+
"content"
26+
)
27+
if primary_tag is None:
28+
primary_tag = ""
29+
return f"{title}, {description}, {primary_tag}, {url}, {published_at}"
30+
except ValueError as e:
31+
LOGGER.error(f"Parsing failed for {url}: {e}")
32+
except Exception as e:
33+
LOGGER.error(f"Parsing failed for {url}: {e}")

aiohttp_aiofiles_tutorial/tests/__init__.py

Whitespace-only changes.

aiohttp_aiofiles_tutorial/tests/resources/intro_to_asyncio.html

Lines changed: 170 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Test metadata parser accuracy with local HTML file."""
2+
import asyncio
3+
4+
import aiofiles
5+
import pytest
6+
from aiohttp_aiofiles_tutorial.parser import parse_html_page_data
7+
from config import BASE_DIR
8+
9+
10+
@pytest.fixture
11+
async def sample_page_metadata():
12+
"""Expected metadata to be returned from parsing `intro_to_asyncio.html`"""
13+
title = "Intro to Asynchronous Python with Asyncio"
14+
description = "Execute multiple tasks concurrently in Python with Asyncio: Python's built-in async library."
15+
tag = "Python"
16+
url = "https://hackersandslackers.com/intro-to-asyncio-concurrency/"
17+
published_at = "2022-01-04T07:37:00.000-05:00"
18+
return ", ".join([title, description, tag, url, published_at]) + ","
19+
20+
21+
@pytest.mark.asyncio
22+
async def test_parse_html_page_data(sample_page_metadata):
23+
"""Verify HTML parser outputs expected values"""
24+
test_file = f"{BASE_DIR}/aiohttp_aiofiles_tutorial/tests/resources/intro_to_asyncio.html"
25+
async with aiofiles.open(test_file, mode="r") as file:
26+
html = await file.read()
27+
url = "https://hackersandslackers.com/intro-to-asyncio-concurrency/"
28+
metadata = await parse_html_page_data(html, url)
29+
assert metadata == sample_page_metadata
Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
"""Parse data from fetched URL and write to file asynchronously."""
22
from aiofiles.threadpool.text import AsyncTextIOWrapper as AsyncIOFile
3-
from bs4 import BeautifulSoup
43
from logger import LOGGER
54

5+
from .parser import parse_html_page_data
6+
67

78
async def write_to_outfile(html: str, url: str, outfile: AsyncIOFile, total_count: int, i: int):
89
"""
@@ -15,21 +16,8 @@ async def write_to_outfile(html: str, url: str, outfile: AsyncIOFile, total_coun
1516
:param int i: Current iteration of URL out of total URLs.
1617
"""
1718
try:
18-
page_title = await get_html_page_title(html, url)
19+
page_title = await parse_html_page_data(html, url)
1920
await outfile.write(f"{page_title}\n")
2021
LOGGER.info(f"Fetched URL {i} of {total_count}: {page_title}")
2122
except Exception as e:
2223
LOGGER.error(f"Unexpected error while writing page title: {e}")
23-
24-
25-
async def get_html_page_title(html: str, url: str) -> str:
26-
"""
27-
Extract page title from raw HTML of fetched URL; return a title/url pair.
28-
29-
:param str html: Raw HTML source of a given fetched URL.
30-
:param str url: URL associated with the extracted HTML.
31-
32-
:returns: str
33-
"""
34-
soup = BeautifulSoup(html, "html.parser")
35-
return f"{soup.title.string.replace(',', '')}, {url},"

config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
CSV_FILEPATH = f"{BASE_DIR}/aiohttp_aiofiles_tutorial/data/urls.csv"
99

1010
# Filepath of asynchronously generated CSV containing page data.
11-
EXPORT_FILE = f"{BASE_DIR}/export/hackerspages.csv"
11+
EXPORT_FILE = f"{BASE_DIR}/export/hackers_pages_metadata.csv"
1212

1313
# Headers to be passed to async HTTP client session.
1414
HTML_HEADERS = {

0 commit comments

Comments
 (0)