Naming scheme adjustments; additional exception.

toddbirchard · toddbirchard · commit 4db4b15f7042 · 2022-01-16T23:20:54.000-05:00
diff --git a/aiohttp_aiofiles_tutorial/__init__.py b/aiohttp_aiofiles_tutorial/__init__.py
@@ -20,7 +20,9 @@ async def init_script():
         await outfile.write("title,description,primary_tag,url,published_at\n")
         await execute_fetcher_tasks(outfile)
         await outfile.close()
-    LOGGER.success(f"Executed {__name__} in {time.perf_counter() - start_time:0.2f} seconds.")
+    LOGGER.success(
+        f"Executed {__name__} in {time.perf_counter() - start_time:0.2f} seconds."
+    )
 
 
 async def execute_fetcher_tasks(outfile: AsyncIOFile):
diff --git a/aiohttp_aiofiles_tutorial/parser.py b/aiohttp_aiofiles_tutorial/parser.py
@@ -1,11 +1,12 @@
 """Parse metadata from raw HTML."""
 from bs4 import BeautifulSoup
+from bs4.builder import ParserRejectedMarkup
 from logger import LOGGER
 
 
-async def parse_html_page_data(html: str, url: str) -> str:
+async def parse_html_page_metadata(html: str, url: str) -> str:
     """
-    Extract page title from raw HTML of fetched URL; return a title
+    Extract page metadata from raw HTML into a CSV row.
 
     :param str html: Raw HTML source of a given fetched URL.
     :param str url: URL associated with the extracted HTML.
@@ -33,6 +34,8 @@ async def parse_html_page_data(html: str, url: str) -> str:
         if primary_tag is None:
             primary_tag = ""
         return f"{title}, {description}, {primary_tag}, {url}, {published_at}"
+    except ParserRejectedMarkup as e:
+        LOGGER.error(f"Failed to parse invalid html for {url}: {e}")
     except ValueError as e:
         LOGGER.error(f"ValueError occurred when parsing html for {url}: {e}")
     except Exception as e:
diff --git a/aiohttp_aiofiles_tutorial/tests/test_parser.py b/aiohttp_aiofiles_tutorial/tests/test_parser.py
@@ -4,7 +4,7 @@
 import pytest_asyncio
 from config import BASE_DIR
 
-from aiohttp_aiofiles_tutorial.parser import parse_html_page_data
+from aiohttp_aiofiles_tutorial.parser import parse_html_page_metadata
 
 
 @pytest_asyncio.fixture
@@ -25,6 +25,6 @@ async def test_parse_html_page_data(sample_page_metadata):
     async with aiofiles.open(test_file, mode="r") as file:
         html = await file.read()
         url = "https://hackersandslackers.com/intro-to-asyncio-concurrency/"
-        metadata = await parse_html_page_data(html, url)
+        metadata = await parse_html_page_metadata(html, url)
         assert metadata == sample_page_metadata
         await file.close()
diff --git a/aiohttp_aiofiles_tutorial/writer.py b/aiohttp_aiofiles_tutorial/writer.py
@@ -2,7 +2,7 @@
 from aiofiles.threadpool.text import AsyncTextIOWrapper as AsyncIOFile
 from logger import LOGGER
 
-from .parser import parse_html_page_data
+from .parser import parse_html_page_metadata
 
 
 async def write_to_outfile(
@@ -18,8 +18,8 @@ async def write_to_outfile(
     :param int i: Current iteration of URL out of total URLs.
     """
     try:
-        page_title = await parse_html_page_data(html, url)
-        await outfile.write(f"{page_title}\n")
-        LOGGER.info(f"Fetched URL {i} of {total_count}: {page_title}")
+        page_metadata = await parse_html_page_metadata(html, url)
+        await outfile.write(f"{page_metadata}\n")
+        LOGGER.info(f"Fetched URL {i} of {total_count}: {page_metadata}")
     except Exception as e:
         LOGGER.error(f"Unexpected error while writing page title: {e}")
diff --git a/poetry.lock b/poetry.lock