hackersandslackers
diff --git a/‎aiohttp_aiofiles_tutorial/__init__.py‎
Lines changed: 3 additions & 5 deletions b/‎aiohttp_aiofiles_tutorial/__init__.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎aiohttp_aiofiles_tutorial/data/urls.csv‎
Lines changed: 173 additions & 253 deletions b/‎aiohttp_aiofiles_tutorial/data/urls.csv‎
Lines changed: 173 additions & 253 deletions
diff --git a/‎aiohttp_aiofiles_tutorial/fetcher.py‎
Lines changed: 2 additions & 0 deletions b/‎aiohttp_aiofiles_tutorial/fetcher.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aiohttp_aiofiles_tutorial/loops.py‎
Lines changed: 0 additions & 12 deletions b/‎aiohttp_aiofiles_tutorial/loops.py‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎aiohttp_aiofiles_tutorial/parser.py‎
Lines changed: 33 additions & 0 deletions b/‎aiohttp_aiofiles_tutorial/parser.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎aiohttp_aiofiles_tutorial/tests/__init__.py‎ b/‎aiohttp_aiofiles_tutorial/tests/__init__.py‎
diff --git a/‎aiohttp_aiofiles_tutorial/tests/resources/intro_to_asyncio.html‎
Lines changed: 170 additions & 0 deletions b/‎aiohttp_aiofiles_tutorial/tests/resources/intro_to_asyncio.html‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎aiohttp_aiofiles_tutorial/tests/test_parser.py‎
Lines changed: 29 additions & 0 deletions b/‎aiohttp_aiofiles_tutorial/tests/test_parser.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎aiohttp_aiofiles_tutorial/writer.py‎
Lines changed: 3 additions & 15 deletions b/‎aiohttp_aiofiles_tutorial/writer.py‎
Lines changed: 3 additions & 15 deletions
diff --git a/‎config.py‎
Lines changed: 1 addition & 1 deletion b/‎config.py‎
Lines changed: 1 addition & 1 deletion
@@ -10,27 +10,25 @@
 from logger import LOGGER
 
 from .data import urls
-from .loops import inspect_event_loop
 from .tasks import create_tasks
 
 
 async def init_script():
     """Initiate script by preparing an output file prior to executing tasks."""
     start_time = timer()
     async with aiofiles.open(EXPORT_FILE, mode="w+") as outfile:
-        await outfile.write("title,url,\n")
-        await create_and_execute_tasks(outfile)
+        await outfile.write("title,description,primary_tag,url,published_at\n")
+        await execute_fetcher_tasks(outfile)
         await outfile.close()
     LOGGER.success(f"Executed {__name__} in {time.perf_counter() - start_time:0.2f} seconds.")
 
 
-async def create_and_execute_tasks(outfile: AsyncIOFile):
+async def execute_fetcher_tasks(outfile: AsyncIOFile):
     """
     Open async HTTP session & execute created tasks.
 
     :param AsyncIOFile outfile: Path of local file to write to.
     """
     async with ClientSession(headers=HTML_HEADERS) as session:
         task_list = await create_tasks(session, urls, outfile)
-        inspect_event_loop()
         await asyncio.gather(*task_list)
@@ -20,6 +20,8 @@ async def fetch_url_and_save_title(
     """
     try:
         async with session.get(url) as resp:
+            if resp.status != 200:
+                pass
             html = await resp.text()
             await write_to_outfile(html, url, outfile, total_count, i)
     except InvalidURL as e:
 
@@ -0,0 +1,33 @@
+from bs4 import BeautifulSoup
+from logger import LOGGER
+
+
+async def parse_html_page_data(html: str, url: str) -> str:
+    """
+    Extract page title from raw HTML of fetched URL; return a title
+
+    :param str html: Raw HTML source of a given fetched URL.
+    :param str url: URL associated with the extracted HTML.
+
+    :returns: str
+    """
+    try:
+        soup = BeautifulSoup(html, "html.parser")
+        title = soup.title.string.replace(",", "")
+        description = (
+            soup.head.select_one("meta[name=description]")
+            .get("content")
+            .replace(",", "")
+            .replace('"', "")
+        )
+        primary_tag = soup.head.select_one("meta[property='article:tag']").get("content")
+        published_at = soup.head.select_one("meta[property='article:published_time']").get(
+            "content"
+        )
+        if primary_tag is None:
+            primary_tag = ""
+        return f"{title}, {description}, {primary_tag}, {url}, {published_at}"
+    except ValueError as e:
+        LOGGER.error(f"Parsing failed for {url}: {e}")
+    except Exception as e:
+        LOGGER.error(f"Parsing failed for {url}: {e}")
@@ -0,0 +1,29 @@
+"""Test metadata parser accuracy with local HTML file."""
+import asyncio
+
+import aiofiles
+import pytest
+from aiohttp_aiofiles_tutorial.parser import parse_html_page_data
+from config import BASE_DIR
+
+
+@pytest.fixture
+async def sample_page_metadata():
+    """Expected metadata to be returned from parsing `intro_to_asyncio.html`"""
+    title = "Intro to Asynchronous Python with Asyncio"
+    description = "Execute multiple tasks concurrently in Python with Asyncio: Python's built-in async library."
+    tag = "Python"
+    url = "https://hackersandslackers.com/intro-to-asyncio-concurrency/"
+    published_at = "2022-01-04T07:37:00.000-05:00"
+    return ", ".join([title, description, tag, url, published_at]) + ","
+
+
+@pytest.mark.asyncio
+async def test_parse_html_page_data(sample_page_metadata):
+    """Verify HTML parser outputs expected values"""
+    test_file = f"{BASE_DIR}/aiohttp_aiofiles_tutorial/tests/resources/intro_to_asyncio.html"
+    async with aiofiles.open(test_file, mode="r") as file:
+        html = await file.read()
+        url = "https://hackersandslackers.com/intro-to-asyncio-concurrency/"
+        metadata = await parse_html_page_data(html, url)
+        assert metadata == sample_page_metadata
@@ -1,8 +1,9 @@
 """Parse data from fetched URL and write to file asynchronously."""
 from aiofiles.threadpool.text import AsyncTextIOWrapper as AsyncIOFile
-from bs4 import BeautifulSoup
 from logger import LOGGER
 
+from .parser import parse_html_page_data
+
 
 async def write_to_outfile(html: str, url: str, outfile: AsyncIOFile, total_count: int, i: int):
     """
@@ -15,21 +16,8 @@ async def write_to_outfile(html: str, url: str, outfile: AsyncIOFile, total_coun
     :param int i: Current iteration of URL out of total URLs.
     """
     try:
-        page_title = await get_html_page_title(html, url)
+        page_title = await parse_html_page_data(html, url)
         await outfile.write(f"{page_title}\n")
         LOGGER.info(f"Fetched URL {i} of {total_count}: {page_title}")
     except Exception as e:
         LOGGER.error(f"Unexpected error while writing page title: {e}")
-
-
-async def get_html_page_title(html: str, url: str) -> str:
-    """
-    Extract page title from raw HTML of fetched URL; return a title/url pair.
-
-    :param str html: Raw HTML source of a given fetched URL.
-    :param str url: URL associated with the extracted HTML.
-
-    :returns: str
-    """
-    soup = BeautifulSoup(html, "html.parser")
-    return f"{soup.title.string.replace(',', '')}, {url},"
@@ -8,7 +8,7 @@
 CSV_FILEPATH = f"{BASE_DIR}/aiohttp_aiofiles_tutorial/data/urls.csv"
 
 # Filepath of asynchronously generated CSV containing page data.
-EXPORT_FILE = f"{BASE_DIR}/export/hackerspages.csv"
+EXPORT_FILE = f"{BASE_DIR}/export/hackers_pages_metadata.csv"
 
 # Headers to be passed to async HTTP client session.
 HTML_HEADERS = {