From f7e7f3fc64ea41269dc2b3433bf51fc5cdaba061 Mon Sep 17 00:00:00 2001
From: Neethu Elizabeth Simon <neethu.elizabethsimon@arm.com>
Date: Mon, 29 Jun 2026 11:10:30 -0700
Subject: [PATCH 1/2] feat: Support video transcript chunking in the Arm
 Knowledge Base

---
 embedding-generation/README.md                |  21 ++-
 embedding-generation/generate-chunks.py       |  69 ++++++-
 .../tests/test_generate_chunks.py             | 176 +++++++++++++++++-
 3 files changed, 254 insertions(+), 12 deletions(-)

diff --git a/embedding-generation/README.md b/embedding-generation/README.md
index f66fa59..9e6cf4e 100644
--- a/embedding-generation/README.md
+++ b/embedding-generation/README.md
@@ -27,12 +27,29 @@ The Dockerfile:
 Add one row to `vector-db-sources.csv` for each document:
 
 ```csv
-Site Name,License Type,Display Name,URL,Keywords
-Example Docs,CC4.0,Example Arm Guide,https://example.com/arm-guide,arm; migration; linux
+Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL
+Example Docs,CC4.0,Example Arm Guide,https://example.com/arm-guide,arm; migration; linux,
 ```
 
 Use clear keywords that users might include in questions. The `URL` is also what retrieval eval uses for expected matches.
 
+### Transcript-backed sources
+
+Some sources (for example edX course videos) do not have directly chunkable text
+at their primary `URL`. For these, populate the optional `Transcript Source URL`
+column with a link to a plain-text or markdown transcript (such as a GitHub
+`.../blob/...` file). When `Transcript Source URL` is set, `generate-chunks.py`
+fetches and chunks the transcript instead of the primary `URL`, but keeps the
+primary `URL` as the user-facing link returned by retrieval:
+
+```csv
+Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL
+Educational Course,All rights reserved,Example Video,https://courses.edx.org/videos/...,arm; ai; inference,https://github.com/arm-education/.../M1KV1.txt
+```
+
+Leave the column empty for sources that are chunked from their primary `URL`.
+
+
 ## Test Locally
 
 Install dependencies once:
diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
index 973bc0e..a09e6ef 100644
--- a/embedding-generation/generate-chunks.py
+++ b/embedding-generation/generate-chunks.py
@@ -148,13 +148,14 @@ def load_existing_sources(csv_file):
                     'license_type': row.get('License Type', ''),
                     'display_name': row.get('Display Name', ''),
                     'url': url,
-                    'keywords': row.get('Keywords', '')
+                    'keywords': row.get('Keywords', ''),
+                    'transcript_source_url': (row.get('Transcript Source URL', '') or '').strip()
                 })
     
     print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'")
 
 
-def register_source(site_name, license_type, display_name, url, keywords):
+def register_source(site_name, license_type, display_name, url, keywords, transcript_source_url=''):
     """
     Register a new source URL. If the URL already exists, skip it.
     Returns True if the source was added, False if it was a duplicate.
@@ -173,7 +174,8 @@ def register_source(site_name, license_type, display_name, url, keywords):
         'license_type': license_type,
         'display_name': display_name,
         'url': url,
-        'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords)
+        'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords),
+        'transcript_source_url': (transcript_source_url or '').strip()
     }
 
     # Keep discovered sources grouped with their existing site section instead of
@@ -198,14 +200,15 @@ def save_sources_csv(csv_file):
     """
     with open(csv_file, 'w', newline='', encoding='utf-8') as file:
         writer = csv.writer(file)
-        writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords'])
+        writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords', 'Transcript Source URL'])
         for source in all_sources:
             writer.writerow([
                 source['site_name'],
                 source['license_type'],
                 source['display_name'],
                 source['url'],
-                source['keywords']
+                source['keywords'],
+                source.get('transcript_source_url', '')
             ])
     
     print(f"Saved {len(all_sources)} sources to '{csv_file}'")
@@ -728,6 +731,7 @@ def readInCSV(csv_file):
         'source_names': [],
         'site_names': [],
         'license_types': [],
+        'transcript_urls': [],
     }
     
     if not os.path.exists(csv_file):
@@ -741,6 +745,7 @@ def readInCSV(csv_file):
             csv_dict['source_names'].append(row.get('Display Name', ''))
             csv_dict['site_names'].append(row.get('Site Name', ''))
             csv_dict['license_types'].append(row.get('License Type', ''))
+            csv_dict['transcript_urls'].append((row.get('Transcript Source URL', '') or '').strip())
     
     return csv_dict, len(csv_dict['urls'])
 
@@ -877,7 +882,56 @@ def parse_keywords(keywords_value, title=""):
     return keywords
 
 
-def create_chunks_for_source(source_url, source_name, doc_type, keywords_value):
+def create_transcript_chunks(source_url, transcript_url, source_name, doc_type, keywords_value):
+    """Chunk a transcript document on behalf of a primary source.
+
+    Some sources (for example edX course videos) have no directly chunkable text
+    at their primary URL. When a "Transcript Source URL" is provided in the CSV,
+    we fetch and chunk that transcript instead, but keep the primary ``source_url``
+    as the user-facing link so retrieval still points users at the original content.
+    """
+    normalized_source_url = normalize_source_url(source_url)
+    fetch_url = source_to_fetch_url(normalize_source_url(transcript_url))
+    response = fetch_with_logging(fetch_url)
+    if response is None:
+        print('transcript not valid, ', fetch_url)
+        return []
+
+    keywords = parse_keywords(keywords_value, source_name)
+    parsed_document = parse_document_content(
+        source_url=normalized_source_url,
+        resolved_url=response.url,
+        response_content=response.content,
+        content_type=response.headers.get("content-type", ""),
+        fallback_title=source_name,
+    )
+
+    chunks = []
+    for payload in chunk_parsed_document(parsed_document, doc_type=doc_type or "Transcript", keywords=keywords):
+        chunks.append(
+            createChunk(
+                text_snippet=payload["content"],
+                WEBSITE_url=normalized_source_url,
+                keywords=keywords,
+                title=payload["title"] or source_name,
+                heading=payload["heading"],
+                heading_path=payload["heading_path"],
+                doc_type=payload["doc_type"],
+                product=payload["product"],
+                version=payload["version"],
+                resolved_url=response.url,
+                content_type=payload["content_type"],
+            )
+        )
+    return chunks
+
+
+def create_chunks_for_source(source_url, source_name, doc_type, keywords_value, transcript_url=""):
+    # When a transcript source is provided, chunk the transcript text instead of
+    # fetching the primary URL (which may be a video player or other non-text page),
+    # while keeping the primary URL as the user-facing link for retrieval.
+    if transcript_url and transcript_url.strip():
+        return create_transcript_chunks(source_url, transcript_url, source_name, doc_type, keywords_value)
     if doc_type == "Ecosystem Dashboard":
         return create_ecosystem_dashboard_chunk(source_url, source_name, keywords_value)
     if is_arm_developer_documentation_url(source_url):
@@ -1104,8 +1158,9 @@ def main():
         source_name = csv_dict['source_names'][i]
         doc_type = csv_dict['site_names'][i]
         keywords_value = csv_dict['focus'][i]
+        transcript_url = csv_dict['transcript_urls'][i]
 
-        for chunk in create_chunks_for_source(url, source_name, doc_type, keywords_value):
+        for chunk in create_chunks_for_source(url, source_name, doc_type, keywords_value, transcript_url):
             chunkSaveAndTrack(url, chunk)
 
     # Save updated sources CSV with all discovered sources
diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py
index 9812634..95cfb70 100644
--- a/embedding-generation/tests/test_generate_chunks.py
+++ b/embedding-generation/tests/test_generate_chunks.py
@@ -422,9 +422,59 @@ def test_save_sources_csv(self, gc, tmp_path):
             reader = csv.reader(f)
             rows = list(reader)
         
-        assert rows[0] == ['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords']
-        assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2']
-        assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3']
+        assert rows[0] == ['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords', 'Transcript Source URL']
+        assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2', '']
+        assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3', '']
+
+    def test_save_sources_csv_preserves_transcript_url(self, gc, tmp_path):
+        """Transcript Source URL must be written back to the CSV."""
+        gc.all_sources = [
+            {
+                'site_name': 'Educational Course',
+                'license_type': 'All rights reserved',
+                'display_name': 'Example Video',
+                'url': 'https://courses.edx.org/videos/example',
+                'keywords': 'arm; ai',
+                'transcript_source_url': 'https://github.com/arm-education/repo/blob/main/M1KV1.txt'
+            }
+        ]
+
+        csv_file = tmp_path / "output.csv"
+        gc.save_sources_csv(str(csv_file))
+
+        with open(csv_file, 'r') as f:
+            rows = list(csv.reader(f))
+
+        assert rows[0][-1] == 'Transcript Source URL'
+        assert rows[1] == [
+            'Educational Course',
+            'All rights reserved',
+            'Example Video',
+            'https://courses.edx.org/videos/example',
+            'arm; ai',
+            'https://github.com/arm-education/repo/blob/main/M1KV1.txt',
+        ]
+
+    def test_load_existing_sources_preserves_transcript_url(self, gc, tmp_path):
+        """Loading a CSV with a transcript column should retain it through a save round-trip."""
+        csv_file = tmp_path / "sources.csv"
+        csv_file.write_text(
+            "Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL\n"
+            "Educational Course,All rights reserved,Example Video,https://courses.edx.org/videos/example,arm; ai,"
+            "https://github.com/arm-education/repo/blob/main/M1KV1.txt\n"
+        )
+
+        gc.load_existing_sources(str(csv_file))
+
+        assert gc.all_sources[0]['transcript_source_url'] == (
+            "https://github.com/arm-education/repo/blob/main/M1KV1.txt"
+        )
+
+        gc.save_sources_csv(str(csv_file))
+
+        with open(csv_file, 'r') as f:
+            rows = list(csv.reader(f))
+        assert rows[1][-1] == "https://github.com/arm-education/repo/blob/main/M1KV1.txt"
 
     def test_load_and_save_roundtrip(self, gc, tmp_path):
         """Test that loading and saving preserves data."""
@@ -634,6 +684,37 @@ def test_read_csv_basic(self, gc, tmp_path):
         assert csv_dict['site_names'] == ['Site1', 'Site2']
         assert csv_dict['license_types'] == ['MIT', 'Apache']
 
+    def test_read_csv_transcript_urls(self, gc, tmp_path):
+        """Test that the optional Transcript Source URL column is read."""
+        csv_file = tmp_path / "transcript.csv"
+        csv_file.write_text(
+            "Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL\n"
+            "Educational Course,All rights reserved,Video1,https://courses.edx.org/v/1,key1,"
+            "https://github.com/arm-education/repo/blob/main/M1KV1.txt\n"
+            "Site2,Apache,Display2,https://example.com/2,key2,\n"
+        )
+
+        csv_dict, length = gc.readInCSV(str(csv_file))
+
+        assert length == 2
+        assert csv_dict['transcript_urls'] == [
+            "https://github.com/arm-education/repo/blob/main/M1KV1.txt",
+            "",
+        ]
+
+    def test_read_csv_transcript_missing_column(self, gc, tmp_path):
+        """A CSV without the transcript column should yield empty transcript URLs."""
+        csv_file = tmp_path / "no_transcript.csv"
+        csv_file.write_text(
+            "Site Name,License Type,Display Name,URL,Keywords\n"
+            "Site1,MIT,Display1,https://example.com/1,key1\n"
+        )
+
+        csv_dict, length = gc.readInCSV(str(csv_file))
+
+        assert length == 1
+        assert csv_dict['transcript_urls'] == [""]
+
     def test_read_csv_empty(self, gc, tmp_path):
         """Test reading an empty CSV (header only)."""
         csv_file = tmp_path / "empty.csv"
@@ -870,6 +951,95 @@ def fake_fetch(url):
         assert "target CPU for Cortex-A builds" in chunks[1].content
 
 
+class TestCreateTranscriptChunks:
+    """Tests for transcript-backed sources (Transcript Source URL column)."""
+
+    def _transcript_response(self, url, text):
+        return SimpleNamespace(
+            url=url,
+            content=text.encode("utf-8"),
+            headers={"content-type": "text/plain"},
+        )
+
+    def test_transcript_used_instead_of_primary_url(self, gc, monkeypatch):
+        """When a transcript URL is set, content is fetched from the transcript."""
+        source_url = "https://courses.edx.org/videos/block-v1:example+type@video+block@abc"
+        transcript_url = "https://github.com/arm-education/repo/blob/main/M1KV1.txt"
+        raw_transcript_url = (
+            "https://raw.githubusercontent.com/arm-education/repo/main/M1KV1.txt"
+        )
+        transcript_text = (
+            "Arm processors deliver strong energy efficiency for AI inference workloads. "
+            * 60
+        )
+
+        fetched_urls = []
+
+        def fake_fetch(url):
+            fetched_urls.append(url)
+            return self._transcript_response(raw_transcript_url, transcript_text)
+
+        monkeypatch.setattr(gc, "fetch_with_logging", fake_fetch)
+
+        chunks = gc.create_chunks_for_source(
+            source_url=source_url,
+            source_name="Energy efficiency for AI inference",
+            doc_type="Educational Course",
+            keywords_value="arm; ai; inference",
+            transcript_url=transcript_url,
+        )
+
+        # The GitHub blob URL must be resolved to a raw fetch URL.
+        assert fetched_urls == [raw_transcript_url]
+        assert len(chunks) >= 1
+        # User-facing link stays the primary source URL, not the transcript.
+        assert all(chunk.url == source_url for chunk in chunks)
+        assert all(chunk.doc_type == "Educational Course" for chunk in chunks)
+        assert "energy efficiency" in chunks[0].content.lower()
+
+    def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch):
+        """A failed transcript fetch should return no chunks."""
+        monkeypatch.setattr(gc, "fetch_with_logging", lambda url: None)
+
+        chunks = gc.create_chunks_for_source(
+            source_url="https://courses.edx.org/videos/example",
+            source_name="Example",
+            doc_type="Educational Course",
+            keywords_value="arm",
+            transcript_url="https://github.com/arm-education/repo/blob/main/missing.txt",
+        )
+
+        assert chunks == []
+
+    def test_blank_transcript_falls_back_to_primary_url(self, gc, monkeypatch):
+        """A blank transcript URL should not trigger transcript chunking."""
+        captured = {}
+
+        def fake_generic(source_url, source_name, doc_type, keywords_value):
+            captured["called"] = True
+            return ["sentinel"]
+
+        # create_chunks_for_source dispatches to the generic path via fetch; here we
+        # short-circuit by patching fetch to confirm the transcript branch is skipped.
+        def fake_fetch(url):
+            captured["fetched"] = url
+            return None
+
+        monkeypatch.setattr(gc, "fetch_with_logging", fake_fetch)
+
+        chunks = gc.create_chunks_for_source(
+            source_url="https://learn.arm.com/learning-paths/example/",
+            source_name="Example",
+            doc_type="Learning Paths",
+            keywords_value="arm",
+            transcript_url="   ",
+        )
+
+        # Transcript branch skipped -> generic path attempted a fetch of the primary URL.
+        assert "fetched" in captured
+        assert chunks == []
+
+
 class TestCreateRetrySession:
     """Tests for create_retry_session function."""
 

From 26da406f6406a68178235f24c05f4d55836f9671 Mon Sep 17 00:00:00 2001
From: Neethu Elizabeth Simon <neethu.elizabethsimon@arm.com>
Date: Mon, 29 Jun 2026 13:47:02 -0700
Subject: [PATCH 2/2] fix: copilot comments

---
 embedding-generation/generate-chunks.py          |  9 ++++++---
 .../tests/test_generate_chunks.py                | 16 ++++++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
index a09e6ef..9939f08 100644
--- a/embedding-generation/generate-chunks.py
+++ b/embedding-generation/generate-chunks.py
@@ -894,17 +894,20 @@ def create_transcript_chunks(source_url, transcript_url, source_name, doc_type,
     fetch_url = source_to_fetch_url(normalize_source_url(transcript_url))
     response = fetch_with_logging(fetch_url)
     if response is None:
-        print('transcript not valid, ', fetch_url)
+        print(f"[TRANSCRIPT FETCH FAILED] for {normalized_source_url} (transcript: {transcript_url}): {fetch_url}")
         return []
 
     keywords = parse_keywords(keywords_value, source_name)
     parsed_document = parse_document_content(
-        source_url=normalized_source_url,
+        source_url=response.url,
         resolved_url=response.url,
         response_content=response.content,
         content_type=response.headers.get("content-type", ""),
         fallback_title=source_name,
     )
+    # Keep the primary URL as the user-facing link while still using the transcript
+    # URL as the base for resolving any relative links inside the transcript content.
+    parsed_document.source_url = normalized_source_url
 
     chunks = []
     for payload in chunk_parsed_document(parsed_document, doc_type=doc_type or "Transcript", keywords=keywords):
@@ -941,7 +944,7 @@ def create_chunks_for_source(source_url, source_name, doc_type, keywords_value,
     fetch_url = source_to_fetch_url(normalized_source_url)
     response = fetch_with_logging(fetch_url)
     if response is None:
-        print('not valid, ', fetch_url)
+        print(f"Fetch failed for {normalized_source_url}: {fetch_url}")
         return []
 
     sources_to_parse = [(normalized_source_url, response)]
diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py
index 95cfb70..0ada0ce 100644
--- a/embedding-generation/tests/test_generate_chunks.py
+++ b/embedding-generation/tests/test_generate_chunks.py
@@ -997,8 +997,8 @@ def fake_fetch(url):
         assert all(chunk.doc_type == "Educational Course" for chunk in chunks)
         assert "energy efficiency" in chunks[0].content.lower()
 
-    def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch):
-        """A failed transcript fetch should return no chunks."""
+    def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch, capsys):
+        """A failed transcript fetch should return no chunks and log both URLs."""
         monkeypatch.setattr(gc, "fetch_with_logging", lambda url: None)
 
         chunks = gc.create_chunks_for_source(
@@ -1011,14 +1011,18 @@ def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch):
 
         assert chunks == []
 
+        captured = capsys.readouterr()
+        assert "TRANSCRIPT FETCH FAILED" in captured.out
+        # Both the primary source URL and the transcript URL aid batch troubleshooting.
+        assert "https://courses.edx.org/videos/example" in captured.out
+        assert "https://github.com/arm-education/repo/blob/main/missing.txt" in captured.out
+        # The resolved raw fetch URL should also be logged.
+        assert "raw.githubusercontent.com/arm-education/repo/main/missing.txt" in captured.out
+
     def test_blank_transcript_falls_back_to_primary_url(self, gc, monkeypatch):
         """A blank transcript URL should not trigger transcript chunking."""
         captured = {}
 
-        def fake_generic(source_url, source_name, doc_type, keywords_value):
-            captured["called"] = True
-            return ["sentinel"]
-
         # create_chunks_for_source dispatches to the generic path via fetch; here we
         # short-circuit by patching fetch to confirm the transcript branch is skipped.
         def fake_fetch(url):