From f7e7f3fc64ea41269dc2b3433bf51fc5cdaba061 Mon Sep 17 00:00:00 2001 From: Neethu Elizabeth Simon Date: Mon, 29 Jun 2026 11:10:30 -0700 Subject: [PATCH 1/2] feat: Support video transcript chunking in the Arm Knowledge Base --- embedding-generation/README.md | 21 ++- embedding-generation/generate-chunks.py | 69 ++++++- .../tests/test_generate_chunks.py | 176 +++++++++++++++++- 3 files changed, 254 insertions(+), 12 deletions(-) diff --git a/embedding-generation/README.md b/embedding-generation/README.md index f66fa59..9e6cf4e 100644 --- a/embedding-generation/README.md +++ b/embedding-generation/README.md @@ -27,12 +27,29 @@ The Dockerfile: Add one row to `vector-db-sources.csv` for each document: ```csv -Site Name,License Type,Display Name,URL,Keywords -Example Docs,CC4.0,Example Arm Guide,https://example.com/arm-guide,arm; migration; linux +Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL +Example Docs,CC4.0,Example Arm Guide,https://example.com/arm-guide,arm; migration; linux, ``` Use clear keywords that users might include in questions. The `URL` is also what retrieval eval uses for expected matches. +### Transcript-backed sources + +Some sources (for example edX course videos) do not have directly chunkable text +at their primary `URL`. For these, populate the optional `Transcript Source URL` +column with a link to a plain-text or markdown transcript (such as a GitHub +`.../blob/...` file). When `Transcript Source URL` is set, `generate-chunks.py` +fetches and chunks the transcript instead of the primary `URL`, but keeps the +primary `URL` as the user-facing link returned by retrieval: + +```csv +Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL +Educational Course,All rights reserved,Example Video,https://courses.edx.org/videos/...,arm; ai; inference,https://github.com/arm-education/.../M1KV1.txt +``` + +Leave the column empty for sources that are chunked from their primary `URL`. + + ## Test Locally Install dependencies once: diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index 973bc0e..a09e6ef 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -148,13 +148,14 @@ def load_existing_sources(csv_file): 'license_type': row.get('License Type', ''), 'display_name': row.get('Display Name', ''), 'url': url, - 'keywords': row.get('Keywords', '') + 'keywords': row.get('Keywords', ''), + 'transcript_source_url': (row.get('Transcript Source URL', '') or '').strip() }) print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'") -def register_source(site_name, license_type, display_name, url, keywords): +def register_source(site_name, license_type, display_name, url, keywords, transcript_source_url=''): """ Register a new source URL. If the URL already exists, skip it. Returns True if the source was added, False if it was a duplicate. @@ -173,7 +174,8 @@ def register_source(site_name, license_type, display_name, url, keywords): 'license_type': license_type, 'display_name': display_name, 'url': url, - 'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords) + 'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords), + 'transcript_source_url': (transcript_source_url or '').strip() } # Keep discovered sources grouped with their existing site section instead of @@ -198,14 +200,15 @@ def save_sources_csv(csv_file): """ with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) - writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords']) + writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords', 'Transcript Source URL']) for source in all_sources: writer.writerow([ source['site_name'], source['license_type'], source['display_name'], source['url'], - source['keywords'] + source['keywords'], + source.get('transcript_source_url', '') ]) print(f"Saved {len(all_sources)} sources to '{csv_file}'") @@ -728,6 +731,7 @@ def readInCSV(csv_file): 'source_names': [], 'site_names': [], 'license_types': [], + 'transcript_urls': [], } if not os.path.exists(csv_file): @@ -741,6 +745,7 @@ def readInCSV(csv_file): csv_dict['source_names'].append(row.get('Display Name', '')) csv_dict['site_names'].append(row.get('Site Name', '')) csv_dict['license_types'].append(row.get('License Type', '')) + csv_dict['transcript_urls'].append((row.get('Transcript Source URL', '') or '').strip()) return csv_dict, len(csv_dict['urls']) @@ -877,7 +882,56 @@ def parse_keywords(keywords_value, title=""): return keywords -def create_chunks_for_source(source_url, source_name, doc_type, keywords_value): +def create_transcript_chunks(source_url, transcript_url, source_name, doc_type, keywords_value): + """Chunk a transcript document on behalf of a primary source. + + Some sources (for example edX course videos) have no directly chunkable text + at their primary URL. When a "Transcript Source URL" is provided in the CSV, + we fetch and chunk that transcript instead, but keep the primary ``source_url`` + as the user-facing link so retrieval still points users at the original content. + """ + normalized_source_url = normalize_source_url(source_url) + fetch_url = source_to_fetch_url(normalize_source_url(transcript_url)) + response = fetch_with_logging(fetch_url) + if response is None: + print('transcript not valid, ', fetch_url) + return [] + + keywords = parse_keywords(keywords_value, source_name) + parsed_document = parse_document_content( + source_url=normalized_source_url, + resolved_url=response.url, + response_content=response.content, + content_type=response.headers.get("content-type", ""), + fallback_title=source_name, + ) + + chunks = [] + for payload in chunk_parsed_document(parsed_document, doc_type=doc_type or "Transcript", keywords=keywords): + chunks.append( + createChunk( + text_snippet=payload["content"], + WEBSITE_url=normalized_source_url, + keywords=keywords, + title=payload["title"] or source_name, + heading=payload["heading"], + heading_path=payload["heading_path"], + doc_type=payload["doc_type"], + product=payload["product"], + version=payload["version"], + resolved_url=response.url, + content_type=payload["content_type"], + ) + ) + return chunks + + +def create_chunks_for_source(source_url, source_name, doc_type, keywords_value, transcript_url=""): + # When a transcript source is provided, chunk the transcript text instead of + # fetching the primary URL (which may be a video player or other non-text page), + # while keeping the primary URL as the user-facing link for retrieval. + if transcript_url and transcript_url.strip(): + return create_transcript_chunks(source_url, transcript_url, source_name, doc_type, keywords_value) if doc_type == "Ecosystem Dashboard": return create_ecosystem_dashboard_chunk(source_url, source_name, keywords_value) if is_arm_developer_documentation_url(source_url): @@ -1104,8 +1158,9 @@ def main(): source_name = csv_dict['source_names'][i] doc_type = csv_dict['site_names'][i] keywords_value = csv_dict['focus'][i] + transcript_url = csv_dict['transcript_urls'][i] - for chunk in create_chunks_for_source(url, source_name, doc_type, keywords_value): + for chunk in create_chunks_for_source(url, source_name, doc_type, keywords_value, transcript_url): chunkSaveAndTrack(url, chunk) # Save updated sources CSV with all discovered sources diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py index 9812634..95cfb70 100644 --- a/embedding-generation/tests/test_generate_chunks.py +++ b/embedding-generation/tests/test_generate_chunks.py @@ -422,9 +422,59 @@ def test_save_sources_csv(self, gc, tmp_path): reader = csv.reader(f) rows = list(reader) - assert rows[0] == ['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords'] - assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2'] - assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3'] + assert rows[0] == ['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords', 'Transcript Source URL'] + assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2', ''] + assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3', ''] + + def test_save_sources_csv_preserves_transcript_url(self, gc, tmp_path): + """Transcript Source URL must be written back to the CSV.""" + gc.all_sources = [ + { + 'site_name': 'Educational Course', + 'license_type': 'All rights reserved', + 'display_name': 'Example Video', + 'url': 'https://courses.edx.org/videos/example', + 'keywords': 'arm; ai', + 'transcript_source_url': 'https://github.com/arm-education/repo/blob/main/M1KV1.txt' + } + ] + + csv_file = tmp_path / "output.csv" + gc.save_sources_csv(str(csv_file)) + + with open(csv_file, 'r') as f: + rows = list(csv.reader(f)) + + assert rows[0][-1] == 'Transcript Source URL' + assert rows[1] == [ + 'Educational Course', + 'All rights reserved', + 'Example Video', + 'https://courses.edx.org/videos/example', + 'arm; ai', + 'https://github.com/arm-education/repo/blob/main/M1KV1.txt', + ] + + def test_load_existing_sources_preserves_transcript_url(self, gc, tmp_path): + """Loading a CSV with a transcript column should retain it through a save round-trip.""" + csv_file = tmp_path / "sources.csv" + csv_file.write_text( + "Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL\n" + "Educational Course,All rights reserved,Example Video,https://courses.edx.org/videos/example,arm; ai," + "https://github.com/arm-education/repo/blob/main/M1KV1.txt\n" + ) + + gc.load_existing_sources(str(csv_file)) + + assert gc.all_sources[0]['transcript_source_url'] == ( + "https://github.com/arm-education/repo/blob/main/M1KV1.txt" + ) + + gc.save_sources_csv(str(csv_file)) + + with open(csv_file, 'r') as f: + rows = list(csv.reader(f)) + assert rows[1][-1] == "https://github.com/arm-education/repo/blob/main/M1KV1.txt" def test_load_and_save_roundtrip(self, gc, tmp_path): """Test that loading and saving preserves data.""" @@ -634,6 +684,37 @@ def test_read_csv_basic(self, gc, tmp_path): assert csv_dict['site_names'] == ['Site1', 'Site2'] assert csv_dict['license_types'] == ['MIT', 'Apache'] + def test_read_csv_transcript_urls(self, gc, tmp_path): + """Test that the optional Transcript Source URL column is read.""" + csv_file = tmp_path / "transcript.csv" + csv_file.write_text( + "Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL\n" + "Educational Course,All rights reserved,Video1,https://courses.edx.org/v/1,key1," + "https://github.com/arm-education/repo/blob/main/M1KV1.txt\n" + "Site2,Apache,Display2,https://example.com/2,key2,\n" + ) + + csv_dict, length = gc.readInCSV(str(csv_file)) + + assert length == 2 + assert csv_dict['transcript_urls'] == [ + "https://github.com/arm-education/repo/blob/main/M1KV1.txt", + "", + ] + + def test_read_csv_transcript_missing_column(self, gc, tmp_path): + """A CSV without the transcript column should yield empty transcript URLs.""" + csv_file = tmp_path / "no_transcript.csv" + csv_file.write_text( + "Site Name,License Type,Display Name,URL,Keywords\n" + "Site1,MIT,Display1,https://example.com/1,key1\n" + ) + + csv_dict, length = gc.readInCSV(str(csv_file)) + + assert length == 1 + assert csv_dict['transcript_urls'] == [""] + def test_read_csv_empty(self, gc, tmp_path): """Test reading an empty CSV (header only).""" csv_file = tmp_path / "empty.csv" @@ -870,6 +951,95 @@ def fake_fetch(url): assert "target CPU for Cortex-A builds" in chunks[1].content +class TestCreateTranscriptChunks: + """Tests for transcript-backed sources (Transcript Source URL column).""" + + def _transcript_response(self, url, text): + return SimpleNamespace( + url=url, + content=text.encode("utf-8"), + headers={"content-type": "text/plain"}, + ) + + def test_transcript_used_instead_of_primary_url(self, gc, monkeypatch): + """When a transcript URL is set, content is fetched from the transcript.""" + source_url = "https://courses.edx.org/videos/block-v1:example+type@video+block@abc" + transcript_url = "https://github.com/arm-education/repo/blob/main/M1KV1.txt" + raw_transcript_url = ( + "https://raw.githubusercontent.com/arm-education/repo/main/M1KV1.txt" + ) + transcript_text = ( + "Arm processors deliver strong energy efficiency for AI inference workloads. " + * 60 + ) + + fetched_urls = [] + + def fake_fetch(url): + fetched_urls.append(url) + return self._transcript_response(raw_transcript_url, transcript_text) + + monkeypatch.setattr(gc, "fetch_with_logging", fake_fetch) + + chunks = gc.create_chunks_for_source( + source_url=source_url, + source_name="Energy efficiency for AI inference", + doc_type="Educational Course", + keywords_value="arm; ai; inference", + transcript_url=transcript_url, + ) + + # The GitHub blob URL must be resolved to a raw fetch URL. + assert fetched_urls == [raw_transcript_url] + assert len(chunks) >= 1 + # User-facing link stays the primary source URL, not the transcript. + assert all(chunk.url == source_url for chunk in chunks) + assert all(chunk.doc_type == "Educational Course" for chunk in chunks) + assert "energy efficiency" in chunks[0].content.lower() + + def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch): + """A failed transcript fetch should return no chunks.""" + monkeypatch.setattr(gc, "fetch_with_logging", lambda url: None) + + chunks = gc.create_chunks_for_source( + source_url="https://courses.edx.org/videos/example", + source_name="Example", + doc_type="Educational Course", + keywords_value="arm", + transcript_url="https://github.com/arm-education/repo/blob/main/missing.txt", + ) + + assert chunks == [] + + def test_blank_transcript_falls_back_to_primary_url(self, gc, monkeypatch): + """A blank transcript URL should not trigger transcript chunking.""" + captured = {} + + def fake_generic(source_url, source_name, doc_type, keywords_value): + captured["called"] = True + return ["sentinel"] + + # create_chunks_for_source dispatches to the generic path via fetch; here we + # short-circuit by patching fetch to confirm the transcript branch is skipped. + def fake_fetch(url): + captured["fetched"] = url + return None + + monkeypatch.setattr(gc, "fetch_with_logging", fake_fetch) + + chunks = gc.create_chunks_for_source( + source_url="https://learn.arm.com/learning-paths/example/", + source_name="Example", + doc_type="Learning Paths", + keywords_value="arm", + transcript_url=" ", + ) + + # Transcript branch skipped -> generic path attempted a fetch of the primary URL. + assert "fetched" in captured + assert chunks == [] + + class TestCreateRetrySession: """Tests for create_retry_session function.""" From 26da406f6406a68178235f24c05f4d55836f9671 Mon Sep 17 00:00:00 2001 From: Neethu Elizabeth Simon Date: Mon, 29 Jun 2026 13:47:02 -0700 Subject: [PATCH 2/2] fix: copilot comments --- embedding-generation/generate-chunks.py | 9 ++++++--- .../tests/test_generate_chunks.py | 16 ++++++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index a09e6ef..9939f08 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -894,17 +894,20 @@ def create_transcript_chunks(source_url, transcript_url, source_name, doc_type, fetch_url = source_to_fetch_url(normalize_source_url(transcript_url)) response = fetch_with_logging(fetch_url) if response is None: - print('transcript not valid, ', fetch_url) + print(f"[TRANSCRIPT FETCH FAILED] for {normalized_source_url} (transcript: {transcript_url}): {fetch_url}") return [] keywords = parse_keywords(keywords_value, source_name) parsed_document = parse_document_content( - source_url=normalized_source_url, + source_url=response.url, resolved_url=response.url, response_content=response.content, content_type=response.headers.get("content-type", ""), fallback_title=source_name, ) + # Keep the primary URL as the user-facing link while still using the transcript + # URL as the base for resolving any relative links inside the transcript content. + parsed_document.source_url = normalized_source_url chunks = [] for payload in chunk_parsed_document(parsed_document, doc_type=doc_type or "Transcript", keywords=keywords): @@ -941,7 +944,7 @@ def create_chunks_for_source(source_url, source_name, doc_type, keywords_value, fetch_url = source_to_fetch_url(normalized_source_url) response = fetch_with_logging(fetch_url) if response is None: - print('not valid, ', fetch_url) + print(f"Fetch failed for {normalized_source_url}: {fetch_url}") return [] sources_to_parse = [(normalized_source_url, response)] diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py index 95cfb70..0ada0ce 100644 --- a/embedding-generation/tests/test_generate_chunks.py +++ b/embedding-generation/tests/test_generate_chunks.py @@ -997,8 +997,8 @@ def fake_fetch(url): assert all(chunk.doc_type == "Educational Course" for chunk in chunks) assert "energy efficiency" in chunks[0].content.lower() - def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch): - """A failed transcript fetch should return no chunks.""" + def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch, capsys): + """A failed transcript fetch should return no chunks and log both URLs.""" monkeypatch.setattr(gc, "fetch_with_logging", lambda url: None) chunks = gc.create_chunks_for_source( @@ -1011,14 +1011,18 @@ def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch): assert chunks == [] + captured = capsys.readouterr() + assert "TRANSCRIPT FETCH FAILED" in captured.out + # Both the primary source URL and the transcript URL aid batch troubleshooting. + assert "https://courses.edx.org/videos/example" in captured.out + assert "https://github.com/arm-education/repo/blob/main/missing.txt" in captured.out + # The resolved raw fetch URL should also be logged. + assert "raw.githubusercontent.com/arm-education/repo/main/missing.txt" in captured.out + def test_blank_transcript_falls_back_to_primary_url(self, gc, monkeypatch): """A blank transcript URL should not trigger transcript chunking.""" captured = {} - def fake_generic(source_url, source_name, doc_type, keywords_value): - captured["called"] = True - return ["sentinel"] - # create_chunks_for_source dispatches to the generic path via fetch; here we # short-circuit by patching fetch to confirm the transcript branch is skipped. def fake_fetch(url):