Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions embedding-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,29 @@ The Dockerfile:
Add one row to `vector-db-sources.csv` for each document:

```csv
Site Name,License Type,Display Name,URL,Keywords
Example Docs,CC4.0,Example Arm Guide,https://example.com/arm-guide,arm; migration; linux
Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL
Example Docs,CC4.0,Example Arm Guide,https://example.com/arm-guide,arm; migration; linux,
```

Use clear keywords that users might include in questions. The `URL` is also what retrieval eval uses for expected matches.

### Transcript-backed sources

Some sources (for example edX course videos) do not have directly chunkable text
at their primary `URL`. For these, populate the optional `Transcript Source URL`
column with a link to a plain-text or markdown transcript (such as a GitHub
`.../blob/...` file). When `Transcript Source URL` is set, `generate-chunks.py`
fetches and chunks the transcript instead of the primary `URL`, but keeps the
primary `URL` as the user-facing link returned by retrieval:

```csv
Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL
Educational Course,All rights reserved,Example Video,https://courses.edx.org/videos/...,arm; ai; inference,https://github.com/arm-education/.../M1KV1.txt
```

Leave the column empty for sources that are chunked from their primary `URL`.


## Test Locally

Install dependencies once:
Expand Down
74 changes: 66 additions & 8 deletions embedding-generation/generate-chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,14 @@ def load_existing_sources(csv_file):
'license_type': row.get('License Type', ''),
'display_name': row.get('Display Name', ''),
'url': url,
'keywords': row.get('Keywords', '')
'keywords': row.get('Keywords', ''),
'transcript_source_url': (row.get('Transcript Source URL', '') or '').strip()
})

print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'")


def register_source(site_name, license_type, display_name, url, keywords):
def register_source(site_name, license_type, display_name, url, keywords, transcript_source_url=''):
"""
Register a new source URL. If the URL already exists, skip it.
Returns True if the source was added, False if it was a duplicate.
Expand All @@ -173,7 +174,8 @@ def register_source(site_name, license_type, display_name, url, keywords):
'license_type': license_type,
'display_name': display_name,
'url': url,
'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords)
'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords),
'transcript_source_url': (transcript_source_url or '').strip()
}

# Keep discovered sources grouped with their existing site section instead of
Expand All @@ -198,14 +200,15 @@ def save_sources_csv(csv_file):
"""
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords'])
writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords', 'Transcript Source URL'])
for source in all_sources:
writer.writerow([
source['site_name'],
source['license_type'],
source['display_name'],
source['url'],
source['keywords']
source['keywords'],
source.get('transcript_source_url', '')
])

print(f"Saved {len(all_sources)} sources to '{csv_file}'")
Expand Down Expand Up @@ -728,6 +731,7 @@ def readInCSV(csv_file):
'source_names': [],
'site_names': [],
'license_types': [],
'transcript_urls': [],
}

if not os.path.exists(csv_file):
Expand All @@ -741,6 +745,7 @@ def readInCSV(csv_file):
csv_dict['source_names'].append(row.get('Display Name', ''))
csv_dict['site_names'].append(row.get('Site Name', ''))
csv_dict['license_types'].append(row.get('License Type', ''))
csv_dict['transcript_urls'].append((row.get('Transcript Source URL', '') or '').strip())

return csv_dict, len(csv_dict['urls'])

Expand Down Expand Up @@ -877,7 +882,59 @@ def parse_keywords(keywords_value, title=""):
return keywords


def create_chunks_for_source(source_url, source_name, doc_type, keywords_value):
def create_transcript_chunks(source_url, transcript_url, source_name, doc_type, keywords_value):
"""Chunk a transcript document on behalf of a primary source.

Some sources (for example edX course videos) have no directly chunkable text
at their primary URL. When a "Transcript Source URL" is provided in the CSV,
we fetch and chunk that transcript instead, but keep the primary ``source_url``
as the user-facing link so retrieval still points users at the original content.
"""
normalized_source_url = normalize_source_url(source_url)
fetch_url = source_to_fetch_url(normalize_source_url(transcript_url))
response = fetch_with_logging(fetch_url)
if response is None:
print(f"[TRANSCRIPT FETCH FAILED] for {normalized_source_url} (transcript: {transcript_url}): {fetch_url}")
return []

keywords = parse_keywords(keywords_value, source_name)
parsed_document = parse_document_content(
source_url=response.url,
resolved_url=response.url,
response_content=response.content,
content_type=response.headers.get("content-type", ""),
fallback_title=source_name,
)
# Keep the primary URL as the user-facing link while still using the transcript
# URL as the base for resolving any relative links inside the transcript content.
parsed_document.source_url = normalized_source_url

chunks = []
for payload in chunk_parsed_document(parsed_document, doc_type=doc_type or "Transcript", keywords=keywords):
chunks.append(
createChunk(
text_snippet=payload["content"],
WEBSITE_url=normalized_source_url,
keywords=keywords,
title=payload["title"] or source_name,
heading=payload["heading"],
heading_path=payload["heading_path"],
doc_type=payload["doc_type"],
product=payload["product"],
version=payload["version"],
resolved_url=response.url,
content_type=payload["content_type"],
)
)
return chunks


def create_chunks_for_source(source_url, source_name, doc_type, keywords_value, transcript_url=""):
# When a transcript source is provided, chunk the transcript text instead of
# fetching the primary URL (which may be a video player or other non-text page),
# while keeping the primary URL as the user-facing link for retrieval.
if transcript_url and transcript_url.strip():
return create_transcript_chunks(source_url, transcript_url, source_name, doc_type, keywords_value)
if doc_type == "Ecosystem Dashboard":
return create_ecosystem_dashboard_chunk(source_url, source_name, keywords_value)
if is_arm_developer_documentation_url(source_url):
Expand All @@ -887,7 +944,7 @@ def create_chunks_for_source(source_url, source_name, doc_type, keywords_value):
fetch_url = source_to_fetch_url(normalized_source_url)
response = fetch_with_logging(fetch_url)
if response is None:
print('not valid, ', fetch_url)
print(f"Fetch failed for {normalized_source_url}: {fetch_url}")
return []

sources_to_parse = [(normalized_source_url, response)]
Expand Down Expand Up @@ -1104,8 +1161,9 @@ def main():
source_name = csv_dict['source_names'][i]
doc_type = csv_dict['site_names'][i]
keywords_value = csv_dict['focus'][i]
transcript_url = csv_dict['transcript_urls'][i]

for chunk in create_chunks_for_source(url, source_name, doc_type, keywords_value):
for chunk in create_chunks_for_source(url, source_name, doc_type, keywords_value, transcript_url):
chunkSaveAndTrack(url, chunk)

# Save updated sources CSV with all discovered sources
Expand Down
180 changes: 177 additions & 3 deletions embedding-generation/tests/test_generate_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,9 +422,59 @@ def test_save_sources_csv(self, gc, tmp_path):
reader = csv.reader(f)
rows = list(reader)

assert rows[0] == ['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords']
assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2']
assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3']
assert rows[0] == ['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords', 'Transcript Source URL']
assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2', '']
assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3', '']

def test_save_sources_csv_preserves_transcript_url(self, gc, tmp_path):
"""Transcript Source URL must be written back to the CSV."""
gc.all_sources = [
{
'site_name': 'Educational Course',
'license_type': 'All rights reserved',
'display_name': 'Example Video',
'url': 'https://courses.edx.org/videos/example',
'keywords': 'arm; ai',
'transcript_source_url': 'https://github.com/arm-education/repo/blob/main/M1KV1.txt'
}
]

csv_file = tmp_path / "output.csv"
gc.save_sources_csv(str(csv_file))

with open(csv_file, 'r') as f:
rows = list(csv.reader(f))

assert rows[0][-1] == 'Transcript Source URL'
assert rows[1] == [
'Educational Course',
'All rights reserved',
'Example Video',
'https://courses.edx.org/videos/example',
'arm; ai',
'https://github.com/arm-education/repo/blob/main/M1KV1.txt',
]

def test_load_existing_sources_preserves_transcript_url(self, gc, tmp_path):
"""Loading a CSV with a transcript column should retain it through a save round-trip."""
csv_file = tmp_path / "sources.csv"
csv_file.write_text(
"Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL\n"
"Educational Course,All rights reserved,Example Video,https://courses.edx.org/videos/example,arm; ai,"
"https://github.com/arm-education/repo/blob/main/M1KV1.txt\n"
)

gc.load_existing_sources(str(csv_file))

assert gc.all_sources[0]['transcript_source_url'] == (
"https://github.com/arm-education/repo/blob/main/M1KV1.txt"
)

gc.save_sources_csv(str(csv_file))

with open(csv_file, 'r') as f:
rows = list(csv.reader(f))
assert rows[1][-1] == "https://github.com/arm-education/repo/blob/main/M1KV1.txt"

def test_load_and_save_roundtrip(self, gc, tmp_path):
"""Test that loading and saving preserves data."""
Expand Down Expand Up @@ -634,6 +684,37 @@ def test_read_csv_basic(self, gc, tmp_path):
assert csv_dict['site_names'] == ['Site1', 'Site2']
assert csv_dict['license_types'] == ['MIT', 'Apache']

def test_read_csv_transcript_urls(self, gc, tmp_path):
"""Test that the optional Transcript Source URL column is read."""
csv_file = tmp_path / "transcript.csv"
csv_file.write_text(
"Site Name,License Type,Display Name,URL,Keywords,Transcript Source URL\n"
"Educational Course,All rights reserved,Video1,https://courses.edx.org/v/1,key1,"
"https://github.com/arm-education/repo/blob/main/M1KV1.txt\n"
"Site2,Apache,Display2,https://example.com/2,key2,\n"
)

csv_dict, length = gc.readInCSV(str(csv_file))

assert length == 2
assert csv_dict['transcript_urls'] == [
"https://github.com/arm-education/repo/blob/main/M1KV1.txt",
"",
]

def test_read_csv_transcript_missing_column(self, gc, tmp_path):
"""A CSV without the transcript column should yield empty transcript URLs."""
csv_file = tmp_path / "no_transcript.csv"
csv_file.write_text(
"Site Name,License Type,Display Name,URL,Keywords\n"
"Site1,MIT,Display1,https://example.com/1,key1\n"
)

csv_dict, length = gc.readInCSV(str(csv_file))

assert length == 1
assert csv_dict['transcript_urls'] == [""]

def test_read_csv_empty(self, gc, tmp_path):
"""Test reading an empty CSV (header only)."""
csv_file = tmp_path / "empty.csv"
Expand Down Expand Up @@ -870,6 +951,99 @@ def fake_fetch(url):
assert "target CPU for Cortex-A builds" in chunks[1].content


class TestCreateTranscriptChunks:
"""Tests for transcript-backed sources (Transcript Source URL column)."""

def _transcript_response(self, url, text):
return SimpleNamespace(
url=url,
content=text.encode("utf-8"),
headers={"content-type": "text/plain"},
)

def test_transcript_used_instead_of_primary_url(self, gc, monkeypatch):
"""When a transcript URL is set, content is fetched from the transcript."""
source_url = "https://courses.edx.org/videos/block-v1:example+type@video+block@abc"
transcript_url = "https://github.com/arm-education/repo/blob/main/M1KV1.txt"
raw_transcript_url = (
"https://raw.githubusercontent.com/arm-education/repo/main/M1KV1.txt"
)
transcript_text = (
"Arm processors deliver strong energy efficiency for AI inference workloads. "
* 60
)

fetched_urls = []

def fake_fetch(url):
fetched_urls.append(url)
return self._transcript_response(raw_transcript_url, transcript_text)

monkeypatch.setattr(gc, "fetch_with_logging", fake_fetch)

chunks = gc.create_chunks_for_source(
source_url=source_url,
source_name="Energy efficiency for AI inference",
doc_type="Educational Course",
keywords_value="arm; ai; inference",
transcript_url=transcript_url,
)

# The GitHub blob URL must be resolved to a raw fetch URL.
assert fetched_urls == [raw_transcript_url]
assert len(chunks) >= 1
# User-facing link stays the primary source URL, not the transcript.
assert all(chunk.url == source_url for chunk in chunks)
assert all(chunk.doc_type == "Educational Course" for chunk in chunks)
assert "energy efficiency" in chunks[0].content.lower()

def test_transcript_fetch_failure_returns_empty(self, gc, monkeypatch, capsys):
"""A failed transcript fetch should return no chunks and log both URLs."""
monkeypatch.setattr(gc, "fetch_with_logging", lambda url: None)

chunks = gc.create_chunks_for_source(
source_url="https://courses.edx.org/videos/example",
source_name="Example",
doc_type="Educational Course",
keywords_value="arm",
transcript_url="https://github.com/arm-education/repo/blob/main/missing.txt",
)

assert chunks == []

captured = capsys.readouterr()
assert "TRANSCRIPT FETCH FAILED" in captured.out
# Both the primary source URL and the transcript URL aid batch troubleshooting.
assert "https://courses.edx.org/videos/example" in captured.out
assert "https://github.com/arm-education/repo/blob/main/missing.txt" in captured.out
# The resolved raw fetch URL should also be logged.
assert "raw.githubusercontent.com/arm-education/repo/main/missing.txt" in captured.out

def test_blank_transcript_falls_back_to_primary_url(self, gc, monkeypatch):
"""A blank transcript URL should not trigger transcript chunking."""
captured = {}

# create_chunks_for_source dispatches to the generic path via fetch; here we
# short-circuit by patching fetch to confirm the transcript branch is skipped.
def fake_fetch(url):
captured["fetched"] = url
return None

monkeypatch.setattr(gc, "fetch_with_logging", fake_fetch)

chunks = gc.create_chunks_for_source(
source_url="https://learn.arm.com/learning-paths/example/",
source_name="Example",
doc_type="Learning Paths",
keywords_value="arm",
transcript_url=" ",
)

# Transcript branch skipped -> generic path attempted a fetch of the primary URL.
assert "fetched" in captured
assert chunks == []


class TestCreateRetrySession:
"""Tests for create_retry_session function."""

Expand Down
Loading