From d413edb3f861041bf7ae89605af7eaa6fdb5f823 Mon Sep 17 00:00:00 2001 From: Andrew Pickard Date: Mon, 18 May 2026 17:17:53 +0100 Subject: [PATCH 1/6] Added the ability to search developer.arm.com and include the search results as embeddings --- embedding-generation/Dockerfile | 3 +- embedding-generation/generate-chunks.py | 199 +++++++++++++++++++++++- embedding-generation/requirements.txt | 1 + 3 files changed, 200 insertions(+), 3 deletions(-) diff --git a/embedding-generation/Dockerfile b/embedding-generation/Dockerfile index 4909565..7c2dc3e 100644 --- a/embedding-generation/Dockerfile +++ b/embedding-generation/Dockerfile @@ -47,7 +47,8 @@ COPY requirements.txt . COPY --from=intrinsic-chunks /embedding-data/intrinsic_chunks ./intrinsic_chunks # Install Python dependencies (force CPU-only torch) -RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt +RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt && \ + playwright install --with-deps chromium # Pre-download the embedding model so local/offline loads succeed later in the build. RUN python3 -c "from sentence_transformers import SentenceTransformer; import os; SentenceTransformer(os.environ['SENTENCE_TRANSFORMER_MODEL'], cache_folder=os.environ['SENTENCE_TRANSFORMERS_HOME'])" diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index fa29a6a..79d36ba 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -27,7 +27,11 @@ import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from urllib.parse import parse_qs, urlparse +from urllib.parse import parse_qs, urlparse, quote +from dataclasses import dataclass +from typing import Any, Dict, List, Optional +from playwright.async_api import async_playwright +import asyncio from document_chunking import ( arm_service_url_to_developer_url, @@ -526,6 +530,194 @@ def htmlToMarkdown(html_string): ''' +@dataclass +class CapturedSearchRequest: + url: str + method: str + headers: Dict[str, str] + post_data: Optional[str] + response_json: Dict[str, Any] + +async def capture_DeveloperArmComSearch(page_url: str) -> CapturedSearchRequest: + apicount = 0 + def is_search_response(resp) -> bool: + nonlocal apicount + # print("Testing url "+str(resp.request.method.upper())+" "+str(resp.url)) + if "coveo.com/rest/search/v2" in resp.url and "querySuggest" not in resp.url: + apicount += 1 + return ( + resp.request.method.upper() in {"GET", "POST"} + and resp.status == 200 + and apicount == 2 + ) + else: + return False + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + async with page.expect_response(is_search_response, timeout=30_000) as response_info: + await page.goto(page_url, wait_until="domcontentloaded") + + response = await response_info.value + data = await response.json() + await browser.close() + + if not(isinstance(data, dict) and "results" in data): + raise RuntimeError("No search API response was captured. ") + else: + return CapturedSearchRequest( + url=response.url, + method=response.request.method, + headers=dict(response.request.headers), + post_data=response.request.post_data, + response_json=data, + ) + +def replay_DeveloperArmComSearch( + captured: CapturedSearchRequest, + query: str, + first_result: int = 0, + number_of_results: int = 48, +) -> Dict[str, Any]: + + def _merge_headers(base_headers: Dict[str, str]) -> Dict[str, str]: + keep = {} + drop = {"host", "content-length", "accept-encoding", "connection", "origin", "referer"} + for k, v in base_headers.items(): + if k.lower() not in drop: + keep[k] = v + keep.setdefault("accept", "application/json, text/plain, */*") + keep.setdefault("content-type", "application/json") + keep.setdefault("user-agent", "Mozilla/5.0") + return keep + + if not captured.post_data: + raise RuntimeError("Captured request had no POST body to replay.") + + body = json.loads(captured.post_data) + body["q"] = query + body["firstResult"] = first_result + body["numberOfResults"] = number_of_results + headers = _merge_headers(captured.headers) + + r = requests.post(captured.url, headers=headers, json=body, timeout=60) + r.raise_for_status() + return r.json() + +def getDeveloperArmComSearchResults(searchterm: str, searchurl: str, maxitems: int = 20000): + + def extract_result(item: Dict[str, Any]) -> Dict[str, Any]: + return { + "title": item.get("title") or item.get("raw", {}).get("title"), + "url": item.get("clickUri") or item.get("uri") or item.get("url"), + "type": item.get("raw", {}).get("navigationhierarchiescontenttype"), + "products": item.get("raw", {}).get("navigationhierarchiesproducts"), + "objecttype": item.get("raw", {}).get("objecttype"), + "keywords": item.get("raw", {}).get("navigationhierarchiestopics") + } + + print('Searching developer.arm.com for "'+searchterm+'"') + captured = asyncio.run(capture_DeveloperArmComSearch(searchurl)) + + all_rows = [] + finished = False + page_size = 48 + start = 0 + while (len(all_rows) < maxitems) and not finished: + payload = replay_DeveloperArmComSearch( + captured, + query=searchterm, + first_result=start, + number_of_results=page_size, + ) + + items = [extract_result(x) for x in payload["results"]] + all_rows.extend(items) + finished = len(payload["results"]) < page_size + start += page_size + print("Found "+str(len(all_rows))+" results") + return all_rows + +def processDeveloperArmCom(url, title, type, keywords, emit_chunks=True): + + def chunkizeLearningPath(url, title, keywords): + if not emit_chunks: + return + + response = fetch_with_logging(url) + if response is None: + return + parsed_document = parse_document_content( + source_url=url, + resolved_url=url, + response_content=response.content, + content_type=response.headers.get("content-type", "text/html"), + fallback_title=title, + ) + chunk_payloads = chunk_parsed_document( + parsed_document, + doc_type=type, + keywords=keywords, + ) + + # 5) Create chunks for each snippet by adding metadata + for payload in chunk_payloads: + chunk = createChunk( + payload["content"], + url, + keywords, + payload["title"], + heading=payload["heading"], + heading_path=payload["heading_path"], + doc_type=payload["doc_type"], + product=payload["product"], + version=payload["version"], + resolved_url=payload["resolved_url"], + content_type=payload["content_type"], + ) + chunkSaveAndTrack(url,chunk) + + + response = http_session.get(url, timeout=60) + soup = BeautifulSoup(response.text, 'html.parser') + + itemtitle = 'Arm '+type+' - '+(blogtitle.get_text() if (blogtitle := soup.find(id='blog-title')) else title) + itemdate = blogdate.get_text() if (blogdate := soup.find(id='blog-date')) else '' + + # Register this learning path as a source + register_source( + site_name='Arm Developer', + license_type='Copyright Arm', + display_name=itemtitle, + url=url, + keywords=keywords + ) + chunkizeLearningPath(url,itemtitle,keywords) + +def createDeveloperArmComChunks(emit_chunks=True): + search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype=" + content_types = [ + "Blog Post" + "Developer Guide", + "Guide", + "Programmer's Guide", + "Migration Guide", + "Getting Started Guide" + ] + + search_url = search_base+",".join([quote(x) for x in content_types])+"&q=" + for searchterm in ["SME"]: + pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm) + for i, page in enumerate(pages): + keywords = list(set( [searchterm] + + [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] + + [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]])) + processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks) + + + def processLearningPath(url, type, emit_chunks=True): github_raw_link = "https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/production/content" site_link = "https://learn.arm.com" @@ -1080,7 +1272,10 @@ def main(): # b) Ecosystem Dashboard createEcosystemDashboardChunks(emit_chunks=False) - # c) Intrinsics + # c) Developer.Arm.Com + createDeveloperArmComChunks(emit_chunks=False) + + # d) Intrinsics #createIntrinsicsDatabaseChunks() # 1) Get URLs and details from CSV diff --git a/embedding-generation/requirements.txt b/embedding-generation/requirements.txt index f6846d7..884dec2 100644 --- a/embedding-generation/requirements.txt +++ b/embedding-generation/requirements.txt @@ -6,3 +6,4 @@ boto3 sentence-transformers pypdf rank-bm25 +playwright From cc76fb48f186f6513df4dd28c490a3c41c74a7fd Mon Sep 17 00:00:00 2001 From: Andrew Pickard Date: Fri, 29 May 2026 15:39:57 +0100 Subject: [PATCH 2/6] Only include developer.arm.com content that is relevant for SME --- embedding-generation/generate-chunks.py | 60 ++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index 79d36ba..4bc1351 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -613,6 +613,7 @@ def extract_result(item: Dict[str, Any]) -> Dict[str, Any]: "title": item.get("title") or item.get("raw", {}).get("title"), "url": item.get("clickUri") or item.get("uri") or item.get("url"), "type": item.get("raw", {}).get("navigationhierarchiescontenttype"), + "author": item.get("raw", {}).get("author") or item.get("raw", {}).get("sysauthor"), "products": item.get("raw", {}).get("navigationhierarchiesproducts"), "objecttype": item.get("raw", {}).get("objecttype"), "keywords": item.get("raw", {}).get("navigationhierarchiestopics") @@ -696,25 +697,64 @@ def chunkizeLearningPath(url, title, keywords): ) chunkizeLearningPath(url,itemtitle,keywords) +def item_is_relevant(item) -> bool: + match item["type"]: + case "Guide": + return item["title"] in { + "What is SME/SME2?", + "Overview of SME", + "Assembly code", + "Streaming SVE", + "Load and Store", + "Z registers", + "Real world examples", + "ZA storage", + "Predication" + } + + case "Programmer's Guide": + for pattern in { + r"/SME-Overview/", + r"/CME", + r"/matmul-fp32", + r"/lut-gemv-rm-int8", + r"/matmul-int8", + r"/gemv-cm-int8.+/", + r"/109246/.*/Introduction(\?|/The.+/)", + r"/Introduction-to-CME", + r"/Toolchains-and-model-support/(?!Quick-start)", + r"/Memory-access.(?!Implications)", + r"/Performance-monitoring", + r"/Matrix-Multiply-Unit" + }: + if re.search(pattern, item["url"]): + return True + return False + + case "Blog Post": + if item["author"] in {"Zenon_Xiu","KhalidS"} and item["title"][0:4] == "Part" and "SME" in item["title"]: + return True + if item["author"] == "mweidmann" and item["title"][0:41] == "Introducing the Scalable Matrix Extension": + return True + return False + def createDeveloperArmComChunks(emit_chunks=True): search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype=" content_types = [ - "Blog Post" - "Developer Guide", + "Blog Post", "Guide", - "Programmer's Guide", - "Migration Guide", - "Getting Started Guide" + "Programmer's Guide" ] search_url = search_base+",".join([quote(x) for x in content_types])+"&q=" for searchterm in ["SME"]: pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm) - for i, page in enumerate(pages): - keywords = list(set( [searchterm] + - [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] + - [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]])) - processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks) + for page in pages: + if item_is_relevant(page): + keywords = list(set( [searchterm] + + [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] + + [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]])) + processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks) From 0c1774e955ff5d308458053bd14158f56644129c Mon Sep 17 00:00:00 2001 From: Andrew Pickard Date: Wed, 17 Jun 2026 16:22:44 +0100 Subject: [PATCH 3/6] Moved developer.arm.com functionality into separate file generate-vectors.py with common functions in generate_common.py --- embedding-generation/Dockerfile | 4 +- embedding-generation/generate-chunks.py | 588 +---------------------- embedding-generation/generate-vectors.py | 335 +++++++++++++ embedding-generation/generate_common.py | 362 ++++++++++++++ 4 files changed, 717 insertions(+), 572 deletions(-) create mode 100644 embedding-generation/generate-vectors.py create mode 100644 embedding-generation/generate_common.py diff --git a/embedding-generation/Dockerfile b/embedding-generation/Dockerfile index 7c2dc3e..6daed68 100644 --- a/embedding-generation/Dockerfile +++ b/embedding-generation/Dockerfile @@ -38,6 +38,7 @@ WORKDIR /embedding-data # Copy Python scripts and dependencies COPY generate-chunks.py . +COPY generate_common.py . COPY document_chunking.py . COPY local_vectorstore_creation.py . COPY vector-db-sources.csv . @@ -47,8 +48,7 @@ COPY requirements.txt . COPY --from=intrinsic-chunks /embedding-data/intrinsic_chunks ./intrinsic_chunks # Install Python dependencies (force CPU-only torch) -RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt && \ - playwright install --with-deps chromium +RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt # Pre-download the embedding model so local/offline loads succeed later in the build. RUN python3 -c "from sentence_transformers import SentenceTransformer; import os; SentenceTransformer(os.environ['SENTENCE_TRANSFORMER_MODEL'], cache_folder=os.environ['SENTENCE_TRANSFORMERS_HOME'])" diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index 4bc1351..1ed10d4 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -17,21 +17,11 @@ import os import re import uuid -import yaml import csv -import datetime -import boto3 -from botocore.exceptions import NoCredentialsError, ClientError from bs4 import BeautifulSoup import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry -from urllib.parse import parse_qs, urlparse, quote -from dataclasses import dataclass -from typing import Any, Dict, List, Optional -from playwright.async_api import async_playwright -import asyncio +from urllib.parse import parse_qs, urlparse from document_chunking import ( arm_service_url_to_developer_url, @@ -45,58 +35,22 @@ source_to_fetch_url, ) +from generate_common import ( + Chunk, + createChunk, + printChunks, + chunkSaveAndTrack, + fetch_with_logging, + register_source, + save_sources_csv, + load_existing_sources, + get_number_of_sources, + ensure_intrinsic_chunks_from_s3, + yaml_dir, + details_file, + http_session +) -# Create a session with retry logic for resilient HTTP requests -def create_retry_session(retries=5, backoff_factor=1, status_forcelist=(500, 502, 503, 504)): - """Create a requests session with automatic retry on failures.""" - session = requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - allowed_methods=["HEAD", "GET", "OPTIONS"] - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - return session - -# Global session for all HTTP requests -http_session = create_retry_session() - - -def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', - s3_bucket='arm-github-copilot-extension', - s3_prefix='embedding_data/intrinsic_chunks/'): - """ - Ensure the local 'intrinsic_chunks' folder exists and is populated with files from S3. - If the folder does not exist, create it and download all files from the S3 prefix. - """ - if not os.path.exists(local_folder): - os.makedirs(local_folder, exist_ok=True) - print(f"Created local folder: {local_folder}") - s3 = boto3.client('s3') - try: - paginator = s3.get_paginator('list_objects_v2') - for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix): - for obj in page.get('Contents', []): - key = obj['Key'] - if key.endswith('/'): - continue # skip folders - filename = os.path.basename(key) - local_path = os.path.join(local_folder, filename) - print(f"Downloading {key} to {local_path}") - s3.download_file(s3_bucket, key, local_path) - except NoCredentialsError: - print("AWS credentials not found. Please configure them.") - except ClientError as e: - print(f"S3 ClientError: {e}") - except Exception as e: - print(f"Unexpected error: {e}") - else: - print(f"Folder '{local_folder}' already exists. Skipping S3 download.") ''' To fix: @@ -104,8 +58,6 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', 2. Learning Path titles must come from index page...send through function along with Graviton. ''' -yaml_dir = os.getenv('YAML_OUTPUT_DIR', 'yaml_data') -details_file = os.getenv('CHUNK_DETAILS_FILE', 'info/chunk_details.csv') chunk_index = 1 @@ -116,158 +68,14 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', # multi-megabyte HTML document for every source row. ecosystem_dashboard_entries = None -# Global tracking for vector-db-sources.csv -# Set of URLs already in the CSV (for deduplication) -known_source_urls = set() -# List of all source entries (including existing and new) -# Each entry is a dict: {site_name, license_type, display_name, url, keywords} -all_sources = [] # Increase the file size limit, which defaults to '131,072' csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror' -def load_existing_sources(csv_file): - """ - Load existing sources from vector-db-sources.csv into memory. - Populates known_source_urls set and all_sources list. - """ - global known_source_urls, all_sources - known_source_urls = set() - all_sources = [] - - if not os.path.exists(csv_file): - print(f"Sources file '{csv_file}' does not exist. Starting fresh.") - return - - with open(csv_file, 'r', newline='', encoding='utf-8') as file: - reader = csv.DictReader(file) - for row in reader: - url = row.get('URL', '').strip() - if url: - known_source_urls.add(url) - all_sources.append({ - 'site_name': row.get('Site Name', ''), - 'license_type': row.get('License Type', ''), - 'display_name': row.get('Display Name', ''), - 'url': url, - 'keywords': row.get('Keywords', '') - }) - - print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'") - - -def register_source(site_name, license_type, display_name, url, keywords): - """ - Register a new source URL. If the URL already exists, skip it. - Returns True if the source was added, False if it was a duplicate. - """ - global known_source_urls, all_sources - - # Normalize URL for comparison - url = url.strip() - - if url in known_source_urls: - return False - - known_source_urls.add(url) - source_entry = { - 'site_name': site_name, - 'license_type': license_type, - 'display_name': display_name, - 'url': url, - 'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords) - } - - # Keep discovered sources grouped with their existing site section instead of - # appending them to the very end of the CSV and fragmenting that block. - insert_at = None - for index, existing_source in enumerate(all_sources): - if existing_source.get('site_name') == site_name: - insert_at = index + 1 - - if insert_at is None: - all_sources.append(source_entry) - else: - all_sources.insert(insert_at, source_entry) - print(f"[NEW SOURCE] {display_name}: {url}") - return True -def save_sources_csv(csv_file): - """ - Write all sources (existing + new) to vector-db-sources.csv. - """ - with open(csv_file, 'w', newline='', encoding='utf-8') as file: - writer = csv.writer(file) - writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords']) - for source in all_sources: - writer.writerow([ - source['site_name'], - source['license_type'], - source['display_name'], - source['url'], - source['keywords'] - ]) - - print(f"Saved {len(all_sources)} sources to '{csv_file}'") - -class Chunk: - def __init__( - self, - title, - url, - uuid, - keywords, - content, - heading="", - heading_path=None, - doc_type="", - product="", - version="", - resolved_url="", - content_type="", - ): - self.title = title - self.url = url - self.uuid = uuid - self.content = content - self.heading = heading - self.heading_path = heading_path or [] - self.doc_type = doc_type - self.product = product - self.version = version - self.resolved_url = resolved_url - self.content_type = content_type - - # Translate keyword list into comma-separated string, and add similar words to keywords. - self.keywords = self.formatKeywords(keywords) - - def formatKeywords(self, keywords): - """Format keywords list into a lowercase, comma-separated string.""" - return ', '.join(k.strip() for k in keywords).lower() - - # Used to dump into a yaml file without difficulty - def toDict(self): - return { - 'title': self.title, - 'url': self.url, - 'uuid': self.uuid, - 'keywords': self.keywords, - 'content': self.content, - 'heading': self.heading, - 'heading_path': self.heading_path, - 'doc_type': self.doc_type, - 'product': self.product, - 'version': self.version, - 'resolved_url': self.resolved_url, - 'content_type': self.content_type, - } - - def __repr__(self): - return f"Chunk(title={self.title}, url={self.url}, uuid={self.uuid}, heading={self.heading})" - def build_ecosystem_dashboard_entries(): """Load and cache package-level snippets from the ecosystem dashboard.""" global ecosystem_dashboard_entries @@ -529,235 +337,6 @@ def htmlToMarkdown(html_string): ''' - -@dataclass -class CapturedSearchRequest: - url: str - method: str - headers: Dict[str, str] - post_data: Optional[str] - response_json: Dict[str, Any] - -async def capture_DeveloperArmComSearch(page_url: str) -> CapturedSearchRequest: - apicount = 0 - def is_search_response(resp) -> bool: - nonlocal apicount - # print("Testing url "+str(resp.request.method.upper())+" "+str(resp.url)) - if "coveo.com/rest/search/v2" in resp.url and "querySuggest" not in resp.url: - apicount += 1 - return ( - resp.request.method.upper() in {"GET", "POST"} - and resp.status == 200 - and apicount == 2 - ) - else: - return False - - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - async with page.expect_response(is_search_response, timeout=30_000) as response_info: - await page.goto(page_url, wait_until="domcontentloaded") - - response = await response_info.value - data = await response.json() - await browser.close() - - if not(isinstance(data, dict) and "results" in data): - raise RuntimeError("No search API response was captured. ") - else: - return CapturedSearchRequest( - url=response.url, - method=response.request.method, - headers=dict(response.request.headers), - post_data=response.request.post_data, - response_json=data, - ) - -def replay_DeveloperArmComSearch( - captured: CapturedSearchRequest, - query: str, - first_result: int = 0, - number_of_results: int = 48, -) -> Dict[str, Any]: - - def _merge_headers(base_headers: Dict[str, str]) -> Dict[str, str]: - keep = {} - drop = {"host", "content-length", "accept-encoding", "connection", "origin", "referer"} - for k, v in base_headers.items(): - if k.lower() not in drop: - keep[k] = v - keep.setdefault("accept", "application/json, text/plain, */*") - keep.setdefault("content-type", "application/json") - keep.setdefault("user-agent", "Mozilla/5.0") - return keep - - if not captured.post_data: - raise RuntimeError("Captured request had no POST body to replay.") - - body = json.loads(captured.post_data) - body["q"] = query - body["firstResult"] = first_result - body["numberOfResults"] = number_of_results - headers = _merge_headers(captured.headers) - - r = requests.post(captured.url, headers=headers, json=body, timeout=60) - r.raise_for_status() - return r.json() - -def getDeveloperArmComSearchResults(searchterm: str, searchurl: str, maxitems: int = 20000): - - def extract_result(item: Dict[str, Any]) -> Dict[str, Any]: - return { - "title": item.get("title") or item.get("raw", {}).get("title"), - "url": item.get("clickUri") or item.get("uri") or item.get("url"), - "type": item.get("raw", {}).get("navigationhierarchiescontenttype"), - "author": item.get("raw", {}).get("author") or item.get("raw", {}).get("sysauthor"), - "products": item.get("raw", {}).get("navigationhierarchiesproducts"), - "objecttype": item.get("raw", {}).get("objecttype"), - "keywords": item.get("raw", {}).get("navigationhierarchiestopics") - } - - print('Searching developer.arm.com for "'+searchterm+'"') - captured = asyncio.run(capture_DeveloperArmComSearch(searchurl)) - - all_rows = [] - finished = False - page_size = 48 - start = 0 - while (len(all_rows) < maxitems) and not finished: - payload = replay_DeveloperArmComSearch( - captured, - query=searchterm, - first_result=start, - number_of_results=page_size, - ) - - items = [extract_result(x) for x in payload["results"]] - all_rows.extend(items) - finished = len(payload["results"]) < page_size - start += page_size - print("Found "+str(len(all_rows))+" results") - return all_rows - -def processDeveloperArmCom(url, title, type, keywords, emit_chunks=True): - - def chunkizeLearningPath(url, title, keywords): - if not emit_chunks: - return - - response = fetch_with_logging(url) - if response is None: - return - parsed_document = parse_document_content( - source_url=url, - resolved_url=url, - response_content=response.content, - content_type=response.headers.get("content-type", "text/html"), - fallback_title=title, - ) - chunk_payloads = chunk_parsed_document( - parsed_document, - doc_type=type, - keywords=keywords, - ) - - # 5) Create chunks for each snippet by adding metadata - for payload in chunk_payloads: - chunk = createChunk( - payload["content"], - url, - keywords, - payload["title"], - heading=payload["heading"], - heading_path=payload["heading_path"], - doc_type=payload["doc_type"], - product=payload["product"], - version=payload["version"], - resolved_url=payload["resolved_url"], - content_type=payload["content_type"], - ) - chunkSaveAndTrack(url,chunk) - - - response = http_session.get(url, timeout=60) - soup = BeautifulSoup(response.text, 'html.parser') - - itemtitle = 'Arm '+type+' - '+(blogtitle.get_text() if (blogtitle := soup.find(id='blog-title')) else title) - itemdate = blogdate.get_text() if (blogdate := soup.find(id='blog-date')) else '' - - # Register this learning path as a source - register_source( - site_name='Arm Developer', - license_type='Copyright Arm', - display_name=itemtitle, - url=url, - keywords=keywords - ) - chunkizeLearningPath(url,itemtitle,keywords) - -def item_is_relevant(item) -> bool: - match item["type"]: - case "Guide": - return item["title"] in { - "What is SME/SME2?", - "Overview of SME", - "Assembly code", - "Streaming SVE", - "Load and Store", - "Z registers", - "Real world examples", - "ZA storage", - "Predication" - } - - case "Programmer's Guide": - for pattern in { - r"/SME-Overview/", - r"/CME", - r"/matmul-fp32", - r"/lut-gemv-rm-int8", - r"/matmul-int8", - r"/gemv-cm-int8.+/", - r"/109246/.*/Introduction(\?|/The.+/)", - r"/Introduction-to-CME", - r"/Toolchains-and-model-support/(?!Quick-start)", - r"/Memory-access.(?!Implications)", - r"/Performance-monitoring", - r"/Matrix-Multiply-Unit" - }: - if re.search(pattern, item["url"]): - return True - return False - - case "Blog Post": - if item["author"] in {"Zenon_Xiu","KhalidS"} and item["title"][0:4] == "Part" and "SME" in item["title"]: - return True - if item["author"] == "mweidmann" and item["title"][0:41] == "Introducing the Scalable Matrix Extension": - return True - return False - -def createDeveloperArmComChunks(emit_chunks=True): - search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype=" - content_types = [ - "Blog Post", - "Guide", - "Programmer's Guide" - ] - - search_url = search_base+",".join([quote(x) for x in content_types])+"&q=" - for searchterm in ["SME"]: - pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm) - for page in pages: - if item_is_relevant(page): - keywords = list(set( [searchterm] + - [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] + - [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]])) - processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks) - - - def processLearningPath(url, type, emit_chunks=True): github_raw_link = "https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/production/content" site_link = "https://learn.arm.com" @@ -1004,31 +583,6 @@ def URLIsValidCheck(url): return False -def fetch_with_logging(url): - try: - response = http_session.get(url, timeout=60) - response.raise_for_status() - return response - except requests.exceptions.HTTPError as http_err: - print(f"HTTP error occurred: {http_err}") - with open('info/errors.csv', 'a', newline='') as csvfile: - csv_writer = csv.writer(csvfile) - csv_writer.writerow([url, str(http_err)]) - return None - except Exception as err: - print(f"Other error occurred: {err}") - with open('info/errors.csv', 'a', newline='') as csvfile: - csv_writer = csv.writer(csvfile) - csv_writer.writerow([url, str(err)]) - return None - except Exception as err: - print(f"Other error occurred: {err}") - with open('info/errors.csv', 'a', newline='') as csvfile: - csv_writer = csv.writer(csvfile) - csv_writer.writerow([url,str(err)]) - return False - - def obtainMarkdownContentFromGitHubMDFile(gh_url): response = http_session.get(gh_url, timeout=60) response.raise_for_status() # Ensure we got a valid response @@ -1059,48 +613,6 @@ def obtainTextSnippets__Markdown(content, min_words=300, max_words=500, min_fina return [chunk["content"] for chunk in chunks] -def createChunk( - text_snippet, - WEBSITE_url, - keywords, - title, - heading="", - heading_path=None, - doc_type="", - product="", - version="", - resolved_url="", - content_type="", -): - chunk = Chunk( - title = title, - url = WEBSITE_url, - uuid = str(uuid.uuid4()), - keywords = keywords, - content = text_snippet, - heading = heading, - heading_path = heading_path or [], - doc_type = doc_type, - product = product, - version = version, - resolved_url = resolved_url, - content_type = content_type, - ) - - return chunk - - -def printChunks(chunks): - for chunk_dict in chunks: - print('='*100) - print("Title:", chunk_dict['title']) - print("Keywords:", chunk_dict['keywords']) - print("URL:", chunk_dict['url']) - print("Unique ID:", chunk_dict['uuid']) - print("Content:", chunk_dict['content']) - print('='*100) - - def parse_keywords(keywords_value, title=""): keywords = [keyword.strip() for keyword in re.split(r"[;,]", keywords_value or "") if keyword.strip()] if title and title not in keywords: @@ -1154,7 +666,6 @@ def _arm_topic_links(topic): links.extend(_arm_topic_links(child)) return links - def _arm_metadata_keywords(root_data, keywords_value, source_name): keywords = parse_keywords(keywords_value, source_name) for value in root_data.get("keywords", []) + root_data.get("products", []): @@ -1162,7 +673,6 @@ def _arm_metadata_keywords(root_data, keywords_value, source_name): keywords.append(value) return keywords - def create_arm_documentation_chunks(source_url, source_name, doc_type, keywords_value): root_response = fetch_with_logging(source_to_fetch_url(source_url)) if root_response is None: @@ -1209,65 +719,6 @@ def create_arm_documentation_chunks(source_url, source_name, doc_type, keywords_ return chunks -def chunkSaveAndTrack(url,chunk): - - def addNewRow(current_date,chunk_words,chunk_id): - return [url,current_date,chunk_words,'1',chunk_id] - - def addToExistingRow(row,chunk_words,chunk_id): - url = row[0] # same URL - date = row[1] # same date - words = str(int(row[2]) + chunk_words) # update words - chunks = row[3] = str(int(row[3]) + 1) # update number of chunks - ids = row[4]+ f", {chunk_id}" # update chunk IDs - return [url,date,words,chunks,ids] - - - def recordChunk(): - current_date = datetime.date.today().strftime('%Y-%m-%d') - chunk_words = len(chunk.content.split()) - chunk_id = f'chunk_{chunk.uuid}' - - new_rows = [] - - with open(details_file, mode='r', newline='', encoding='utf-8') as file: - csv_reader = csv.reader(file) - try: - headers = next(csv_reader) - new_rows.append(headers) # keep in memory - except StopIteration: - pass - - url_found = False # Track if the URL is found in any row - - # Loop through all the rows after the header - for row in csv_reader: - if row[0] == url: - new_rows.append(addToExistingRow(row, chunk_words, chunk_id)) # Modify and append the row - url_found = True # Mark that the URL was found - else: - new_rows.append(row) # Append the row without modification - - # If the URL was not found, append a new row - if not url_found: - new_rows.append(addNewRow(current_date, chunk_words, chunk_id)) - - - # Overwrite csv with new info - with open(details_file, mode='w', newline='') as file: - csv_writer = csv.writer(file, delimiter=',') - csv_writer.writerows(new_rows) - - # Save chunk - file_name = f"{yaml_dir}/chunk_{chunk.uuid}.yaml" - with open(file_name, 'w') as file: - yaml.dump(chunk.toDict(), file, default_flow_style=False, sort_keys=False) - - # Record chunk - recordChunk() - print(f"{file_name} === {chunk.title}") - - def main(): skip_discovery = os.getenv("SKIP_DISCOVERY", "").lower() in {"1", "true", "yes"} @@ -1312,10 +763,7 @@ def main(): # b) Ecosystem Dashboard createEcosystemDashboardChunks(emit_chunks=False) - # c) Developer.Arm.Com - createDeveloperArmComChunks(emit_chunks=False) - - # d) Intrinsics + # c) Intrinsics #createIntrinsicsDatabaseChunks() # 1) Get URLs and details from CSV @@ -1334,7 +782,7 @@ def main(): # Save updated sources CSV with all discovered sources save_sources_csv(sources_file) print(f"\n=== Source tracking complete ===") - print(f"Total sources in {sources_file}: {len(all_sources)}") + print(f"Total sources in {sources_file}: {get_number_of_sources()}") if __name__ == "__main__": diff --git a/embedding-generation/generate-vectors.py b/embedding-generation/generate-vectors.py new file mode 100644 index 0000000..da9851a --- /dev/null +++ b/embedding-generation/generate-vectors.py @@ -0,0 +1,335 @@ +# Copyright © 2025, Arm Limited and Contributors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os +import re +import csv + +from bs4 import BeautifulSoup +import requests +from urllib.parse import quote +from dataclasses import dataclass +from typing import Any, Dict, Optional +from playwright.async_api import async_playwright +import asyncio + +from document_chunking import ( + chunk_parsed_document, + parse_document_content, +) + +from generate_common import ( + Chunk, + createChunk, + chunkSaveAndTrack, + fetch_with_logging, + register_source, + save_sources_csv, + load_existing_sources, + get_number_of_sources, + ensure_intrinsic_chunks_from_s3, + yaml_dir, + details_file, + http_session +) + + +@dataclass +class CapturedSearchRequest: + url: str + method: str + headers: Dict[str, str] + post_data: Optional[str] + response_json: Dict[str, Any] + +async def capture_DeveloperArmComSearch(page_url: str) -> CapturedSearchRequest: + apicount = 0 + def is_search_response(resp) -> bool: + nonlocal apicount + if "coveo.com/rest/search/v2" in resp.url and "querySuggest" not in resp.url: + apicount += 1 + return ( + resp.request.method.upper() == "POST" + and resp.request.post_data is not None + and resp.status == 200 + and apicount > 1 + ) + else: + return False + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + try: + async with page.expect_response(is_search_response, timeout=30_000) as response_info: + await page.goto(page_url, wait_until="domcontentloaded") + + response = await response_info.value + data = await response.json() + finally: + await browser.close() + + if not(isinstance(data, dict) and "results" in data): + raise RuntimeError("No search API response was captured. ") + else: + return CapturedSearchRequest( + url=response.url, + method=response.request.method, + headers=dict(response.request.headers), + post_data=response.request.post_data, + response_json=data, + ) + +def replay_DeveloperArmComSearch( + captured: CapturedSearchRequest, + query: str, + first_result: int = 0, + number_of_results: int = 48, +) -> Dict[str, Any]: + + def _merge_headers(base_headers: Dict[str, str]) -> Dict[str, str]: + keep = {} + drop = {"host", "content-length", "accept-encoding", "connection", "origin", "referer", "cookie"} + for k, v in base_headers.items(): + if k.lower() not in drop: + keep[k] = v + keep.setdefault("accept", "application/json, text/plain, */*") + keep.setdefault("content-type", "application/json") + keep.setdefault("user-agent", "Mozilla/5.0") + return keep + + if not captured.post_data: + raise RuntimeError("Captured request had no POST body to replay.") + + body = json.loads(captured.post_data) + body["q"] = query + body["firstResult"] = first_result + body["numberOfResults"] = number_of_results + headers = _merge_headers(captured.headers) + + r = requests.post(captured.url, headers=headers, json=body, timeout=60) + r.raise_for_status() + return r.json() + +def getDeveloperArmComSearchResults(searchterm: str, searchurl: str, maxitems: int = 20000): + + def extract_result(item: Dict[str, Any]) -> Dict[str, Any]: + return { + "title": item.get("title") or item.get("raw", {}).get("title"), + "url": item.get("clickUri") or item.get("uri") or item.get("url"), + "type": item.get("raw", {}).get("navigationhierarchiescontenttype"), + "author": item.get("raw", {}).get("author") or item.get("raw", {}).get("sysauthor"), + "products": item.get("raw", {}).get("navigationhierarchiesproducts"), + "objecttype": item.get("raw", {}).get("objecttype"), + "keywords": item.get("raw", {}).get("navigationhierarchiestopics") + } + + print('Searching developer.arm.com for "'+searchterm+'"') + captured = asyncio.run(capture_DeveloperArmComSearch(searchurl)) + + all_rows = [] + finished = False + page_size = 48 + start = 0 + while (len(all_rows) < maxitems) and not finished: + payload = replay_DeveloperArmComSearch( + captured, + query=searchterm, + first_result=start, + number_of_results=page_size, + ) + + items = [extract_result(x) for x in payload["results"]] + all_rows.extend(items) + finished = len(payload["results"]) < page_size + start += page_size + print("Found "+str(len(all_rows))+" results") + return all_rows + +def processDeveloperArmCom(url, title, type, keywords, emit_chunks=True): + + def chunkizeLearningPath(url, title, keywords): + if not emit_chunks: + return + + response = fetch_with_logging(url) + if response is None: + return + parsed_document = parse_document_content( + source_url=url, + resolved_url=url, + response_content=response.content, + content_type=response.headers.get("content-type", "text/html"), + fallback_title=title, + ) + chunk_payloads = chunk_parsed_document( + parsed_document, + doc_type=type, + keywords=keywords, + ) + + # 5) Create chunks for each snippet by adding metadata + for payload in chunk_payloads: + chunk = createChunk( + payload["content"], + url, + keywords, + payload["title"], + heading=payload["heading"], + heading_path=payload["heading_path"], + doc_type=payload["doc_type"], + product=payload["product"], + version=payload["version"], + resolved_url=payload["resolved_url"], + content_type=payload["content_type"], + ) + chunkSaveAndTrack(url,chunk) + + + response = http_session.get(url, timeout=60) + soup = BeautifulSoup(response.text, 'html.parser') + + itemtitle = 'Arm '+type+' - '+(blogtitle.get_text() if (blogtitle := soup.find(id='blog-title')) else title) + itemdate = blogdate.get_text() if (blogdate := soup.find(id='blog-date')) else '' + + # Register this learning path as a source + register_source( + site_name='Arm Developer', + license_type='Arm Proprietary', + display_name=itemtitle, + url=url, + keywords=keywords + ) + chunkizeLearningPath(url,itemtitle,keywords) + +def item_is_relevant(item) -> bool: + if not item.get("url"): + return False + match item["type"]: + case "Guide": + return item["title"] in { + "What is SME/SME2?", + "Overview of SME", + "Assembly code", + "Streaming SVE", + "Load and Store", + "Z registers", + "Real world examples", + "ZA storage", + "Predication" + } + + case "Programmer's Guide": + for pattern in { + r"/SME-Overview/", + r"/CME", + r"/matmul-fp32", + r"/lut-gemv-rm-int8", + r"/matmul-int8", + r"/gemv-cm-int8.+/", + r"/109246/.*/Introduction(\?|/The.+/)", + r"/Introduction-to-CME", + r"/Toolchains-and-model-support/(?!Quick-start)", + r"/Memory-access.(?!Implications)", + r"/Performance-monitoring", + r"/Matrix-Multiply-Unit" + }: + if item.get("url") and re.search(pattern, item["url"]): + return True + return False + + case "Blog Post": + title = item.get("title") or "" + author = item.get("author") or "" + if author in {"Zenon_Xiu", "KhalidS"} and title.startswith("Part") and "SME" in title: + return True + if author == "mweidmann" and title.startswith("Introducing the Scalable Matrix Extension"): + return True + return False + + case _: + return False + +def createDeveloperArmComChunks(emit_chunks=True): + search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype=" + content_types = [ + "Blog Post", + "Guide", + "Programmer's Guide" + ] + + search_url = search_base+",".join([quote(x) for x in content_types])+"&q=" + for searchterm in ["SME"]: + pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm) + relevant = 0 + for page in pages: + if item_is_relevant(page): + keywords = list(set( [searchterm] + + [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] + + [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]])) + processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks) + relevant += 1 + print("Keeping "+str(relevant)+" relevant items out of "+str(len(pages))) + +def main(): + skip_discovery = os.getenv("SKIP_DISCOVERY", "").lower() in {"1", "true", "yes"} + + # Ensure intrinsic_chunks folder and files from S3 are present + ensure_intrinsic_chunks_from_s3() + + # Argparse inputs + parser = argparse.ArgumentParser( + description="Generates list of Arm documentation sources for vector database ingestion. " + "Discovers developer.arm.com entries, " + "then updates the sources CSV with any new entries found." + ) + parser.add_argument( + "sources_file", + help="Path to vector-db-sources.csv. This file is read for existing sources " + "(to avoid duplicates) and WILL BE OVERWRITTEN with the combined list " + "of existing + newly discovered sources." + ) + args = parser.parse_args() + sources_file = args.sources_file + + # Load existing sources from vector-db-sources.csv (for deduplication) + load_existing_sources(sources_file) + + # 0) Initialize files + os.makedirs(yaml_dir, exist_ok=True) # create if doesn't exist + details_dir = os.path.dirname(details_file) + if details_dir: + os.makedirs(details_dir, exist_ok=True) + for filename in os.listdir(yaml_dir): + if filename.startswith('chunk_') and filename.endswith('.yaml'): + os.remove(os.path.join(yaml_dir, filename)) + with open(details_file, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerow(['URL','Date', 'Number of Words', 'Number of Chunks','Chunk IDs']) + + # 0) Obtain full database information: + # a) Learning Paths & Install Guides + if not skip_discovery: + # Developer.Arm.Com + createDeveloperArmComChunks(emit_chunks=False) + + # Save updated sources CSV with all discovered sources + save_sources_csv(sources_file) + print(f"\n=== Source tracking complete ===") + print(f"Total sources in {sources_file}: {get_number_of_sources()}") + +if __name__ == "__main__": + main() diff --git a/embedding-generation/generate_common.py b/embedding-generation/generate_common.py new file mode 100644 index 0000000..2e4efb4 --- /dev/null +++ b/embedding-generation/generate_common.py @@ -0,0 +1,362 @@ +# Copyright © 2025, Arm Limited and Contributors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid +import yaml +import csv +import datetime + +import boto3 +from botocore.exceptions import NoCredentialsError, ClientError +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + + +# Create a session with retry logic for resilient HTTP requests +def create_retry_session(retries=5, backoff_factor=1, status_forcelist=(500, 502, 503, 504)): + """Create a requests session with automatic retry on failures.""" + session = requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + allowed_methods=["HEAD", "GET", "OPTIONS"] + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + +# Global session for all HTTP requests +http_session = create_retry_session() + + +def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', + s3_bucket='arm-github-copilot-extension', + s3_prefix='embedding_data/intrinsic_chunks/'): + """ + Ensure the local 'intrinsic_chunks' folder exists and is populated with files from S3. + If the folder does not exist, create it and download all files from the S3 prefix. + """ + if not os.path.exists(local_folder): + os.makedirs(local_folder, exist_ok=True) + print(f"Created local folder: {local_folder}") + s3 = boto3.client('s3') + try: + paginator = s3.get_paginator('list_objects_v2') + for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix): + for obj in page.get('Contents', []): + key = obj['Key'] + if key.endswith('/'): + continue # skip folders + filename = os.path.basename(key) + local_path = os.path.join(local_folder, filename) + print(f"Downloading {key} to {local_path}") + s3.download_file(s3_bucket, key, local_path) + except NoCredentialsError: + print("AWS credentials not found. Please configure them.") + except ClientError as e: + print(f"S3 ClientError: {e}") + except Exception as e: + print(f"Unexpected error: {e}") + else: + print(f"Folder '{local_folder}' already exists. Skipping S3 download.") + + +yaml_dir = os.getenv('YAML_OUTPUT_DIR', 'yaml_data') +details_file = os.getenv('CHUNK_DETAILS_FILE', 'info/chunk_details.csv') + +# Global tracking for vector-db-sources.csv +# Set of URLs already in the CSV (for deduplication) +known_source_urls = set() +# List of all source entries (including existing and new) +# Each entry is a dict: {site_name, license_type, display_name, url, keywords} +all_sources = [] + + +def get_number_of_sources(): + global all_sources + return len(all_sources) + + +def load_existing_sources(csv_file): + """ + Load existing sources from vector-db-sources.csv into memory. + Populates known_source_urls set and all_sources list. + """ + global known_source_urls, all_sources + known_source_urls = set() + all_sources = [] + + if not os.path.exists(csv_file): + print(f"Sources file '{csv_file}' does not exist. Starting fresh.") + return + + with open(csv_file, 'r', newline='', encoding='utf-8') as file: + reader = csv.DictReader(file) + for row in reader: + url = row.get('URL', '').strip() + if url: + known_source_urls.add(url) + all_sources.append({ + 'site_name': row.get('Site Name', ''), + 'license_type': row.get('License Type', ''), + 'display_name': row.get('Display Name', ''), + 'url': url, + 'keywords': row.get('Keywords', '') + }) + + print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'") + + +def register_source(site_name, license_type, display_name, url, keywords): + """ + Register a new source URL. If the URL already exists, skip it. + Returns True if the source was added, False if it was a duplicate. + """ + global known_source_urls, all_sources + + # Normalize URL for comparison + url = url.strip() + + if url in known_source_urls: + return False + + known_source_urls.add(url) + source_entry = { + 'site_name': site_name, + 'license_type': license_type, + 'display_name': display_name, + 'url': url, + 'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords) + } + + # Keep discovered sources grouped with their existing site section instead of + # appending them to the very end of the CSV and fragmenting that block. + insert_at = None + for index, existing_source in enumerate(all_sources): + if existing_source.get('site_name') == site_name: + insert_at = index + 1 + + if insert_at is None: + all_sources.append(source_entry) + else: + all_sources.insert(insert_at, source_entry) + + print(f"[NEW SOURCE] {display_name}: {url}") + return True + + +def save_sources_csv(csv_file): + """ + Write all sources (existing + new) to vector-db-sources.csv. + """ + with open(csv_file, 'w', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords']) + for source in all_sources: + writer.writerow([ + source['site_name'], + source['license_type'], + source['display_name'], + source['url'], + source['keywords'] + ]) + + print(f"Saved {len(all_sources)} sources to '{csv_file}'") + +class Chunk: + def __init__( + self, + title, + url, + uuid, + keywords, + content, + heading="", + heading_path=None, + doc_type="", + product="", + version="", + resolved_url="", + content_type="", + ): + self.title = title + self.url = url + self.uuid = uuid + self.content = content + self.heading = heading + self.heading_path = heading_path or [] + self.doc_type = doc_type + self.product = product + self.version = version + self.resolved_url = resolved_url + self.content_type = content_type + + # Translate keyword list into comma-separated string, and add similar words to keywords. + self.keywords = self.formatKeywords(keywords) + + def formatKeywords(self, keywords): + """Format keywords list into a lowercase, comma-separated string.""" + return ', '.join(k.strip() for k in keywords).lower() + + # Used to dump into a yaml file without difficulty + def toDict(self): + return { + 'title': self.title, + 'url': self.url, + 'uuid': self.uuid, + 'keywords': self.keywords, + 'content': self.content, + 'heading': self.heading, + 'heading_path': self.heading_path, + 'doc_type': self.doc_type, + 'product': self.product, + 'version': self.version, + 'resolved_url': self.resolved_url, + 'content_type': self.content_type, + } + + def __repr__(self): + return f"Chunk(title={self.title}, url={self.url}, uuid={self.uuid}, heading={self.heading})" + + +def fetch_with_logging(url): + try: + response = http_session.get(url, timeout=60) + response.raise_for_status() + return response + except requests.exceptions.HTTPError as http_err: + print(f"HTTP error occurred: {http_err}") + with open('info/errors.csv', 'a', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow([url, str(http_err)]) + return None + except Exception as err: + print(f"Other error occurred: {err}") + with open('info/errors.csv', 'a', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow([url, str(err)]) + return None + except Exception as err: + print(f"Other error occurred: {err}") + with open('info/errors.csv', 'a', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow([url,str(err)]) + return False + + +def createChunk( + text_snippet, + WEBSITE_url, + keywords, + title, + heading="", + heading_path=None, + doc_type="", + product="", + version="", + resolved_url="", + content_type="", +): + chunk = Chunk( + title = title, + url = WEBSITE_url, + uuid = str(uuid.uuid4()), + keywords = keywords, + content = text_snippet, + heading = heading, + heading_path = heading_path or [], + doc_type = doc_type, + product = product, + version = version, + resolved_url = resolved_url, + content_type = content_type, + ) + + return chunk + + +def printChunks(chunks): + for chunk_dict in chunks: + print('='*100) + print("Title:", chunk_dict['title']) + print("Keywords:", chunk_dict['keywords']) + print("URL:", chunk_dict['url']) + print("Unique ID:", chunk_dict['uuid']) + print("Content:", chunk_dict['content']) + print('='*100) + + +def chunkSaveAndTrack(url,chunk): + + def addNewRow(current_date,chunk_words,chunk_id): + return [url,current_date,chunk_words,'1',chunk_id] + + def addToExistingRow(row,chunk_words,chunk_id): + url = row[0] # same URL + date = row[1] # same date + words = str(int(row[2]) + chunk_words) # update words + chunks = row[3] = str(int(row[3]) + 1) # update number of chunks + ids = row[4]+ f", {chunk_id}" # update chunk IDs + return [url,date,words,chunks,ids] + + + def recordChunk(): + current_date = datetime.date.today().strftime('%Y-%m-%d') + chunk_words = len(chunk.content.split()) + chunk_id = f'chunk_{chunk.uuid}' + + new_rows = [] + + with open(details_file, mode='r', newline='', encoding='utf-8') as file: + csv_reader = csv.reader(file) + try: + headers = next(csv_reader) + new_rows.append(headers) # keep in memory + except StopIteration: + pass + + url_found = False # Track if the URL is found in any row + + # Loop through all the rows after the header + for row in csv_reader: + if row[0] == url: + new_rows.append(addToExistingRow(row, chunk_words, chunk_id)) # Modify and append the row + url_found = True # Mark that the URL was found + else: + new_rows.append(row) # Append the row without modification + + # If the URL was not found, append a new row + if not url_found: + new_rows.append(addNewRow(current_date, chunk_words, chunk_id)) + + + # Overwrite csv with new info + with open(details_file, mode='w', newline='') as file: + csv_writer = csv.writer(file, delimiter=',') + csv_writer.writerows(new_rows) + + # Save chunk + file_name = f"{yaml_dir}/chunk_{chunk.uuid}.yaml" + with open(file_name, 'w') as file: + yaml.dump(chunk.toDict(), file, default_flow_style=False, sort_keys=False) + + # Record chunk + recordChunk() + print(f"{file_name} === {chunk.title}") From 5d632f5417185763629388d75dab968ea841cb35 Mon Sep 17 00:00:00 2001 From: Andrew Pickard Date: Thu, 18 Jun 2026 17:24:41 +0100 Subject: [PATCH 4/6] Duplicated csv field size limit into generate-common.py --- embedding-generation/generate-chunks.py | 3 --- embedding-generation/generate_common.py | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index 0ef5401..3edc10a 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -69,14 +69,11 @@ # multi-megabyte HTML document for every source row. ecosystem_dashboard_entries = None - # Increase the file size limit, which defaults to '131,072' csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror' - - def build_ecosystem_dashboard_entries(): """Load and cache package-level snippets from the ecosystem dashboard.""" global ecosystem_dashboard_entries diff --git a/embedding-generation/generate_common.py b/embedding-generation/generate_common.py index 2e4efb4..4c6c417 100644 --- a/embedding-generation/generate_common.py +++ b/embedding-generation/generate_common.py @@ -88,6 +88,9 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', # Each entry is a dict: {site_name, license_type, display_name, url, keywords} all_sources = [] +# Increase the file size limit, which defaults to '131,072' +csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror' + def get_number_of_sources(): global all_sources From 285523e64c805b011b7136da438148fee4413220 Mon Sep 17 00:00:00 2001 From: Andrew Pickard Date: Thu, 18 Jun 2026 17:38:42 +0100 Subject: [PATCH 5/6] Updated vector-db-sources.csv with developer.arm.com items --- embedding-generation/vector-db-sources.csv | 80 ++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/embedding-generation/vector-db-sources.csv b/embedding-generation/vector-db-sources.csv index 5c5e6ea..2cb715b 100755 --- a/embedding-generation/vector-db-sources.csv +++ b/embedding-generation/vector-db-sources.csv @@ -1840,3 +1840,83 @@ Learning Paths,CC4.0,Learning Path - Deploy multi-network device meshes using De Learning Paths,CC4.0,Learning Path - Implement post-quantum cryptography on Arm Cortex-M4,https://learn.arm.com/learning-paths/embedded-and-microcontrollers/pqc_pqm4/,Security; Linux; macOS; C; Python; GCC; stlink; QEMU Learning Paths,CC4.0,Learning Path - Device-to-Device communication with Device Connect,https://learn.arm.com/learning-paths/embedded-and-microcontrollers/device-connect-d2d/,Libraries; Linux; macOS; Windows; Python Learning Paths,CC4.0,Learning Path - Create and deploy a custom Topo Template,https://learn.arm.com/learning-paths/cross-platform/create-your-own-topo-templates/,Containers and Virtualization; Linux; macOS; Windows; Topo; Docker; SSH +Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME and SME2,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME context save restore,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-context-save-restore?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME ZA storage,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-ZA-storage?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - If SME and SME2 are supported,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2/If-SME-and-SME2-are-supported?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - How to run an SME application,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/How-to-run-an-SME-application?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Guide - What is SME/SME2?,https://developer.arm.com/documentation/109974/0100/What-is-SME-SME2-?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Guide - Overview of SME,https://developer.arm.com/documentation/109974/0100/Overview-of-SME?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME2 multi-vector predication,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2/SME2-multi-vector-predication?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Use multi-vector SME loads for better efficiency,https://developer.arm.com/documentation/110636/0100/Memory-access/Use-multi-vector-SME-loads-for-better-efficiency?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Blog Post - Part 1: Arm Scalable Matrix Extension (SME) Introduction,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction,SME +Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME2 lookup table,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2/SME2-lookup-table?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME2 multi-vector operands,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME2-multi-vector-operands?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Streaming SVE mode,https://developer.arm.com/documentation/109246/0101/SME-Overview/Streaming-SVE-mode?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Compiler options and pragmas,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Compiler-support/Compiler-options-and-pragmas?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Guide - Assembly code,https://developer.arm.com/documentation/109974/0100/Basic-SME-example/Assembly-code?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - ZA array vector access and ZA tile mapping,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-ZA-storage/ZA-array-vector-access-and-ZA-tile-mapping?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Compiler support,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Compiler-support?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_r function overview,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-r-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Controlling the use of streaming mode,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Controlling-the-use-of-streaming-mode?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Debug tools,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Debug-tools?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Managing streaming mode across function boundaries,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Controlling-the-use-of-streaming-mode/Managing-streaming-mode-across-function-boundaries?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function overview,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-l-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Guide - Streaming SVE,https://developer.arm.com/documentation/109974/0100/Streaming-SVE?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Calling conventions,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_opt function overview,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/lut-gemv-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Controlling the use of ZA storage,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Controlling-the-use-of-ZA-storage?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Guide - Load and Store,https://developer.arm.com/documentation/109974/0100/Introduction-to-SME-instructions/Load-and-Store?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt function overview,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/matmul-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Blog Post - Introducing the Scalable Matrix Extension for the Armv9-A Architecture,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture,SME; A-Profile +Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_fp32: Single precision matrix-by-matrix multiplication,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_int8: 8-bit integer to 32-bit integer matrix-by-matrix multiplication,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_rm_int8: Compressed 8-bit integer to 32-bit integer matrix-by-vector multiplication,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the gemv_cm_int8 algorithm,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/Overview-of-the-gemv-cm-int8-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the matmul_fp32 algorithm,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/Overview-of-the-matmul-fp32-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the lut_gemv_rm_int8 algorithm,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/Overview-of-the-lut-gemv-rm-int8-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Blog Post - Part 2: Arm Scalable Matrix Extension (SME) Instructions,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2,SME +Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt function overview,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/matmul-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Preparation for entering and exiting streaming mode,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Preparation-for-entering-and-exiting-streaming-mode?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the matmul_int8 algorithm,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/Overview-of-the-matmul-int8-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,"Arm Blog Post - Part 3: Matrix-matrix multiplication. Neon, SVE, and SME compared",https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/matrix-matrix-multiplication-neon-sve-and-sme-compared,SME +Arm Developer,Arm Proprietary,Arm Guide - Z registers,https://developer.arm.com/documentation/109974/0100/ZA-storage/Z-registers?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Blog Post - Part4: Arm SME2 Introduction,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/part4-arm-sme2-introduction,SME; SVE; SIMD ISAs +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Introduction,https://developer.arm.com/documentation/109246/0101/Introduction?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - gemv_opt function overview,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/gemv-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l code,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/preprocess-l-code?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_r code,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-r-code?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Guide - Real world examples,https://developer.arm.com/documentation/109974/0100/Why-are-matrices-used-/Real-world-examples?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function overview,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/preprocess-l-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Streaming SVE mode and ZA storage,https://developer.arm.com/documentation/109246/0101/Introduction/The-Scalable-Matrix-Extensions/Streaming-SVE-mode-and-ZA-storage?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l code,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-l-code?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Memory access,https://developer.arm.com/documentation/110636/0100/Memory-access?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt code,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/matmul-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt code,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/matmul-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_r function details,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-r-function-details?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Introduction to CME,https://developer.arm.com/documentation/110636/0100/Introduction-to-CME?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Implications for programmers,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Implications-for-programmers?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Avoid access conflicts in 1KB regions,https://developer.arm.com/documentation/110636/0100/Memory-access/Avoid-access-conflicts-in-1KB-regions?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Guide - ZA storage,https://developer.arm.com/documentation/109974/0100/ZA-storage?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Performance monitoring,https://developer.arm.com/documentation/110636/0100/Performance-monitoring?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function details,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-l-function-details?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_opt function details,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/lut-gemv-opt-function-details?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME components,https://developer.arm.com/documentation/110636/0100/Introduction-to-CME/CME-components?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME and the PMU,https://developer.arm.com/documentation/110636/0100/Performance-monitoring/CME-and-the-PMU?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_opt code,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/lut-gemv-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - gemv_opt code,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/gemv-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Guide - Predication,https://developer.arm.com/documentation/109974/0100/Predication?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt function details,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/matmul-opt-function-details?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function details,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/preprocess-l-function-details?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - gemv_opt function details,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/gemv-opt-function-details?lang=en,SME; A-Profile; Software development; Armv9-A +Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME and the SPE,https://developer.arm.com/documentation/110636/0100/Performance-monitoring/CME-and-the-SPE?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Implications for programmers,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution/Implications-for-programmers?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Fast Context Switching Instructions,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Fast-Context-Switching-Instructions?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME system configurations,https://developer.arm.com/documentation/110636/0100/CME-system-configurations?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Example 1: efficient loop,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution/Example-1--efficient-loop?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Example 2: inefficient loop,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution/Example-2--inefficient-loop?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Implications for programmers,https://developer.arm.com/documentation/110636/0100/Matrix-Multiply-Unit/Implications-for-programmers?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Matrix Multiply Unit,https://developer.arm.com/documentation/110636/0100/Matrix-Multiply-Unit?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME instruction execution,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Multi-CME systems,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Multi-CME-systems?lang=en,SME; Software development +Arm Developer,Arm Proprietary,Arm Programmer's Guide - Single-CME systems,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Single-CME-systems?lang=en,SME; Software development From 0f511948543950589258f7667c1a8844d4973299 Mon Sep 17 00:00:00 2001 From: Andrew Pickard Date: Thu, 18 Jun 2026 18:48:49 +0100 Subject: [PATCH 6/6] Fixed unit tests to reflect the functions that have moved from generate-chunks.py into generate_common.py. --- embedding-generation/tests/conftest.py | 21 ++++ .../tests/test_generate_chunks.py | 98 +++++++++---------- 2 files changed, 70 insertions(+), 49 deletions(-) diff --git a/embedding-generation/tests/conftest.py b/embedding-generation/tests/conftest.py index 22f243b..2e2a1e5 100644 --- a/embedding-generation/tests/conftest.py +++ b/embedding-generation/tests/conftest.py @@ -40,9 +40,19 @@ def _load_generate_chunks(): spec.loader.exec_module(module) return module +def _load_generate_common(): + """Load generate_common.py module.""" + spec = importlib.util.spec_from_file_location( + "generate_common", + os.path.join(_PARENT_DIR, "generate_common.py") + ) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module # Load module once at conftest import time _generate_chunks_module = _load_generate_chunks() +_generate_common_module = _load_generate_common() @pytest.fixture @@ -55,3 +65,14 @@ def gc(): # Clean up after test _generate_chunks_module.known_source_urls = set() _generate_chunks_module.all_sources = [] + +@pytest.fixture +def gcom(): + """Provide the generate_common module with reset global state.""" + # Reset global state before each test + _generate_common_module.known_source_urls = set() + _generate_common_module.all_sources = [] + yield _generate_common_module + # Clean up after test + _generate_common_module.known_source_urls = set() + _generate_common_module.all_sources = [] diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py index 9812634..25ddee7 100644 --- a/embedding-generation/tests/test_generate_chunks.py +++ b/embedding-generation/tests/test_generate_chunks.py @@ -259,9 +259,9 @@ class TestSourceTracking: known_source_urls and all_sources before and after each test. """ - def test_register_source_new(self, gc): + def test_register_source_new(self, gcom): """Test registering a new source.""" - result = gc.register_source( + result = gcom.register_source( site_name="Test Site", license_type="MIT", display_name="Test Display", @@ -270,14 +270,14 @@ def test_register_source_new(self, gc): ) assert result is True - assert "https://example.com/test" in gc.known_source_urls - assert len(gc.all_sources) == 1 - assert gc.all_sources[0]['url'] == "https://example.com/test" - assert gc.all_sources[0]['keywords'] == "test; example" + assert "https://example.com/test" in gcom.known_source_urls + assert len(gcom.all_sources) == 1 + assert gcom.all_sources[0]['url'] == "https://example.com/test" + assert gcom.all_sources[0]['keywords'] == "test; example" - def test_register_source_duplicate(self, gc): + def test_register_source_duplicate(self, gcom): """Test that duplicate URLs are rejected.""" - gc.register_source( + gcom.register_source( site_name="Test Site", license_type="MIT", display_name="Test Display", @@ -285,7 +285,7 @@ def test_register_source_duplicate(self, gc): keywords="test" ) - result = gc.register_source( + result = gcom.register_source( site_name="Test Site 2", license_type="Apache", display_name="Different Display", @@ -294,11 +294,11 @@ def test_register_source_duplicate(self, gc): ) assert result is False - assert len(gc.all_sources) == 1 + assert len(gcom.all_sources) == 1 - def test_register_source_inserts_after_matching_site_group(self, gc): + def test_register_source_inserts_after_matching_site_group(self, gcom): """Test that new sources stay grouped with existing sources from the same site.""" - gc.all_sources = [ + gcom.all_sources = [ { 'site_name': 'Google Cloud', 'license_type': 'CC4.0', @@ -328,9 +328,9 @@ def test_register_source_inserts_after_matching_site_group(self, gc): 'keywords': 'a1' }, ] - gc.known_source_urls = {source['url'] for source in gc.all_sources} + gcom.known_source_urls = {source['url'] for source in gcom.all_sources} - result = gc.register_source( + result = gcom.register_source( site_name="Ecosystem Dashboard", license_type="Arm Proprietary", display_name="Dashboard 3", @@ -339,7 +339,7 @@ def test_register_source_inserts_after_matching_site_group(self, gc): ) assert result is True - assert [source['display_name'] for source in gc.all_sources] == [ + assert [source['display_name'] for source in gcom.all_sources] == [ 'Google 1', 'Dashboard 1', 'Dashboard 2', @@ -347,9 +347,9 @@ def test_register_source_inserts_after_matching_site_group(self, gc): 'Graviton 1', ] - def test_register_source_url_normalization(self, gc): + def test_register_source_url_normalization(self, gcom): """Test that URLs are stripped of whitespace.""" - gc.register_source( + gcom.register_source( site_name="Test", license_type="MIT", display_name="Test", @@ -357,11 +357,11 @@ def test_register_source_url_normalization(self, gc): keywords="test" ) - assert "https://example.com/test" in gc.known_source_urls + assert "https://example.com/test" in gcom.known_source_urls - def test_register_source_string_keywords(self, gc): + def test_register_source_string_keywords(self, gcom): """Test that string keywords are preserved as-is.""" - gc.register_source( + gcom.register_source( site_name="Test", license_type="MIT", display_name="Test", @@ -369,16 +369,16 @@ def test_register_source_string_keywords(self, gc): keywords="already; formatted; string" ) - assert gc.all_sources[0]['keywords'] == "already; formatted; string" + assert gcom.all_sources[0]['keywords'] == "already; formatted; string" - def test_load_existing_sources_file_not_exists(self, gc, tmp_path): + def test_load_existing_sources_file_not_exists(self, gcom, tmp_path): """Test loading from non-existent file.""" - gc.load_existing_sources(str(tmp_path / "nonexistent.csv")) + gcom.load_existing_sources(str(tmp_path / "nonexistent.csv")) - assert len(gc.all_sources) == 0 - assert len(gc.known_source_urls) == 0 + assert len(gcom.all_sources) == 0 + assert len(gcom.known_source_urls) == 0 - def test_load_existing_sources(self, gc, tmp_path): + def test_load_existing_sources(self, gcom, tmp_path): """Test loading sources from CSV file.""" csv_file = tmp_path / "sources.csv" csv_file.write_text( @@ -387,17 +387,17 @@ def test_load_existing_sources(self, gc, tmp_path): "Another Site,Apache,Another Display,https://example.com/2,key3\n" ) - gc.load_existing_sources(str(csv_file)) + gcom.load_existing_sources(str(csv_file)) - assert len(gc.all_sources) == 2 - assert "https://example.com/1" in gc.known_source_urls - assert "https://example.com/2" in gc.known_source_urls - assert gc.all_sources[0]['site_name'] == "Test Site" - assert gc.all_sources[1]['display_name'] == "Another Display" + assert len(gcom.all_sources) == 2 + assert "https://example.com/1" in gcom.known_source_urls + assert "https://example.com/2" in gcom.known_source_urls + assert gcom.all_sources[0]['site_name'] == "Test Site" + assert gcom.all_sources[1]['display_name'] == "Another Display" - def test_save_sources_csv(self, gc, tmp_path): + def test_save_sources_csv(self, gcom, tmp_path): """Test saving sources to CSV file.""" - gc.all_sources = [ + gcom.all_sources = [ { 'site_name': 'Site 1', 'license_type': 'MIT', @@ -415,7 +415,7 @@ def test_save_sources_csv(self, gc, tmp_path): ] csv_file = tmp_path / "output.csv" - gc.save_sources_csv(str(csv_file)) + gcom.save_sources_csv(str(csv_file)) # Read and verify with open(csv_file, 'r') as f: @@ -426,7 +426,7 @@ def test_save_sources_csv(self, gc, tmp_path): assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2'] assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3'] - def test_load_and_save_roundtrip(self, gc, tmp_path): + def test_load_and_save_roundtrip(self, gcom, tmp_path): """Test that loading and saving preserves data.""" csv_file = tmp_path / "sources.csv" original_content = ( @@ -436,10 +436,10 @@ def test_load_and_save_roundtrip(self, gc, tmp_path): csv_file.write_text(original_content) # Load - gc.load_existing_sources(str(csv_file)) + gcom.load_existing_sources(str(csv_file)) # Add a new source - gc.register_source( + gcom.register_source( site_name="New Site", license_type="Apache", display_name="New Display", @@ -448,16 +448,16 @@ def test_load_and_save_roundtrip(self, gc, tmp_path): ) # Save - gc.save_sources_csv(str(csv_file)) + gcom.save_sources_csv(str(csv_file)) # Verify - gc.known_source_urls = set() - gc.all_sources = [] - gc.load_existing_sources(str(csv_file)) + gcom.known_source_urls = set() + gcom.all_sources = [] + gcom.load_existing_sources(str(csv_file)) - assert len(gc.all_sources) == 2 - assert "https://example.com/test" in gc.known_source_urls - assert "https://new.example.com" in gc.known_source_urls + assert len(gcom.all_sources) == 2 + assert "https://example.com/test" in gcom.known_source_urls + assert "https://new.example.com" in gcom.known_source_urls class TestGetMarkdownGitHubURLsFromPage: @@ -873,18 +873,18 @@ def fake_fetch(url): class TestCreateRetrySession: """Tests for create_retry_session function.""" - def test_creates_session(self, gc): + def test_creates_session(self, gcom): """Test that a session is created.""" - session = gc.create_retry_session() + session = gcom.create_retry_session() assert session is not None # Check that adapters are mounted assert 'http://' in session.adapters assert 'https://' in session.adapters - def test_custom_retry_settings(self, gc): + def test_custom_retry_settings(self, gcom): """Test session with custom retry settings.""" - session = gc.create_retry_session( + session = gcom.create_retry_session( retries=3, backoff_factor=2, status_forcelist=(500, 503)