From d413edb3f861041bf7ae89605af7eaa6fdb5f823 Mon Sep 17 00:00:00 2001
From: Andrew Pickard <andrew.pickard@arm.com>
Date: Mon, 18 May 2026 17:17:53 +0100
Subject: [PATCH 1/6] Added the ability to search developer.arm.com and include
 the search results as embeddings

---
 embedding-generation/Dockerfile         |   3 +-
 embedding-generation/generate-chunks.py | 199 +++++++++++++++++++++++-
 embedding-generation/requirements.txt   |   1 +
 3 files changed, 200 insertions(+), 3 deletions(-)

diff --git a/embedding-generation/Dockerfile b/embedding-generation/Dockerfile
index 4909565..7c2dc3e 100644
--- a/embedding-generation/Dockerfile
+++ b/embedding-generation/Dockerfile
@@ -47,7 +47,8 @@ COPY requirements.txt .
 COPY --from=intrinsic-chunks /embedding-data/intrinsic_chunks ./intrinsic_chunks
 
 # Install Python dependencies (force CPU-only torch)
-RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt
+RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt && \
+    playwright install --with-deps chromium
 
 # Pre-download the embedding model so local/offline loads succeed later in the build.
 RUN python3 -c "from sentence_transformers import SentenceTransformer; import os; SentenceTransformer(os.environ['SENTENCE_TRANSFORMER_MODEL'], cache_folder=os.environ['SENTENCE_TRANSFORMERS_HOME'])"
diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
index fa29a6a..79d36ba 100644
--- a/embedding-generation/generate-chunks.py
+++ b/embedding-generation/generate-chunks.py
@@ -27,7 +27,11 @@
 import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
-from urllib.parse import parse_qs, urlparse
+from urllib.parse import parse_qs, urlparse, quote
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+from playwright.async_api import async_playwright
+import asyncio
 
 from document_chunking import (
     arm_service_url_to_developer_url,
@@ -526,6 +530,194 @@ def htmlToMarkdown(html_string):
     '''
 
 
+@dataclass
+class CapturedSearchRequest:
+    url: str
+    method: str
+    headers: Dict[str, str]
+    post_data: Optional[str]
+    response_json: Dict[str, Any]
+
+async def capture_DeveloperArmComSearch(page_url: str) -> CapturedSearchRequest:
+    apicount = 0
+    def is_search_response(resp) -> bool:
+        nonlocal apicount
+        # print("Testing url "+str(resp.request.method.upper())+" "+str(resp.url))
+        if "coveo.com/rest/search/v2" in resp.url and "querySuggest" not in resp.url:
+            apicount += 1
+            return (
+                resp.request.method.upper() in {"GET", "POST"}
+                and resp.status == 200
+                and apicount == 2
+            )
+        else:
+            return False
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+
+        async with page.expect_response(is_search_response, timeout=30_000) as response_info:
+            await page.goto(page_url, wait_until="domcontentloaded")
+
+        response = await response_info.value
+        data = await response.json()
+        await browser.close()
+
+        if not(isinstance(data, dict) and "results" in data):
+            raise RuntimeError("No search API response was captured. ")
+        else:
+            return CapturedSearchRequest(
+                url=response.url,
+                method=response.request.method,
+                headers=dict(response.request.headers),
+                post_data=response.request.post_data,
+                response_json=data,
+            )
+
+def replay_DeveloperArmComSearch(
+    captured: CapturedSearchRequest,
+    query: str,
+    first_result: int = 0,
+    number_of_results: int = 48,
+) -> Dict[str, Any]:
+
+    def _merge_headers(base_headers: Dict[str, str]) -> Dict[str, str]:
+        keep = {}
+        drop = {"host", "content-length", "accept-encoding", "connection", "origin", "referer"}
+        for k, v in base_headers.items():
+            if k.lower() not in drop:
+                keep[k] = v
+        keep.setdefault("accept", "application/json, text/plain, */*")
+        keep.setdefault("content-type", "application/json")
+        keep.setdefault("user-agent", "Mozilla/5.0")
+        return keep
+
+    if not captured.post_data:
+        raise RuntimeError("Captured request had no POST body to replay.")
+
+    body = json.loads(captured.post_data)
+    body["q"] = query
+    body["firstResult"] = first_result
+    body["numberOfResults"] = number_of_results
+    headers = _merge_headers(captured.headers)
+
+    r = requests.post(captured.url, headers=headers, json=body, timeout=60)
+    r.raise_for_status()
+    return r.json()
+
+def getDeveloperArmComSearchResults(searchterm: str, searchurl: str, maxitems: int = 20000):
+
+    def extract_result(item: Dict[str, Any]) -> Dict[str, Any]:
+        return {
+            "title": item.get("title") or item.get("raw", {}).get("title"),
+            "url": item.get("clickUri") or item.get("uri") or item.get("url"),
+            "type": item.get("raw", {}).get("navigationhierarchiescontenttype"),
+            "products": item.get("raw", {}).get("navigationhierarchiesproducts"),
+            "objecttype": item.get("raw", {}).get("objecttype"),
+            "keywords": item.get("raw", {}).get("navigationhierarchiestopics")
+        }
+
+    print('Searching developer.arm.com for "'+searchterm+'"')
+    captured = asyncio.run(capture_DeveloperArmComSearch(searchurl))
+
+    all_rows = []
+    finished = False
+    page_size = 48
+    start = 0
+    while (len(all_rows) < maxitems) and not finished:
+        payload = replay_DeveloperArmComSearch(
+            captured,
+            query=searchterm,
+            first_result=start,
+            number_of_results=page_size,
+        )
+
+        items = [extract_result(x) for x in payload["results"]]
+        all_rows.extend(items)
+        finished = len(payload["results"]) < page_size
+        start += page_size
+    print("Found "+str(len(all_rows))+" results")
+    return all_rows
+
+def processDeveloperArmCom(url, title, type, keywords, emit_chunks=True):
+
+    def chunkizeLearningPath(url, title, keywords):
+        if not emit_chunks:
+            return
+
+        response = fetch_with_logging(url)
+        if response is None:
+            return
+        parsed_document = parse_document_content(
+            source_url=url,
+            resolved_url=url,
+            response_content=response.content,
+            content_type=response.headers.get("content-type", "text/html"),
+            fallback_title=title,
+        )
+        chunk_payloads = chunk_parsed_document(
+            parsed_document,
+            doc_type=type,
+            keywords=keywords,
+        )
+
+        # 5) Create chunks for each snippet by adding metadata
+        for payload in chunk_payloads:
+            chunk = createChunk(
+                payload["content"],
+                url,
+                keywords,
+                payload["title"],
+                heading=payload["heading"],
+                heading_path=payload["heading_path"],
+                doc_type=payload["doc_type"],
+                product=payload["product"],
+                version=payload["version"],
+                resolved_url=payload["resolved_url"],
+                content_type=payload["content_type"],
+            )
+            chunkSaveAndTrack(url,chunk)
+
+
+    response = http_session.get(url, timeout=60)
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    itemtitle = 'Arm '+type+' - '+(blogtitle.get_text() if (blogtitle := soup.find(id='blog-title')) else title)
+    itemdate = blogdate.get_text() if (blogdate := soup.find(id='blog-date')) else ''
+
+    # Register this learning path as a source
+    register_source(
+        site_name='Arm Developer',
+        license_type='Copyright Arm',
+        display_name=itemtitle,
+        url=url,
+        keywords=keywords
+    )
+    chunkizeLearningPath(url,itemtitle,keywords)
+
+def createDeveloperArmComChunks(emit_chunks=True):
+    search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype="
+    content_types = [
+        "Blog Post"
+        "Developer Guide",
+        "Guide",
+        "Programmer's Guide",
+        "Migration Guide",
+        "Getting Started Guide"
+    ]
+
+    search_url = search_base+",".join([quote(x) for x in content_types])+"&q="
+    for searchterm in ["SME"]:
+        pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm)
+        for i, page in enumerate(pages):
+            keywords =  list(set( [searchterm] +
+                                  [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] +
+                                  [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]]))
+            processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks)
+
+
+
 def processLearningPath(url, type, emit_chunks=True):
     github_raw_link = "https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/production/content"
     site_link = "https://learn.arm.com"
@@ -1080,7 +1272,10 @@ def main():
         # b) Ecosystem Dashboard
         createEcosystemDashboardChunks(emit_chunks=False)
 
-    # c) Intrinsics
+        # c) Developer.Arm.Com
+        createDeveloperArmComChunks(emit_chunks=False)
+
+    # d) Intrinsics
     #createIntrinsicsDatabaseChunks()
 
     # 1) Get URLs and details from CSV
diff --git a/embedding-generation/requirements.txt b/embedding-generation/requirements.txt
index f6846d7..884dec2 100644
--- a/embedding-generation/requirements.txt
+++ b/embedding-generation/requirements.txt
@@ -6,3 +6,4 @@ boto3
 sentence-transformers
 pypdf
 rank-bm25
+playwright

From cc76fb48f186f6513df4dd28c490a3c41c74a7fd Mon Sep 17 00:00:00 2001
From: Andrew Pickard <andrew.pickard@arm.com>
Date: Fri, 29 May 2026 15:39:57 +0100
Subject: [PATCH 2/6] Only include developer.arm.com content that is relevant
 for SME

---
 embedding-generation/generate-chunks.py | 60 ++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
index 79d36ba..4bc1351 100644
--- a/embedding-generation/generate-chunks.py
+++ b/embedding-generation/generate-chunks.py
@@ -613,6 +613,7 @@ def extract_result(item: Dict[str, Any]) -> Dict[str, Any]:
             "title": item.get("title") or item.get("raw", {}).get("title"),
             "url": item.get("clickUri") or item.get("uri") or item.get("url"),
             "type": item.get("raw", {}).get("navigationhierarchiescontenttype"),
+            "author": item.get("raw", {}).get("author") or item.get("raw", {}).get("sysauthor"),
             "products": item.get("raw", {}).get("navigationhierarchiesproducts"),
             "objecttype": item.get("raw", {}).get("objecttype"),
             "keywords": item.get("raw", {}).get("navigationhierarchiestopics")
@@ -696,25 +697,64 @@ def chunkizeLearningPath(url, title, keywords):
     )
     chunkizeLearningPath(url,itemtitle,keywords)
 
+def item_is_relevant(item) -> bool:
+    match item["type"]:
+        case "Guide":
+            return item["title"] in {
+                    "What is SME/SME2?",
+                    "Overview of SME",
+                    "Assembly code",
+                    "Streaming SVE",
+                    "Load and Store",
+                    "Z registers",
+                    "Real world examples",
+                    "ZA storage",
+                    "Predication"
+            }
+
+        case "Programmer's Guide":
+            for pattern in {
+                r"/SME-Overview/",
+                r"/CME",
+                r"/matmul-fp32",
+                r"/lut-gemv-rm-int8",
+                r"/matmul-int8",
+                r"/gemv-cm-int8.+/",
+                r"/109246/.*/Introduction(\?|/The.+/)",
+                r"/Introduction-to-CME",
+                r"/Toolchains-and-model-support/(?!Quick-start)",
+                r"/Memory-access.(?!Implications)",
+                r"/Performance-monitoring",
+                r"/Matrix-Multiply-Unit"
+            }:
+                if re.search(pattern, item["url"]):
+                    return True
+            return False
+
+        case "Blog Post":
+            if item["author"] in {"Zenon_Xiu","KhalidS"} and item["title"][0:4] == "Part" and "SME" in item["title"]:
+                return True
+            if item["author"] == "mweidmann" and item["title"][0:41] == "Introducing the Scalable Matrix Extension":
+                return True
+            return False
+
 def createDeveloperArmComChunks(emit_chunks=True):
     search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype="
     content_types = [
-        "Blog Post"
-        "Developer Guide",
+        "Blog Post",
         "Guide",
-        "Programmer's Guide",
-        "Migration Guide",
-        "Getting Started Guide"
+        "Programmer's Guide"
     ]
 
     search_url = search_base+",".join([quote(x) for x in content_types])+"&q="
     for searchterm in ["SME"]:
         pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm)
-        for i, page in enumerate(pages):
-            keywords =  list(set( [searchterm] +
-                                  [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] +
-                                  [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]]))
-            processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks)
+        for page in pages:
+            if item_is_relevant(page):
+                keywords =  list(set( [searchterm] +
+                                    [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] +
+                                    [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]]))
+                processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks)
 
 
 

From 0c1774e955ff5d308458053bd14158f56644129c Mon Sep 17 00:00:00 2001
From: Andrew Pickard <andrew.pickard@arm.com>
Date: Wed, 17 Jun 2026 16:22:44 +0100
Subject: [PATCH 3/6] Moved developer.arm.com functionality into separate file
 generate-vectors.py with common functions in generate_common.py

---
 embedding-generation/Dockerfile          |   4 +-
 embedding-generation/generate-chunks.py  | 588 +----------------------
 embedding-generation/generate-vectors.py | 335 +++++++++++++
 embedding-generation/generate_common.py  | 362 ++++++++++++++
 4 files changed, 717 insertions(+), 572 deletions(-)
 create mode 100644 embedding-generation/generate-vectors.py
 create mode 100644 embedding-generation/generate_common.py

diff --git a/embedding-generation/Dockerfile b/embedding-generation/Dockerfile
index 7c2dc3e..6daed68 100644
--- a/embedding-generation/Dockerfile
+++ b/embedding-generation/Dockerfile
@@ -38,6 +38,7 @@ WORKDIR /embedding-data
 
 # Copy Python scripts and dependencies
 COPY generate-chunks.py .
+COPY generate_common.py .
 COPY document_chunking.py .
 COPY local_vectorstore_creation.py .
 COPY vector-db-sources.csv .
@@ -47,8 +48,7 @@ COPY requirements.txt .
 COPY --from=intrinsic-chunks /embedding-data/intrinsic_chunks ./intrinsic_chunks
 
 # Install Python dependencies (force CPU-only torch)
-RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt && \
-    playwright install --with-deps chromium
+RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt
 
 # Pre-download the embedding model so local/offline loads succeed later in the build.
 RUN python3 -c "from sentence_transformers import SentenceTransformer; import os; SentenceTransformer(os.environ['SENTENCE_TRANSFORMER_MODEL'], cache_folder=os.environ['SENTENCE_TRANSFORMERS_HOME'])"
diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
index 4bc1351..1ed10d4 100644
--- a/embedding-generation/generate-chunks.py
+++ b/embedding-generation/generate-chunks.py
@@ -17,21 +17,11 @@
 import os
 import re
 import uuid
-import yaml
 import csv
-import datetime
 
-import boto3
-from botocore.exceptions import NoCredentialsError, ClientError
 from bs4 import BeautifulSoup
 import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
-from urllib.parse import parse_qs, urlparse, quote
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-from playwright.async_api import async_playwright
-import asyncio
+from urllib.parse import parse_qs, urlparse
 
 from document_chunking import (
     arm_service_url_to_developer_url,
@@ -45,58 +35,22 @@
     source_to_fetch_url,
 )
 
+from generate_common import (
+    Chunk,
+    createChunk,
+    printChunks,
+    chunkSaveAndTrack,
+    fetch_with_logging,
+    register_source,
+    save_sources_csv,
+    load_existing_sources,
+    get_number_of_sources,
+    ensure_intrinsic_chunks_from_s3,
+    yaml_dir,
+    details_file,
+    http_session
+)
 
-# Create a session with retry logic for resilient HTTP requests
-def create_retry_session(retries=5, backoff_factor=1, status_forcelist=(500, 502, 503, 504)):
-    """Create a requests session with automatic retry on failures."""
-    session = requests.Session()
-    retry = Retry(
-        total=retries,
-        read=retries,
-        connect=retries,
-        backoff_factor=backoff_factor,
-        status_forcelist=status_forcelist,
-        allowed_methods=["HEAD", "GET", "OPTIONS"]
-    )
-    adapter = HTTPAdapter(max_retries=retry)
-    session.mount("http://", adapter)
-    session.mount("https://", adapter)
-    return session
-
-# Global session for all HTTP requests
-http_session = create_retry_session()
-
-
-def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks',
-                                    s3_bucket='arm-github-copilot-extension',
-                                    s3_prefix='embedding_data/intrinsic_chunks/'):
-    """
-    Ensure the local 'intrinsic_chunks' folder exists and is populated with files from S3.
-    If the folder does not exist, create it and download all files from the S3 prefix.
-    """
-    if not os.path.exists(local_folder):
-        os.makedirs(local_folder, exist_ok=True)
-        print(f"Created local folder: {local_folder}")
-        s3 = boto3.client('s3')
-        try:
-            paginator = s3.get_paginator('list_objects_v2')
-            for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix):
-                for obj in page.get('Contents', []):
-                    key = obj['Key']
-                    if key.endswith('/'):
-                        continue  # skip folders
-                    filename = os.path.basename(key)
-                    local_path = os.path.join(local_folder, filename)
-                    print(f"Downloading {key} to {local_path}")
-                    s3.download_file(s3_bucket, key, local_path)
-        except NoCredentialsError:
-            print("AWS credentials not found. Please configure them.")
-        except ClientError as e:
-            print(f"S3 ClientError: {e}")
-        except Exception as e:
-            print(f"Unexpected error: {e}")
-    else:
-        print(f"Folder '{local_folder}' already exists. Skipping S3 download.")
 
 '''
 To fix:
@@ -104,8 +58,6 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks',
 2. Learning Path titles must come from index page...send through function along with Graviton.
 '''
 
-yaml_dir = os.getenv('YAML_OUTPUT_DIR', 'yaml_data')
-details_file = os.getenv('CHUNK_DETAILS_FILE', 'info/chunk_details.csv')
 
 chunk_index = 1
 
@@ -116,158 +68,14 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks',
 # multi-megabyte HTML document for every source row.
 ecosystem_dashboard_entries = None
 
-# Global tracking for vector-db-sources.csv
-# Set of URLs already in the CSV (for deduplication)
-known_source_urls = set()
-# List of all source entries (including existing and new)
-# Each entry is a dict: {site_name, license_type, display_name, url, keywords}
-all_sources = []
 
 # Increase the file size limit, which defaults to '131,072'
 csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror'
 
 
-def load_existing_sources(csv_file):
-    """
-    Load existing sources from vector-db-sources.csv into memory.
-    Populates known_source_urls set and all_sources list.
-    """
-    global known_source_urls, all_sources
-    known_source_urls = set()
-    all_sources = []
-    
-    if not os.path.exists(csv_file):
-        print(f"Sources file '{csv_file}' does not exist. Starting fresh.")
-        return
-    
-    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
-        reader = csv.DictReader(file)
-        for row in reader:
-            url = row.get('URL', '').strip()
-            if url:
-                known_source_urls.add(url)
-                all_sources.append({
-                    'site_name': row.get('Site Name', ''),
-                    'license_type': row.get('License Type', ''),
-                    'display_name': row.get('Display Name', ''),
-                    'url': url,
-                    'keywords': row.get('Keywords', '')
-                })
-    
-    print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'")
-
-
-def register_source(site_name, license_type, display_name, url, keywords):
-    """
-    Register a new source URL. If the URL already exists, skip it.
-    Returns True if the source was added, False if it was a duplicate.
-    """
-    global known_source_urls, all_sources
-    
-    # Normalize URL for comparison
-    url = url.strip()
-    
-    if url in known_source_urls:
-        return False
-    
-    known_source_urls.add(url)
-    source_entry = {
-        'site_name': site_name,
-        'license_type': license_type,
-        'display_name': display_name,
-        'url': url,
-        'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords)
-    }
-
-    # Keep discovered sources grouped with their existing site section instead of
-    # appending them to the very end of the CSV and fragmenting that block.
-    insert_at = None
-    for index, existing_source in enumerate(all_sources):
-        if existing_source.get('site_name') == site_name:
-            insert_at = index + 1
-
-    if insert_at is None:
-        all_sources.append(source_entry)
-    else:
-        all_sources.insert(insert_at, source_entry)
 
-    print(f"[NEW SOURCE] {display_name}: {url}")
-    return True
 
 
-def save_sources_csv(csv_file):
-    """
-    Write all sources (existing + new) to vector-db-sources.csv.
-    """
-    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
-        writer = csv.writer(file)
-        writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords'])
-        for source in all_sources:
-            writer.writerow([
-                source['site_name'],
-                source['license_type'],
-                source['display_name'],
-                source['url'],
-                source['keywords']
-            ])
-    
-    print(f"Saved {len(all_sources)} sources to '{csv_file}'")
-
-class Chunk:
-    def __init__(
-        self,
-        title,
-        url,
-        uuid,
-        keywords,
-        content,
-        heading="",
-        heading_path=None,
-        doc_type="",
-        product="",
-        version="",
-        resolved_url="",
-        content_type="",
-    ):
-        self.title = title
-        self.url = url
-        self.uuid = uuid
-        self.content = content
-        self.heading = heading
-        self.heading_path = heading_path or []
-        self.doc_type = doc_type
-        self.product = product
-        self.version = version
-        self.resolved_url = resolved_url
-        self.content_type = content_type
-
-        # Translate keyword list into comma-separated string, and add similar words to keywords.
-        self.keywords = self.formatKeywords(keywords)
-
-    def formatKeywords(self, keywords):
-        """Format keywords list into a lowercase, comma-separated string."""
-        return ', '.join(k.strip() for k in keywords).lower()
-
-    # Used to dump into a yaml file without difficulty
-    def toDict(self):
-        return {
-            'title': self.title,
-            'url': self.url,
-            'uuid': self.uuid,
-            'keywords': self.keywords,
-            'content': self.content,
-            'heading': self.heading,
-            'heading_path': self.heading_path,
-            'doc_type': self.doc_type,
-            'product': self.product,
-            'version': self.version,
-            'resolved_url': self.resolved_url,
-            'content_type': self.content_type,
-        }
-
-    def __repr__(self):
-        return f"Chunk(title={self.title}, url={self.url}, uuid={self.uuid}, heading={self.heading})"
-
 def build_ecosystem_dashboard_entries():
     """Load and cache package-level snippets from the ecosystem dashboard."""
     global ecosystem_dashboard_entries
@@ -529,235 +337,6 @@ def htmlToMarkdown(html_string):
         <sudocode>
     '''
 
-
-@dataclass
-class CapturedSearchRequest:
-    url: str
-    method: str
-    headers: Dict[str, str]
-    post_data: Optional[str]
-    response_json: Dict[str, Any]
-
-async def capture_DeveloperArmComSearch(page_url: str) -> CapturedSearchRequest:
-    apicount = 0
-    def is_search_response(resp) -> bool:
-        nonlocal apicount
-        # print("Testing url "+str(resp.request.method.upper())+" "+str(resp.url))
-        if "coveo.com/rest/search/v2" in resp.url and "querySuggest" not in resp.url:
-            apicount += 1
-            return (
-                resp.request.method.upper() in {"GET", "POST"}
-                and resp.status == 200
-                and apicount == 2
-            )
-        else:
-            return False
-
-    async with async_playwright() as p:
-        browser = await p.chromium.launch(headless=True)
-        page = await browser.new_page()
-
-        async with page.expect_response(is_search_response, timeout=30_000) as response_info:
-            await page.goto(page_url, wait_until="domcontentloaded")
-
-        response = await response_info.value
-        data = await response.json()
-        await browser.close()
-
-        if not(isinstance(data, dict) and "results" in data):
-            raise RuntimeError("No search API response was captured. ")
-        else:
-            return CapturedSearchRequest(
-                url=response.url,
-                method=response.request.method,
-                headers=dict(response.request.headers),
-                post_data=response.request.post_data,
-                response_json=data,
-            )
-
-def replay_DeveloperArmComSearch(
-    captured: CapturedSearchRequest,
-    query: str,
-    first_result: int = 0,
-    number_of_results: int = 48,
-) -> Dict[str, Any]:
-
-    def _merge_headers(base_headers: Dict[str, str]) -> Dict[str, str]:
-        keep = {}
-        drop = {"host", "content-length", "accept-encoding", "connection", "origin", "referer"}
-        for k, v in base_headers.items():
-            if k.lower() not in drop:
-                keep[k] = v
-        keep.setdefault("accept", "application/json, text/plain, */*")
-        keep.setdefault("content-type", "application/json")
-        keep.setdefault("user-agent", "Mozilla/5.0")
-        return keep
-
-    if not captured.post_data:
-        raise RuntimeError("Captured request had no POST body to replay.")
-
-    body = json.loads(captured.post_data)
-    body["q"] = query
-    body["firstResult"] = first_result
-    body["numberOfResults"] = number_of_results
-    headers = _merge_headers(captured.headers)
-
-    r = requests.post(captured.url, headers=headers, json=body, timeout=60)
-    r.raise_for_status()
-    return r.json()
-
-def getDeveloperArmComSearchResults(searchterm: str, searchurl: str, maxitems: int = 20000):
-
-    def extract_result(item: Dict[str, Any]) -> Dict[str, Any]:
-        return {
-            "title": item.get("title") or item.get("raw", {}).get("title"),
-            "url": item.get("clickUri") or item.get("uri") or item.get("url"),
-            "type": item.get("raw", {}).get("navigationhierarchiescontenttype"),
-            "author": item.get("raw", {}).get("author") or item.get("raw", {}).get("sysauthor"),
-            "products": item.get("raw", {}).get("navigationhierarchiesproducts"),
-            "objecttype": item.get("raw", {}).get("objecttype"),
-            "keywords": item.get("raw", {}).get("navigationhierarchiestopics")
-        }
-
-    print('Searching developer.arm.com for "'+searchterm+'"')
-    captured = asyncio.run(capture_DeveloperArmComSearch(searchurl))
-
-    all_rows = []
-    finished = False
-    page_size = 48
-    start = 0
-    while (len(all_rows) < maxitems) and not finished:
-        payload = replay_DeveloperArmComSearch(
-            captured,
-            query=searchterm,
-            first_result=start,
-            number_of_results=page_size,
-        )
-
-        items = [extract_result(x) for x in payload["results"]]
-        all_rows.extend(items)
-        finished = len(payload["results"]) < page_size
-        start += page_size
-    print("Found "+str(len(all_rows))+" results")
-    return all_rows
-
-def processDeveloperArmCom(url, title, type, keywords, emit_chunks=True):
-
-    def chunkizeLearningPath(url, title, keywords):
-        if not emit_chunks:
-            return
-
-        response = fetch_with_logging(url)
-        if response is None:
-            return
-        parsed_document = parse_document_content(
-            source_url=url,
-            resolved_url=url,
-            response_content=response.content,
-            content_type=response.headers.get("content-type", "text/html"),
-            fallback_title=title,
-        )
-        chunk_payloads = chunk_parsed_document(
-            parsed_document,
-            doc_type=type,
-            keywords=keywords,
-        )
-
-        # 5) Create chunks for each snippet by adding metadata
-        for payload in chunk_payloads:
-            chunk = createChunk(
-                payload["content"],
-                url,
-                keywords,
-                payload["title"],
-                heading=payload["heading"],
-                heading_path=payload["heading_path"],
-                doc_type=payload["doc_type"],
-                product=payload["product"],
-                version=payload["version"],
-                resolved_url=payload["resolved_url"],
-                content_type=payload["content_type"],
-            )
-            chunkSaveAndTrack(url,chunk)
-
-
-    response = http_session.get(url, timeout=60)
-    soup = BeautifulSoup(response.text, 'html.parser')
-
-    itemtitle = 'Arm '+type+' - '+(blogtitle.get_text() if (blogtitle := soup.find(id='blog-title')) else title)
-    itemdate = blogdate.get_text() if (blogdate := soup.find(id='blog-date')) else ''
-
-    # Register this learning path as a source
-    register_source(
-        site_name='Arm Developer',
-        license_type='Copyright Arm',
-        display_name=itemtitle,
-        url=url,
-        keywords=keywords
-    )
-    chunkizeLearningPath(url,itemtitle,keywords)
-
-def item_is_relevant(item) -> bool:
-    match item["type"]:
-        case "Guide":
-            return item["title"] in {
-                    "What is SME/SME2?",
-                    "Overview of SME",
-                    "Assembly code",
-                    "Streaming SVE",
-                    "Load and Store",
-                    "Z registers",
-                    "Real world examples",
-                    "ZA storage",
-                    "Predication"
-            }
-
-        case "Programmer's Guide":
-            for pattern in {
-                r"/SME-Overview/",
-                r"/CME",
-                r"/matmul-fp32",
-                r"/lut-gemv-rm-int8",
-                r"/matmul-int8",
-                r"/gemv-cm-int8.+/",
-                r"/109246/.*/Introduction(\?|/The.+/)",
-                r"/Introduction-to-CME",
-                r"/Toolchains-and-model-support/(?!Quick-start)",
-                r"/Memory-access.(?!Implications)",
-                r"/Performance-monitoring",
-                r"/Matrix-Multiply-Unit"
-            }:
-                if re.search(pattern, item["url"]):
-                    return True
-            return False
-
-        case "Blog Post":
-            if item["author"] in {"Zenon_Xiu","KhalidS"} and item["title"][0:4] == "Part" and "SME" in item["title"]:
-                return True
-            if item["author"] == "mweidmann" and item["title"][0:41] == "Introducing the Scalable Matrix Extension":
-                return True
-            return False
-
-def createDeveloperArmComChunks(emit_chunks=True):
-    search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype="
-    content_types = [
-        "Blog Post",
-        "Guide",
-        "Programmer's Guide"
-    ]
-
-    search_url = search_base+",".join([quote(x) for x in content_types])+"&q="
-    for searchterm in ["SME"]:
-        pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm)
-        for page in pages:
-            if item_is_relevant(page):
-                keywords =  list(set( [searchterm] +
-                                    [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] +
-                                    [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]]))
-                processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks)
-
-
-
 def processLearningPath(url, type, emit_chunks=True):
     github_raw_link = "https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/production/content"
     site_link = "https://learn.arm.com"
@@ -1004,31 +583,6 @@ def URLIsValidCheck(url):
         return False
 
 
-def fetch_with_logging(url):
-    try:
-        response = http_session.get(url, timeout=60)
-        response.raise_for_status()
-        return response
-    except requests.exceptions.HTTPError as http_err:
-        print(f"HTTP error occurred: {http_err}")
-        with open('info/errors.csv', 'a', newline='') as csvfile:
-            csv_writer = csv.writer(csvfile)
-            csv_writer.writerow([url, str(http_err)])
-        return None
-    except Exception as err:
-        print(f"Other error occurred: {err}")
-        with open('info/errors.csv', 'a', newline='') as csvfile:
-            csv_writer = csv.writer(csvfile)
-            csv_writer.writerow([url, str(err)])
-        return None
-    except Exception as err:
-        print(f"Other error occurred: {err}")
-        with open('info/errors.csv', 'a', newline='') as csvfile:
-            csv_writer = csv.writer(csvfile)
-            csv_writer.writerow([url,str(err)])
-        return False
-
-
 def obtainMarkdownContentFromGitHubMDFile(gh_url):
     response = http_session.get(gh_url, timeout=60)
     response.raise_for_status()  # Ensure we got a valid response
@@ -1059,48 +613,6 @@ def obtainTextSnippets__Markdown(content, min_words=300, max_words=500, min_fina
     return [chunk["content"] for chunk in chunks]
 
 
-def createChunk(
-    text_snippet,
-    WEBSITE_url,
-    keywords,
-    title,
-    heading="",
-    heading_path=None,
-    doc_type="",
-    product="",
-    version="",
-    resolved_url="",
-    content_type="",
-):
-    chunk = Chunk(
-        title        = title,
-        url          = WEBSITE_url,
-        uuid         = str(uuid.uuid4()),
-        keywords     = keywords,
-        content      = text_snippet,
-        heading      = heading,
-        heading_path = heading_path or [],
-        doc_type     = doc_type,
-        product      = product,
-        version      = version,
-        resolved_url = resolved_url,
-        content_type = content_type,
-    )
-
-    return chunk
-
-
-def printChunks(chunks):
-    for chunk_dict in chunks:
-        print('='*100)
-        print("Title:", chunk_dict['title'])
-        print("Keywords:", chunk_dict['keywords'])
-        print("URL:", chunk_dict['url'])
-        print("Unique ID:", chunk_dict['uuid'])
-        print("Content:", chunk_dict['content'])
-        print('='*100)
-
-
 def parse_keywords(keywords_value, title=""):
     keywords = [keyword.strip() for keyword in re.split(r"[;,]", keywords_value or "") if keyword.strip()]
     if title and title not in keywords:
@@ -1154,7 +666,6 @@ def _arm_topic_links(topic):
         links.extend(_arm_topic_links(child))
     return links
 
-
 def _arm_metadata_keywords(root_data, keywords_value, source_name):
     keywords = parse_keywords(keywords_value, source_name)
     for value in root_data.get("keywords", []) + root_data.get("products", []):
@@ -1162,7 +673,6 @@ def _arm_metadata_keywords(root_data, keywords_value, source_name):
             keywords.append(value)
     return keywords
 
-
 def create_arm_documentation_chunks(source_url, source_name, doc_type, keywords_value):
     root_response = fetch_with_logging(source_to_fetch_url(source_url))
     if root_response is None:
@@ -1209,65 +719,6 @@ def create_arm_documentation_chunks(source_url, source_name, doc_type, keywords_
     return chunks
 
 
-def chunkSaveAndTrack(url,chunk):
-
-    def addNewRow(current_date,chunk_words,chunk_id):
-        return [url,current_date,chunk_words,'1',chunk_id]
-    
-    def addToExistingRow(row,chunk_words,chunk_id):
-        url = row[0] # same URL
-        date = row[1] # same date
-        words = str(int(row[2]) + chunk_words) # update words
-        chunks = row[3] = str(int(row[3]) + 1) # update number of chunks
-        ids = row[4]+ f", {chunk_id}" # update chunk IDs
-        return [url,date,words,chunks,ids]
-
-
-    def recordChunk():
-        current_date = datetime.date.today().strftime('%Y-%m-%d')
-        chunk_words  = len(chunk.content.split())    
-        chunk_id     = f'chunk_{chunk.uuid}'
-
-        new_rows = []
-
-        with open(details_file, mode='r', newline='', encoding='utf-8') as file:
-            csv_reader = csv.reader(file)
-            try:
-                headers = next(csv_reader)  
-                new_rows.append(headers) # keep in memory
-            except StopIteration:
-                pass
-
-            url_found = False  # Track if the URL is found in any row
-            
-            # Loop through all the rows after the header
-            for row in csv_reader:
-                if row[0] == url:
-                    new_rows.append(addToExistingRow(row, chunk_words, chunk_id))  # Modify and append the row
-                    url_found = True  # Mark that the URL was found
-                else:
-                    new_rows.append(row)  # Append the row without modification
-            
-            # If the URL was not found, append a new row
-            if not url_found:
-                new_rows.append(addNewRow(current_date, chunk_words, chunk_id))
-
-
-        # Overwrite csv with new info
-        with open(details_file, mode='w', newline='') as file:
-            csv_writer = csv.writer(file, delimiter=',')
-            csv_writer.writerows(new_rows) 
-
-    # Save chunk
-    file_name = f"{yaml_dir}/chunk_{chunk.uuid}.yaml"
-    with open(file_name, 'w') as file:
-        yaml.dump(chunk.toDict(), file, default_flow_style=False, sort_keys=False)
-
-    # Record chunk
-    recordChunk()
-    print(f"{file_name} === {chunk.title}")
-
-
 def main():
     skip_discovery = os.getenv("SKIP_DISCOVERY", "").lower() in {"1", "true", "yes"}
 
@@ -1312,10 +763,7 @@ def main():
         # b) Ecosystem Dashboard
         createEcosystemDashboardChunks(emit_chunks=False)
 
-        # c) Developer.Arm.Com
-        createDeveloperArmComChunks(emit_chunks=False)
-
-    # d) Intrinsics
+    # c) Intrinsics
     #createIntrinsicsDatabaseChunks()
 
     # 1) Get URLs and details from CSV
@@ -1334,7 +782,7 @@ def main():
     # Save updated sources CSV with all discovered sources
     save_sources_csv(sources_file)
     print(f"\n=== Source tracking complete ===")
-    print(f"Total sources in {sources_file}: {len(all_sources)}")
+    print(f"Total sources in {sources_file}: {get_number_of_sources()}")
 
 
 if __name__ == "__main__":
diff --git a/embedding-generation/generate-vectors.py b/embedding-generation/generate-vectors.py
new file mode 100644
index 0000000..da9851a
--- /dev/null
+++ b/embedding-generation/generate-vectors.py
@@ -0,0 +1,335 @@
+# Copyright © 2025, Arm Limited and Contributors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+import re
+import csv
+
+from bs4 import BeautifulSoup
+import requests
+from urllib.parse import quote
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+from playwright.async_api import async_playwright
+import asyncio
+
+from document_chunking import (
+    chunk_parsed_document,
+    parse_document_content,
+)
+
+from generate_common import (
+    Chunk,
+    createChunk,
+    chunkSaveAndTrack,
+    fetch_with_logging,
+    register_source,
+    save_sources_csv,
+    load_existing_sources,
+    get_number_of_sources,
+    ensure_intrinsic_chunks_from_s3,
+    yaml_dir,
+    details_file,
+    http_session
+)
+
+
+@dataclass
+class CapturedSearchRequest:
+    url: str
+    method: str
+    headers: Dict[str, str]
+    post_data: Optional[str]
+    response_json: Dict[str, Any]
+
+async def capture_DeveloperArmComSearch(page_url: str) -> CapturedSearchRequest:
+    apicount = 0
+    def is_search_response(resp) -> bool:
+        nonlocal apicount
+        if "coveo.com/rest/search/v2" in resp.url and "querySuggest" not in resp.url:
+            apicount += 1
+            return (
+                resp.request.method.upper() == "POST"
+                and resp.request.post_data is not None
+                and resp.status == 200
+                and apicount > 1
+            )
+        else:
+            return False
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+        try:
+            async with page.expect_response(is_search_response, timeout=30_000) as response_info:
+                await page.goto(page_url, wait_until="domcontentloaded")
+
+            response = await response_info.value
+            data = await response.json()
+        finally:
+            await browser.close()
+
+        if not(isinstance(data, dict) and "results" in data):
+            raise RuntimeError("No search API response was captured. ")
+        else:
+            return CapturedSearchRequest(
+                url=response.url,
+                method=response.request.method,
+                headers=dict(response.request.headers),
+                post_data=response.request.post_data,
+                response_json=data,
+            )
+
+def replay_DeveloperArmComSearch(
+    captured: CapturedSearchRequest,
+    query: str,
+    first_result: int = 0,
+    number_of_results: int = 48,
+) -> Dict[str, Any]:
+
+    def _merge_headers(base_headers: Dict[str, str]) -> Dict[str, str]:
+        keep = {}
+        drop = {"host", "content-length", "accept-encoding", "connection", "origin", "referer", "cookie"}
+        for k, v in base_headers.items():
+            if k.lower() not in drop:
+                keep[k] = v
+        keep.setdefault("accept", "application/json, text/plain, */*")
+        keep.setdefault("content-type", "application/json")
+        keep.setdefault("user-agent", "Mozilla/5.0")
+        return keep
+
+    if not captured.post_data:
+        raise RuntimeError("Captured request had no POST body to replay.")
+
+    body = json.loads(captured.post_data)
+    body["q"] = query
+    body["firstResult"] = first_result
+    body["numberOfResults"] = number_of_results
+    headers = _merge_headers(captured.headers)
+
+    r = requests.post(captured.url, headers=headers, json=body, timeout=60)
+    r.raise_for_status()
+    return r.json()
+
+def getDeveloperArmComSearchResults(searchterm: str, searchurl: str, maxitems: int = 20000):
+
+    def extract_result(item: Dict[str, Any]) -> Dict[str, Any]:
+        return {
+            "title": item.get("title") or item.get("raw", {}).get("title"),
+            "url": item.get("clickUri") or item.get("uri") or item.get("url"),
+            "type": item.get("raw", {}).get("navigationhierarchiescontenttype"),
+            "author": item.get("raw", {}).get("author") or item.get("raw", {}).get("sysauthor"),
+            "products": item.get("raw", {}).get("navigationhierarchiesproducts"),
+            "objecttype": item.get("raw", {}).get("objecttype"),
+            "keywords": item.get("raw", {}).get("navigationhierarchiestopics")
+        }
+
+    print('Searching developer.arm.com for "'+searchterm+'"')
+    captured = asyncio.run(capture_DeveloperArmComSearch(searchurl))
+
+    all_rows = []
+    finished = False
+    page_size = 48
+    start = 0
+    while (len(all_rows) < maxitems) and not finished:
+        payload = replay_DeveloperArmComSearch(
+            captured,
+            query=searchterm,
+            first_result=start,
+            number_of_results=page_size,
+        )
+
+        items = [extract_result(x) for x in payload["results"]]
+        all_rows.extend(items)
+        finished = len(payload["results"]) < page_size
+        start += page_size
+    print("Found "+str(len(all_rows))+" results")
+    return all_rows
+
+def processDeveloperArmCom(url, title, type, keywords, emit_chunks=True):
+
+    def chunkizeLearningPath(url, title, keywords):
+        if not emit_chunks:
+            return
+
+        response = fetch_with_logging(url)
+        if response is None:
+            return
+        parsed_document = parse_document_content(
+            source_url=url,
+            resolved_url=url,
+            response_content=response.content,
+            content_type=response.headers.get("content-type", "text/html"),
+            fallback_title=title,
+        )
+        chunk_payloads = chunk_parsed_document(
+            parsed_document,
+            doc_type=type,
+            keywords=keywords,
+        )
+
+        # 5) Create chunks for each snippet by adding metadata
+        for payload in chunk_payloads:
+            chunk = createChunk(
+                payload["content"],
+                url,
+                keywords,
+                payload["title"],
+                heading=payload["heading"],
+                heading_path=payload["heading_path"],
+                doc_type=payload["doc_type"],
+                product=payload["product"],
+                version=payload["version"],
+                resolved_url=payload["resolved_url"],
+                content_type=payload["content_type"],
+            )
+            chunkSaveAndTrack(url,chunk)
+
+
+    response = http_session.get(url, timeout=60)
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    itemtitle = 'Arm '+type+' - '+(blogtitle.get_text() if (blogtitle := soup.find(id='blog-title')) else title)
+    itemdate = blogdate.get_text() if (blogdate := soup.find(id='blog-date')) else ''
+
+    # Register this learning path as a source
+    register_source(
+        site_name='Arm Developer',
+        license_type='Arm Proprietary',
+        display_name=itemtitle,
+        url=url,
+        keywords=keywords
+    )
+    chunkizeLearningPath(url,itemtitle,keywords)
+
+def item_is_relevant(item) -> bool:
+    if not item.get("url"):
+        return False
+    match item["type"]:
+        case "Guide":
+            return item["title"] in {
+                    "What is SME/SME2?",
+                    "Overview of SME",
+                    "Assembly code",
+                    "Streaming SVE",
+                    "Load and Store",
+                    "Z registers",
+                    "Real world examples",
+                    "ZA storage",
+                    "Predication"
+            }
+
+        case "Programmer's Guide":
+            for pattern in {
+                r"/SME-Overview/",
+                r"/CME",
+                r"/matmul-fp32",
+                r"/lut-gemv-rm-int8",
+                r"/matmul-int8",
+                r"/gemv-cm-int8.+/",
+                r"/109246/.*/Introduction(\?|/The.+/)",
+                r"/Introduction-to-CME",
+                r"/Toolchains-and-model-support/(?!Quick-start)",
+                r"/Memory-access.(?!Implications)",
+                r"/Performance-monitoring",
+                r"/Matrix-Multiply-Unit"
+            }:
+                if item.get("url") and re.search(pattern, item["url"]):
+                    return True
+            return False
+
+        case "Blog Post":
+            title = item.get("title") or ""
+            author = item.get("author") or ""
+            if author in {"Zenon_Xiu", "KhalidS"} and title.startswith("Part") and "SME" in title:
+                return True
+            if author == "mweidmann" and title.startswith("Introducing the Scalable Matrix Extension"):
+                return True
+            return False
+
+        case _:
+            return False
+
+def createDeveloperArmComChunks(emit_chunks=True):
+    search_base = "https://developer.arm.com/search#numberOfResults=48&f-navigationhierarchiescontenttype="
+    content_types = [
+        "Blog Post",
+        "Guide",
+        "Programmer's Guide"
+    ]
+
+    search_url = search_base+",".join([quote(x) for x in content_types])+"&q="
+    for searchterm in ["SME"]:
+        pages = getDeveloperArmComSearchResults(searchterm, search_url+searchterm)
+        relevant = 0
+        for page in pages:
+            if item_is_relevant(page):
+                keywords =  list(set( [searchterm] +
+                                    [key for key_list in (page["keywords"] or []) for key in key_list.split(sep="|")] +
+                                    [key for key_list in (page["products"] or []) for key in key_list.split(sep="|")[2:]]))
+                processDeveloperArmCom(page["url"], page["title"], page["type"], keywords, emit_chunks=emit_chunks)
+                relevant += 1
+        print("Keeping "+str(relevant)+" relevant items out of "+str(len(pages)))
+
+def main():
+    skip_discovery = os.getenv("SKIP_DISCOVERY", "").lower() in {"1", "true", "yes"}
+
+    # Ensure intrinsic_chunks folder and files from S3 are present
+    ensure_intrinsic_chunks_from_s3()
+
+    # Argparse inputs
+    parser = argparse.ArgumentParser(
+        description="Generates list of Arm documentation sources for vector database ingestion. "
+                    "Discovers developer.arm.com entries, "
+                    "then updates the sources CSV with any new entries found."
+    )
+    parser.add_argument(
+        "sources_file",
+        help="Path to vector-db-sources.csv. This file is read for existing sources "
+             "(to avoid duplicates) and WILL BE OVERWRITTEN with the combined list "
+             "of existing + newly discovered sources."
+    )
+    args = parser.parse_args()
+    sources_file = args.sources_file
+
+    # Load existing sources from vector-db-sources.csv (for deduplication)
+    load_existing_sources(sources_file)
+
+    # 0) Initialize files
+    os.makedirs(yaml_dir, exist_ok=True) # create if doesn't exist
+    details_dir = os.path.dirname(details_file)
+    if details_dir:
+        os.makedirs(details_dir, exist_ok=True)
+    for filename in os.listdir(yaml_dir):
+        if filename.startswith('chunk_') and filename.endswith('.yaml'):
+            os.remove(os.path.join(yaml_dir, filename))
+    with open(details_file, mode='w', newline='') as file:
+        writer = csv.writer(file)        
+        writer.writerow(['URL','Date', 'Number of Words', 'Number of Chunks','Chunk IDs'])
+
+    # 0) Obtain full database information:
+    # a) Learning Paths & Install Guides
+    if not skip_discovery:
+        # Developer.Arm.Com
+        createDeveloperArmComChunks(emit_chunks=False)
+
+    # Save updated sources CSV with all discovered sources
+    save_sources_csv(sources_file)
+    print(f"\n=== Source tracking complete ===")
+    print(f"Total sources in {sources_file}: {get_number_of_sources()}")
+
+if __name__ == "__main__":
+    main()
diff --git a/embedding-generation/generate_common.py b/embedding-generation/generate_common.py
new file mode 100644
index 0000000..2e4efb4
--- /dev/null
+++ b/embedding-generation/generate_common.py
@@ -0,0 +1,362 @@
+# Copyright © 2025, Arm Limited and Contributors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+import yaml
+import csv
+import datetime
+
+import boto3
+from botocore.exceptions import NoCredentialsError, ClientError
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+
+# Create a session with retry logic for resilient HTTP requests
+def create_retry_session(retries=5, backoff_factor=1, status_forcelist=(500, 502, 503, 504)):
+    """Create a requests session with automatic retry on failures."""
+    session = requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+        allowed_methods=["HEAD", "GET", "OPTIONS"]
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+# Global session for all HTTP requests
+http_session = create_retry_session()
+
+
+def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks',
+                                    s3_bucket='arm-github-copilot-extension',
+                                    s3_prefix='embedding_data/intrinsic_chunks/'):
+    """
+    Ensure the local 'intrinsic_chunks' folder exists and is populated with files from S3.
+    If the folder does not exist, create it and download all files from the S3 prefix.
+    """
+    if not os.path.exists(local_folder):
+        os.makedirs(local_folder, exist_ok=True)
+        print(f"Created local folder: {local_folder}")
+        s3 = boto3.client('s3')
+        try:
+            paginator = s3.get_paginator('list_objects_v2')
+            for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix):
+                for obj in page.get('Contents', []):
+                    key = obj['Key']
+                    if key.endswith('/'):
+                        continue  # skip folders
+                    filename = os.path.basename(key)
+                    local_path = os.path.join(local_folder, filename)
+                    print(f"Downloading {key} to {local_path}")
+                    s3.download_file(s3_bucket, key, local_path)
+        except NoCredentialsError:
+            print("AWS credentials not found. Please configure them.")
+        except ClientError as e:
+            print(f"S3 ClientError: {e}")
+        except Exception as e:
+            print(f"Unexpected error: {e}")
+    else:
+        print(f"Folder '{local_folder}' already exists. Skipping S3 download.")
+
+
+yaml_dir = os.getenv('YAML_OUTPUT_DIR', 'yaml_data')
+details_file = os.getenv('CHUNK_DETAILS_FILE', 'info/chunk_details.csv')
+
+# Global tracking for vector-db-sources.csv
+# Set of URLs already in the CSV (for deduplication)
+known_source_urls = set()
+# List of all source entries (including existing and new)
+# Each entry is a dict: {site_name, license_type, display_name, url, keywords}
+all_sources = []
+
+
+def get_number_of_sources():
+    global all_sources
+    return len(all_sources)
+
+
+def load_existing_sources(csv_file):
+    """
+    Load existing sources from vector-db-sources.csv into memory.
+    Populates known_source_urls set and all_sources list.
+    """
+    global known_source_urls, all_sources
+    known_source_urls = set()
+    all_sources = []
+    
+    if not os.path.exists(csv_file):
+        print(f"Sources file '{csv_file}' does not exist. Starting fresh.")
+        return
+    
+    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            url = row.get('URL', '').strip()
+            if url:
+                known_source_urls.add(url)
+                all_sources.append({
+                    'site_name': row.get('Site Name', ''),
+                    'license_type': row.get('License Type', ''),
+                    'display_name': row.get('Display Name', ''),
+                    'url': url,
+                    'keywords': row.get('Keywords', '')
+                })
+    
+    print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'")
+
+
+def register_source(site_name, license_type, display_name, url, keywords):
+    """
+    Register a new source URL. If the URL already exists, skip it.
+    Returns True if the source was added, False if it was a duplicate.
+    """
+    global known_source_urls, all_sources
+    
+    # Normalize URL for comparison
+    url = url.strip()
+    
+    if url in known_source_urls:
+        return False
+    
+    known_source_urls.add(url)
+    source_entry = {
+        'site_name': site_name,
+        'license_type': license_type,
+        'display_name': display_name,
+        'url': url,
+        'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords)
+    }
+
+    # Keep discovered sources grouped with their existing site section instead of
+    # appending them to the very end of the CSV and fragmenting that block.
+    insert_at = None
+    for index, existing_source in enumerate(all_sources):
+        if existing_source.get('site_name') == site_name:
+            insert_at = index + 1
+
+    if insert_at is None:
+        all_sources.append(source_entry)
+    else:
+        all_sources.insert(insert_at, source_entry)
+
+    print(f"[NEW SOURCE] {display_name}: {url}")
+    return True
+
+
+def save_sources_csv(csv_file):
+    """
+    Write all sources (existing + new) to vector-db-sources.csv.
+    """
+    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords'])
+        for source in all_sources:
+            writer.writerow([
+                source['site_name'],
+                source['license_type'],
+                source['display_name'],
+                source['url'],
+                source['keywords']
+            ])
+    
+    print(f"Saved {len(all_sources)} sources to '{csv_file}'")
+
+class Chunk:
+    def __init__(
+        self,
+        title,
+        url,
+        uuid,
+        keywords,
+        content,
+        heading="",
+        heading_path=None,
+        doc_type="",
+        product="",
+        version="",
+        resolved_url="",
+        content_type="",
+    ):
+        self.title = title
+        self.url = url
+        self.uuid = uuid
+        self.content = content
+        self.heading = heading
+        self.heading_path = heading_path or []
+        self.doc_type = doc_type
+        self.product = product
+        self.version = version
+        self.resolved_url = resolved_url
+        self.content_type = content_type
+
+        # Translate keyword list into comma-separated string, and add similar words to keywords.
+        self.keywords = self.formatKeywords(keywords)
+
+    def formatKeywords(self, keywords):
+        """Format keywords list into a lowercase, comma-separated string."""
+        return ', '.join(k.strip() for k in keywords).lower()
+
+    # Used to dump into a yaml file without difficulty
+    def toDict(self):
+        return {
+            'title': self.title,
+            'url': self.url,
+            'uuid': self.uuid,
+            'keywords': self.keywords,
+            'content': self.content,
+            'heading': self.heading,
+            'heading_path': self.heading_path,
+            'doc_type': self.doc_type,
+            'product': self.product,
+            'version': self.version,
+            'resolved_url': self.resolved_url,
+            'content_type': self.content_type,
+        }
+
+    def __repr__(self):
+        return f"Chunk(title={self.title}, url={self.url}, uuid={self.uuid}, heading={self.heading})"
+
+
+def fetch_with_logging(url):
+    try:
+        response = http_session.get(url, timeout=60)
+        response.raise_for_status()
+        return response
+    except requests.exceptions.HTTPError as http_err:
+        print(f"HTTP error occurred: {http_err}")
+        with open('info/errors.csv', 'a', newline='') as csvfile:
+            csv_writer = csv.writer(csvfile)
+            csv_writer.writerow([url, str(http_err)])
+        return None
+    except Exception as err:
+        print(f"Other error occurred: {err}")
+        with open('info/errors.csv', 'a', newline='') as csvfile:
+            csv_writer = csv.writer(csvfile)
+            csv_writer.writerow([url, str(err)])
+        return None
+    except Exception as err:
+        print(f"Other error occurred: {err}")
+        with open('info/errors.csv', 'a', newline='') as csvfile:
+            csv_writer = csv.writer(csvfile)
+            csv_writer.writerow([url,str(err)])
+        return False
+
+
+def createChunk(
+    text_snippet,
+    WEBSITE_url,
+    keywords,
+    title,
+    heading="",
+    heading_path=None,
+    doc_type="",
+    product="",
+    version="",
+    resolved_url="",
+    content_type="",
+):
+    chunk = Chunk(
+        title        = title,
+        url          = WEBSITE_url,
+        uuid         = str(uuid.uuid4()),
+        keywords     = keywords,
+        content      = text_snippet,
+        heading      = heading,
+        heading_path = heading_path or [],
+        doc_type     = doc_type,
+        product      = product,
+        version      = version,
+        resolved_url = resolved_url,
+        content_type = content_type,
+    )
+
+    return chunk
+
+
+def printChunks(chunks):
+    for chunk_dict in chunks:
+        print('='*100)
+        print("Title:", chunk_dict['title'])
+        print("Keywords:", chunk_dict['keywords'])
+        print("URL:", chunk_dict['url'])
+        print("Unique ID:", chunk_dict['uuid'])
+        print("Content:", chunk_dict['content'])
+        print('='*100)
+
+
+def chunkSaveAndTrack(url,chunk):
+
+    def addNewRow(current_date,chunk_words,chunk_id):
+        return [url,current_date,chunk_words,'1',chunk_id]
+    
+    def addToExistingRow(row,chunk_words,chunk_id):
+        url = row[0] # same URL
+        date = row[1] # same date
+        words = str(int(row[2]) + chunk_words) # update words
+        chunks = row[3] = str(int(row[3]) + 1) # update number of chunks
+        ids = row[4]+ f", {chunk_id}" # update chunk IDs
+        return [url,date,words,chunks,ids]
+
+
+    def recordChunk():
+        current_date = datetime.date.today().strftime('%Y-%m-%d')
+        chunk_words  = len(chunk.content.split())    
+        chunk_id     = f'chunk_{chunk.uuid}'
+
+        new_rows = []
+
+        with open(details_file, mode='r', newline='', encoding='utf-8') as file:
+            csv_reader = csv.reader(file)
+            try:
+                headers = next(csv_reader)  
+                new_rows.append(headers) # keep in memory
+            except StopIteration:
+                pass
+
+            url_found = False  # Track if the URL is found in any row
+            
+            # Loop through all the rows after the header
+            for row in csv_reader:
+                if row[0] == url:
+                    new_rows.append(addToExistingRow(row, chunk_words, chunk_id))  # Modify and append the row
+                    url_found = True  # Mark that the URL was found
+                else:
+                    new_rows.append(row)  # Append the row without modification
+            
+            # If the URL was not found, append a new row
+            if not url_found:
+                new_rows.append(addNewRow(current_date, chunk_words, chunk_id))
+
+
+        # Overwrite csv with new info
+        with open(details_file, mode='w', newline='') as file:
+            csv_writer = csv.writer(file, delimiter=',')
+            csv_writer.writerows(new_rows) 
+
+    # Save chunk
+    file_name = f"{yaml_dir}/chunk_{chunk.uuid}.yaml"
+    with open(file_name, 'w') as file:
+        yaml.dump(chunk.toDict(), file, default_flow_style=False, sort_keys=False)
+
+    # Record chunk
+    recordChunk()
+    print(f"{file_name} === {chunk.title}")

From 5d632f5417185763629388d75dab968ea841cb35 Mon Sep 17 00:00:00 2001
From: Andrew Pickard <andrew.pickard@arm.com>
Date: Thu, 18 Jun 2026 17:24:41 +0100
Subject: [PATCH 4/6] Duplicated csv field size limit into generate-common.py

---
 embedding-generation/generate-chunks.py | 3 ---
 embedding-generation/generate_common.py | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
index 0ef5401..3edc10a 100644
--- a/embedding-generation/generate-chunks.py
+++ b/embedding-generation/generate-chunks.py
@@ -69,14 +69,11 @@
 # multi-megabyte HTML document for every source row.
 ecosystem_dashboard_entries = None
 
-
 # Increase the file size limit, which defaults to '131,072'
 csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror'
 
 
 
-
-
 def build_ecosystem_dashboard_entries():
     """Load and cache package-level snippets from the ecosystem dashboard."""
     global ecosystem_dashboard_entries
diff --git a/embedding-generation/generate_common.py b/embedding-generation/generate_common.py
index 2e4efb4..4c6c417 100644
--- a/embedding-generation/generate_common.py
+++ b/embedding-generation/generate_common.py
@@ -88,6 +88,9 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks',
 # Each entry is a dict: {site_name, license_type, display_name, url, keywords}
 all_sources = []
 
+# Increase the file size limit, which defaults to '131,072'
+csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror'
+
 
 def get_number_of_sources():
     global all_sources

From 285523e64c805b011b7136da438148fee4413220 Mon Sep 17 00:00:00 2001
From: Andrew Pickard <andrew.pickard@arm.com>
Date: Thu, 18 Jun 2026 17:38:42 +0100
Subject: [PATCH 5/6] Updated vector-db-sources.csv with developer.arm.com
 items

---
 embedding-generation/vector-db-sources.csv | 80 ++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/embedding-generation/vector-db-sources.csv b/embedding-generation/vector-db-sources.csv
index 5c5e6ea..2cb715b 100755
--- a/embedding-generation/vector-db-sources.csv
+++ b/embedding-generation/vector-db-sources.csv
@@ -1840,3 +1840,83 @@ Learning Paths,CC4.0,Learning Path - Deploy multi-network device meshes using De
 Learning Paths,CC4.0,Learning Path - Implement post-quantum cryptography on Arm Cortex-M4,https://learn.arm.com/learning-paths/embedded-and-microcontrollers/pqc_pqm4/,Security; Linux; macOS; C; Python; GCC; stlink; QEMU
 Learning Paths,CC4.0,Learning Path - Device-to-Device communication with Device Connect,https://learn.arm.com/learning-paths/embedded-and-microcontrollers/device-connect-d2d/,Libraries; Linux; macOS; Windows; Python
 Learning Paths,CC4.0,Learning Path - Create and deploy a custom Topo Template,https://learn.arm.com/learning-paths/cross-platform/create-your-own-topo-templates/,Containers and Virtualization; Linux; macOS; Windows; Topo; Docker; SSH
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME and SME2,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME context save restore,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-context-save-restore?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME ZA storage,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-ZA-storage?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - If SME and SME2 are supported,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2/If-SME-and-SME2-are-supported?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - How to run an SME application,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/How-to-run-an-SME-application?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Guide - What is SME/SME2?,https://developer.arm.com/documentation/109974/0100/What-is-SME-SME2-?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Guide - Overview of SME,https://developer.arm.com/documentation/109974/0100/Overview-of-SME?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME2 multi-vector predication,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2/SME2-multi-vector-predication?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Use multi-vector SME loads for better efficiency,https://developer.arm.com/documentation/110636/0100/Memory-access/Use-multi-vector-SME-loads-for-better-efficiency?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Blog Post - Part 1: Arm Scalable Matrix Extension (SME) Introduction,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction,SME
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME2 lookup table,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-and-SME2/SME2-lookup-table?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - SME2 multi-vector operands,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME2-multi-vector-operands?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Streaming SVE mode,https://developer.arm.com/documentation/109246/0101/SME-Overview/Streaming-SVE-mode?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Compiler options and pragmas,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Compiler-support/Compiler-options-and-pragmas?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Guide - Assembly code,https://developer.arm.com/documentation/109974/0100/Basic-SME-example/Assembly-code?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - ZA array vector access and ZA tile mapping,https://developer.arm.com/documentation/109246/0101/SME-Overview/SME-ZA-storage/ZA-array-vector-access-and-ZA-tile-mapping?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Compiler support,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Compiler-support?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_r function overview,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-r-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Controlling the use of streaming mode,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Controlling-the-use-of-streaming-mode?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Debug tools,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Debug-tools?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Managing streaming mode across function boundaries,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Controlling-the-use-of-streaming-mode/Managing-streaming-mode-across-function-boundaries?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function overview,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-l-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Guide - Streaming SVE,https://developer.arm.com/documentation/109974/0100/Streaming-SVE?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Calling conventions,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_opt function overview,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/lut-gemv-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Controlling the use of ZA storage,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Controlling-the-use-of-ZA-storage?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Guide - Load and Store,https://developer.arm.com/documentation/109974/0100/Introduction-to-SME-instructions/Load-and-Store?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt function overview,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/matmul-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Blog Post - Introducing the Scalable Matrix Extension for the Armv9-A Architecture,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture,SME; A-Profile
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_fp32: Single precision matrix-by-matrix multiplication,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_int8: 8-bit integer to 32-bit integer matrix-by-matrix multiplication,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_rm_int8: Compressed 8-bit integer to 32-bit integer matrix-by-vector multiplication,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the gemv_cm_int8 algorithm,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/Overview-of-the-gemv-cm-int8-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the matmul_fp32 algorithm,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/Overview-of-the-matmul-fp32-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the lut_gemv_rm_int8 algorithm,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/Overview-of-the-lut-gemv-rm-int8-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Blog Post - Part 2: Arm Scalable Matrix Extension (SME) Instructions,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2,SME
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt function overview,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/matmul-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Preparation for entering and exiting streaming mode,https://developer.arm.com/documentation/109246/0101/Toolchains-and-model-support/Calling-conventions/Preparation-for-entering-and-exiting-streaming-mode?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Overview of the matmul_int8 algorithm,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/Overview-of-the-matmul-int8-algorithm?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,"Arm Blog Post - Part 3: Matrix-matrix multiplication. Neon, SVE, and SME compared",https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/matrix-matrix-multiplication-neon-sve-and-sme-compared,SME
+Arm Developer,Arm Proprietary,Arm Guide - Z registers,https://developer.arm.com/documentation/109974/0100/ZA-storage/Z-registers?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Blog Post - Part4: Arm SME2 Introduction,https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/part4-arm-sme2-introduction,SME; SVE; SIMD ISAs
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Introduction,https://developer.arm.com/documentation/109246/0101/Introduction?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - gemv_opt function overview,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/gemv-opt-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l code,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/preprocess-l-code?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_r code,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-r-code?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Guide - Real world examples,https://developer.arm.com/documentation/109974/0100/Why-are-matrices-used-/Real-world-examples?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function overview,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/preprocess-l-function-overview?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Streaming SVE mode and ZA storage,https://developer.arm.com/documentation/109246/0101/Introduction/The-Scalable-Matrix-Extensions/Streaming-SVE-mode-and-ZA-storage?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l code,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-l-code?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Memory access,https://developer.arm.com/documentation/110636/0100/Memory-access?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt code,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/matmul-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt code,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/matmul-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_r function details,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-r-function-details?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Introduction to CME,https://developer.arm.com/documentation/110636/0100/Introduction-to-CME?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Implications for programmers,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Implications-for-programmers?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Avoid access conflicts in 1KB regions,https://developer.arm.com/documentation/110636/0100/Memory-access/Avoid-access-conflicts-in-1KB-regions?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Guide - ZA storage,https://developer.arm.com/documentation/109974/0100/ZA-storage?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Performance monitoring,https://developer.arm.com/documentation/110636/0100/Performance-monitoring?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function details,https://developer.arm.com/documentation/109246/0101/matmul-int8--8-bit-integer-to-32-bit-integer-matrix-by-matrix-multiplication/preprocess-l-function-details?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_opt function details,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/lut-gemv-opt-function-details?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME components,https://developer.arm.com/documentation/110636/0100/Introduction-to-CME/CME-components?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME and the PMU,https://developer.arm.com/documentation/110636/0100/Performance-monitoring/CME-and-the-PMU?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - lut_gemv_opt code,https://developer.arm.com/documentation/109246/0101/lut-gemv-rm-int8--Compressed-8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/lut-gemv-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - gemv_opt code,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/gemv-opt-code?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Guide - Predication,https://developer.arm.com/documentation/109974/0100/Predication?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - matmul_opt function details,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/matmul-opt-function-details?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - preprocess_l function details,https://developer.arm.com/documentation/109246/0101/matmul-fp32--Single-precision-matrix-by-matrix-multiplication/preprocess-l-function-details?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - gemv_opt function details,https://developer.arm.com/documentation/109246/0101/gemv-cm-int8--8-bit-integer-to-32-bit-integer-matrix-by-vector-multiplication/gemv-opt-function-details?lang=en,SME; A-Profile; Software development; Armv9-A
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME and the SPE,https://developer.arm.com/documentation/110636/0100/Performance-monitoring/CME-and-the-SPE?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Implications for programmers,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution/Implications-for-programmers?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Fast Context Switching Instructions,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Fast-Context-Switching-Instructions?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME system configurations,https://developer.arm.com/documentation/110636/0100/CME-system-configurations?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Example 1: efficient loop,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution/Example-1--efficient-loop?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Example 2: inefficient loop,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution/Example-2--inefficient-loop?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Implications for programmers,https://developer.arm.com/documentation/110636/0100/Matrix-Multiply-Unit/Implications-for-programmers?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Matrix Multiply Unit,https://developer.arm.com/documentation/110636/0100/Matrix-Multiply-Unit?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - CME instruction execution,https://developer.arm.com/documentation/110636/0100/CME-instruction-execution?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Multi-CME systems,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Multi-CME-systems?lang=en,SME; Software development
+Arm Developer,Arm Proprietary,Arm Programmer's Guide - Single-CME systems,https://developer.arm.com/documentation/110636/0100/CME-system-configurations/Single-CME-systems?lang=en,SME; Software development

From 0f511948543950589258f7667c1a8844d4973299 Mon Sep 17 00:00:00 2001
From: Andrew Pickard <andrew.pickard@arm.com>
Date: Thu, 18 Jun 2026 18:48:49 +0100
Subject: [PATCH 6/6] Fixed unit tests to reflect the functions that have moved
 from generate-chunks.py into generate_common.py.

---
 embedding-generation/tests/conftest.py        | 21 ++++
 .../tests/test_generate_chunks.py             | 98 +++++++++----------
 2 files changed, 70 insertions(+), 49 deletions(-)

diff --git a/embedding-generation/tests/conftest.py b/embedding-generation/tests/conftest.py
index 22f243b..2e2a1e5 100644
--- a/embedding-generation/tests/conftest.py
+++ b/embedding-generation/tests/conftest.py
@@ -40,9 +40,19 @@ def _load_generate_chunks():
     spec.loader.exec_module(module)
     return module
 
+def _load_generate_common():
+    """Load generate_common.py module."""
+    spec = importlib.util.spec_from_file_location(
+        "generate_common",
+        os.path.join(_PARENT_DIR, "generate_common.py")
+    )
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
 
 # Load module once at conftest import time
 _generate_chunks_module = _load_generate_chunks()
+_generate_common_module = _load_generate_common()
 
 
 @pytest.fixture
@@ -55,3 +65,14 @@ def gc():
     # Clean up after test
     _generate_chunks_module.known_source_urls = set()
     _generate_chunks_module.all_sources = []
+
+@pytest.fixture
+def gcom():
+    """Provide the generate_common module with reset global state."""
+    # Reset global state before each test
+    _generate_common_module.known_source_urls = set()
+    _generate_common_module.all_sources = []
+    yield _generate_common_module
+    # Clean up after test
+    _generate_common_module.known_source_urls = set()
+    _generate_common_module.all_sources = []
diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py
index 9812634..25ddee7 100644
--- a/embedding-generation/tests/test_generate_chunks.py
+++ b/embedding-generation/tests/test_generate_chunks.py
@@ -259,9 +259,9 @@ class TestSourceTracking:
     known_source_urls and all_sources before and after each test.
     """
 
-    def test_register_source_new(self, gc):
+    def test_register_source_new(self, gcom):
         """Test registering a new source."""
-        result = gc.register_source(
+        result = gcom.register_source(
             site_name="Test Site",
             license_type="MIT",
             display_name="Test Display",
@@ -270,14 +270,14 @@ def test_register_source_new(self, gc):
         )
         
         assert result is True
-        assert "https://example.com/test" in gc.known_source_urls
-        assert len(gc.all_sources) == 1
-        assert gc.all_sources[0]['url'] == "https://example.com/test"
-        assert gc.all_sources[0]['keywords'] == "test; example"
+        assert "https://example.com/test" in gcom.known_source_urls
+        assert len(gcom.all_sources) == 1
+        assert gcom.all_sources[0]['url'] == "https://example.com/test"
+        assert gcom.all_sources[0]['keywords'] == "test; example"
 
-    def test_register_source_duplicate(self, gc):
+    def test_register_source_duplicate(self, gcom):
         """Test that duplicate URLs are rejected."""
-        gc.register_source(
+        gcom.register_source(
             site_name="Test Site",
             license_type="MIT",
             display_name="Test Display",
@@ -285,7 +285,7 @@ def test_register_source_duplicate(self, gc):
             keywords="test"
         )
         
-        result = gc.register_source(
+        result = gcom.register_source(
             site_name="Test Site 2",
             license_type="Apache",
             display_name="Different Display",
@@ -294,11 +294,11 @@ def test_register_source_duplicate(self, gc):
         )
         
         assert result is False
-        assert len(gc.all_sources) == 1
+        assert len(gcom.all_sources) == 1
 
-    def test_register_source_inserts_after_matching_site_group(self, gc):
+    def test_register_source_inserts_after_matching_site_group(self, gcom):
         """Test that new sources stay grouped with existing sources from the same site."""
-        gc.all_sources = [
+        gcom.all_sources = [
             {
                 'site_name': 'Google Cloud',
                 'license_type': 'CC4.0',
@@ -328,9 +328,9 @@ def test_register_source_inserts_after_matching_site_group(self, gc):
                 'keywords': 'a1'
             },
         ]
-        gc.known_source_urls = {source['url'] for source in gc.all_sources}
+        gcom.known_source_urls = {source['url'] for source in gcom.all_sources}
 
-        result = gc.register_source(
+        result = gcom.register_source(
             site_name="Ecosystem Dashboard",
             license_type="Arm Proprietary",
             display_name="Dashboard 3",
@@ -339,7 +339,7 @@ def test_register_source_inserts_after_matching_site_group(self, gc):
         )
 
         assert result is True
-        assert [source['display_name'] for source in gc.all_sources] == [
+        assert [source['display_name'] for source in gcom.all_sources] == [
             'Google 1',
             'Dashboard 1',
             'Dashboard 2',
@@ -347,9 +347,9 @@ def test_register_source_inserts_after_matching_site_group(self, gc):
             'Graviton 1',
         ]
 
-    def test_register_source_url_normalization(self, gc):
+    def test_register_source_url_normalization(self, gcom):
         """Test that URLs are stripped of whitespace."""
-        gc.register_source(
+        gcom.register_source(
             site_name="Test",
             license_type="MIT",
             display_name="Test",
@@ -357,11 +357,11 @@ def test_register_source_url_normalization(self, gc):
             keywords="test"
         )
         
-        assert "https://example.com/test" in gc.known_source_urls
+        assert "https://example.com/test" in gcom.known_source_urls
 
-    def test_register_source_string_keywords(self, gc):
+    def test_register_source_string_keywords(self, gcom):
         """Test that string keywords are preserved as-is."""
-        gc.register_source(
+        gcom.register_source(
             site_name="Test",
             license_type="MIT",
             display_name="Test",
@@ -369,16 +369,16 @@ def test_register_source_string_keywords(self, gc):
             keywords="already; formatted; string"
         )
         
-        assert gc.all_sources[0]['keywords'] == "already; formatted; string"
+        assert gcom.all_sources[0]['keywords'] == "already; formatted; string"
 
-    def test_load_existing_sources_file_not_exists(self, gc, tmp_path):
+    def test_load_existing_sources_file_not_exists(self, gcom, tmp_path):
         """Test loading from non-existent file."""
-        gc.load_existing_sources(str(tmp_path / "nonexistent.csv"))
+        gcom.load_existing_sources(str(tmp_path / "nonexistent.csv"))
         
-        assert len(gc.all_sources) == 0
-        assert len(gc.known_source_urls) == 0
+        assert len(gcom.all_sources) == 0
+        assert len(gcom.known_source_urls) == 0
 
-    def test_load_existing_sources(self, gc, tmp_path):
+    def test_load_existing_sources(self, gcom, tmp_path):
         """Test loading sources from CSV file."""
         csv_file = tmp_path / "sources.csv"
         csv_file.write_text(
@@ -387,17 +387,17 @@ def test_load_existing_sources(self, gc, tmp_path):
             "Another Site,Apache,Another Display,https://example.com/2,key3\n"
         )
         
-        gc.load_existing_sources(str(csv_file))
+        gcom.load_existing_sources(str(csv_file))
         
-        assert len(gc.all_sources) == 2
-        assert "https://example.com/1" in gc.known_source_urls
-        assert "https://example.com/2" in gc.known_source_urls
-        assert gc.all_sources[0]['site_name'] == "Test Site"
-        assert gc.all_sources[1]['display_name'] == "Another Display"
+        assert len(gcom.all_sources) == 2
+        assert "https://example.com/1" in gcom.known_source_urls
+        assert "https://example.com/2" in gcom.known_source_urls
+        assert gcom.all_sources[0]['site_name'] == "Test Site"
+        assert gcom.all_sources[1]['display_name'] == "Another Display"
 
-    def test_save_sources_csv(self, gc, tmp_path):
+    def test_save_sources_csv(self, gcom, tmp_path):
         """Test saving sources to CSV file."""
-        gc.all_sources = [
+        gcom.all_sources = [
             {
                 'site_name': 'Site 1',
                 'license_type': 'MIT',
@@ -415,7 +415,7 @@ def test_save_sources_csv(self, gc, tmp_path):
         ]
         
         csv_file = tmp_path / "output.csv"
-        gc.save_sources_csv(str(csv_file))
+        gcom.save_sources_csv(str(csv_file))
         
         # Read and verify
         with open(csv_file, 'r') as f:
@@ -426,7 +426,7 @@ def test_save_sources_csv(self, gc, tmp_path):
         assert rows[1] == ['Site 1', 'MIT', 'Display 1', 'https://example.com/1', 'key1; key2']
         assert rows[2] == ['Site 2', 'Apache', 'Display 2', 'https://example.com/2', 'key3']
 
-    def test_load_and_save_roundtrip(self, gc, tmp_path):
+    def test_load_and_save_roundtrip(self, gcom, tmp_path):
         """Test that loading and saving preserves data."""
         csv_file = tmp_path / "sources.csv"
         original_content = (
@@ -436,10 +436,10 @@ def test_load_and_save_roundtrip(self, gc, tmp_path):
         csv_file.write_text(original_content)
         
         # Load
-        gc.load_existing_sources(str(csv_file))
+        gcom.load_existing_sources(str(csv_file))
         
         # Add a new source
-        gc.register_source(
+        gcom.register_source(
             site_name="New Site",
             license_type="Apache",
             display_name="New Display",
@@ -448,16 +448,16 @@ def test_load_and_save_roundtrip(self, gc, tmp_path):
         )
         
         # Save
-        gc.save_sources_csv(str(csv_file))
+        gcom.save_sources_csv(str(csv_file))
         
         # Verify
-        gc.known_source_urls = set()
-        gc.all_sources = []
-        gc.load_existing_sources(str(csv_file))
+        gcom.known_source_urls = set()
+        gcom.all_sources = []
+        gcom.load_existing_sources(str(csv_file))
         
-        assert len(gc.all_sources) == 2
-        assert "https://example.com/test" in gc.known_source_urls
-        assert "https://new.example.com" in gc.known_source_urls
+        assert len(gcom.all_sources) == 2
+        assert "https://example.com/test" in gcom.known_source_urls
+        assert "https://new.example.com" in gcom.known_source_urls
 
 
 class TestGetMarkdownGitHubURLsFromPage:
@@ -873,18 +873,18 @@ def fake_fetch(url):
 class TestCreateRetrySession:
     """Tests for create_retry_session function."""
 
-    def test_creates_session(self, gc):
+    def test_creates_session(self, gcom):
         """Test that a session is created."""
-        session = gc.create_retry_session()
+        session = gcom.create_retry_session()
         
         assert session is not None
         # Check that adapters are mounted
         assert 'http://' in session.adapters
         assert 'https://' in session.adapters
 
-    def test_custom_retry_settings(self, gc):
+    def test_custom_retry_settings(self, gcom):
         """Test session with custom retry settings."""
-        session = gc.create_retry_session(
+        session = gcom.create_retry_session(
             retries=3,
             backoff_factor=2,
             status_forcelist=(500, 503)