From 45cd3ef5f808603865c23528c01a723ad38eb4f7 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:07:09 +0100 Subject: [PATCH 01/54] feat(example-data): add Python pango lineage collection seeder Adds example-data/lineages/seed.py, a Python script that fetches pango lineage definitions from the upstream summary JSON and creates one backend collection per lineage (nucleotide substitutions as variants). Mirrors the patterns of seed.mjs: idempotent, supports --wait, --url, --user-id, and --limit (default 10 for testing, 0 for all). Co-Authored-By: Claude Sonnet 4.6 --- example-data/lineages/seed.py | 176 ++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 example-data/lineages/seed.py diff --git a/example-data/lineages/seed.py b/example-data/lineages/seed.py new file mode 100644 index 000000000..e6b27e053 --- /dev/null +++ b/example-data/lineages/seed.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +"""Seeds pango lineage collections into the backend. + +For each pango lineage definition fetched from the upstream summary JSON, +creates one collection whose variants are the cumulative nucleotide substitutions +that define that lineage. + +Idempotent: skips any collection whose name already exists for the seed user. + +Run with --help for usage. +""" + +import argparse +import sys +import time +import os + +import requests + +DATA_URL = ( + "https://raw.githubusercontent.com/corneliusroemer/pango-sequences" + "/refs/heads/main/data/pango-consensus-sequences_summary.json" +) + +RETRY_ATTEMPTS = 30 +RETRY_DELAY_S = 2 +ORGANISM = "covid" +DEFAULT_LIMIT = 10 + + +def parse_args(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-u", "--url", + default=os.environ.get("BACKEND_URL", "http://localhost:8080"), + help="Backend base URL (default: $BACKEND_URL or http://localhost:8080)", + ) + parser.add_argument( + "--user-id", + default=os.environ.get("SEED_USER_ID", "example-data-seeder"), + help="User ID (default: $SEED_USER_ID or example-data-seeder)", + ) + parser.add_argument( + "--wait", + action="store_true", + default=not sys.stdout.isatty(), + help="Retry until backend is ready (auto-enabled when no TTY)", + ) + parser.add_argument( + "--limit", + type=int, + default=DEFAULT_LIMIT, + metavar="N", + help=f"Only process the first N lineages (default: {DEFAULT_LIMIT}; 0 = all)", + ) + return parser.parse_args() + + +def fetch_lineages(limit): + print(f"Fetching lineage data from {DATA_URL} ...") + response = requests.get(DATA_URL, timeout=60) + response.raise_for_status() + data = response.json() + lineages = list(data.values()) + if limit: + lineages = lineages[:limit] + print(f" Loaded {len(lineages)} lineage(s).") + return lineages + + +def build_collection(entry): + lineage = entry["lineage"] + parent = entry.get("parent") or "—" + clade = entry.get("nextstrainClade") or "—" + date = entry.get("designationDate") or "unknown" + + subs = [s for s in entry.get("nucSubstitutions", []) if s] + variants = [ + { + "type": "filterObject", + "name": sub, + "filterObject": {"nucleotideMutations": [sub]}, + } + for sub in subs + ] + + description = ( + f"Pango lineage {lineage}. " + f"Parent: {parent}. " + f"Nextstrain clade: {clade}. " + f"Designated: {date}." + ) + + return { + "name": lineage, + "organism": ORGANISM, + "description": description, + "variants": variants, + } + + +def wait_for_backend(backend_url, user_id): + url = f"{backend_url}/collections" + params = {"userId": user_id, "organism": ORGANISM} + for attempt in range(1, RETRY_ATTEMPTS + 1): + try: + r = requests.get(url, params=params, timeout=5) + if r.ok or r.status_code == 404: + return + except requests.RequestException: + pass + print(f"Waiting for backend... (attempt {attempt}/{RETRY_ATTEMPTS})") + time.sleep(RETRY_DELAY_S) + print(f"Backend at {backend_url} did not become ready after {RETRY_ATTEMPTS} attempts.", file=sys.stderr) + sys.exit(1) + + +def fetch_existing_collections(backend_url, user_id): + url = f"{backend_url}/collections" + params = {"userId": user_id, "organism": ORGANISM} + r = requests.get(url, params=params, timeout=10) + if not r.ok: + raise RuntimeError(f"GET /collections failed: {r.status_code} {r.text}") + return r.json() + + +def create_collection(backend_url, user_id, collection): + url = f"{backend_url}/collections" + params = {"userId": user_id} + r = requests.post(url, params=params, json=collection, timeout=10) + if r.status_code != 201: + raise RuntimeError(f"POST /collections failed: {r.status_code} {r.text}") + return r.json()["id"] + + +def main(): + args = parse_args() + backend_url = args.url.rstrip("/") + user_id = args.user_id + + print(f"Seeding pango lineage collections against {backend_url} as user '{user_id}'...") + + if args.wait: + wait_for_backend(backend_url, user_id) + + lineages = fetch_lineages(args.limit) + collections = [build_collection(e) for e in lineages if e.get("nucSubstitutions")] + # Filter out lineages that had no non-empty substitutions + collections = [c for c in collections if c["variants"]] + + existing = fetch_existing_collections(backend_url, user_id) + existing_names = {c["name"] for c in existing} + + created = 0 + skipped = 0 + for collection in collections: + if collection["name"] in existing_names: + print(f" SKIP {collection['name']}") + skipped += 1 + else: + col_id = create_collection(backend_url, user_id, collection) + print(f" OK id={col_id} {collection['name']}") + created += 1 + + print(f"\nDone. Created: {created}, skipped (already exist): {skipped}.") + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) From ee8ff3af750c30d3fa2b8c554393166cec674996 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:10:23 +0100 Subject: [PATCH 02/54] feat(example-data): add requirements.txt and Dockerfile for lineages seeder Co-Authored-By: Claude Sonnet 4.6 --- example-data/lineages/Dockerfile | 6 ++++++ example-data/lineages/requirements.txt | 1 + 2 files changed, 7 insertions(+) create mode 100644 example-data/lineages/Dockerfile create mode 100644 example-data/lineages/requirements.txt diff --git a/example-data/lineages/Dockerfile b/example-data/lineages/Dockerfile new file mode 100644 index 000000000..1e44a9f90 --- /dev/null +++ b/example-data/lineages/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.13-alpine +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY seed.py . +CMD ["python3", "seed.py"] diff --git a/example-data/lineages/requirements.txt b/example-data/lineages/requirements.txt new file mode 100644 index 000000000..f2293605c --- /dev/null +++ b/example-data/lineages/requirements.txt @@ -0,0 +1 @@ +requests From b7f327e1d7b431f0cab98daf2d1d5dea4fcd5893 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:29:41 +0100 Subject: [PATCH 03/54] refactor(example-data): unified Python seeder with source modules Replaces the split JS/Python approach with a single Python codebase: - seed.py: main entry point with argparse subcommands (covid-resistance-mutations, covid-pango-lineages) - backend.py: shared BackendClient (wait, fetch, create) - sources/resistance_mutations.py: port of seed.mjs resistance data - sources/pango_lineages.py: pango lineage fetcher - Dockerfile updated to run python3 seed.py Running without a subcommand seeds all sources. --limit only applies to the covid-pango-lineages subcommand (default: 10, 0 = all). Co-Authored-By: Claude Sonnet 4.6 --- example-data/Dockerfile | 9 +- example-data/backend.py | 47 +++++ example-data/lineages/Dockerfile | 6 - example-data/lineages/seed.py | 176 ------------------- example-data/{lineages => }/requirements.txt | 0 example-data/seed.py | 133 ++++++++++++++ example-data/sources/__init__.py | 0 example-data/sources/pango_lineages.py | 58 ++++++ example-data/sources/resistance_mutations.py | 136 ++++++++++++++ 9 files changed, 380 insertions(+), 185 deletions(-) create mode 100644 example-data/backend.py delete mode 100644 example-data/lineages/Dockerfile delete mode 100644 example-data/lineages/seed.py rename example-data/{lineages => }/requirements.txt (100%) create mode 100644 example-data/seed.py create mode 100644 example-data/sources/__init__.py create mode 100644 example-data/sources/pango_lineages.py create mode 100644 example-data/sources/resistance_mutations.py diff --git a/example-data/Dockerfile b/example-data/Dockerfile index 4ebbbed8e..ae727a863 100644 --- a/example-data/Dockerfile +++ b/example-data/Dockerfile @@ -1,4 +1,7 @@ -FROM node:24-alpine +FROM python:3.13-alpine WORKDIR /app -COPY seed.mjs . -CMD ["node", "seed.mjs"] +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY seed.py backend.py . +COPY sources/ sources/ +CMD ["python3", "seed.py"] diff --git a/example-data/backend.py b/example-data/backend.py new file mode 100644 index 000000000..222779ba9 --- /dev/null +++ b/example-data/backend.py @@ -0,0 +1,47 @@ +"""Shared backend API client for collection seeders.""" + +import sys +import time + +import requests + +RETRY_ATTEMPTS = 30 +RETRY_DELAY_S = 2 + + +class BackendClient: + def __init__(self, base_url: str, user_id: str): + self.base_url = base_url.rstrip("/") + self.user_id = user_id + self._collections_url = f"{self.base_url}/collections" + + def wait_for_backend(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELAY_S): + params = {"userId": self.user_id, "organism": "covid"} + for attempt in range(1, attempts + 1): + try: + r = requests.get(self._collections_url, params=params, timeout=5) + if r.ok or r.status_code == 404: + return + except requests.RequestException: + pass + print(f"Waiting for backend... (attempt {attempt}/{attempts})") + time.sleep(delay) + print( + f"Backend at {self.base_url} did not become ready after {attempts} attempts.", + file=sys.stderr, + ) + sys.exit(1) + + def fetch_existing_collections(self, organism: str) -> list[dict]: + params = {"userId": self.user_id, "organism": organism} + r = requests.get(self._collections_url, params=params, timeout=10) + if not r.ok: + raise RuntimeError(f"GET /collections failed: {r.status_code} {r.text}") + return r.json() + + def create_collection(self, collection: dict) -> str: + params = {"userId": self.user_id} + r = requests.post(self._collections_url, params=params, json=collection, timeout=10) + if r.status_code != 201: + raise RuntimeError(f"POST /collections failed: {r.status_code} {r.text}") + return r.json()["id"] diff --git a/example-data/lineages/Dockerfile b/example-data/lineages/Dockerfile deleted file mode 100644 index 1e44a9f90..000000000 --- a/example-data/lineages/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM python:3.13-alpine -WORKDIR /app -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt -COPY seed.py . -CMD ["python3", "seed.py"] diff --git a/example-data/lineages/seed.py b/example-data/lineages/seed.py deleted file mode 100644 index e6b27e053..000000000 --- a/example-data/lineages/seed.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python3 -"""Seeds pango lineage collections into the backend. - -For each pango lineage definition fetched from the upstream summary JSON, -creates one collection whose variants are the cumulative nucleotide substitutions -that define that lineage. - -Idempotent: skips any collection whose name already exists for the seed user. - -Run with --help for usage. -""" - -import argparse -import sys -import time -import os - -import requests - -DATA_URL = ( - "https://raw.githubusercontent.com/corneliusroemer/pango-sequences" - "/refs/heads/main/data/pango-consensus-sequences_summary.json" -) - -RETRY_ATTEMPTS = 30 -RETRY_DELAY_S = 2 -ORGANISM = "covid" -DEFAULT_LIMIT = 10 - - -def parse_args(): - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "-u", "--url", - default=os.environ.get("BACKEND_URL", "http://localhost:8080"), - help="Backend base URL (default: $BACKEND_URL or http://localhost:8080)", - ) - parser.add_argument( - "--user-id", - default=os.environ.get("SEED_USER_ID", "example-data-seeder"), - help="User ID (default: $SEED_USER_ID or example-data-seeder)", - ) - parser.add_argument( - "--wait", - action="store_true", - default=not sys.stdout.isatty(), - help="Retry until backend is ready (auto-enabled when no TTY)", - ) - parser.add_argument( - "--limit", - type=int, - default=DEFAULT_LIMIT, - metavar="N", - help=f"Only process the first N lineages (default: {DEFAULT_LIMIT}; 0 = all)", - ) - return parser.parse_args() - - -def fetch_lineages(limit): - print(f"Fetching lineage data from {DATA_URL} ...") - response = requests.get(DATA_URL, timeout=60) - response.raise_for_status() - data = response.json() - lineages = list(data.values()) - if limit: - lineages = lineages[:limit] - print(f" Loaded {len(lineages)} lineage(s).") - return lineages - - -def build_collection(entry): - lineage = entry["lineage"] - parent = entry.get("parent") or "—" - clade = entry.get("nextstrainClade") or "—" - date = entry.get("designationDate") or "unknown" - - subs = [s for s in entry.get("nucSubstitutions", []) if s] - variants = [ - { - "type": "filterObject", - "name": sub, - "filterObject": {"nucleotideMutations": [sub]}, - } - for sub in subs - ] - - description = ( - f"Pango lineage {lineage}. " - f"Parent: {parent}. " - f"Nextstrain clade: {clade}. " - f"Designated: {date}." - ) - - return { - "name": lineage, - "organism": ORGANISM, - "description": description, - "variants": variants, - } - - -def wait_for_backend(backend_url, user_id): - url = f"{backend_url}/collections" - params = {"userId": user_id, "organism": ORGANISM} - for attempt in range(1, RETRY_ATTEMPTS + 1): - try: - r = requests.get(url, params=params, timeout=5) - if r.ok or r.status_code == 404: - return - except requests.RequestException: - pass - print(f"Waiting for backend... (attempt {attempt}/{RETRY_ATTEMPTS})") - time.sleep(RETRY_DELAY_S) - print(f"Backend at {backend_url} did not become ready after {RETRY_ATTEMPTS} attempts.", file=sys.stderr) - sys.exit(1) - - -def fetch_existing_collections(backend_url, user_id): - url = f"{backend_url}/collections" - params = {"userId": user_id, "organism": ORGANISM} - r = requests.get(url, params=params, timeout=10) - if not r.ok: - raise RuntimeError(f"GET /collections failed: {r.status_code} {r.text}") - return r.json() - - -def create_collection(backend_url, user_id, collection): - url = f"{backend_url}/collections" - params = {"userId": user_id} - r = requests.post(url, params=params, json=collection, timeout=10) - if r.status_code != 201: - raise RuntimeError(f"POST /collections failed: {r.status_code} {r.text}") - return r.json()["id"] - - -def main(): - args = parse_args() - backend_url = args.url.rstrip("/") - user_id = args.user_id - - print(f"Seeding pango lineage collections against {backend_url} as user '{user_id}'...") - - if args.wait: - wait_for_backend(backend_url, user_id) - - lineages = fetch_lineages(args.limit) - collections = [build_collection(e) for e in lineages if e.get("nucSubstitutions")] - # Filter out lineages that had no non-empty substitutions - collections = [c for c in collections if c["variants"]] - - existing = fetch_existing_collections(backend_url, user_id) - existing_names = {c["name"] for c in existing} - - created = 0 - skipped = 0 - for collection in collections: - if collection["name"] in existing_names: - print(f" SKIP {collection['name']}") - skipped += 1 - else: - col_id = create_collection(backend_url, user_id, collection) - print(f" OK id={col_id} {collection['name']}") - created += 1 - - print(f"\nDone. Created: {created}, skipped (already exist): {skipped}.") - - -if __name__ == "__main__": - try: - main() - except Exception as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) diff --git a/example-data/lineages/requirements.txt b/example-data/requirements.txt similarity index 100% rename from example-data/lineages/requirements.txt rename to example-data/requirements.txt diff --git a/example-data/seed.py b/example-data/seed.py new file mode 100644 index 000000000..dd65adb9b --- /dev/null +++ b/example-data/seed.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +"""Seeds example collections into the backend from one or more data sources. + +Idempotent: skips any collection whose name already exists for the seed user. + +Run with --help for usage, or --help for source-specific options. +""" + +import argparse +import os +import sys + +from backend import BackendClient +from sources import pango_lineages, resistance_mutations + +ALL_SOURCES = [resistance_mutations, pango_lineages] +DEFAULT_LINEAGE_LIMIT = 10 + + +def make_parser() -> argparse.ArgumentParser: + parent = argparse.ArgumentParser(add_help=False) + parent.add_argument( + "-u", "--url", + default=os.environ.get("BACKEND_URL", "http://localhost:8080"), + help="Backend base URL (default: $BACKEND_URL or http://localhost:8080)", + ) + parent.add_argument( + "--user-id", + default=os.environ.get("SEED_USER_ID", "example-data-seeder"), + help="User ID (default: $SEED_USER_ID or example-data-seeder)", + ) + parent.add_argument( + "--wait", + action="store_true", + default=not sys.stdout.isatty(), + help="Retry until backend is ready (auto-enabled when no TTY)", + ) + + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + parents=[parent], + ) + subparsers = parser.add_subparsers(dest="source", metavar="source") + + subparsers.add_parser( + resistance_mutations.NAME, + parents=[parent], + help="Seed SARS-CoV-2 antiviral resistance mutation collections", + ) + + lineages_parser = subparsers.add_parser( + pango_lineages.NAME, + parents=[parent], + help="Seed pango lineage collections", + ) + lineages_parser.add_argument( + "--limit", + type=int, + default=DEFAULT_LINEAGE_LIMIT, + metavar="N", + help=f"Only process the first N lineages (default: {DEFAULT_LINEAGE_LIMIT}; 0 = all)", + ) + + return parser + + +def seed_source(client: BackendClient, source_name: str, collections: list[dict]): + print(f"\n[{source_name}]") + + organisms = {} + for c in collections: + organisms.setdefault(c["organism"], []).append(c) + + created = 0 + skipped = 0 + for organism, org_collections in organisms.items(): + existing = client.fetch_existing_collections(organism) + existing_names = {c["name"] for c in existing} + for collection in org_collections: + if collection["name"] in existing_names: + print(f" SKIP {collection['name']}") + skipped += 1 + else: + col_id = client.create_collection(collection) + print(f" OK id={col_id} {collection['name']}") + created += 1 + + print(f" Created: {created}, skipped: {skipped}.") + return created, skipped + + +def main(): + parser = make_parser() + args = parser.parse_args() + + client = BackendClient(args.url, args.user_id) + print(f"Seeding collections against {args.url} as user '{args.user_id}'...") + + if args.wait: + client.wait_for_backend() + + lineage_limit = getattr(args, "limit", DEFAULT_LINEAGE_LIMIT) + + if args.source == resistance_mutations.NAME: + active = [(resistance_mutations, {})] + elif args.source == pango_lineages.NAME: + active = [(pango_lineages, {"limit": lineage_limit})] + else: + # No subcommand: run all sources + active = [ + (resistance_mutations, {}), + (pango_lineages, {"limit": lineage_limit}), + ] + + total_created = 0 + total_skipped = 0 + for source, kwargs in active: + collections = source.get_collections(**kwargs) + c, s = seed_source(client, source.NAME, collections) + total_created += c + total_skipped += s + + if len(active) > 1: + print(f"\nTotal — created: {total_created}, skipped: {total_skipped}.") + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) diff --git a/example-data/sources/__init__.py b/example-data/sources/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/example-data/sources/pango_lineages.py b/example-data/sources/pango_lineages.py new file mode 100644 index 000000000..44c00070c --- /dev/null +++ b/example-data/sources/pango_lineages.py @@ -0,0 +1,58 @@ +"""Source: Pango lineage definitions from corneliusroemer/pango-sequences. + +Creates one collection per lineage, with nucleotide substitutions as variants. +""" + +import requests + +NAME = "covid-pango-lineages" + +DATA_URL = ( + "https://raw.githubusercontent.com/corneliusroemer/pango-sequences" + "/refs/heads/main/data/pango-consensus-sequences_summary.json" +) + + +def _build_collection(entry: dict) -> dict: + lineage = entry["lineage"] + parent = entry.get("parent") or "—" + clade = entry.get("nextstrainClade") or "—" + date = entry.get("designationDate") or "unknown" + + subs = [s for s in entry.get("nucSubstitutions", []) if s] + variants = [ + { + "type": "filterObject", + "name": sub, + "filterObject": {"nucleotideMutations": [sub]}, + } + for sub in subs + ] + + description = ( + f"Pango lineage {lineage}. " + f"Parent: {parent}. " + f"Nextstrain clade: {clade}. " + f"Designated: {date}." + ) + + return { + "name": lineage, + "organism": "covid", + "description": description, + "variants": variants, + } + + +def get_collections(limit: int = 0) -> list[dict]: + print(f"Fetching lineage data from {DATA_URL} ...") + response = requests.get(DATA_URL, timeout=60) + response.raise_for_status() + entries = list(response.json().values()) + if limit: + entries = entries[:limit] + print(f" Loaded {len(entries)} lineage(s).") + + collections = [_build_collection(e) for e in entries] + # Drop lineages that ended up with no variants after filtering blank subs + return [c for c in collections if c["variants"]] diff --git a/example-data/sources/resistance_mutations.py b/example-data/sources/resistance_mutations.py new file mode 100644 index 000000000..ebdda993c --- /dev/null +++ b/example-data/sources/resistance_mutations.py @@ -0,0 +1,136 @@ +"""Source: SARS-CoV-2 antiviral resistance mutations (ported from seed.mjs). + +Three collections covering 3CLpro, RdRp, and Spike mAb resistance mutations +as per the Stanford Coronavirus Antiviral & Resistance database. +""" + +NAME = "covid-resistance-mutations" + +CLPRO_MUTATIONS = [ + 'ORF1a:T3284I', 'ORF1a:T3288A', 'ORF1a:T3288N', 'ORF1a:T3308I', 'ORF1a:D3311Y', + 'ORF1a:M3312I', 'ORF1a:M3312L', 'ORF1a:M3312T', 'ORF1a:M3312-', 'ORF1a:L3313F', + 'ORF1a:G3401S', 'ORF1a:F3403L', 'ORF1a:F3403S', 'ORF1a:N3405D', 'ORF1a:N3405L', + 'ORF1a:N3405S', 'ORF1a:G3406S', 'ORF1a:S3407A', 'ORF1a:S3407E', 'ORF1a:S3407L', + 'ORF1a:S3407P', 'ORF1a:C3423F', 'ORF1a:M3428R', 'ORF1a:M3428T', 'ORF1a:E3429A', + 'ORF1a:E3429G', 'ORF1a:E3429K', 'ORF1a:E3429Q', 'ORF1a:E3429V', 'ORF1a:L3430F', + 'ORF1a:P3431-', 'ORF1a:T3432I', 'ORF1a:H3435L', 'ORF1a:H3435N', 'ORF1a:H3435Q', + 'ORF1a:H3435Y', 'ORF1a:A3436T', 'ORF1a:A3436V', 'ORF1a:V3449A', 'ORF1a:R3451G', + 'ORF1a:R3451S', 'ORF1a:Q3452I', 'ORF1a:Q3452K', 'ORF1a:T3453I', 'ORF1a:A3454T', + 'ORF1a:A3454V', 'ORF1a:Q3455A', 'ORF1a:Q3455C', 'ORF1a:Q3455D', 'ORF1a:Q3455E', + 'ORF1a:Q3455F', 'ORF1a:Q3455G', 'ORF1a:Q3455H', 'ORF1a:Q3455I', 'ORF1a:Q3455K', + 'ORF1a:Q3455L', 'ORF1a:Q3455N', 'ORF1a:Q3455P', 'ORF1a:Q3455R', 'ORF1a:Q3455S', + 'ORF1a:Q3455T', 'ORF1a:Q3455V', 'ORF1a:Q3455W', 'ORF1a:Q3455Y', 'ORF1a:A3456P', + 'ORF1a:A3457S', 'ORF1a:P3515L', 'ORF1a:V3560A', 'ORF1a:S3564P', 'ORF1a:T3567I', + 'ORF1a:F3568L', +] + +RDRP_MUTATIONS = [ + 'ORF1b:V157A', 'ORF1b:V157L', 'ORF1b:N189S', 'ORF1b:R276C', 'ORF1b:A367V', + 'ORF1b:A440V', 'ORF1b:F471L', 'ORF1b:D475Y', 'ORF1b:A517V', 'ORF1b:V548L', + 'ORF1b:G662S', 'ORF1b:S750A', 'ORF1b:V783I', 'ORF1b:E787G', 'ORF1b:C790F', + 'ORF1b:C790R', 'ORF1b:E793A', 'ORF1b:E793D', 'ORF1b:M915R', +] + +SPIKE_MUTATIONS = [ + 'S:P337H', 'S:P337L', 'S:P337R', 'S:P337S', 'S:P337T', + 'S:E340A', 'S:E340D', 'S:E340G', 'S:E340K', 'S:E340Q', 'S:E340V', + 'S:T345P', + 'S:R346G', 'S:R346I', 'S:R346K', 'S:R346S', 'S:R346T', + 'S:K356Q', 'S:K356T', + 'S:S371F', 'S:S371L', + 'S:D405E', 'S:D405N', 'S:E406D', + 'S:K417E', 'S:K417H', 'S:K417I', 'S:K417M', 'S:K417N', 'S:K417R', 'S:K417S', 'S:K417T', + 'S:D420A', 'S:D420N', + 'S:N439K', + 'S:N440D', 'S:N440E', 'S:N440I', 'S:N440K', 'S:N440R', 'S:N440T', 'S:N440Y', + 'S:S443Y', + 'S:K444E', 'S:K444F', 'S:K444I', 'S:K444L', 'S:K444M', 'S:K444N', 'S:K444R', 'S:K444T', + 'S:V445A', 'S:V445D', 'S:V445F', 'S:V445I', 'S:V445L', + 'S:G446A', 'S:G446D', 'S:G446I', 'S:G446N', 'S:G446R', 'S:G446S', 'S:G446T', 'S:G446V', + 'S:G447C', 'S:G447D', 'S:G447F', 'S:G447S', 'S:G447V', + 'S:N448D', 'S:N448K', 'S:N448T', 'S:N448Y', + 'S:Y449D', + 'S:N450D', 'S:N450K', + 'S:L452M', 'S:L452Q', 'S:L452R', 'S:L452W', + 'S:Y453F', 'S:Y453H', + 'S:L455F', 'S:L455M', 'S:L455S', 'S:L455W', + 'S:F456C', 'S:F456L', 'S:F456V', + 'S:S459P', + 'S:N460D', 'S:N460H', 'S:N460I', 'S:N460K', 'S:N460S', 'S:N460T', 'S:N460Y', + 'S:A475D', 'S:A475V', + 'S:G476D', 'S:G476R', 'S:G476T', + 'S:V483A', + 'S:E484A', 'S:E484D', 'S:E484G', 'S:E484K', 'S:E484P', 'S:E484Q', 'S:E484R', 'S:E484S', 'S:E484T', 'S:E484V', + 'S:G485D', 'S:G485R', + 'S:F486D', 'S:F486I', 'S:F486L', 'S:F486N', 'S:F486P', 'S:F486S', 'S:F486T', 'S:F486V', + 'S:N487D', 'S:N487H', 'S:N487S', + 'S:Y489H', 'S:Y489W', + 'S:F490G', 'S:F490I', 'S:F490L', 'S:F490R', 'S:F490S', 'S:F490V', 'S:F490Y', + 'S:Q493D', 'S:Q493E', 'S:Q493H', 'S:Q493K', 'S:Q493L', 'S:Q493R', 'S:Q493V', + 'S:S494P', 'S:S494R', + 'S:G496S', + 'S:Q498H', + 'S:P499H', 'S:P499R', 'S:P499S', 'S:P499T', + 'S:N501T', 'S:N501Y', + 'S:G504C', 'S:G504D', 'S:G504I', 'S:G504L', 'S:G504N', 'S:G504R', 'S:G504V', + 'S:P507A', + 'S:N856K', 'S:N969K', 'S:E990A', 'S:T1009I', +] + + +def _mature_name(mutation: str, set_name: str, offset: int) -> str: + """Convert a genomic mutation code to a mature protein name with the given offset. + + e.g. _mature_name("ORF1a:T3284I", "3CLpro", -3263) -> "3CLpro:T21I" + """ + mut_part = mutation[mutation.index(':') + 1:] + original_base = mut_part[0] + new_base = mut_part[-1] + position = int(''.join(c for c in mut_part if c.isdigit())) + return f"{set_name}:{original_base}{position + offset}{new_base}" + + +def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[dict]: + return [ + { + "type": "filterObject", + "name": _mature_name(m, set_name, offset), + "filterObject": {"aminoAcidMutations": [m]}, + } + for m in mutations + ] + + +def get_collections(limit: int = 0) -> list[dict]: + return [ + { + "name": "3CLpro resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(CLPRO_MUTATIONS, "3CLpro", -3263), + }, + { + "name": "RdRp resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(RDRP_MUTATIONS, "RdRp", 9), + }, + { + "name": "Spike mAb resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(SPIKE_MUTATIONS, "Spike", 0), + }, + ] From 2ffef39fffc62352afcf23693aeb9567d74d1ad2 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:34:29 +0100 Subject: [PATCH 04/54] feat(example-data): replace requirements.txt with pixi - pixi.toml with [workspace] config, python 3.13, requests via PyPI - pixi.lock committed for reproducibility - Dockerfile updated to multi-stage: pixi builder copies site-packages into python:3.13-slim final image - Defines tasks: seed, seed-lineages, seed-all-lineages, seed-resistance Co-Authored-By: Claude Sonnet 4.6 --- example-data/Dockerfile | 15 +- example-data/pixi.lock | 916 ++++++++++++++++++++++++++++++++++ example-data/pixi.toml | 17 + example-data/requirements.txt | 1 - 4 files changed, 944 insertions(+), 5 deletions(-) create mode 100644 example-data/pixi.lock create mode 100644 example-data/pixi.toml delete mode 100644 example-data/requirements.txt diff --git a/example-data/Dockerfile b/example-data/Dockerfile index ae727a863..72c322b8c 100644 --- a/example-data/Dockerfile +++ b/example-data/Dockerfile @@ -1,7 +1,14 @@ -FROM python:3.13-alpine +# Stage 1: use pixi to resolve and install dependencies +FROM ghcr.io/prefix-dev/pixi:0.58.0 AS builder WORKDIR /app -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +COPY pixi.toml pixi.lock . +RUN pixi install --frozen + +# Stage 2: slim runtime image — copy only the installed site-packages +FROM python:3.13-slim AS final +WORKDIR /app +COPY --from=builder /app/.pixi/envs/default/lib/python3.13/site-packages \ + /usr/local/lib/python3.13/site-packages COPY seed.py backend.py . COPY sources/ sources/ -CMD ["python3", "seed.py"] +CMD ["python", "seed.py"] diff --git a/example-data/pixi.lock b/example-data/pixi.lock new file mode 100644 index 000000000..f26f3573e --- /dev/null +++ b/example-data/pixi.lock @@ -0,0 +1,916 @@ +version: 6 +environments: + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.8.0-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.1-h0c1763c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.13-h6add32d_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + linux-aarch64: + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.8.0-hfae3067_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.3-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.53.1-h022381a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.42-h1022ec0_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.6-hf8d1292_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.2-h546c87b_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.13.13-h11c0449_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + osx-64: + - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/icu-78.3-h25d91c4_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.8.0-hcc62823_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.5.2-hd1f9c09_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.3-hbb4bfdb_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libmpdec-4.0.0-hf3981d6_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.53.1-h8f8c405_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.2-hbb4bfdb_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.6-hcc0dc9a_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.6.2-hc881268_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.13.13-h3d5d122_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.3-h68b038d_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h7142dee_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + osx-arm64: + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.8.0-hf6b4638_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.5.2-hcf2aa1b_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.3-h8088a28_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libmpdec-4.0.0-h84a0fba_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.53.1-h1b79a29_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.2-h8088a28_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.6-h1d4f5a5_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.6.2-hd24854e_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.13.13-h20e6be0_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.3-h46df422_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h010d191_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + build_number: 20 + sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9 + md5: a9f577daf3de00bca7c3c76c0ecbd1de + depends: + - __glibc >=2.17,<3.0.a0 + - libgomp >=7.5.0 + constrains: + - openmp_impl <0.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 28948 + timestamp: 1770939786096 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda + build_number: 20 + sha256: a2527b1d81792a0ccd2c05850960df119c2b6d8f5fdec97f2db7d25dc23b1068 + md5: 468fd3bb9e1f671d36c2cbc677e56f1d + depends: + - libgomp >=7.5.0 + constrains: + - openmp_impl <0.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 28926 + timestamp: 1770939656741 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6 + md5: d2ffd7602c02f2b316fd921d39876885 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 260182 + timestamp: 1771350215188 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda + sha256: b3495077889dde6bb370938e7db82be545c73e8589696ad0843a32221520ad4c + md5: 840d8fc0d7b3209be93080bc20e07f2d + depends: + - libgcc >=14 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 192412 + timestamp: 1771350241232 +- conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda + sha256: 9f242f13537ef1ce195f93f0cc162965d6cc79da578568d6d8e50f70dd025c42 + md5: 4173ac3b19ec0a4f400b4f782910368b + depends: + - __osx >=10.13 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 133427 + timestamp: 1771350680709 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda + sha256: 540fe54be35fac0c17feefbdc3e29725cce05d7367ffedfaaa1bdda234b019df + md5: 620b85a3f45526a8bc4d23fd78fc22f0 + depends: + - __osx >=11.0 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 124834 + timestamp: 1771350416561 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + sha256: c9dbcc8039a52023660d6d1bbf87594a93dd69c6ac5a2a44323af2c92976728d + md5: e18ad67cf881dcadee8b8d9e2f8e5f73 + depends: + - __unix + license: ISC + purls: [] + size: 131039 + timestamp: 1776865545798 +- pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + name: certifi + version: 2026.4.22 + sha256: 3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl + name: charset-normalizer + version: 3.4.7 + sha256: 0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl + name: charset-normalizer + version: 3.4.7 + sha256: f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + name: charset-normalizer + version: 3.4.7 + sha256: e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd + requires_python: '>=3.7' +- conda: https://conda.anaconda.org/conda-forge/osx-64/icu-78.3-h25d91c4_0.conda + sha256: 1294117122d55246bb83ad5b589e2a031aacdf2d0b1f99fd338aa4394f881735 + md5: 627eca44e62e2b665eeec57a984a7f00 + depends: + - __osx >=11.0 + license: MIT + license_family: MIT + purls: [] + size: 12273764 + timestamp: 1773822733780 +- pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + name: idna + version: '3.13' + sha256: 892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3 + requires_dist: + - ruff>=0.6.2 ; extra == 'all' + - mypy>=1.11.2 ; extra == 'all' + - pytest>=8.3.2 ; extra == 'all' + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + sha256: 3d584956604909ff5df353767f3a2a2f60e07d070b328d109f30ac40cd62df6c + md5: 18335a698559cdbcd86150a48bf54ba6 + depends: + - __glibc >=2.17,<3.0.a0 + - zstd >=1.5.7,<1.6.0a0 + constrains: + - binutils_impl_linux-64 2.45.1 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 728002 + timestamp: 1774197446916 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda + sha256: 7abd913d81a9bf00abb699e8987966baa2065f5132e37e815f92d90fc6bba530 + md5: a21644fc4a83da26452a718dc9468d5f + depends: + - zstd >=1.5.7,<1.6.0a0 + constrains: + - binutils_impl_linux-aarch64 2.45.1 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 875596 + timestamp: 1774197520746 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.8.0-hecca717_0.conda + sha256: ea33c40977ea7a2c3658c522230058395bc2ee0d89d99f0711390b6a1ee80d12 + md5: a3b390520c563d78cc58974de95a03e5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - expat 2.8.0.* + license: MIT + license_family: MIT + purls: [] + size: 77241 + timestamp: 1777846112704 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.8.0-hfae3067_0.conda + sha256: 206c422a7f4b462d1dc17d558f0299088d0992bd3309ae83f5440fcc4f130602 + md5: 3bacd6171f0a3f8fddd06c3d5ae01955 + depends: + - libgcc >=14 + constrains: + - expat 2.8.0.* + license: MIT + license_family: MIT + purls: [] + size: 76996 + timestamp: 1777846096032 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.8.0-hcc62823_0.conda + sha256: 5ebcc413d0a75da926a8b9b681d7d12c9562993991ba49c90a9881c4a59bdc11 + md5: d2e01f78c1daaeb4d2aa870125ebcd7e + depends: + - __osx >=11.0 + constrains: + - expat 2.8.0.* + license: MIT + license_family: MIT + purls: [] + size: 75242 + timestamp: 1777846416221 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.8.0-hf6b4638_0.conda + sha256: f4b1cafc59afaede8fa0a2d9cf376840f1c553001acd72f6ead18bbc8ac8c49c + md5: 65466e82c09e888ca7560c11a97d5450 + depends: + - __osx >=11.0 + constrains: + - expat 2.8.0.* + license: MIT + license_family: MIT + purls: [] + size: 68789 + timestamp: 1777846180142 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 + md5: a360c33a5abe61c07959e449fa1453eb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 58592 + timestamp: 1769456073053 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda + sha256: 3df4c539449aabc3443bbe8c492c01d401eea894603087fca2917aa4e1c2dea9 + md5: 2f364feefb6a7c00423e80dcb12db62a + depends: + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 55952 + timestamp: 1769456078358 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.5.2-hd1f9c09_0.conda + sha256: 951958d1792238006fdc6fce7f71f1b559534743b26cc1333497d46e5903a2d6 + md5: 66a0dc7464927d0853b590b6f53ba3ea + depends: + - __osx >=10.13 + license: MIT + license_family: MIT + purls: [] + size: 53583 + timestamp: 1769456300951 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.5.2-hcf2aa1b_0.conda + sha256: 6686a26466a527585e6a75cc2a242bf4a3d97d6d6c86424a441677917f28bec7 + md5: 43c04d9cb46ef176bb2a4c77e324d599 + depends: + - __osx >=11.0 + license: MIT + license_family: MIT + purls: [] + size: 40979 + timestamp: 1769456747661 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 + md5: 0aa00f03f9e39fb9876085dee11a85d4 + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.2.0=*_18 + - libgomp 15.2.0 he0feb66_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 1041788 + timestamp: 1771378212382 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda + sha256: 43df385bedc1cab11993c4369e1f3b04b4ca5d0ea16cba6a0e7f18dbc129fcc9 + md5: 552567ea2b61e3a3035759b2fdb3f9a6 + depends: + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.2.0=*_18 + - libgomp 15.2.0 h8acb6b2_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 622900 + timestamp: 1771378128706 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 + md5: 239c5e9546c38a1e884d69effcf4c882 + depends: + - __glibc >=2.17,<3.0.a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 603262 + timestamp: 1771378117851 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda + sha256: fc716f11a6a8525e27a5d332ef6a689210b0d2a4dd1133edc0f530659aa9faa6 + md5: 4faa39bf919939602e594253bd673958 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 588060 + timestamp: 1771378040807 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + sha256: ec30e52a3c1bf7d0425380a189d209a52baa03f22fb66dd3eb587acaa765bd6d + md5: b88d90cad08e6bc8ad540cb310a761fb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - xz 5.8.3.* + license: 0BSD + purls: [] + size: 113478 + timestamp: 1775825492909 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.3-he30d5cf_0.conda + sha256: d61962b9cd54c3554361550203c64d5b65b71e3058a285b66e4b04b9769f0a5c + md5: 76298a9e6d71ee6e832a8d0d7373b261 + depends: + - libgcc >=14 + constrains: + - xz 5.8.3.* + license: 0BSD + purls: [] + size: 126102 + timestamp: 1775828008518 +- conda: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.3-hbb4bfdb_0.conda + sha256: d9e2006051529aec5578c6efeb13bb6a7200a014b2d5a77a579e83a8049d5f3c + md5: becdfbfe7049fa248e52aa37a9df09e2 + depends: + - __osx >=11.0 + constrains: + - xz 5.8.3.* + license: 0BSD + purls: [] + size: 105724 + timestamp: 1775826029494 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.3-h8088a28_0.conda + sha256: 34878d87275c298f1a732c6806349125cebbf340d24c6c23727268184bba051e + md5: b1fd823b5ae54fbec272cea0811bd8a9 + depends: + - __osx >=11.0 + constrains: + - xz 5.8.3.* + license: 0BSD + purls: [] + size: 92472 + timestamp: 1775825802659 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 + md5: 2c21e66f50753a083cbe6b80f38268fa + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 92400 + timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda + sha256: 57c0dd12d506e84541c4e877898bd2a59cca141df493d34036f18b2751e0a453 + md5: 7b9813e885482e3ccb1fa212b86d7fd0 + depends: + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 114056 + timestamp: 1769482343003 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libmpdec-4.0.0-hf3981d6_1.conda + sha256: 1096c740109386607938ab9f09a7e9bca06d86770a284777586d6c378b8fb3fd + md5: ec88ba8a245855935b871a7324373105 + depends: + - __osx >=10.13 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 79899 + timestamp: 1769482558610 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libmpdec-4.0.0-h84a0fba_1.conda + sha256: 1089c7f15d5b62c622625ec6700732ece83be8b705da8c6607f4dabb0c4bd6d2 + md5: 57c4be259f5e0b99a5983799a228ae55 + depends: + - __osx >=11.0 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 73690 + timestamp: 1769482560514 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.1-h0c1763c_0.conda + sha256: 54cdcd3214313b62c2a8ee277e6f42150d9b748264c1b70d958bf735e420ef8d + md5: 7dc38adcbf71e6b38748e919e16e0dce + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: blessing + purls: [] + size: 954962 + timestamp: 1777986471789 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.53.1-h022381a_0.conda + sha256: ad03b7d8e4d08001f0df88ee7a56108bb35bae4795a42b9a04cc1abfa822bd07 + md5: 2ec1119217d8f0d086e9a62f3cb0e5ea + depends: + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: blessing + purls: [] + size: 955361 + timestamp: 1777986487553 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.53.1-h8f8c405_0.conda + sha256: 5e964e07a14180ce20decfd4897e8f81d48ec78c1cbf4af85c5520f535d9510c + md5: 9273c877f78b7486b0dfdd9268327a79 + depends: + - __osx >=11.0 + - icu >=78.3,<79.0a0 + - libzlib >=1.3.2,<2.0a0 + license: blessing + purls: [] + size: 1007171 + timestamp: 1777987093870 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.53.1-h1b79a29_0.conda + sha256: 49daec7c83e70d4efc17b813547824bc2bcf2f7256d84061d24fbfe537da9f74 + md5: 6681822ea9d362953206352371b6a904 + depends: + - __osx >=11.0 + - libzlib >=1.3.2,<2.0a0 + license: blessing + purls: [] + size: 920047 + timestamp: 1777987051643 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + sha256: bc1b08c92626c91500fd9f26f2c797f3eb153b627d53e9c13cd167f1e12b2829 + md5: 38ffe67b78c9d4de527be8315e5ada2c + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 40297 + timestamp: 1775052476770 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.42-h1022ec0_0.conda + sha256: 7d427edf58c702c337bf62bc90f355b7fc374a65fd9f70ea7a490f13bb76b1b9 + md5: a0b5de740d01c390bdbb46d7503c9fab + depends: + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 43567 + timestamp: 1775052485727 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + sha256: 55044c403570f0dc26e6364de4dc5368e5f3fc7ff103e867c487e2b5ab2bcda9 + md5: d87ff7921124eccd67248aa483c23fec + depends: + - __glibc >=2.17,<3.0.a0 + constrains: + - zlib 1.3.2 *_2 + license: Zlib + license_family: Other + purls: [] + size: 63629 + timestamp: 1774072609062 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda + sha256: eb111e32e5a7313a5bf799c7fb2419051fa2fe7eff74769fac8d5a448b309f7f + md5: 502006882cf5461adced436e410046d1 + constrains: + - zlib 1.3.2 *_2 + license: Zlib + license_family: Other + purls: [] + size: 69833 + timestamp: 1774072605429 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.2-hbb4bfdb_2.conda + sha256: 4c6da089952b2d70150c74234679d6f7ac04f4a98f9432dec724968f912691e7 + md5: 30439ff30578e504ee5e0b390afc8c65 + depends: + - __osx >=11.0 + constrains: + - zlib 1.3.2 *_2 + license: Zlib + license_family: Other + purls: [] + size: 59000 + timestamp: 1774073052242 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.2-h8088a28_2.conda + sha256: 361415a698514b19a852f5d1123c5da746d4642139904156ddfca7c922d23a05 + md5: bc5a5721b6439f2f62a84f2548136082 + depends: + - __osx >=11.0 + constrains: + - zlib 1.3.2 *_2 + license: Zlib + license_family: Other + purls: [] + size: 47759 + timestamp: 1774072956767 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + sha256: fc89f74bbe362fb29fa3c037697a89bec140b346a2469a90f7936d1d7ea4d8a3 + md5: fc21868a1a5aacc937e7a18747acb8a5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: X11 AND BSD-3-Clause + purls: [] + size: 918956 + timestamp: 1777422145199 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.6-hf8d1292_0.conda + sha256: 369db85c5cd8d99dde364ce70725d76511d9c8199e5b820c740414091bf5bcca + md5: b2a43456aa56fe80c2477a5094899eff + depends: + - libgcc >=14 + license: X11 AND BSD-3-Clause + purls: [] + size: 960036 + timestamp: 1777422174534 +- conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.6-hcc0dc9a_0.conda + sha256: f5f7e006ff4271305ab4cc08eedd855c67a571793c3d18aff73f645f088a8cae + md5: 31b8740cf1b2588d4e61c81191004061 + depends: + - __osx >=11.0 + license: X11 AND BSD-3-Clause + purls: [] + size: 831711 + timestamp: 1777423052277 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.6-h1d4f5a5_0.conda + sha256: 4ea6c620b87bd1d42bb2ccc2c87cd2483fa2d7f9e905b14c223f11ff3f4c455d + md5: 343d10ed5b44030a2f67193905aea159 + depends: + - __osx >=11.0 + license: X11 AND BSD-3-Clause + purls: [] + size: 805509 + timestamp: 1777423252320 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + sha256: c0ef482280e38c71a08ad6d71448194b719630345b0c9c60744a2010e8a8e0cb + md5: da1b85b6a87e141f5140bb9924cecab0 + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 3167099 + timestamp: 1775587756857 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.2-h546c87b_0.conda + sha256: 348cb74c1530ac241215d047ef65d134cf797af935c97a68655319362b7e6a01 + md5: 3b129669089e4d6a5c6871dbb4669b99 + depends: + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 3706406 + timestamp: 1775589602258 +- conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.6.2-hc881268_0.conda + sha256: 334fd49ea31b99114f5afb1ec44555dc8c90640648302a4f8f838ee345d1ec50 + md5: 5cf0ece4375c73d7a5765e83565a69c7 + depends: + - __osx >=11.0 + - ca-certificates + license: Apache-2.0 + license_family: Apache + purls: [] + size: 2776564 + timestamp: 1775589970694 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.6.2-hd24854e_0.conda + sha256: c91bf510c130a1ea1b6ff023e28bac0ccaef869446acd805e2016f69ebdc49ea + md5: 25dcccd4f80f1638428613e0d7c9b4e1 + depends: + - __osx >=11.0 + - ca-certificates + license: Apache-2.0 + license_family: Apache + purls: [] + size: 3106008 + timestamp: 1775587972483 +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.13-h6add32d_100_cp313.conda + build_number: 100 + sha256: 7f77eb57648f545c1f58e10035d0d9d66b0a0efb7c4b58d3ed89ec7269afdde1 + md5: 05051be49267378d2fcd12931e319ac3 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.7.5,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.52.0,<4.0a0 + - libuuid >=2.42,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.6,<4.0a0 + - python_abi 3.13.* *_cp313 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + license: Python-2.0 + purls: [] + size: 37358322 + timestamp: 1775614712638 + python_site_packages_path: lib/python3.13/site-packages +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.13.13-h11c0449_100_cp313.conda + build_number: 100 + sha256: d14e731e871d6379f8b82f3af5eb3382caa444880a9fc9d1d12033748277eb14 + md5: 81809cabd4647dee1127f2623a6a3005 + depends: + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-aarch64 >=2.36.1 + - libexpat >=2.7.5,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.52.0,<4.0a0 + - libuuid >=2.42,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.6,<4.0a0 + - python_abi 3.13.* *_cp313 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + license: Python-2.0 + purls: [] + size: 34042952 + timestamp: 1775613691 + python_site_packages_path: lib/python3.13/site-packages +- conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.13.13-h3d5d122_100_cp313.conda + build_number: 100 + sha256: 6f71b48fe93ebc0dd42c80358b75020f6ad12ed4772fb3555da36000139c0dc7 + md5: 8948c8c7c653ad668d55bbbd6836178b + depends: + - __osx >=11.0 + - bzip2 >=1.0.8,<2.0a0 + - libexpat >=2.7.5,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.52.0,<4.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.6,<4.0a0 + - python_abi 3.13.* *_cp313 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + license: Python-2.0 + purls: [] + size: 17650454 + timestamp: 1775616128232 + python_site_packages_path: lib/python3.13/site-packages +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.13.13-h20e6be0_100_cp313.conda + build_number: 100 + sha256: d0fffc5fde21d1ae350da545dfb9e115a8c53bed8a9c5761f9efd4a5581853c1 + md5: 9991a930e81d3873eba7a299ba783ec4 + depends: + - __osx >=11.0 + - bzip2 >=1.0.8,<2.0a0 + - libexpat >=2.7.5,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.52.0,<4.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.6,<4.0a0 + - python_abi 3.13.* *_cp313 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + license: Python-2.0 + purls: [] + size: 12966447 + timestamp: 1775615694085 + python_site_packages_path: lib/python3.13/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + build_number: 8 + sha256: 210bffe7b121e651419cb196a2a63687b087497595c9be9d20ebe97dd06060a7 + md5: 94305520c52a4aa3f6c2b1ff6008d9f8 + constrains: + - python 3.13.* *_cp313 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 7002 + timestamp: 1752805902938 +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 + md5: d7d95fc8287ea7bf33e0e7116d2b95ec + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 345073 + timestamp: 1765813471974 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda + sha256: fe695f9d215e9a2e3dd0ca7f56435ab4df24f5504b83865e3d295df36e88d216 + md5: 3d49cad61f829f4f0e0611547a9cda12 + depends: + - libgcc >=14 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 357597 + timestamp: 1765815673644 +- conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.3-h68b038d_0.conda + sha256: 4614af680aa0920e82b953fece85a03007e0719c3399f13d7de64176874b80d5 + md5: eefd65452dfe7cce476a519bece46704 + depends: + - __osx >=10.13 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 317819 + timestamp: 1765813692798 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.3-h46df422_0.conda + sha256: a77010528efb4b548ac2a4484eaf7e1c3907f2aec86123ed9c5212ae44502477 + md5: f8381319127120ce51e081dce4865cf4 + depends: + - __osx >=11.0 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 313930 + timestamp: 1765813902568 +- pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + name: requests + version: 2.33.1 + sha256: 4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a + requires_dist: + - charset-normalizer>=2,<4 + - idna>=2.5,<4 + - urllib3>=1.26,<3 + - certifi>=2023.5.7 + - pysocks>=1.5.6,!=1.5.7 ; extra == 'socks' + - chardet>=3.0.2,<8 ; extra == 'use-chardet-on-py3' + requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac + md5: cffd3bdd58090148f4cfcd831f4b26ab + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + constrains: + - xorg-libx11 >=1.8.12,<2.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3301196 + timestamp: 1769460227866 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda + sha256: e25c314b52764219f842b41aea2c98a059f06437392268f09b03561e4f6e5309 + md5: 7fc6affb9b01e567d2ef1d05b84aa6ed + depends: + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + constrains: + - xorg-libx11 >=1.8.12,<2.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3368666 + timestamp: 1769464148928 +- conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h7142dee_3.conda + sha256: 7f0d9c320288532873e2d8486c331ec6d87919c9028208d3f6ac91dc8f99a67b + md5: 6e6efb7463f8cef69dbcb4c2205bf60e + depends: + - __osx >=10.13 + - libzlib >=1.3.1,<2.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3282953 + timestamp: 1769460532442 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h010d191_3.conda + sha256: 799cab4b6cde62f91f750149995d149bc9db525ec12595e8a1d91b9317f038b3 + md5: a9d86bc62f39b94c4661716624eb21b0 + depends: + - __osx >=11.0 + - libzlib >=1.3.1,<2.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3127137 + timestamp: 1769460817696 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c + md5: ad659d0a2b3e47e38d829aa8cad2d610 + license: LicenseRef-Public-Domain + purls: [] + size: 119135 + timestamp: 1767016325805 +- pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + name: urllib3 + version: 2.6.3 + sha256: bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4 + requires_dist: + - brotli>=1.2.0 ; platform_python_implementation == 'CPython' and extra == 'brotli' + - brotlicffi>=1.2.0.0 ; platform_python_implementation != 'CPython' and extra == 'brotli' + - h2>=4,<5 ; extra == 'h2' + - pysocks>=1.5.6,!=1.5.7,<2.0 ; extra == 'socks' + - backports-zstd>=1.0.0 ; python_full_version < '3.14' and extra == 'zstd' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 + md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 + depends: + - __glibc >=2.17,<3.0.a0 + - libzlib >=1.3.1,<2.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 601375 + timestamp: 1764777111296 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda + sha256: 569990cf12e46f9df540275146da567d9c618c1e9c7a0bc9d9cfefadaed20b75 + md5: c3655f82dcea2aa179b291e7099c1fcc + depends: + - libzlib >=1.3.1,<2.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 614429 + timestamp: 1764777145593 diff --git a/example-data/pixi.toml b/example-data/pixi.toml new file mode 100644 index 000000000..f2a12f7a6 --- /dev/null +++ b/example-data/pixi.toml @@ -0,0 +1,17 @@ +[workspace] +name = "example-data-seeder" +version = "0.1.0" +channels = ["conda-forge"] +platforms = ["linux-64", "osx-arm64", "osx-64", "linux-aarch64"] + +[dependencies] +python = "3.13.*" + +[pypi-dependencies] +requests = "*" + +[tasks] +seed = "python seed.py" +seed-lineages = "python seed.py covid-pango-lineages" +seed-all-lineages = "python seed.py covid-pango-lineages --limit 0" +seed-resistance = "python seed.py covid-resistance-mutations" diff --git a/example-data/requirements.txt b/example-data/requirements.txt deleted file mode 100644 index f2293605c..000000000 --- a/example-data/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -requests From 4121ad8c7ec29210a5cfe42d8898aac3b5f257e3 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:41:48 +0100 Subject: [PATCH 05/54] feat(example-data): upsert seed user via POST /users/sync before seeding BackendClient now calls POST /users/sync (githubId=9999999999, name="GenSpectrum Team") to obtain the internal user id before any collection API calls. wait_for_backend() uses this call for polling. Removes the --user-id CLI flag. Co-Authored-By: Claude Sonnet 4.6 --- example-data/backend.py | 26 +++++++++++++++++++------- example-data/seed.py | 15 +++++++-------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/example-data/backend.py b/example-data/backend.py index 222779ba9..ae31563cf 100644 --- a/example-data/backend.py +++ b/example-data/backend.py @@ -9,20 +9,32 @@ RETRY_DELAY_S = 2 +SYNC_GITHUB_ID = "9999999999" +SYNC_NAME = "GenSpectrum Team" + + class BackendClient: - def __init__(self, base_url: str, user_id: str): + def __init__(self, base_url: str): self.base_url = base_url.rstrip("/") - self.user_id = user_id + self.user_id: int | None = None self._collections_url = f"{self.base_url}/collections" + def sync_user(self, github_id: str = SYNC_GITHUB_ID, name: str = SYNC_NAME, email: str | None = None) -> int: + """Upsert the seed user and store the returned internal id.""" + body = {"githubId": github_id, "name": name, "email": email} + r = requests.post(f"{self.base_url}/users/sync", json=body, timeout=10) + if not r.ok: + raise RuntimeError(f"POST /users/sync failed: {r.status_code} {r.text}") + self.user_id = r.json()["id"] + return self.user_id + def wait_for_backend(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELAY_S): - params = {"userId": self.user_id, "organism": "covid"} + """Poll until the backend is ready by repeatedly attempting user sync.""" for attempt in range(1, attempts + 1): try: - r = requests.get(self._collections_url, params=params, timeout=5) - if r.ok or r.status_code == 404: - return - except requests.RequestException: + self.sync_user() + return + except (requests.RequestException, RuntimeError): pass print(f"Waiting for backend... (attempt {attempt}/{attempts})") time.sleep(delay) diff --git a/example-data/seed.py b/example-data/seed.py index dd65adb9b..b74cea74b 100644 --- a/example-data/seed.py +++ b/example-data/seed.py @@ -24,11 +24,6 @@ def make_parser() -> argparse.ArgumentParser: default=os.environ.get("BACKEND_URL", "http://localhost:8080"), help="Backend base URL (default: $BACKEND_URL or http://localhost:8080)", ) - parent.add_argument( - "--user-id", - default=os.environ.get("SEED_USER_ID", "example-data-seeder"), - help="User ID (default: $SEED_USER_ID or example-data-seeder)", - ) parent.add_argument( "--wait", action="store_true", @@ -94,11 +89,15 @@ def main(): parser = make_parser() args = parser.parse_args() - client = BackendClient(args.url, args.user_id) - print(f"Seeding collections against {args.url} as user '{args.user_id}'...") + client = BackendClient(args.url) + print(f"Seeding collections against {args.url} ...") if args.wait: - client.wait_for_backend() + client.wait_for_backend() # syncs user as part of polling + else: + client.sync_user() + + print(f"Seeding as user id={client.user_id}.") lineage_limit = getattr(args, "limit", DEFAULT_LINEAGE_LIMIT) From 965bad30f73a54074e6f7bd17133e64abc845366 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:47:57 +0100 Subject: [PATCH 06/54] chore: add Python and pixi entries to .gitignore --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 35cd109ad..7ac7e8013 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,10 @@ logs node_modules/ .env + +# Python +__pycache__/ +*.pyc + +# pixi +.pixi/ From 78cd794e7f4d3dbc6c72378e3cd4b7ed20aba669 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:54:32 +0100 Subject: [PATCH 07/54] docs(example-data): update README for Python/pixi seeder --- example-data/README.md | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/example-data/README.md b/example-data/README.md index 3c33232c0..90cda84f2 100644 --- a/example-data/README.md +++ b/example-data/README.md @@ -1,6 +1,9 @@ # example-data -Seeds the backend with example collections (resistance mutation data for 3CLpro, RdRp, and Spike mAb). +Seeds the backend with example collections: + +- **covid-resistance-mutations** — resistance mutation data for 3CLpro, RdRp, and Spike mAb +- **covid-pango-lineages** — one collection per pango lineage, with nucleotide substitutions as variants The script is idempotent — re-running it will skip collections that already exist. @@ -12,16 +15,27 @@ The seeder runs automatically as part of Docker Compose: BACKEND_TAG=latest WEBSITE_TAG=latest SEEDER_TAG=latest docker compose up ``` -## Running locally by hand +## Running locally + +Requires [pixi](https://pixi.sh). Install dependencies once: + +```bash +pixi install +``` -Requires a current version of NodeJS. No `npm install` needed. +Then use the provided tasks: ```bash -# Local backend running on :8080 -node seed.mjs +pixi run seed # all sources (resistance mutations + first 10 lineages) +pixi run seed-resistance # resistance mutations only +pixi run seed-lineages # pango lineages (first 10) +pixi run seed-all-lineages # all ~4976 pango lineages +``` -# Local backend on a different port -node seed.mjs --url http://localhost:9021 +To target a different backend: + +```bash +pixi run seed --url http://localhost:9021 ``` -Run `node seed.mjs --help` for all options. +Run `pixi run seed --help` or `pixi run seed --help` for all options. From 42ea432a364a3b88f2f749178de5255d2bbec80e Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 11:56:31 +0100 Subject: [PATCH 08/54] feat(example-data): upsert collections instead of skipping existing ones Collections are now always created or updated (matched by name). Adds BackendClient.update_collection() using PUT /collections/{id}. Co-Authored-By: Claude Sonnet 4.6 --- example-data/README.md | 2 +- example-data/backend.py | 6 ++++++ example-data/seed.py | 26 ++++++++++++++------------ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/example-data/README.md b/example-data/README.md index 90cda84f2..9d4b2a86b 100644 --- a/example-data/README.md +++ b/example-data/README.md @@ -5,7 +5,7 @@ Seeds the backend with example collections: - **covid-resistance-mutations** — resistance mutation data for 3CLpro, RdRp, and Spike mAb - **covid-pango-lineages** — one collection per pango lineage, with nucleotide substitutions as variants -The script is idempotent — re-running it will skip collections that already exist. +The script is idempotent — re-running it will create new collections or update existing ones (matched by name). ## Via Docker Compose diff --git a/example-data/backend.py b/example-data/backend.py index ae31563cf..5ce2740a8 100644 --- a/example-data/backend.py +++ b/example-data/backend.py @@ -57,3 +57,9 @@ def create_collection(self, collection: dict) -> str: if r.status_code != 201: raise RuntimeError(f"POST /collections failed: {r.status_code} {r.text}") return r.json()["id"] + + def update_collection(self, collection_id: int, collection: dict) -> None: + params = {"userId": self.user_id} + r = requests.put(f"{self._collections_url}/{collection_id}", params=params, json=collection, timeout=10) + if not r.ok: + raise RuntimeError(f"PUT /collections/{collection_id} failed: {r.status_code} {r.text}") diff --git a/example-data/seed.py b/example-data/seed.py index b74cea74b..e94da987d 100644 --- a/example-data/seed.py +++ b/example-data/seed.py @@ -68,21 +68,23 @@ def seed_source(client: BackendClient, source_name: str, collections: list[dict] organisms.setdefault(c["organism"], []).append(c) created = 0 - skipped = 0 + updated = 0 for organism, org_collections in organisms.items(): existing = client.fetch_existing_collections(organism) - existing_names = {c["name"] for c in existing} + existing_by_name = {c["name"]: c for c in existing} for collection in org_collections: - if collection["name"] in existing_names: - print(f" SKIP {collection['name']}") - skipped += 1 + existing_entry = existing_by_name.get(collection["name"]) + if existing_entry: + client.update_collection(existing_entry["id"], collection) + print(f" UPDATE id={existing_entry['id']} {collection['name']}") + updated += 1 else: col_id = client.create_collection(collection) - print(f" OK id={col_id} {collection['name']}") + print(f" CREATE id={col_id} {collection['name']}") created += 1 - print(f" Created: {created}, skipped: {skipped}.") - return created, skipped + print(f" Created: {created}, updated: {updated}.") + return created, updated def main(): @@ -113,15 +115,15 @@ def main(): ] total_created = 0 - total_skipped = 0 + total_updated = 0 for source, kwargs in active: collections = source.get_collections(**kwargs) - c, s = seed_source(client, source.NAME, collections) + c, u = seed_source(client, source.NAME, collections) total_created += c - total_skipped += s + total_updated += u if len(active) > 1: - print(f"\nTotal — created: {total_created}, skipped: {total_skipped}.") + print(f"\nTotal — created: {total_created}, updated: {total_updated}.") if __name__ == "__main__": From 4ed9c5b9945b290739dc21583ce52780eae2e078 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 12:01:54 +0100 Subject: [PATCH 09/54] test(example-data): add test suite with mock source and mocked HTTP backend 38 tests across 4 files: - test_backend.py: BackendClient (responses library for HTTP mocking) - test_resistance_mutations.py: mature_name offset math, collection structure - test_pango_lineages.py: collection building, variant filtering, HTTP fetch - test_seed.py: seed_source create/update/mixed upsert logic Run with: pixi run -e test test Co-Authored-By: Claude Sonnet 4.6 --- example-data/pixi.lock | 219 ++++++++++++++++++ example-data/pixi.toml | 10 + example-data/pytest.ini | 3 + example-data/tests/__init__.py | 0 example-data/tests/mock_source.py | 22 ++ example-data/tests/test_backend.py | 136 +++++++++++ example-data/tests/test_pango_lineages.py | 108 +++++++++ .../tests/test_resistance_mutations.py | 51 ++++ example-data/tests/test_seed.py | 74 ++++++ 9 files changed, 623 insertions(+) create mode 100644 example-data/pytest.ini create mode 100644 example-data/tests/__init__.py create mode 100644 example-data/tests/mock_source.py create mode 100644 example-data/tests/test_backend.py create mode 100644 example-data/tests/test_pango_lineages.py create mode 100644 example-data/tests/test_resistance_mutations.py create mode 100644 example-data/tests/test_seed.py diff --git a/example-data/pixi.lock b/example-data/pixi.lock index f26f3573e..b8ae0977c 100644 --- a/example-data/pixi.lock +++ b/example-data/pixi.lock @@ -103,6 +103,137 @@ environments: - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + test: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.8.0-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.1-h0c1763c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.13-h6add32d_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + linux-aarch64: + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.8.0-hfae3067_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.3-he30d5cf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.53.1-h022381a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.42-h1022ec0_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.6-hf8d1292_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.2-h546c87b_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.13.13-h11c0449_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + osx-64: + - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/icu-78.3-h25d91c4_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.8.0-hcc62823_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.5.2-hd1f9c09_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.3-hbb4bfdb_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libmpdec-4.0.0-hf3981d6_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.53.1-h8f8c405_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.2-hbb4bfdb_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.6-hcc0dc9a_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.6.2-hc881268_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.13.13-h3d5d122_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.3-h68b038d_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h7142dee_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + osx-arm64: + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.8.0-hf6b4638_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.5.2-hcf2aa1b_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/liblzma-5.8.3-h8088a28_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libmpdec-4.0.0-h84a0fba_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.53.1-h1b79a29_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.2-h8088a28_2.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.6-h1d4f5a5_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.6.2-hd24854e_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.13.13-h20e6be0_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.3-h46df422_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h010d191_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - pypi: https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl + - pypi: https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl + - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda build_number: 20 @@ -220,6 +351,11 @@ packages: - mypy>=1.11.2 ; extra == 'all' - pytest>=8.3.2 ; extra == 'all' requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl + name: iniconfig + version: 2.3.0 + sha256: f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12 + requires_python: '>=3.10' - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda sha256: 3d584956604909ff5df353767f3a2a2f60e07d070b328d109f30ac40cd62df6c md5: 18335a698559cdbcd86150a48bf54ba6 @@ -657,6 +793,49 @@ packages: purls: [] size: 3106008 timestamp: 1775587972483 +- pypi: https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl + name: packaging + version: '26.2' + sha256: 5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl + name: pluggy + version: 1.6.0 + sha256: e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746 + requires_dist: + - pre-commit ; extra == 'dev' + - tox ; extra == 'dev' + - pytest ; extra == 'testing' + - pytest-benchmark ; extra == 'testing' + - coverage ; extra == 'testing' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl + name: pygments + version: 2.20.0 + sha256: 81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176 + requires_dist: + - colorama>=0.4.6 ; extra == 'windows-terminal' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl + name: pytest + version: 9.0.3 + sha256: 2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9 + requires_dist: + - colorama>=0.4 ; sys_platform == 'win32' + - exceptiongroup>=1 ; python_full_version < '3.11' + - iniconfig>=1.0.1 + - packaging>=22 + - pluggy>=1.5,<2 + - pygments>=2.7.2 + - tomli>=1 ; python_full_version < '3.11' + - argcomplete ; extra == 'dev' + - attrs>=19.2 ; extra == 'dev' + - hypothesis>=3.56 ; extra == 'dev' + - mock ; extra == 'dev' + - requests ; extra == 'dev' + - setuptools ; extra == 'dev' + - xmlschema ; extra == 'dev' + requires_python: '>=3.10' - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.13-h6add32d_100_cp313.conda build_number: 100 sha256: 7f77eb57648f545c1f58e10035d0d9d66b0a0efb7c4b58d3ed89ec7269afdde1 @@ -769,6 +948,26 @@ packages: purls: [] size: 7002 timestamp: 1752805902938 +- pypi: https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl + name: pyyaml + version: 6.0.3 + sha256: ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + name: pyyaml + version: 6.0.3 + sha256: 0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl + name: pyyaml + version: 6.0.3 + sha256: 2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl + name: pyyaml + version: 6.0.3 + sha256: 8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8 + requires_python: '>=3.8' - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 md5: d7d95fc8287ea7bf33e0e7116d2b95ec @@ -826,6 +1025,26 @@ packages: - pysocks>=1.5.6,!=1.5.7 ; extra == 'socks' - chardet>=3.0.2,<8 ; extra == 'use-chardet-on-py3' requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + name: responses + version: 0.26.0 + sha256: 03ec4409088cd5c66b71ecbbbd27fe2c58ddfad801c66203457b3e6a04868c37 + requires_dist: + - requests>=2.30.0,<3.0 + - urllib3>=1.25.10,<3.0 + - pyyaml + - pytest>=7.0.0 ; extra == 'tests' + - coverage>=6.0.0 ; extra == 'tests' + - pytest-cov ; extra == 'tests' + - pytest-asyncio ; extra == 'tests' + - pytest-httpserver ; extra == 'tests' + - flake8 ; extra == 'tests' + - types-pyyaml ; extra == 'tests' + - types-requests ; extra == 'tests' + - mypy ; extra == 'tests' + - tomli ; python_full_version < '3.11' and extra == 'tests' + - tomli-w ; extra == 'tests' + requires_python: '>=3.8' - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac md5: cffd3bdd58090148f4cfcd831f4b26ab diff --git a/example-data/pixi.toml b/example-data/pixi.toml index f2a12f7a6..7b6794266 100644 --- a/example-data/pixi.toml +++ b/example-data/pixi.toml @@ -15,3 +15,13 @@ seed = "python seed.py" seed-lineages = "python seed.py covid-pango-lineages" seed-all-lineages = "python seed.py covid-pango-lineages --limit 0" seed-resistance = "python seed.py covid-resistance-mutations" + +[feature.test.pypi-dependencies] +pytest = "*" +responses = "*" + +[feature.test.tasks] +test = "pytest" + +[environments] +test = { features = ["test"] } diff --git a/example-data/pytest.ini b/example-data/pytest.ini new file mode 100644 index 000000000..c7b23ecb1 --- /dev/null +++ b/example-data/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +pythonpath = . +testpaths = tests diff --git a/example-data/tests/__init__.py b/example-data/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/example-data/tests/mock_source.py b/example-data/tests/mock_source.py new file mode 100644 index 000000000..4b17db6b4 --- /dev/null +++ b/example-data/tests/mock_source.py @@ -0,0 +1,22 @@ +"""Mock data source for use in tests.""" + +NAME = "mock-source" + +COLLECTIONS = [ + { + "name": "Mock Collection A", + "organism": "covid", + "description": "A mock collection for testing.", + "variants": [{"type": "filterObject", "name": "C123T", "filterObject": {"nucleotideMutations": ["C123T"]}}], + }, + { + "name": "Mock Collection B", + "organism": "covid", + "description": "Another mock collection for testing.", + "variants": [], + }, +] + + +def get_collections(limit: int = 0) -> list[dict]: + return list(COLLECTIONS) diff --git a/example-data/tests/test_backend.py b/example-data/tests/test_backend.py new file mode 100644 index 000000000..cad1fd6cf --- /dev/null +++ b/example-data/tests/test_backend.py @@ -0,0 +1,136 @@ +import sys +import pytest +import responses as rsps_lib + +from backend import BackendClient, SYNC_GITHUB_ID, SYNC_NAME + +BASE = "http://localhost:8080" +SYNC_URL = f"{BASE}/users/sync" +COLLECTIONS_URL = f"{BASE}/collections" + + +@pytest.fixture +def client(): + return BackendClient(BASE) + + +# --- sync_user --- + +@rsps_lib.activate +def test_sync_user_sets_user_id(client): + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 42}, status=200) + result = client.sync_user() + assert result == 42 + assert client.user_id == 42 + + +@rsps_lib.activate +def test_sync_user_sends_correct_body(client): + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 1}, status=200) + client.sync_user() + body = rsps_lib.calls[0].request.body + import json + parsed = json.loads(body) + assert parsed["githubId"] == SYNC_GITHUB_ID + assert parsed["name"] == SYNC_NAME + + +@rsps_lib.activate +def test_sync_user_raises_on_error(client): + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"error": "bad"}, status=500) + with pytest.raises(RuntimeError, match="POST /users/sync failed"): + client.sync_user() + + +# --- fetch_existing_collections --- + +@rsps_lib.activate +def test_fetch_existing_collections(client): + client.user_id = 7 + existing = [{"id": 1, "name": "Col A"}, {"id": 2, "name": "Col B"}] + rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json=existing, status=200) + + result = client.fetch_existing_collections("covid") + + assert result == existing + req = rsps_lib.calls[0].request + assert "userId=7" in req.url + assert "organism=covid" in req.url + + +@rsps_lib.activate +def test_fetch_existing_collections_raises_on_error(client): + client.user_id = 7 + rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) + with pytest.raises(RuntimeError, match="GET /collections failed"): + client.fetch_existing_collections("covid") + + +# --- create_collection --- + +@rsps_lib.activate +def test_create_collection_returns_id(client): + client.user_id = 7 + rsps_lib.add(rsps_lib.POST, COLLECTIONS_URL, json={"id": 99}, status=201) + col = {"name": "Test", "organism": "covid", "description": "", "variants": []} + result = client.create_collection(col) + assert result == 99 + + +@rsps_lib.activate +def test_create_collection_raises_on_non_201(client): + client.user_id = 7 + rsps_lib.add(rsps_lib.POST, COLLECTIONS_URL, json={}, status=200) + with pytest.raises(RuntimeError, match="POST /collections failed"): + client.create_collection({"name": "X", "organism": "covid", "description": "", "variants": []}) + + +# --- update_collection --- + +@rsps_lib.activate +def test_update_collection_puts_correct_url(client): + client.user_id = 7 + rsps_lib.add(rsps_lib.PUT, f"{COLLECTIONS_URL}/55", json={}, status=200) + col = {"name": "Updated", "organism": "covid", "description": "", "variants": []} + client.update_collection(55, col) + + req = rsps_lib.calls[0].request + assert "/collections/55" in req.url + assert "userId=7" in req.url + + +@rsps_lib.activate +def test_update_collection_raises_on_error(client): + client.user_id = 7 + rsps_lib.add(rsps_lib.PUT, f"{COLLECTIONS_URL}/55", json={}, status=404) + with pytest.raises(RuntimeError, match="PUT /collections/55 failed"): + client.update_collection(55, {"name": "X", "organism": "covid", "description": "", "variants": []}) + + +# --- wait_for_backend --- + +@rsps_lib.activate +def test_wait_for_backend_succeeds_immediately(client): + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 5}, status=200) + client.wait_for_backend() + assert client.user_id == 5 + assert len(rsps_lib.calls) == 1 + + +@rsps_lib.activate +def test_wait_for_backend_retries_then_succeeds(client): + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={}, status=500) + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={}, status=500) + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 8}, status=200) + client.wait_for_backend(attempts=5, delay=0) + assert client.user_id == 8 + assert len(rsps_lib.calls) == 3 + + +@rsps_lib.activate +def test_wait_for_backend_exits_after_max_attempts(client): + for _ in range(3): + rsps_lib.add(rsps_lib.POST, SYNC_URL, json={}, status=500) + with pytest.raises(SystemExit) as exc: + client.wait_for_backend(attempts=3, delay=0) + assert exc.value.code == 1 diff --git a/example-data/tests/test_pango_lineages.py b/example-data/tests/test_pango_lineages.py new file mode 100644 index 000000000..5820ae998 --- /dev/null +++ b/example-data/tests/test_pango_lineages.py @@ -0,0 +1,108 @@ +import json +import responses as rsps_lib + +from sources.pango_lineages import _build_collection, get_collections, DATA_URL, NAME + +SAMPLE_DATA = { + "BA.2": { + "lineage": "BA.2", + "unaliased": "B.1.1.529.2", + "parent": "BA", + "nextstrainClade": "22C", + "nucSubstitutions": ["C241T", "A23403G", ""], + "designationDate": "2022-01-20", + }, + "XBB": { + "lineage": "XBB", + "unaliased": "XBB", + "parent": "", + "nextstrainClade": "", + "nucSubstitutions": [""], + "designationDate": "", + }, + "BA.5": { + "lineage": "BA.5", + "unaliased": "B.1.1.529.5", + "parent": "BA", + "nextstrainClade": "22B", + "nucSubstitutions": ["C241T", "T19955C"], + "designationDate": "2022-05-06", + }, +} + + +def test_name(): + assert NAME == "covid-pango-lineages" + + +# --- _build_collection --- + +def test_build_collection_basic(): + col = _build_collection(SAMPLE_DATA["BA.2"]) + assert col["name"] == "BA.2" + assert col["organism"] == "covid" + + +def test_build_collection_description_format(): + col = _build_collection(SAMPLE_DATA["BA.2"]) + assert "BA.2" in col["description"] + assert "BA" in col["description"] # parent + assert "22C" in col["description"] # clade + assert "2022-01-20" in col["description"] + + +def test_build_collection_filters_blank_subs(): + col = _build_collection(SAMPLE_DATA["BA.2"]) + # nucSubstitutions has ["C241T", "A23403G", ""] — blank should be dropped + assert len(col["variants"]) == 2 + names = [v["name"] for v in col["variants"]] + assert "C241T" in names + assert "A23403G" in names + + +def test_build_collection_variant_structure(): + col = _build_collection(SAMPLE_DATA["BA.2"]) + for v in col["variants"]: + assert v["type"] == "filterObject" + assert "nucleotideMutations" in v["filterObject"] + assert len(v["filterObject"]["nucleotideMutations"]) == 1 + + +def test_build_collection_missing_fields_use_defaults(): + col = _build_collection(SAMPLE_DATA["XBB"]) + assert "—" in col["description"] # parent and clade fallback + assert "unknown" in col["description"] # date fallback + + +# --- get_collections --- + +@rsps_lib.activate +def test_get_collections_fetches_data_url(): + rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) + get_collections() + assert len(rsps_lib.calls) == 1 + assert rsps_lib.calls[0].request.url == DATA_URL + + +@rsps_lib.activate +def test_get_collections_excludes_empty_variants(): + rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) + cols = get_collections() + # XBB has only blank subs → should be excluded + names = [c["name"] for c in cols] + assert "XBB" not in names + + +@rsps_lib.activate +def test_get_collections_respects_limit(): + rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) + cols = get_collections(limit=1) + assert len(cols) <= 1 + + +@rsps_lib.activate +def test_get_collections_no_limit_returns_all_valid(): + rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) + cols = get_collections(limit=0) + # BA.2 and BA.5 have valid subs; XBB does not + assert len(cols) == 2 diff --git a/example-data/tests/test_resistance_mutations.py b/example-data/tests/test_resistance_mutations.py new file mode 100644 index 000000000..cfa48c234 --- /dev/null +++ b/example-data/tests/test_resistance_mutations.py @@ -0,0 +1,51 @@ +from sources.resistance_mutations import _mature_name, get_collections, NAME + + +def test_name(): + assert NAME == "covid-resistance-mutations" + + +# --- _mature_name --- + +def test_mature_name_clpro_offset(): + # ORF1a position 3284, offset -3263 → position 21 + assert _mature_name("ORF1a:T3284I", "3CLpro", -3263) == "3CLpro:T21I" + + +def test_mature_name_rdrp_offset(): + # ORF1b position 157, offset +9 → position 166 + assert _mature_name("ORF1b:V157A", "RdRp", 9) == "RdRp:V166A" + + +def test_mature_name_spike_zero_offset(): + assert _mature_name("S:E484K", "Spike", 0) == "Spike:E484K" + + +def test_mature_name_deletion(): + # Deletions use '-' as new base + assert _mature_name("ORF1a:M3312-", "3CLpro", -3263) == "3CLpro:M49-" + + +# --- get_collections --- + +def test_get_collections_returns_three(): + cols = get_collections() + assert len(cols) == 3 + + +def test_get_collections_all_covid(): + for col in get_collections(): + assert col["organism"] == "covid" + + +def test_get_collections_variant_structure(): + for col in get_collections(): + assert col["variants"], f"'{col['name']}' has no variants" + for v in col["variants"]: + assert v["type"] == "filterObject" + assert "aminoAcidMutations" in v["filterObject"] + assert len(v["filterObject"]["aminoAcidMutations"]) == 1 + + +def test_get_collections_limit_ignored(): + assert get_collections(limit=1) == get_collections(limit=0) diff --git a/example-data/tests/test_seed.py b/example-data/tests/test_seed.py new file mode 100644 index 000000000..139d66d4f --- /dev/null +++ b/example-data/tests/test_seed.py @@ -0,0 +1,74 @@ +from unittest.mock import MagicMock, call + +from seed import seed_source +from tests.mock_source import COLLECTIONS + + +def make_client(existing=None): + client = MagicMock() + client.fetch_existing_collections.return_value = existing or [] + client.create_collection.return_value = 99 + return client + + +# --- seed_source: create / update / mixed --- + +def test_all_new_creates_all(): + client = make_client(existing=[]) + created, updated = seed_source(client, "mock-source", list(COLLECTIONS)) + assert created == len(COLLECTIONS) + assert updated == 0 + assert client.create_collection.call_count == len(COLLECTIONS) + client.update_collection.assert_not_called() + + +def test_all_existing_updates_all(): + existing = [{"id": i + 1, "name": c["name"]} for i, c in enumerate(COLLECTIONS)] + client = make_client(existing=existing) + created, updated = seed_source(client, "mock-source", list(COLLECTIONS)) + assert created == 0 + assert updated == len(COLLECTIONS) + assert client.update_collection.call_count == len(COLLECTIONS) + client.create_collection.assert_not_called() + + +def test_mixed_creates_and_updates(): + # Only the first collection already exists + existing = [{"id": 10, "name": COLLECTIONS[0]["name"]}] + client = make_client(existing=existing) + created, updated = seed_source(client, "mock-source", list(COLLECTIONS)) + assert created == len(COLLECTIONS) - 1 + assert updated == 1 + + +def test_update_uses_correct_id(): + existing = [{"id": 42, "name": COLLECTIONS[0]["name"]}] + client = make_client(existing=existing) + seed_source(client, "mock-source", [COLLECTIONS[0]]) + client.update_collection.assert_called_once_with(42, COLLECTIONS[0]) + + +def test_create_passes_full_collection(): + client = make_client(existing=[]) + seed_source(client, "mock-source", [COLLECTIONS[0]]) + client.create_collection.assert_called_once_with(COLLECTIONS[0]) + + +def test_fetch_called_once_per_organism(): + # Two collections with different organisms + multi = [ + {**COLLECTIONS[0], "organism": "covid"}, + {**COLLECTIONS[1], "organism": "mpox"}, + ] + client = make_client(existing=[]) + seed_source(client, "mock-source", multi) + assert client.fetch_existing_collections.call_count == 2 + organisms_fetched = {c.args[0] for c in client.fetch_existing_collections.call_args_list} + assert organisms_fetched == {"covid", "mpox"} + + +def test_returns_zero_counts_for_empty_collections(): + client = make_client(existing=[]) + created, updated = seed_source(client, "mock-source", []) + assert created == 0 + assert updated == 0 From f0dd9cc95566e2806498ee117e1a0902b7e47094 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 12:02:40 +0100 Subject: [PATCH 10/54] chore: rename example-data to collection-seeding --- {example-data => collection-seeding}/Dockerfile | 0 {example-data => collection-seeding}/README.md | 0 {example-data => collection-seeding}/backend.py | 0 {example-data => collection-seeding}/pixi.lock | 0 {example-data => collection-seeding}/pixi.toml | 0 {example-data => collection-seeding}/pytest.ini | 0 {example-data => collection-seeding}/seed.mjs | 0 {example-data => collection-seeding}/seed.py | 0 {example-data => collection-seeding}/sources/__init__.py | 0 {example-data => collection-seeding}/sources/pango_lineages.py | 0 .../sources/resistance_mutations.py | 0 {example-data => collection-seeding}/tests/__init__.py | 0 {example-data => collection-seeding}/tests/mock_source.py | 0 {example-data => collection-seeding}/tests/test_backend.py | 0 {example-data => collection-seeding}/tests/test_pango_lineages.py | 0 .../tests/test_resistance_mutations.py | 0 {example-data => collection-seeding}/tests/test_seed.py | 0 17 files changed, 0 insertions(+), 0 deletions(-) rename {example-data => collection-seeding}/Dockerfile (100%) rename {example-data => collection-seeding}/README.md (100%) rename {example-data => collection-seeding}/backend.py (100%) rename {example-data => collection-seeding}/pixi.lock (100%) rename {example-data => collection-seeding}/pixi.toml (100%) rename {example-data => collection-seeding}/pytest.ini (100%) rename {example-data => collection-seeding}/seed.mjs (100%) rename {example-data => collection-seeding}/seed.py (100%) rename {example-data => collection-seeding}/sources/__init__.py (100%) rename {example-data => collection-seeding}/sources/pango_lineages.py (100%) rename {example-data => collection-seeding}/sources/resistance_mutations.py (100%) rename {example-data => collection-seeding}/tests/__init__.py (100%) rename {example-data => collection-seeding}/tests/mock_source.py (100%) rename {example-data => collection-seeding}/tests/test_backend.py (100%) rename {example-data => collection-seeding}/tests/test_pango_lineages.py (100%) rename {example-data => collection-seeding}/tests/test_resistance_mutations.py (100%) rename {example-data => collection-seeding}/tests/test_seed.py (100%) diff --git a/example-data/Dockerfile b/collection-seeding/Dockerfile similarity index 100% rename from example-data/Dockerfile rename to collection-seeding/Dockerfile diff --git a/example-data/README.md b/collection-seeding/README.md similarity index 100% rename from example-data/README.md rename to collection-seeding/README.md diff --git a/example-data/backend.py b/collection-seeding/backend.py similarity index 100% rename from example-data/backend.py rename to collection-seeding/backend.py diff --git a/example-data/pixi.lock b/collection-seeding/pixi.lock similarity index 100% rename from example-data/pixi.lock rename to collection-seeding/pixi.lock diff --git a/example-data/pixi.toml b/collection-seeding/pixi.toml similarity index 100% rename from example-data/pixi.toml rename to collection-seeding/pixi.toml diff --git a/example-data/pytest.ini b/collection-seeding/pytest.ini similarity index 100% rename from example-data/pytest.ini rename to collection-seeding/pytest.ini diff --git a/example-data/seed.mjs b/collection-seeding/seed.mjs similarity index 100% rename from example-data/seed.mjs rename to collection-seeding/seed.mjs diff --git a/example-data/seed.py b/collection-seeding/seed.py similarity index 100% rename from example-data/seed.py rename to collection-seeding/seed.py diff --git a/example-data/sources/__init__.py b/collection-seeding/sources/__init__.py similarity index 100% rename from example-data/sources/__init__.py rename to collection-seeding/sources/__init__.py diff --git a/example-data/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py similarity index 100% rename from example-data/sources/pango_lineages.py rename to collection-seeding/sources/pango_lineages.py diff --git a/example-data/sources/resistance_mutations.py b/collection-seeding/sources/resistance_mutations.py similarity index 100% rename from example-data/sources/resistance_mutations.py rename to collection-seeding/sources/resistance_mutations.py diff --git a/example-data/tests/__init__.py b/collection-seeding/tests/__init__.py similarity index 100% rename from example-data/tests/__init__.py rename to collection-seeding/tests/__init__.py diff --git a/example-data/tests/mock_source.py b/collection-seeding/tests/mock_source.py similarity index 100% rename from example-data/tests/mock_source.py rename to collection-seeding/tests/mock_source.py diff --git a/example-data/tests/test_backend.py b/collection-seeding/tests/test_backend.py similarity index 100% rename from example-data/tests/test_backend.py rename to collection-seeding/tests/test_backend.py diff --git a/example-data/tests/test_pango_lineages.py b/collection-seeding/tests/test_pango_lineages.py similarity index 100% rename from example-data/tests/test_pango_lineages.py rename to collection-seeding/tests/test_pango_lineages.py diff --git a/example-data/tests/test_resistance_mutations.py b/collection-seeding/tests/test_resistance_mutations.py similarity index 100% rename from example-data/tests/test_resistance_mutations.py rename to collection-seeding/tests/test_resistance_mutations.py diff --git a/example-data/tests/test_seed.py b/collection-seeding/tests/test_seed.py similarity index 100% rename from example-data/tests/test_seed.py rename to collection-seeding/tests/test_seed.py From c168c0276d37618f0abbe663d9d012f8c40d5f2f Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 12:06:37 +0100 Subject: [PATCH 11/54] fix(ci): update example-data path to collection-seeding, remove unused SEED_USER_ID --- .github/workflows/example-data-seeder.yml | 2 +- docker-compose.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/example-data-seeder.yml b/.github/workflows/example-data-seeder.yml index 1b06a9f8c..4ea69aa21 100644 --- a/.github/workflows/example-data-seeder.yml +++ b/.github/workflows/example-data-seeder.yml @@ -37,7 +37,7 @@ jobs: - name: Build and push image uses: docker/build-push-action@v7 with: - context: ./example-data + context: ./collection-seeding tags: ${{ steps.dockerMetadata.outputs.tags }} cache-from: type=gha,scope=example-data-seeder-${{ github.ref }} cache-to: type=gha,mode=max,scope=example-data-seeder-${{ github.ref }} diff --git a/docker-compose.yml b/docker-compose.yml index a5c499de7..3af0825fe 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,7 +42,6 @@ services: - backend environment: BACKEND_URL: http://backend:8080 - SEED_USER_ID: example-data-seeder restart: "no" volumes: From 2af00d3c9b596b1552775cbd05270a1c4bc398c7 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 12:08:45 +0100 Subject: [PATCH 12/54] fix(collection-seeding): use genspectrum-bot GitHub ID (218605180) for user sync --- collection-seeding/README.md | 4 +++- collection-seeding/backend.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 9d4b2a86b..288a42604 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -1,4 +1,4 @@ -# example-data +# collection-seeding Seeds the backend with example collections: @@ -7,6 +7,8 @@ Seeds the backend with example collections: The script is idempotent — re-running it will create new collections or update existing ones (matched by name). +Collections are seeded under the [genspectrum-bot](https://github.com/genspectrum-bot) account (GitHub ID `218605180`), which is upserted automatically via `POST /users/sync` before seeding. + ## Via Docker Compose The seeder runs automatically as part of Docker Compose: diff --git a/collection-seeding/backend.py b/collection-seeding/backend.py index 5ce2740a8..96aad066d 100644 --- a/collection-seeding/backend.py +++ b/collection-seeding/backend.py @@ -9,7 +9,7 @@ RETRY_DELAY_S = 2 -SYNC_GITHUB_ID = "9999999999" +SYNC_GITHUB_ID = "218605180" # https://github.com/genspectrum-bot SYNC_NAME = "GenSpectrum Team" From bd97def95ef70573bae910ffa06bf54fd02a8c6c Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 7 May 2026 12:18:07 +0100 Subject: [PATCH 13/54] refactor(collection-seeding): add TypedDict types (Collection, Variant, FilterObject, ExistingCollection) --- collection-seeding/backend.py | 8 ++++-- collection-seeding/models.py | 27 ++++++++++++++++++ collection-seeding/seed.py | 3 +- collection-seeding/sources/pango_lineages.py | 28 ++++++++++--------- .../sources/resistance_mutations.py | 6 ++-- 5 files changed, 53 insertions(+), 19 deletions(-) create mode 100644 collection-seeding/models.py diff --git a/collection-seeding/backend.py b/collection-seeding/backend.py index 96aad066d..71440cedd 100644 --- a/collection-seeding/backend.py +++ b/collection-seeding/backend.py @@ -5,6 +5,8 @@ import requests +from models import Collection, ExistingCollection + RETRY_ATTEMPTS = 30 RETRY_DELAY_S = 2 @@ -44,21 +46,21 @@ def wait_for_backend(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_ ) sys.exit(1) - def fetch_existing_collections(self, organism: str) -> list[dict]: + def fetch_existing_collections(self, organism: str) -> list[ExistingCollection]: params = {"userId": self.user_id, "organism": organism} r = requests.get(self._collections_url, params=params, timeout=10) if not r.ok: raise RuntimeError(f"GET /collections failed: {r.status_code} {r.text}") return r.json() - def create_collection(self, collection: dict) -> str: + def create_collection(self, collection: Collection) -> int: params = {"userId": self.user_id} r = requests.post(self._collections_url, params=params, json=collection, timeout=10) if r.status_code != 201: raise RuntimeError(f"POST /collections failed: {r.status_code} {r.text}") return r.json()["id"] - def update_collection(self, collection_id: int, collection: dict) -> None: + def update_collection(self, collection_id: int, collection: Collection) -> None: params = {"userId": self.user_id} r = requests.put(f"{self._collections_url}/{collection_id}", params=params, json=collection, timeout=10) if not r.ok: diff --git a/collection-seeding/models.py b/collection-seeding/models.py new file mode 100644 index 000000000..2e0425561 --- /dev/null +++ b/collection-seeding/models.py @@ -0,0 +1,27 @@ +"""Shared type definitions for collection seeding.""" + +from typing import TypedDict + + +class FilterObject(TypedDict, total=False): + aminoAcidMutations: list[str] + nucleotideMutations: list[str] + + +class Variant(TypedDict): + type: str + name: str + filterObject: FilterObject + + +class Collection(TypedDict): + name: str + organism: str + description: str + variants: list[Variant] + + +class ExistingCollection(TypedDict): + """A collection as returned by the backend (includes the assigned id).""" + id: int + name: str diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index e94da987d..764bb126e 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -11,6 +11,7 @@ import sys from backend import BackendClient +from models import Collection from sources import pango_lineages, resistance_mutations ALL_SOURCES = [resistance_mutations, pango_lineages] @@ -60,7 +61,7 @@ def make_parser() -> argparse.ArgumentParser: return parser -def seed_source(client: BackendClient, source_name: str, collections: list[dict]): +def seed_source(client: BackendClient, source_name: str, collections: list[Collection]) -> tuple[int, int]: print(f"\n[{source_name}]") organisms = {} diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index 44c00070c..85c520252 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -5,6 +5,8 @@ import requests +from models import Collection, Variant + NAME = "covid-pango-lineages" DATA_URL = ( @@ -13,14 +15,14 @@ ) -def _build_collection(entry: dict) -> dict: - lineage = entry["lineage"] - parent = entry.get("parent") or "—" - clade = entry.get("nextstrainClade") or "—" - date = entry.get("designationDate") or "unknown" +def _build_collection(entry: dict) -> Collection: + lineage: str = entry["lineage"] + parent: str = entry.get("parent") or "—" + clade: str = entry.get("nextstrainClade") or "—" + date: str = entry.get("designationDate") or "unknown" subs = [s for s in entry.get("nucSubstitutions", []) if s] - variants = [ + variants: list[Variant] = [ { "type": "filterObject", "name": sub, @@ -36,15 +38,15 @@ def _build_collection(entry: dict) -> dict: f"Designated: {date}." ) - return { - "name": lineage, - "organism": "covid", - "description": description, - "variants": variants, - } + return Collection( + name=lineage, + organism="covid", + description=description, + variants=variants, + ) -def get_collections(limit: int = 0) -> list[dict]: +def get_collections(limit: int = 0) -> list[Collection]: print(f"Fetching lineage data from {DATA_URL} ...") response = requests.get(DATA_URL, timeout=60) response.raise_for_status() diff --git a/collection-seeding/sources/resistance_mutations.py b/collection-seeding/sources/resistance_mutations.py index ebdda993c..82e3b9b2d 100644 --- a/collection-seeding/sources/resistance_mutations.py +++ b/collection-seeding/sources/resistance_mutations.py @@ -4,6 +4,8 @@ as per the Stanford Coronavirus Antiviral & Resistance database. """ +from models import Collection, Variant + NAME = "covid-resistance-mutations" CLPRO_MUTATIONS = [ @@ -90,7 +92,7 @@ def _mature_name(mutation: str, set_name: str, offset: int) -> str: return f"{set_name}:{original_base}{position + offset}{new_base}" -def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[dict]: +def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[Variant]: return [ { "type": "filterObject", @@ -101,7 +103,7 @@ def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[di ] -def get_collections(limit: int = 0) -> list[dict]: +def get_collections(limit: int = 0) -> list[Collection]: return [ { "name": "3CLpro resistance mutations", From 5cf49fb2a721f64afa87bef17435784dad9a6326 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 12:09:24 +0200 Subject: [PATCH 14/54] refactor(collection-seeding): authenticate via API key through website proxy Replace the /users/sync + userId query param approach with a pre-provisioned API key sent as Authorization: Bearer. The seeder now talks to the website proxy (/api/collections) instead of the backend directly, removing the need for backend access inside docker-compose. Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/backend.py | 49 +++++--------- collection-seeding/seed.py | 20 +++--- collection-seeding/tests/test_backend.py | 81 +++++++----------------- docker-compose.yml | 5 +- 4 files changed, 54 insertions(+), 101 deletions(-) diff --git a/collection-seeding/backend.py b/collection-seeding/backend.py index 71440cedd..dae9afdf9 100644 --- a/collection-seeding/backend.py +++ b/collection-seeding/backend.py @@ -11,57 +11,42 @@ RETRY_DELAY_S = 2 -SYNC_GITHUB_ID = "218605180" # https://github.com/genspectrum-bot -SYNC_NAME = "GenSpectrum Team" - - class BackendClient: - def __init__(self, base_url: str): + def __init__(self, base_url: str, api_key: str): self.base_url = base_url.rstrip("/") - self.user_id: int | None = None - self._collections_url = f"{self.base_url}/collections" - - def sync_user(self, github_id: str = SYNC_GITHUB_ID, name: str = SYNC_NAME, email: str | None = None) -> int: - """Upsert the seed user and store the returned internal id.""" - body = {"githubId": github_id, "name": name, "email": email} - r = requests.post(f"{self.base_url}/users/sync", json=body, timeout=10) - if not r.ok: - raise RuntimeError(f"POST /users/sync failed: {r.status_code} {r.text}") - self.user_id = r.json()["id"] - return self.user_id + self._collections_url = f"{self.base_url}/api/collections" + self._auth_headers = {"Authorization": f"Bearer {api_key}"} - def wait_for_backend(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELAY_S): - """Poll until the backend is ready by repeatedly attempting user sync.""" + def wait_for_api(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELAY_S): + """Poll until the API is ready by checking the collections endpoint.""" for attempt in range(1, attempts + 1): try: - self.sync_user() - return - except (requests.RequestException, RuntimeError): + r = requests.get(self._collections_url, timeout=10) + if r.ok: + return + except requests.RequestException: pass - print(f"Waiting for backend... (attempt {attempt}/{attempts})") + print(f"Waiting for API... (attempt {attempt}/{attempts})") time.sleep(delay) print( - f"Backend at {self.base_url} did not become ready after {attempts} attempts.", + f"API at {self.base_url} did not become ready after {attempts} attempts.", file=sys.stderr, ) sys.exit(1) def fetch_existing_collections(self, organism: str) -> list[ExistingCollection]: - params = {"userId": self.user_id, "organism": organism} - r = requests.get(self._collections_url, params=params, timeout=10) + r = requests.get(self._collections_url, params={"organism": organism}, headers=self._auth_headers, timeout=10) if not r.ok: - raise RuntimeError(f"GET /collections failed: {r.status_code} {r.text}") + raise RuntimeError(f"GET /api/collections failed: {r.status_code} {r.text}") return r.json() def create_collection(self, collection: Collection) -> int: - params = {"userId": self.user_id} - r = requests.post(self._collections_url, params=params, json=collection, timeout=10) + r = requests.post(self._collections_url, headers=self._auth_headers, json=collection, timeout=10) if r.status_code != 201: - raise RuntimeError(f"POST /collections failed: {r.status_code} {r.text}") + raise RuntimeError(f"POST /api/collections failed: {r.status_code} {r.text}") return r.json()["id"] def update_collection(self, collection_id: int, collection: Collection) -> None: - params = {"userId": self.user_id} - r = requests.put(f"{self._collections_url}/{collection_id}", params=params, json=collection, timeout=10) + r = requests.put(f"{self._collections_url}/{collection_id}", headers=self._auth_headers, json=collection, timeout=10) if not r.ok: - raise RuntimeError(f"PUT /collections/{collection_id} failed: {r.status_code} {r.text}") + raise RuntimeError(f"PUT /api/collections/{collection_id} failed: {r.status_code} {r.text}") diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 764bb126e..dadb1bf49 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -22,14 +22,20 @@ def make_parser() -> argparse.ArgumentParser: parent = argparse.ArgumentParser(add_help=False) parent.add_argument( "-u", "--url", - default=os.environ.get("BACKEND_URL", "http://localhost:8080"), - help="Backend base URL (default: $BACKEND_URL or http://localhost:8080)", + default=os.environ.get("API_URL", "http://localhost:4321"), + help="API base URL (default: $API_URL or http://localhost:4321)", + ) + parent.add_argument( + "-k", "--api-key", + default=os.environ.get("API_KEY"), + required=not os.environ.get("API_KEY"), + help="API key for authentication (default: $API_KEY)", ) parent.add_argument( "--wait", action="store_true", default=not sys.stdout.isatty(), - help="Retry until backend is ready (auto-enabled when no TTY)", + help="Retry until API is ready (auto-enabled when no TTY)", ) parser = argparse.ArgumentParser( @@ -92,15 +98,11 @@ def main(): parser = make_parser() args = parser.parse_args() - client = BackendClient(args.url) + client = BackendClient(args.url, args.api_key) print(f"Seeding collections against {args.url} ...") if args.wait: - client.wait_for_backend() # syncs user as part of polling - else: - client.sync_user() - - print(f"Seeding as user id={client.user_id}.") + client.wait_for_api() lineage_limit = getattr(args, "limit", DEFAULT_LINEAGE_LIMIT) diff --git a/collection-seeding/tests/test_backend.py b/collection-seeding/tests/test_backend.py index cad1fd6cf..c980304ff 100644 --- a/collection-seeding/tests/test_backend.py +++ b/collection-seeding/tests/test_backend.py @@ -2,51 +2,22 @@ import pytest import responses as rsps_lib -from backend import BackendClient, SYNC_GITHUB_ID, SYNC_NAME +from backend import BackendClient BASE = "http://localhost:8080" -SYNC_URL = f"{BASE}/users/sync" -COLLECTIONS_URL = f"{BASE}/collections" +COLLECTIONS_URL = f"{BASE}/api/collections" +API_KEY = "test-api-key" @pytest.fixture def client(): - return BackendClient(BASE) - - -# --- sync_user --- - -@rsps_lib.activate -def test_sync_user_sets_user_id(client): - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 42}, status=200) - result = client.sync_user() - assert result == 42 - assert client.user_id == 42 - - -@rsps_lib.activate -def test_sync_user_sends_correct_body(client): - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 1}, status=200) - client.sync_user() - body = rsps_lib.calls[0].request.body - import json - parsed = json.loads(body) - assert parsed["githubId"] == SYNC_GITHUB_ID - assert parsed["name"] == SYNC_NAME - - -@rsps_lib.activate -def test_sync_user_raises_on_error(client): - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"error": "bad"}, status=500) - with pytest.raises(RuntimeError, match="POST /users/sync failed"): - client.sync_user() + return BackendClient(BASE, API_KEY) # --- fetch_existing_collections --- @rsps_lib.activate def test_fetch_existing_collections(client): - client.user_id = 7 existing = [{"id": 1, "name": "Col A"}, {"id": 2, "name": "Col B"}] rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json=existing, status=200) @@ -54,15 +25,14 @@ def test_fetch_existing_collections(client): assert result == existing req = rsps_lib.calls[0].request - assert "userId=7" in req.url assert "organism=covid" in req.url + assert req.headers["Authorization"] == f"Bearer {API_KEY}" @rsps_lib.activate def test_fetch_existing_collections_raises_on_error(client): - client.user_id = 7 rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) - with pytest.raises(RuntimeError, match="GET /collections failed"): + with pytest.raises(RuntimeError, match="GET /api/collections failed"): client.fetch_existing_collections("covid") @@ -70,18 +40,17 @@ def test_fetch_existing_collections_raises_on_error(client): @rsps_lib.activate def test_create_collection_returns_id(client): - client.user_id = 7 rsps_lib.add(rsps_lib.POST, COLLECTIONS_URL, json={"id": 99}, status=201) col = {"name": "Test", "organism": "covid", "description": "", "variants": []} result = client.create_collection(col) assert result == 99 + assert rsps_lib.calls[0].request.headers["Authorization"] == f"Bearer {API_KEY}" @rsps_lib.activate def test_create_collection_raises_on_non_201(client): - client.user_id = 7 rsps_lib.add(rsps_lib.POST, COLLECTIONS_URL, json={}, status=200) - with pytest.raises(RuntimeError, match="POST /collections failed"): + with pytest.raises(RuntimeError, match="POST /api/collections failed"): client.create_collection({"name": "X", "organism": "covid", "description": "", "variants": []}) @@ -89,48 +58,44 @@ def test_create_collection_raises_on_non_201(client): @rsps_lib.activate def test_update_collection_puts_correct_url(client): - client.user_id = 7 rsps_lib.add(rsps_lib.PUT, f"{COLLECTIONS_URL}/55", json={}, status=200) col = {"name": "Updated", "organism": "covid", "description": "", "variants": []} client.update_collection(55, col) req = rsps_lib.calls[0].request - assert "/collections/55" in req.url - assert "userId=7" in req.url + assert "/api/collections/55" in req.url + assert req.headers["Authorization"] == f"Bearer {API_KEY}" @rsps_lib.activate def test_update_collection_raises_on_error(client): - client.user_id = 7 rsps_lib.add(rsps_lib.PUT, f"{COLLECTIONS_URL}/55", json={}, status=404) - with pytest.raises(RuntimeError, match="PUT /collections/55 failed"): + with pytest.raises(RuntimeError, match="PUT /api/collections/55 failed"): client.update_collection(55, {"name": "X", "organism": "covid", "description": "", "variants": []}) -# --- wait_for_backend --- +# --- wait_for_api --- @rsps_lib.activate -def test_wait_for_backend_succeeds_immediately(client): - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 5}, status=200) - client.wait_for_backend() - assert client.user_id == 5 +def test_wait_for_api_succeeds_immediately(client): + rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json=[], status=200) + client.wait_for_api() assert len(rsps_lib.calls) == 1 @rsps_lib.activate -def test_wait_for_backend_retries_then_succeeds(client): - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={}, status=500) - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={}, status=500) - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={"id": 8}, status=200) - client.wait_for_backend(attempts=5, delay=0) - assert client.user_id == 8 +def test_wait_for_api_retries_then_succeeds(client): + rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) + rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) + rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json=[], status=200) + client.wait_for_api(attempts=5, delay=0) assert len(rsps_lib.calls) == 3 @rsps_lib.activate -def test_wait_for_backend_exits_after_max_attempts(client): +def test_wait_for_api_exits_after_max_attempts(client): for _ in range(3): - rsps_lib.add(rsps_lib.POST, SYNC_URL, json={}, status=500) + rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) with pytest.raises(SystemExit) as exc: - client.wait_for_backend(attempts=3, delay=0) + client.wait_for_api(attempts=3, delay=0) assert exc.value.code == 1 diff --git a/docker-compose.yml b/docker-compose.yml index 3af0825fe..c1e857396 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -39,9 +39,10 @@ services: example-data-seeder: image: ghcr.io/genspectrum/dashboards/example-data-seeder:${SEEDER_TAG} depends_on: - - backend + - website environment: - BACKEND_URL: http://backend:8080 + API_URL: http://website:4321 + API_KEY: ${SEEDER_API_KEY} restart: "no" volumes: From 0faea388ad742f2d300fe0660f2386698e038033 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 15:23:17 +0200 Subject: [PATCH 15/54] refactor(collection-seeding): rename backend.py to api.py, BackendClient to ApiClient Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/{backend.py => api.py} | 2 +- collection-seeding/seed.py | 6 +++--- collection-seeding/tests/test_backend.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) rename collection-seeding/{backend.py => api.py} (99%) diff --git a/collection-seeding/backend.py b/collection-seeding/api.py similarity index 99% rename from collection-seeding/backend.py rename to collection-seeding/api.py index dae9afdf9..9673e99af 100644 --- a/collection-seeding/backend.py +++ b/collection-seeding/api.py @@ -11,7 +11,7 @@ RETRY_DELAY_S = 2 -class BackendClient: +class ApiClient: def __init__(self, base_url: str, api_key: str): self.base_url = base_url.rstrip("/") self._collections_url = f"{self.base_url}/api/collections" diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index dadb1bf49..7ad1cb943 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -10,7 +10,7 @@ import os import sys -from backend import BackendClient +from api import ApiClient from models import Collection from sources import pango_lineages, resistance_mutations @@ -67,7 +67,7 @@ def make_parser() -> argparse.ArgumentParser: return parser -def seed_source(client: BackendClient, source_name: str, collections: list[Collection]) -> tuple[int, int]: +def seed_source(client: ApiClient, source_name: str, collections: list[Collection]) -> tuple[int, int]: print(f"\n[{source_name}]") organisms = {} @@ -98,7 +98,7 @@ def main(): parser = make_parser() args = parser.parse_args() - client = BackendClient(args.url, args.api_key) + client = ApiClient(args.url, args.api_key) print(f"Seeding collections against {args.url} ...") if args.wait: diff --git a/collection-seeding/tests/test_backend.py b/collection-seeding/tests/test_backend.py index c980304ff..1b842ebae 100644 --- a/collection-seeding/tests/test_backend.py +++ b/collection-seeding/tests/test_backend.py @@ -2,7 +2,7 @@ import pytest import responses as rsps_lib -from backend import BackendClient +from api import ApiClient BASE = "http://localhost:8080" COLLECTIONS_URL = f"{BASE}/api/collections" @@ -11,7 +11,7 @@ @pytest.fixture def client(): - return BackendClient(BASE, API_KEY) + return ApiClient(BASE, API_KEY) # --- fetch_existing_collections --- From f09b3600605950cf898303f6ebe51fa245867750 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 15:33:36 +0200 Subject: [PATCH 16/54] refactor(collection-seeding): rename test_backend.py to test_api.py, document name-matching caveat Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 2 +- collection-seeding/seed.py | 2 + collection-seeding/tests/test_backend.py | 101 ----------------------- 3 files changed, 3 insertions(+), 102 deletions(-) delete mode 100644 collection-seeding/tests/test_backend.py diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 288a42604..f2b991bbc 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -5,7 +5,7 @@ Seeds the backend with example collections: - **covid-resistance-mutations** — resistance mutation data for 3CLpro, RdRp, and Spike mAb - **covid-pango-lineages** — one collection per pango lineage, with nucleotide substitutions as variants -The script is idempotent — re-running it will create new collections or update existing ones (matched by name). +The script is idempotent — re-running it will create new collections or update existing ones (matched by name). If a collection's name changes in the source, the old entry is orphaned and a new one is created. Collections are seeded under the [genspectrum-bot](https://github.com/genspectrum-bot) account (GitHub ID `218605180`), which is upserted automatically via `POST /users/sync` before seeding. diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 7ad1cb943..15544464c 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -68,6 +68,8 @@ def make_parser() -> argparse.ArgumentParser: def seed_source(client: ApiClient, source_name: str, collections: list[Collection]) -> tuple[int, int]: + """Upsert collections for one source, grouped by organism. Returns (created, updated) counts. + Matching is by name — if a collection's name changes in the source, the old entry is orphaned and a new one is created.""" print(f"\n[{source_name}]") organisms = {} diff --git a/collection-seeding/tests/test_backend.py b/collection-seeding/tests/test_backend.py deleted file mode 100644 index 1b842ebae..000000000 --- a/collection-seeding/tests/test_backend.py +++ /dev/null @@ -1,101 +0,0 @@ -import sys -import pytest -import responses as rsps_lib - -from api import ApiClient - -BASE = "http://localhost:8080" -COLLECTIONS_URL = f"{BASE}/api/collections" -API_KEY = "test-api-key" - - -@pytest.fixture -def client(): - return ApiClient(BASE, API_KEY) - - -# --- fetch_existing_collections --- - -@rsps_lib.activate -def test_fetch_existing_collections(client): - existing = [{"id": 1, "name": "Col A"}, {"id": 2, "name": "Col B"}] - rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json=existing, status=200) - - result = client.fetch_existing_collections("covid") - - assert result == existing - req = rsps_lib.calls[0].request - assert "organism=covid" in req.url - assert req.headers["Authorization"] == f"Bearer {API_KEY}" - - -@rsps_lib.activate -def test_fetch_existing_collections_raises_on_error(client): - rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) - with pytest.raises(RuntimeError, match="GET /api/collections failed"): - client.fetch_existing_collections("covid") - - -# --- create_collection --- - -@rsps_lib.activate -def test_create_collection_returns_id(client): - rsps_lib.add(rsps_lib.POST, COLLECTIONS_URL, json={"id": 99}, status=201) - col = {"name": "Test", "organism": "covid", "description": "", "variants": []} - result = client.create_collection(col) - assert result == 99 - assert rsps_lib.calls[0].request.headers["Authorization"] == f"Bearer {API_KEY}" - - -@rsps_lib.activate -def test_create_collection_raises_on_non_201(client): - rsps_lib.add(rsps_lib.POST, COLLECTIONS_URL, json={}, status=200) - with pytest.raises(RuntimeError, match="POST /api/collections failed"): - client.create_collection({"name": "X", "organism": "covid", "description": "", "variants": []}) - - -# --- update_collection --- - -@rsps_lib.activate -def test_update_collection_puts_correct_url(client): - rsps_lib.add(rsps_lib.PUT, f"{COLLECTIONS_URL}/55", json={}, status=200) - col = {"name": "Updated", "organism": "covid", "description": "", "variants": []} - client.update_collection(55, col) - - req = rsps_lib.calls[0].request - assert "/api/collections/55" in req.url - assert req.headers["Authorization"] == f"Bearer {API_KEY}" - - -@rsps_lib.activate -def test_update_collection_raises_on_error(client): - rsps_lib.add(rsps_lib.PUT, f"{COLLECTIONS_URL}/55", json={}, status=404) - with pytest.raises(RuntimeError, match="PUT /api/collections/55 failed"): - client.update_collection(55, {"name": "X", "organism": "covid", "description": "", "variants": []}) - - -# --- wait_for_api --- - -@rsps_lib.activate -def test_wait_for_api_succeeds_immediately(client): - rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json=[], status=200) - client.wait_for_api() - assert len(rsps_lib.calls) == 1 - - -@rsps_lib.activate -def test_wait_for_api_retries_then_succeeds(client): - rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) - rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) - rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json=[], status=200) - client.wait_for_api(attempts=5, delay=0) - assert len(rsps_lib.calls) == 3 - - -@rsps_lib.activate -def test_wait_for_api_exits_after_max_attempts(client): - for _ in range(3): - rsps_lib.add(rsps_lib.GET, COLLECTIONS_URL, json={}, status=500) - with pytest.raises(SystemExit) as exc: - client.wait_for_api(attempts=3, delay=0) - assert exc.value.code == 1 From 65f6b849d90f468baa77e994162fda1383fa2385 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 15:38:56 +0200 Subject: [PATCH 17/54] chore(collection-seeding): pin pixi dependencies, fix README default URL Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 2 +- collection-seeding/pixi.toml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index f2b991bbc..8351cd7a1 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -37,7 +37,7 @@ pixi run seed-all-lineages # all ~4976 pango lineages To target a different backend: ```bash -pixi run seed --url http://localhost:9021 +pixi run seed --url http://localhost:4321 ``` Run `pixi run seed --help` or `pixi run seed --help` for all options. diff --git a/collection-seeding/pixi.toml b/collection-seeding/pixi.toml index 7b6794266..bd7fe3ca5 100644 --- a/collection-seeding/pixi.toml +++ b/collection-seeding/pixi.toml @@ -8,7 +8,7 @@ platforms = ["linux-64", "osx-arm64", "osx-64", "linux-aarch64"] python = "3.13.*" [pypi-dependencies] -requests = "*" +requests = ">=2.33.1" [tasks] seed = "python seed.py" @@ -17,8 +17,8 @@ seed-all-lineages = "python seed.py covid-pango-lineages --limit 0" seed-resistance = "python seed.py covid-resistance-mutations" [feature.test.pypi-dependencies] -pytest = "*" -responses = "*" +pytest = ">=9.0.3" +responses = ">=0.26.0" [feature.test.tasks] test = "pytest" From 74c63a68b91d4c91efc51a8db30f674ab5d34562 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 15:47:32 +0200 Subject: [PATCH 18/54] =?UTF-8?q?fix(collection-seeding):=20fix=20Dockerfi?= =?UTF-8?q?le=20COPY=20after=20backend.py=E2=86=92api.py=20rename,=20add?= =?UTF-8?q?=20missing=20models.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collection-seeding/Dockerfile b/collection-seeding/Dockerfile index 72c322b8c..21727058c 100644 --- a/collection-seeding/Dockerfile +++ b/collection-seeding/Dockerfile @@ -9,6 +9,6 @@ FROM python:3.13-slim AS final WORKDIR /app COPY --from=builder /app/.pixi/envs/default/lib/python3.13/site-packages \ /usr/local/lib/python3.13/site-packages -COPY seed.py backend.py . +COPY seed.py api.py models.py . COPY sources/ sources/ CMD ["python", "seed.py"] From 7646933b4543aa757ef4a8c923027c44d1f8dc8d Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 15:56:10 +0200 Subject: [PATCH 19/54] feat(collection-seeding): add repeat loop via REPEAT_INTERVAL_HOURS env var, run every 8h in compose Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 18 +++++++++++++----- docker-compose.yml | 3 ++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 15544464c..ffd49d079 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -9,6 +9,7 @@ import argparse import os import sys +import time from api import ApiClient from models import Collection @@ -132,8 +133,15 @@ def main(): if __name__ == "__main__": - try: - main() - except Exception as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) + repeat_hours = os.environ.get("REPEAT_INTERVAL_HOURS") + while True: + try: + main() + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + if not repeat_hours: + sys.exit(1) + if not repeat_hours: + break + print(f"\nSleeping for {repeat_hours}h ...") + time.sleep(float(repeat_hours) * 3600) diff --git a/docker-compose.yml b/docker-compose.yml index c1e857396..9d4562443 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -43,7 +43,8 @@ services: environment: API_URL: http://website:4321 API_KEY: ${SEEDER_API_KEY} - restart: "no" + REPEAT_INTERVAL_HOURS: 8 + restart: always volumes: database-data: From 8b3e342669c99c7ece0016f02e8a841c2dfbb6fa Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:00:48 +0200 Subject: [PATCH 20/54] docs(collection-seeding): document REPEAT_INTERVAL_HOURS in README Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 8351cd7a1..1c1c3380a 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -7,6 +7,8 @@ Seeds the backend with example collections: The script is idempotent — re-running it will create new collections or update existing ones (matched by name). If a collection's name changes in the source, the old entry is orphaned and a new one is created. +Set `REPEAT_INTERVAL_HOURS` to run on a loop (e.g. `REPEAT_INTERVAL_HOURS=8` re-seeds every 8 hours). Without it, the script runs once and exits. + Collections are seeded under the [genspectrum-bot](https://github.com/genspectrum-bot) account (GitHub ID `218605180`), which is upserted automatically via `POST /users/sync` before seeding. ## Via Docker Compose From 578fc24d60ff439ec32fbb8db1e6cc9add2321b0 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:19:09 +0200 Subject: [PATCH 21/54] refactor(collection-seeding): convert sources to classes with Source ABC Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 44 +++++------- collection-seeding/sources/__init__.py | 11 +++ collection-seeding/sources/pango_lineages.py | 32 +++++---- .../sources/resistance_mutations.py | 72 ++++++++++--------- collection-seeding/tests/mock_source.py | 15 ++-- .../tests/test_pango_lineages.py | 13 ++-- .../tests/test_resistance_mutations.py | 14 ++-- collection-seeding/tests/test_seed.py | 20 +++--- 8 files changed, 117 insertions(+), 104 deletions(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index ffd49d079..64c242b20 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -13,10 +13,9 @@ from api import ApiClient from models import Collection -from sources import pango_lineages, resistance_mutations - -ALL_SOURCES = [resistance_mutations, pango_lineages] -DEFAULT_LINEAGE_LIMIT = 10 +from sources import Source +from sources.pango_lineages import PangoLineagesSource, DEFAULT_LIMIT +from sources.resistance_mutations import ResistanceMutationsSource def make_parser() -> argparse.ArgumentParser: @@ -47,33 +46,34 @@ def make_parser() -> argparse.ArgumentParser: subparsers = parser.add_subparsers(dest="source", metavar="source") subparsers.add_parser( - resistance_mutations.NAME, + ResistanceMutationsSource.name, parents=[parent], help="Seed SARS-CoV-2 antiviral resistance mutation collections", ) lineages_parser = subparsers.add_parser( - pango_lineages.NAME, + PangoLineagesSource.name, parents=[parent], help="Seed pango lineage collections", ) lineages_parser.add_argument( "--limit", type=int, - default=DEFAULT_LINEAGE_LIMIT, + default=DEFAULT_LIMIT, metavar="N", - help=f"Only process the first N lineages (default: {DEFAULT_LINEAGE_LIMIT}; 0 = all)", + help=f"Only process the first N lineages (default: {DEFAULT_LIMIT}; 0 = all)", ) return parser -def seed_source(client: ApiClient, source_name: str, collections: list[Collection]) -> tuple[int, int]: +def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: """Upsert collections for one source, grouped by organism. Returns (created, updated) counts. Matching is by name — if a collection's name changes in the source, the old entry is orphaned and a new one is created.""" - print(f"\n[{source_name}]") + collections = source.get_collections() + print(f"\n[{source.name}]") - organisms = {} + organisms: dict[str, list[Collection]] = {} for c in collections: organisms.setdefault(c["organism"], []).append(c) @@ -107,24 +107,18 @@ def main(): if args.wait: client.wait_for_api() - lineage_limit = getattr(args, "limit", DEFAULT_LINEAGE_LIMIT) + lineage_limit = getattr(args, "limit", DEFAULT_LIMIT) - if args.source == resistance_mutations.NAME: - active = [(resistance_mutations, {})] - elif args.source == pango_lineages.NAME: - active = [(pango_lineages, {"limit": lineage_limit})] - else: - # No subcommand: run all sources - active = [ - (resistance_mutations, {}), - (pango_lineages, {"limit": lineage_limit}), - ] + source_map: dict[str, Source] = { + ResistanceMutationsSource.name: ResistanceMutationsSource(), + PangoLineagesSource.name: PangoLineagesSource(limit=lineage_limit), + } + active = [source_map[args.source]] if args.source else list(source_map.values()) total_created = 0 total_updated = 0 - for source, kwargs in active: - collections = source.get_collections(**kwargs) - c, u = seed_source(client, source.NAME, collections) + for source in active: + c, u = seed_source(client, source) total_created += c total_updated += u diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index e69de29bb..ebecccae7 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -0,0 +1,11 @@ +from abc import ABC, abstractmethod + +from models import Collection + + +class Source(ABC): + name: str + + @abstractmethod + def get_collections(self) -> list[Collection]: + ... diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index 85c520252..7737cee50 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -6,14 +6,15 @@ import requests from models import Collection, Variant - -NAME = "covid-pango-lineages" +from sources import Source DATA_URL = ( "https://raw.githubusercontent.com/corneliusroemer/pango-sequences" "/refs/heads/main/data/pango-consensus-sequences_summary.json" ) +DEFAULT_LIMIT = 10 + def _build_collection(entry: dict) -> Collection: lineage: str = entry["lineage"] @@ -46,15 +47,20 @@ def _build_collection(entry: dict) -> Collection: ) -def get_collections(limit: int = 0) -> list[Collection]: - print(f"Fetching lineage data from {DATA_URL} ...") - response = requests.get(DATA_URL, timeout=60) - response.raise_for_status() - entries = list(response.json().values()) - if limit: - entries = entries[:limit] - print(f" Loaded {len(entries)} lineage(s).") +class PangoLineagesSource(Source): + name = "covid-pango-lineages" + + def __init__(self, limit: int = DEFAULT_LIMIT): + self._limit = limit - collections = [_build_collection(e) for e in entries] - # Drop lineages that ended up with no variants after filtering blank subs - return [c for c in collections if c["variants"]] + def get_collections(self) -> list[Collection]: + print(f"Fetching lineage data from {DATA_URL} ...") + response = requests.get(DATA_URL, timeout=60) + response.raise_for_status() + entries = list(response.json().values()) + if self._limit: + entries = entries[:self._limit] + print(f" Loaded {len(entries)} lineage(s).") + collections = [_build_collection(e) for e in entries] + # Drop lineages that ended up with no variants after filtering blank subs + return [c for c in collections if c["variants"]] diff --git a/collection-seeding/sources/resistance_mutations.py b/collection-seeding/sources/resistance_mutations.py index 82e3b9b2d..39e15e39a 100644 --- a/collection-seeding/sources/resistance_mutations.py +++ b/collection-seeding/sources/resistance_mutations.py @@ -5,8 +5,7 @@ """ from models import Collection, Variant - -NAME = "covid-resistance-mutations" +from sources import Source CLPRO_MUTATIONS = [ 'ORF1a:T3284I', 'ORF1a:T3288A', 'ORF1a:T3288N', 'ORF1a:T3308I', 'ORF1a:D3311Y', @@ -103,36 +102,39 @@ def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[Va ] -def get_collections(limit: int = 0) -> list[Collection]: - return [ - { - "name": "3CLpro resistance mutations", - "organism": "covid", - "description": ( - "SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations " - "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." - ), - "variants": _build_variants(CLPRO_MUTATIONS, "3CLpro", -3263), - }, - { - "name": "RdRp resistance mutations", - "organism": "covid", - "description": ( - "SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations " - "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." - ), - "variants": _build_variants(RDRP_MUTATIONS, "RdRp", 9), - }, - { - "name": "Spike mAb resistance mutations", - "organism": "covid", - "description": ( - "SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations " - "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." - ), - "variants": _build_variants(SPIKE_MUTATIONS, "Spike", 0), - }, - ] +class ResistanceMutationsSource(Source): + name = "covid-resistance-mutations" + + def get_collections(self) -> list[Collection]: + return [ + { + "name": "3CLpro resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(CLPRO_MUTATIONS, "3CLpro", -3263), + }, + { + "name": "RdRp resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(RDRP_MUTATIONS, "RdRp", 9), + }, + { + "name": "Spike mAb resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(SPIKE_MUTATIONS, "Spike", 0), + }, + ] diff --git a/collection-seeding/tests/mock_source.py b/collection-seeding/tests/mock_source.py index 4b17db6b4..fb6b76779 100644 --- a/collection-seeding/tests/mock_source.py +++ b/collection-seeding/tests/mock_source.py @@ -1,8 +1,9 @@ """Mock data source for use in tests.""" -NAME = "mock-source" +from models import Collection +from sources import Source -COLLECTIONS = [ +COLLECTIONS: list[Collection] = [ { "name": "Mock Collection A", "organism": "covid", @@ -18,5 +19,11 @@ ] -def get_collections(limit: int = 0) -> list[dict]: - return list(COLLECTIONS) +class MockSource(Source): + name = "mock-source" + + def __init__(self, collections: list[Collection] | None = None): + self._collections = list(COLLECTIONS) if collections is None else collections + + def get_collections(self) -> list[Collection]: + return self._collections diff --git a/collection-seeding/tests/test_pango_lineages.py b/collection-seeding/tests/test_pango_lineages.py index 5820ae998..70c1383f2 100644 --- a/collection-seeding/tests/test_pango_lineages.py +++ b/collection-seeding/tests/test_pango_lineages.py @@ -1,7 +1,6 @@ -import json import responses as rsps_lib -from sources.pango_lineages import _build_collection, get_collections, DATA_URL, NAME +from sources.pango_lineages import PangoLineagesSource, _build_collection, DATA_URL SAMPLE_DATA = { "BA.2": { @@ -32,7 +31,7 @@ def test_name(): - assert NAME == "covid-pango-lineages" + assert PangoLineagesSource.name == "covid-pango-lineages" # --- _build_collection --- @@ -79,7 +78,7 @@ def test_build_collection_missing_fields_use_defaults(): @rsps_lib.activate def test_get_collections_fetches_data_url(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) - get_collections() + PangoLineagesSource(limit=0).get_collections() assert len(rsps_lib.calls) == 1 assert rsps_lib.calls[0].request.url == DATA_URL @@ -87,7 +86,7 @@ def test_get_collections_fetches_data_url(): @rsps_lib.activate def test_get_collections_excludes_empty_variants(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) - cols = get_collections() + cols = PangoLineagesSource(limit=0).get_collections() # XBB has only blank subs → should be excluded names = [c["name"] for c in cols] assert "XBB" not in names @@ -96,13 +95,13 @@ def test_get_collections_excludes_empty_variants(): @rsps_lib.activate def test_get_collections_respects_limit(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) - cols = get_collections(limit=1) + cols = PangoLineagesSource(limit=1).get_collections() assert len(cols) <= 1 @rsps_lib.activate def test_get_collections_no_limit_returns_all_valid(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) - cols = get_collections(limit=0) + cols = PangoLineagesSource(limit=0).get_collections() # BA.2 and BA.5 have valid subs; XBB does not assert len(cols) == 2 diff --git a/collection-seeding/tests/test_resistance_mutations.py b/collection-seeding/tests/test_resistance_mutations.py index cfa48c234..84a7754fa 100644 --- a/collection-seeding/tests/test_resistance_mutations.py +++ b/collection-seeding/tests/test_resistance_mutations.py @@ -1,8 +1,8 @@ -from sources.resistance_mutations import _mature_name, get_collections, NAME +from sources.resistance_mutations import ResistanceMutationsSource, _mature_name def test_name(): - assert NAME == "covid-resistance-mutations" + assert ResistanceMutationsSource.name == "covid-resistance-mutations" # --- _mature_name --- @@ -29,23 +29,19 @@ def test_mature_name_deletion(): # --- get_collections --- def test_get_collections_returns_three(): - cols = get_collections() + cols = ResistanceMutationsSource().get_collections() assert len(cols) == 3 def test_get_collections_all_covid(): - for col in get_collections(): + for col in ResistanceMutationsSource().get_collections(): assert col["organism"] == "covid" def test_get_collections_variant_structure(): - for col in get_collections(): + for col in ResistanceMutationsSource().get_collections(): assert col["variants"], f"'{col['name']}' has no variants" for v in col["variants"]: assert v["type"] == "filterObject" assert "aminoAcidMutations" in v["filterObject"] assert len(v["filterObject"]["aminoAcidMutations"]) == 1 - - -def test_get_collections_limit_ignored(): - assert get_collections(limit=1) == get_collections(limit=0) diff --git a/collection-seeding/tests/test_seed.py b/collection-seeding/tests/test_seed.py index 139d66d4f..6e98c56da 100644 --- a/collection-seeding/tests/test_seed.py +++ b/collection-seeding/tests/test_seed.py @@ -1,7 +1,7 @@ -from unittest.mock import MagicMock, call +from unittest.mock import MagicMock from seed import seed_source -from tests.mock_source import COLLECTIONS +from tests.mock_source import COLLECTIONS, MockSource def make_client(existing=None): @@ -15,7 +15,7 @@ def make_client(existing=None): def test_all_new_creates_all(): client = make_client(existing=[]) - created, updated = seed_source(client, "mock-source", list(COLLECTIONS)) + created, updated = seed_source(client, MockSource()) assert created == len(COLLECTIONS) assert updated == 0 assert client.create_collection.call_count == len(COLLECTIONS) @@ -25,7 +25,7 @@ def test_all_new_creates_all(): def test_all_existing_updates_all(): existing = [{"id": i + 1, "name": c["name"]} for i, c in enumerate(COLLECTIONS)] client = make_client(existing=existing) - created, updated = seed_source(client, "mock-source", list(COLLECTIONS)) + created, updated = seed_source(client, MockSource()) assert created == 0 assert updated == len(COLLECTIONS) assert client.update_collection.call_count == len(COLLECTIONS) @@ -33,10 +33,9 @@ def test_all_existing_updates_all(): def test_mixed_creates_and_updates(): - # Only the first collection already exists existing = [{"id": 10, "name": COLLECTIONS[0]["name"]}] client = make_client(existing=existing) - created, updated = seed_source(client, "mock-source", list(COLLECTIONS)) + created, updated = seed_source(client, MockSource()) assert created == len(COLLECTIONS) - 1 assert updated == 1 @@ -44,24 +43,23 @@ def test_mixed_creates_and_updates(): def test_update_uses_correct_id(): existing = [{"id": 42, "name": COLLECTIONS[0]["name"]}] client = make_client(existing=existing) - seed_source(client, "mock-source", [COLLECTIONS[0]]) + seed_source(client, MockSource(collections=[COLLECTIONS[0]])) client.update_collection.assert_called_once_with(42, COLLECTIONS[0]) def test_create_passes_full_collection(): client = make_client(existing=[]) - seed_source(client, "mock-source", [COLLECTIONS[0]]) + seed_source(client, MockSource(collections=[COLLECTIONS[0]])) client.create_collection.assert_called_once_with(COLLECTIONS[0]) def test_fetch_called_once_per_organism(): - # Two collections with different organisms multi = [ {**COLLECTIONS[0], "organism": "covid"}, {**COLLECTIONS[1], "organism": "mpox"}, ] client = make_client(existing=[]) - seed_source(client, "mock-source", multi) + seed_source(client, MockSource(collections=multi)) assert client.fetch_existing_collections.call_count == 2 organisms_fetched = {c.args[0] for c in client.fetch_existing_collections.call_args_list} assert organisms_fetched == {"covid", "mpox"} @@ -69,6 +67,6 @@ def test_fetch_called_once_per_organism(): def test_returns_zero_counts_for_empty_collections(): client = make_client(existing=[]) - created, updated = seed_source(client, "mock-source", []) + created, updated = seed_source(client, MockSource(collections=[])) assert created == 0 assert updated == 0 From a35bb0e558beac19d019b2c1d364648a64e4576d Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:22:09 +0200 Subject: [PATCH 22/54] README --- collection-seeding/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 1c1c3380a..9a06aa9c4 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -9,8 +9,6 @@ The script is idempotent — re-running it will create new collections or update Set `REPEAT_INTERVAL_HOURS` to run on a loop (e.g. `REPEAT_INTERVAL_HOURS=8` re-seeds every 8 hours). Without it, the script runs once and exits. -Collections are seeded under the [genspectrum-bot](https://github.com/genspectrum-bot) account (GitHub ID `218605180`), which is upserted automatically via `POST /users/sync` before seeding. - ## Via Docker Compose The seeder runs automatically as part of Docker Compose: From 8932f9a729a4a6a6bdef6c26fdebb73c40404c59 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:29:10 +0200 Subject: [PATCH 23/54] refactor(collection-seeding): move _build_collection into class, default to no lineage limit, add sample task Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 6 +- collection-seeding/pixi.toml | 2 +- collection-seeding/sources/pango_lineages.py | 66 +++++++++---------- .../tests/test_pango_lineages.py | 12 ++-- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 9a06aa9c4..5d56f239c 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -28,10 +28,10 @@ pixi install Then use the provided tasks: ```bash -pixi run seed # all sources (resistance mutations + first 10 lineages) +pixi run seed # all sources (resistance mutations + all pango lineages) pixi run seed-resistance # resistance mutations only -pixi run seed-lineages # pango lineages (first 10) -pixi run seed-all-lineages # all ~4976 pango lineages +pixi run seed-lineages # pango lineages only +pixi run seed-lineages-sample # first 10 pango lineages (quick test) ``` To target a different backend: diff --git a/collection-seeding/pixi.toml b/collection-seeding/pixi.toml index bd7fe3ca5..5d75c7750 100644 --- a/collection-seeding/pixi.toml +++ b/collection-seeding/pixi.toml @@ -13,7 +13,7 @@ requests = ">=2.33.1" [tasks] seed = "python seed.py" seed-lineages = "python seed.py covid-pango-lineages" -seed-all-lineages = "python seed.py covid-pango-lineages --limit 0" +seed-lineages-sample = "python seed.py covid-pango-lineages --limit 10" seed-resistance = "python seed.py covid-resistance-mutations" [feature.test.pypi-dependencies] diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index 7737cee50..04415326d 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -13,38 +13,7 @@ "/refs/heads/main/data/pango-consensus-sequences_summary.json" ) -DEFAULT_LIMIT = 10 - - -def _build_collection(entry: dict) -> Collection: - lineage: str = entry["lineage"] - parent: str = entry.get("parent") or "—" - clade: str = entry.get("nextstrainClade") or "—" - date: str = entry.get("designationDate") or "unknown" - - subs = [s for s in entry.get("nucSubstitutions", []) if s] - variants: list[Variant] = [ - { - "type": "filterObject", - "name": sub, - "filterObject": {"nucleotideMutations": [sub]}, - } - for sub in subs - ] - - description = ( - f"Pango lineage {lineage}. " - f"Parent: {parent}. " - f"Nextstrain clade: {clade}. " - f"Designated: {date}." - ) - - return Collection( - name=lineage, - organism="covid", - description=description, - variants=variants, - ) +DEFAULT_LIMIT = 0 class PangoLineagesSource(Source): @@ -61,6 +30,37 @@ def get_collections(self) -> list[Collection]: if self._limit: entries = entries[:self._limit] print(f" Loaded {len(entries)} lineage(s).") - collections = [_build_collection(e) for e in entries] + collections = [self._build_collection(e) for e in entries] # Drop lineages that ended up with no variants after filtering blank subs return [c for c in collections if c["variants"]] + + @staticmethod + def _build_collection(entry: dict) -> Collection: + lineage: str = entry["lineage"] + parent: str = entry.get("parent") or "—" + clade: str = entry.get("nextstrainClade") or "—" + date: str = entry.get("designationDate") or "unknown" + + subs = [s for s in entry.get("nucSubstitutions", []) if s] + variants: list[Variant] = [ + { + "type": "filterObject", + "name": sub, + "filterObject": {"nucleotideMutations": [sub]}, + } + for sub in subs + ] + + description = ( + f"Pango lineage {lineage}. " + f"Parent: {parent}. " + f"Nextstrain clade: {clade}. " + f"Designated: {date}." + ) + + return Collection( + name=lineage, + organism="covid", + description=description, + variants=variants, + ) diff --git a/collection-seeding/tests/test_pango_lineages.py b/collection-seeding/tests/test_pango_lineages.py index 70c1383f2..2c3146996 100644 --- a/collection-seeding/tests/test_pango_lineages.py +++ b/collection-seeding/tests/test_pango_lineages.py @@ -1,6 +1,6 @@ import responses as rsps_lib -from sources.pango_lineages import PangoLineagesSource, _build_collection, DATA_URL +from sources.pango_lineages import PangoLineagesSource, DATA_URL SAMPLE_DATA = { "BA.2": { @@ -37,13 +37,13 @@ def test_name(): # --- _build_collection --- def test_build_collection_basic(): - col = _build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) assert col["name"] == "BA.2" assert col["organism"] == "covid" def test_build_collection_description_format(): - col = _build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) assert "BA.2" in col["description"] assert "BA" in col["description"] # parent assert "22C" in col["description"] # clade @@ -51,7 +51,7 @@ def test_build_collection_description_format(): def test_build_collection_filters_blank_subs(): - col = _build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) # nucSubstitutions has ["C241T", "A23403G", ""] — blank should be dropped assert len(col["variants"]) == 2 names = [v["name"] for v in col["variants"]] @@ -60,7 +60,7 @@ def test_build_collection_filters_blank_subs(): def test_build_collection_variant_structure(): - col = _build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) for v in col["variants"]: assert v["type"] == "filterObject" assert "nucleotideMutations" in v["filterObject"] @@ -68,7 +68,7 @@ def test_build_collection_variant_structure(): def test_build_collection_missing_fields_use_defaults(): - col = _build_collection(SAMPLE_DATA["XBB"]) + col = PangoLineagesSource._build_collection(SAMPLE_DATA["XBB"]) assert "—" in col["description"] # parent and clade fallback assert "unknown" in col["description"] # date fallback From de989d65029286a6051c25fd9d2349ad21888247 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:31:36 +0200 Subject: [PATCH 24/54] docs(collection-seeding): add docstring to Source ABC Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/sources/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index ebecccae7..3715d3e87 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -4,6 +4,11 @@ class Source(ABC): + """A data source that produces collections to be seeded into the backend. + + Implement this to add a new source: set a unique `name` (used as the CLI subcommand) + and implement `get_collections` to return the collections to upsert. + """ name: str @abstractmethod From 33233a5c892a3ae189570464026da4c4a9b2ccf2 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:37:45 +0200 Subject: [PATCH 25/54] refactor(collection-seeding): use None for no limit, move docstrings to classes, reorder resistance_mutations.py Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 8 +- collection-seeding/sources/pango_lineages.py | 16 +-- .../sources/resistance_mutations.py | 134 +++++++++--------- .../tests/test_pango_lineages.py | 6 +- 4 files changed, 80 insertions(+), 84 deletions(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 64c242b20..4ffd29598 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -14,7 +14,7 @@ from api import ApiClient from models import Collection from sources import Source -from sources.pango_lineages import PangoLineagesSource, DEFAULT_LIMIT +from sources.pango_lineages import PangoLineagesSource from sources.resistance_mutations import ResistanceMutationsSource @@ -59,9 +59,9 @@ def make_parser() -> argparse.ArgumentParser: lineages_parser.add_argument( "--limit", type=int, - default=DEFAULT_LIMIT, + default=None, metavar="N", - help=f"Only process the first N lineages (default: {DEFAULT_LIMIT}; 0 = all)", + help="Only process the first N lineages (default: all)", ) return parser @@ -107,7 +107,7 @@ def main(): if args.wait: client.wait_for_api() - lineage_limit = getattr(args, "limit", DEFAULT_LIMIT) + lineage_limit = getattr(args, "limit", None) source_map: dict[str, Source] = { ResistanceMutationsSource.name: ResistanceMutationsSource(), diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index 04415326d..7b85f8eb1 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -1,8 +1,3 @@ -"""Source: Pango lineage definitions from corneliusroemer/pango-sequences. - -Creates one collection per lineage, with nucleotide substitutions as variants. -""" - import requests from models import Collection, Variant @@ -13,13 +8,14 @@ "/refs/heads/main/data/pango-consensus-sequences_summary.json" ) -DEFAULT_LIMIT = 0 - - class PangoLineagesSource(Source): + """Source: Pango lineage definitions from corneliusroemer/pango-sequences. + + Creates one collection per lineage, with nucleotide substitutions as variants. + """ name = "covid-pango-lineages" - def __init__(self, limit: int = DEFAULT_LIMIT): + def __init__(self, limit: int | None = None): self._limit = limit def get_collections(self) -> list[Collection]: @@ -27,7 +23,7 @@ def get_collections(self) -> list[Collection]: response = requests.get(DATA_URL, timeout=60) response.raise_for_status() entries = list(response.json().values()) - if self._limit: + if self._limit is not None: entries = entries[:self._limit] print(f" Loaded {len(entries)} lineage(s).") collections = [self._build_collection(e) for e in entries] diff --git a/collection-seeding/sources/resistance_mutations.py b/collection-seeding/sources/resistance_mutations.py index 39e15e39a..27aad53a0 100644 --- a/collection-seeding/sources/resistance_mutations.py +++ b/collection-seeding/sources/resistance_mutations.py @@ -1,12 +1,73 @@ -"""Source: SARS-CoV-2 antiviral resistance mutations (ported from seed.mjs). - -Three collections covering 3CLpro, RdRp, and Spike mAb resistance mutations -as per the Stanford Coronavirus Antiviral & Resistance database. -""" - from models import Collection, Variant from sources import Source + +class ResistanceMutationsSource(Source): + """Source: SARS-CoV-2 antiviral resistance mutations (ported from seed.mjs). + + Three collections covering 3CLpro, RdRp, and Spike mAb resistance mutations + as per the Stanford Coronavirus Antiviral & Resistance database. + """ + name = "covid-resistance-mutations" + + def get_collections(self) -> list[Collection]: + return [ + { + "name": "3CLpro resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(CLPRO_MUTATIONS, "3CLpro", -3263), + }, + { + "name": "RdRp resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(RDRP_MUTATIONS, "RdRp", 9), + }, + { + "name": "Spike mAb resistance mutations", + "organism": "covid", + "description": ( + "SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations " + "as per Stanford Coronavirus Antiviral & Resistance database " + "(last updated 21 August 2024)." + ), + "variants": _build_variants(SPIKE_MUTATIONS, "Spike", 0), + }, + ] + + +def _mature_name(mutation: str, set_name: str, offset: int) -> str: + """Convert a genomic mutation code to a mature protein name with the given offset. + + e.g. _mature_name("ORF1a:T3284I", "3CLpro", -3263) -> "3CLpro:T21I" + """ + mut_part = mutation[mutation.index(':') + 1:] + original_base = mut_part[0] + new_base = mut_part[-1] + position = int(''.join(c for c in mut_part if c.isdigit())) + return f"{set_name}:{original_base}{position + offset}{new_base}" + + +def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[Variant]: + return [ + { + "type": "filterObject", + "name": _mature_name(m, set_name, offset), + "filterObject": {"aminoAcidMutations": [m]}, + } + for m in mutations + ] + + CLPRO_MUTATIONS = [ 'ORF1a:T3284I', 'ORF1a:T3288A', 'ORF1a:T3288N', 'ORF1a:T3308I', 'ORF1a:D3311Y', 'ORF1a:M3312I', 'ORF1a:M3312L', 'ORF1a:M3312T', 'ORF1a:M3312-', 'ORF1a:L3313F', @@ -77,64 +138,3 @@ 'S:P507A', 'S:N856K', 'S:N969K', 'S:E990A', 'S:T1009I', ] - - -def _mature_name(mutation: str, set_name: str, offset: int) -> str: - """Convert a genomic mutation code to a mature protein name with the given offset. - - e.g. _mature_name("ORF1a:T3284I", "3CLpro", -3263) -> "3CLpro:T21I" - """ - mut_part = mutation[mutation.index(':') + 1:] - original_base = mut_part[0] - new_base = mut_part[-1] - position = int(''.join(c for c in mut_part if c.isdigit())) - return f"{set_name}:{original_base}{position + offset}{new_base}" - - -def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[Variant]: - return [ - { - "type": "filterObject", - "name": _mature_name(m, set_name, offset), - "filterObject": {"aminoAcidMutations": [m]}, - } - for m in mutations - ] - - -class ResistanceMutationsSource(Source): - name = "covid-resistance-mutations" - - def get_collections(self) -> list[Collection]: - return [ - { - "name": "3CLpro resistance mutations", - "organism": "covid", - "description": ( - "SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations " - "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." - ), - "variants": _build_variants(CLPRO_MUTATIONS, "3CLpro", -3263), - }, - { - "name": "RdRp resistance mutations", - "organism": "covid", - "description": ( - "SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations " - "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." - ), - "variants": _build_variants(RDRP_MUTATIONS, "RdRp", 9), - }, - { - "name": "Spike mAb resistance mutations", - "organism": "covid", - "description": ( - "SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations " - "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." - ), - "variants": _build_variants(SPIKE_MUTATIONS, "Spike", 0), - }, - ] diff --git a/collection-seeding/tests/test_pango_lineages.py b/collection-seeding/tests/test_pango_lineages.py index 2c3146996..38c0571d4 100644 --- a/collection-seeding/tests/test_pango_lineages.py +++ b/collection-seeding/tests/test_pango_lineages.py @@ -78,7 +78,7 @@ def test_build_collection_missing_fields_use_defaults(): @rsps_lib.activate def test_get_collections_fetches_data_url(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) - PangoLineagesSource(limit=0).get_collections() + PangoLineagesSource().get_collections() assert len(rsps_lib.calls) == 1 assert rsps_lib.calls[0].request.url == DATA_URL @@ -86,7 +86,7 @@ def test_get_collections_fetches_data_url(): @rsps_lib.activate def test_get_collections_excludes_empty_variants(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) - cols = PangoLineagesSource(limit=0).get_collections() + cols = PangoLineagesSource().get_collections() # XBB has only blank subs → should be excluded names = [c["name"] for c in cols] assert "XBB" not in names @@ -102,6 +102,6 @@ def test_get_collections_respects_limit(): @rsps_lib.activate def test_get_collections_no_limit_returns_all_valid(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) - cols = PangoLineagesSource(limit=0).get_collections() + cols = PangoLineagesSource().get_collections() # BA.2 and BA.5 have valid subs; XBB does not assert len(cols) == 2 From 95a7100f5a018a9b23739735781904f4a4e9b921 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:40:28 +0200 Subject: [PATCH 26/54] refactor(collection-seeding): move test collections out of MockSource into test_seed.py Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/tests/mock_source.py | 21 ++--------------- collection-seeding/tests/test_seed.py | 31 ++++++++++++++++++------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/collection-seeding/tests/mock_source.py b/collection-seeding/tests/mock_source.py index fb6b76779..28a16d734 100644 --- a/collection-seeding/tests/mock_source.py +++ b/collection-seeding/tests/mock_source.py @@ -1,29 +1,12 @@ -"""Mock data source for use in tests.""" - from models import Collection from sources import Source -COLLECTIONS: list[Collection] = [ - { - "name": "Mock Collection A", - "organism": "covid", - "description": "A mock collection for testing.", - "variants": [{"type": "filterObject", "name": "C123T", "filterObject": {"nucleotideMutations": ["C123T"]}}], - }, - { - "name": "Mock Collection B", - "organism": "covid", - "description": "Another mock collection for testing.", - "variants": [], - }, -] - class MockSource(Source): name = "mock-source" - def __init__(self, collections: list[Collection] | None = None): - self._collections = list(COLLECTIONS) if collections is None else collections + def __init__(self, collections: list[Collection]): + self._collections = collections def get_collections(self) -> list[Collection]: return self._collections diff --git a/collection-seeding/tests/test_seed.py b/collection-seeding/tests/test_seed.py index 6e98c56da..e2f18e491 100644 --- a/collection-seeding/tests/test_seed.py +++ b/collection-seeding/tests/test_seed.py @@ -1,7 +1,22 @@ from unittest.mock import MagicMock from seed import seed_source -from tests.mock_source import COLLECTIONS, MockSource +from tests.mock_source import MockSource + +COLLECTIONS = [ + { + "name": "Mock Collection A", + "organism": "covid", + "description": "A mock collection for testing.", + "variants": [{"type": "filterObject", "name": "C123T", "filterObject": {"nucleotideMutations": ["C123T"]}}], + }, + { + "name": "Mock Collection B", + "organism": "covid", + "description": "Another mock collection for testing.", + "variants": [], + }, +] def make_client(existing=None): @@ -15,7 +30,7 @@ def make_client(existing=None): def test_all_new_creates_all(): client = make_client(existing=[]) - created, updated = seed_source(client, MockSource()) + created, updated = seed_source(client, MockSource(COLLECTIONS)) assert created == len(COLLECTIONS) assert updated == 0 assert client.create_collection.call_count == len(COLLECTIONS) @@ -25,7 +40,7 @@ def test_all_new_creates_all(): def test_all_existing_updates_all(): existing = [{"id": i + 1, "name": c["name"]} for i, c in enumerate(COLLECTIONS)] client = make_client(existing=existing) - created, updated = seed_source(client, MockSource()) + created, updated = seed_source(client, MockSource(COLLECTIONS)) assert created == 0 assert updated == len(COLLECTIONS) assert client.update_collection.call_count == len(COLLECTIONS) @@ -35,7 +50,7 @@ def test_all_existing_updates_all(): def test_mixed_creates_and_updates(): existing = [{"id": 10, "name": COLLECTIONS[0]["name"]}] client = make_client(existing=existing) - created, updated = seed_source(client, MockSource()) + created, updated = seed_source(client, MockSource(COLLECTIONS)) assert created == len(COLLECTIONS) - 1 assert updated == 1 @@ -43,13 +58,13 @@ def test_mixed_creates_and_updates(): def test_update_uses_correct_id(): existing = [{"id": 42, "name": COLLECTIONS[0]["name"]}] client = make_client(existing=existing) - seed_source(client, MockSource(collections=[COLLECTIONS[0]])) + seed_source(client, MockSource([COLLECTIONS[0]])) client.update_collection.assert_called_once_with(42, COLLECTIONS[0]) def test_create_passes_full_collection(): client = make_client(existing=[]) - seed_source(client, MockSource(collections=[COLLECTIONS[0]])) + seed_source(client, MockSource([COLLECTIONS[0]])) client.create_collection.assert_called_once_with(COLLECTIONS[0]) @@ -59,7 +74,7 @@ def test_fetch_called_once_per_organism(): {**COLLECTIONS[1], "organism": "mpox"}, ] client = make_client(existing=[]) - seed_source(client, MockSource(collections=multi)) + seed_source(client, MockSource(multi)) assert client.fetch_existing_collections.call_count == 2 organisms_fetched = {c.args[0] for c in client.fetch_existing_collections.call_args_list} assert organisms_fetched == {"covid", "mpox"} @@ -67,6 +82,6 @@ def test_fetch_called_once_per_organism(): def test_returns_zero_counts_for_empty_collections(): client = make_client(existing=[]) - created, updated = seed_source(client, MockSource(collections=[])) + created, updated = seed_source(client, MockSource([])) assert created == 0 assert updated == 0 From 4be0043092af4274b4bcddaf6b5bc4255d2c149c Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:52:46 +0200 Subject: [PATCH 27/54] refactor(collection-seeding): --source flag, ALL_SOURCES registry, PangoLineagesSampleSource, --list command Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 19 +++++- collection-seeding/pixi.toml | 6 +- collection-seeding/seed.py | 67 ++++++++------------ collection-seeding/sources/__init__.py | 13 +++- collection-seeding/sources/pango_lineages.py | 8 +++ 5 files changed, 68 insertions(+), 45 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 5d56f239c..515959adc 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -28,7 +28,7 @@ pixi install Then use the provided tasks: ```bash -pixi run seed # all sources (resistance mutations + all pango lineages) +pixi run seed # all sources pixi run seed-resistance # resistance mutations only pixi run seed-lineages # pango lineages only pixi run seed-lineages-sample # first 10 pango lineages (quick test) @@ -40,4 +40,19 @@ To target a different backend: pixi run seed --url http://localhost:4321 ``` -Run `pixi run seed --help` or `pixi run seed --help` for all options. +Run `pixi run seed --help` for all options. Use `pixi run seed --list` to print all available sources. + +## Adding a new source + +1. Create `sources/your_source.py` and implement the `Source` ABC: + ```python + from sources import Source + from models import Collection + + class YourSource(Source): + name = "your-source-name" # used with --source flag + + def get_collections(self) -> list[Collection]: + ... + ``` +2. Register it in `sources/__init__.py` by adding it to `ALL_SOURCES`. diff --git a/collection-seeding/pixi.toml b/collection-seeding/pixi.toml index 5d75c7750..5fe7a1927 100644 --- a/collection-seeding/pixi.toml +++ b/collection-seeding/pixi.toml @@ -12,9 +12,9 @@ requests = ">=2.33.1" [tasks] seed = "python seed.py" -seed-lineages = "python seed.py covid-pango-lineages" -seed-lineages-sample = "python seed.py covid-pango-lineages --limit 10" -seed-resistance = "python seed.py covid-resistance-mutations" +seed-lineages = "python seed.py --source covid-pango-lineages" +seed-lineages-sample = "python seed.py --source covid-pango-lineages-sample" +seed-resistance = "python seed.py --source covid-resistance-mutations" [feature.test.pypi-dependencies] pytest = ">=9.0.3" diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 4ffd29598..e977163b6 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -3,7 +3,7 @@ Idempotent: skips any collection whose name already exists for the seed user. -Run with --help for usage, or --help for source-specific options. +Run with --help for usage. """ import argparse @@ -13,57 +13,41 @@ from api import ApiClient from models import Collection -from sources import Source -from sources.pango_lineages import PangoLineagesSource -from sources.resistance_mutations import ResistanceMutationsSource +from sources import Source, ALL_SOURCES def make_parser() -> argparse.ArgumentParser: - parent = argparse.ArgumentParser(add_help=False) - parent.add_argument( + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( "-u", "--url", default=os.environ.get("API_URL", "http://localhost:4321"), help="API base URL (default: $API_URL or http://localhost:4321)", ) - parent.add_argument( + parser.add_argument( "-k", "--api-key", default=os.environ.get("API_KEY"), required=not os.environ.get("API_KEY"), help="API key for authentication (default: $API_KEY)", ) - parent.add_argument( + parser.add_argument( "--wait", action="store_true", default=not sys.stdout.isatty(), help="Retry until API is ready (auto-enabled when no TTY)", ) - - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - parents=[parent], - ) - subparsers = parser.add_subparsers(dest="source", metavar="source") - - subparsers.add_parser( - ResistanceMutationsSource.name, - parents=[parent], - help="Seed SARS-CoV-2 antiviral resistance mutation collections", - ) - - lineages_parser = subparsers.add_parser( - PangoLineagesSource.name, - parents=[parent], - help="Seed pango lineage collections", + parser.add_argument( + "--source", + metavar="NAME", + help=f"Only run this source (default: all). Use --list to see available sources.", ) - lineages_parser.add_argument( - "--limit", - type=int, - default=None, - metavar="N", - help="Only process the first N lineages (default: all)", + parser.add_argument( + "--list", + action="store_true", + help="List available sources and exit", ) - return parser @@ -101,18 +85,23 @@ def main(): parser = make_parser() args = parser.parse_args() + source_map = {cls.name: cls() for cls in ALL_SOURCES} + + if args.list: + for name in source_map: + print(name) + return + + if args.source and args.source not in source_map: + print(f"Unknown source '{args.source}'. Use --list to see available sources.", file=sys.stderr) + sys.exit(1) + client = ApiClient(args.url, args.api_key) print(f"Seeding collections against {args.url} ...") if args.wait: client.wait_for_api() - lineage_limit = getattr(args, "limit", None) - - source_map: dict[str, Source] = { - ResistanceMutationsSource.name: ResistanceMutationsSource(), - PangoLineagesSource.name: PangoLineagesSource(limit=lineage_limit), - } active = [source_map[args.source]] if args.source else list(source_map.values()) total_created = 0 diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index 3715d3e87..4ee88e3fa 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -6,11 +6,22 @@ class Source(ABC): """A data source that produces collections to be seeded into the backend. - Implement this to add a new source: set a unique `name` (used as the CLI subcommand) + Implement this to add a new source: set a unique `name` (used as the --source flag value) and implement `get_collections` to return the collections to upsert. + Then register it in ALL_SOURCES below. """ name: str @abstractmethod def get_collections(self) -> list[Collection]: ... + + +from sources.resistance_mutations import ResistanceMutationsSource # noqa: E402 +from sources.pango_lineages import PangoLineagesSource, PangoLineagesSampleSource # noqa: E402 + +ALL_SOURCES: list[type[Source]] = [ + ResistanceMutationsSource, + PangoLineagesSource, + PangoLineagesSampleSource, +] diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index 7b85f8eb1..d9441cb7a 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -60,3 +60,11 @@ def _build_collection(entry: dict) -> Collection: description=description, variants=variants, ) + + +class PangoLineagesSampleSource(PangoLineagesSource): + """Same as PangoLineagesSource but limited to the first 10 lineages, for quick testing.""" + name = "covid-pango-lineages-sample" + + def __init__(self): + super().__init__(limit=10) From 0cce98f71099bda1853ab40374065f8e471655ad Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:56:07 +0200 Subject: [PATCH 28/54] refactor(collection-seeding): move ALL_SOURCES to dedicated sources/registry.py Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 2 +- collection-seeding/seed.py | 3 ++- collection-seeding/sources/__init__.py | 10 +--------- collection-seeding/sources/registry.py | 15 +++++++++++++++ 4 files changed, 19 insertions(+), 11 deletions(-) create mode 100644 collection-seeding/sources/registry.py diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 515959adc..7818e19f1 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -55,4 +55,4 @@ Run `pixi run seed --help` for all options. Use `pixi run seed --list` to print def get_collections(self) -> list[Collection]: ... ``` -2. Register it in `sources/__init__.py` by adding it to `ALL_SOURCES`. +2. Register it in `sources/registry.py` by adding it to `ALL_SOURCES`. diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index e977163b6..c2f7cf66f 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -13,7 +13,8 @@ from api import ApiClient from models import Collection -from sources import Source, ALL_SOURCES +from sources import Source +from sources.registry import ALL_SOURCES def make_parser() -> argparse.ArgumentParser: diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index 4ee88e3fa..3b7976365 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -8,7 +8,7 @@ class Source(ABC): Implement this to add a new source: set a unique `name` (used as the --source flag value) and implement `get_collections` to return the collections to upsert. - Then register it in ALL_SOURCES below. + Then register it in sources/registry.py. """ name: str @@ -17,11 +17,3 @@ def get_collections(self) -> list[Collection]: ... -from sources.resistance_mutations import ResistanceMutationsSource # noqa: E402 -from sources.pango_lineages import PangoLineagesSource, PangoLineagesSampleSource # noqa: E402 - -ALL_SOURCES: list[type[Source]] = [ - ResistanceMutationsSource, - PangoLineagesSource, - PangoLineagesSampleSource, -] diff --git a/collection-seeding/sources/registry.py b/collection-seeding/sources/registry.py new file mode 100644 index 000000000..88d704005 --- /dev/null +++ b/collection-seeding/sources/registry.py @@ -0,0 +1,15 @@ +"""Registry of all available seeding sources. + +To add a new source, import it here and add it to ALL_SOURCES. This is the only +place that needs to change — seed.py discovers sources exclusively through this list. +""" + +from sources.pango_lineages import PangoLineagesSource, PangoLineagesSampleSource +from sources.resistance_mutations import ResistanceMutationsSource +from sources import Source + +ALL_SOURCES: list[type[Source]] = [ + ResistanceMutationsSource, + PangoLineagesSource, + PangoLineagesSampleSource, +] From 09c27b5c8a26f361fbb04166950afdd85bc5aba2 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 16:57:57 +0200 Subject: [PATCH 29/54] refactor(collection-seeding): reorder seed.py with main first Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 130 ++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index c2f7cf66f..4871e191b 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -17,71 +17,6 @@ from sources.registry import ALL_SOURCES -def make_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "-u", "--url", - default=os.environ.get("API_URL", "http://localhost:4321"), - help="API base URL (default: $API_URL or http://localhost:4321)", - ) - parser.add_argument( - "-k", "--api-key", - default=os.environ.get("API_KEY"), - required=not os.environ.get("API_KEY"), - help="API key for authentication (default: $API_KEY)", - ) - parser.add_argument( - "--wait", - action="store_true", - default=not sys.stdout.isatty(), - help="Retry until API is ready (auto-enabled when no TTY)", - ) - parser.add_argument( - "--source", - metavar="NAME", - help=f"Only run this source (default: all). Use --list to see available sources.", - ) - parser.add_argument( - "--list", - action="store_true", - help="List available sources and exit", - ) - return parser - - -def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: - """Upsert collections for one source, grouped by organism. Returns (created, updated) counts. - Matching is by name — if a collection's name changes in the source, the old entry is orphaned and a new one is created.""" - collections = source.get_collections() - print(f"\n[{source.name}]") - - organisms: dict[str, list[Collection]] = {} - for c in collections: - organisms.setdefault(c["organism"], []).append(c) - - created = 0 - updated = 0 - for organism, org_collections in organisms.items(): - existing = client.fetch_existing_collections(organism) - existing_by_name = {c["name"]: c for c in existing} - for collection in org_collections: - existing_entry = existing_by_name.get(collection["name"]) - if existing_entry: - client.update_collection(existing_entry["id"], collection) - print(f" UPDATE id={existing_entry['id']} {collection['name']}") - updated += 1 - else: - col_id = client.create_collection(collection) - print(f" CREATE id={col_id} {collection['name']}") - created += 1 - - print(f" Created: {created}, updated: {updated}.") - return created, updated - - def main(): parser = make_parser() args = parser.parse_args() @@ -116,6 +51,71 @@ def main(): print(f"\nTotal — created: {total_created}, updated: {total_updated}.") +def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: + """Upsert collections for one source, grouped by organism. Returns (created, updated) counts. + Matching is by name — if a collection's name changes in the source, the old entry is orphaned and a new one is created.""" + collections = source.get_collections() + print(f"\n[{source.name}]") + + organisms: dict[str, list[Collection]] = {} + for c in collections: + organisms.setdefault(c["organism"], []).append(c) + + created = 0 + updated = 0 + for organism, org_collections in organisms.items(): + existing = client.fetch_existing_collections(organism) + existing_by_name = {c["name"]: c for c in existing} + for collection in org_collections: + existing_entry = existing_by_name.get(collection["name"]) + if existing_entry: + client.update_collection(existing_entry["id"], collection) + print(f" UPDATE id={existing_entry['id']} {collection['name']}") + updated += 1 + else: + col_id = client.create_collection(collection) + print(f" CREATE id={col_id} {collection['name']}") + created += 1 + + print(f" Created: {created}, updated: {updated}.") + return created, updated + + +def make_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-u", "--url", + default=os.environ.get("API_URL", "http://localhost:4321"), + help="API base URL (default: $API_URL or http://localhost:4321)", + ) + parser.add_argument( + "-k", "--api-key", + default=os.environ.get("API_KEY"), + required=not os.environ.get("API_KEY"), + help="API key for authentication (default: $API_KEY)", + ) + parser.add_argument( + "--wait", + action="store_true", + default=not sys.stdout.isatty(), + help="Retry until API is ready (auto-enabled when no TTY)", + ) + parser.add_argument( + "--source", + metavar="NAME", + help="Only run this source (default: all). Use --list to see available sources.", + ) + parser.add_argument( + "--list", + action="store_true", + help="List available sources and exit", + ) + return parser + + if __name__ == "__main__": repeat_hours = os.environ.get("REPEAT_INTERVAL_HOURS") while True: From 474d95b4d4c7710ddc80a61c3f54607bc64c296e Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 17:03:38 +0200 Subject: [PATCH 30/54] fix(collection-seeding): don't require --api-key for --list Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 4871e191b..5c78c4284 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -32,6 +32,10 @@ def main(): print(f"Unknown source '{args.source}'. Use --list to see available sources.", file=sys.stderr) sys.exit(1) + if not args.api_key: + print("Error: --api-key is required (or set $API_KEY).", file=sys.stderr) + sys.exit(1) + client = ApiClient(args.url, args.api_key) print(f"Seeding collections against {args.url} ...") @@ -94,7 +98,6 @@ def make_parser() -> argparse.ArgumentParser: parser.add_argument( "-k", "--api-key", default=os.environ.get("API_KEY"), - required=not os.environ.get("API_KEY"), help="API key for authentication (default: $API_KEY)", ) parser.add_argument( From a665b93636f9b15c8a8baf2dfeee9830f6fd8ad6 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 17:08:20 +0200 Subject: [PATCH 31/54] refactor(collection-seeding): move repeat loop into main(), add --repeat-interval-hours flag Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 2 +- collection-seeding/seed.py | 48 ++++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 7818e19f1..63dc153b2 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -7,7 +7,7 @@ Seeds the backend with example collections: The script is idempotent — re-running it will create new collections or update existing ones (matched by name). If a collection's name changes in the source, the old entry is orphaned and a new one is created. -Set `REPEAT_INTERVAL_HOURS` to run on a loop (e.g. `REPEAT_INTERVAL_HOURS=8` re-seeds every 8 hours). Without it, the script runs once and exits. +Use `--repeat-interval-hours N` (or `$REPEAT_INTERVAL_HOURS`) to run on a loop — re-seeds every N hours. Without it, the script runs once and exits. ## Via Docker Compose diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 5c78c4284..9c653b4f5 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -44,15 +44,24 @@ def main(): active = [source_map[args.source]] if args.source else list(source_map.values()) - total_created = 0 - total_updated = 0 - for source in active: - c, u = seed_source(client, source) - total_created += c - total_updated += u - - if len(active) > 1: - print(f"\nTotal — created: {total_created}, updated: {total_updated}.") + while True: + try: + total_created = 0 + total_updated = 0 + for source in active: + c, u = seed_source(client, source) + total_created += c + total_updated += u + if len(active) > 1: + print(f"\nTotal — created: {total_created}, updated: {total_updated}.") + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + if not args.repeat_interval_hours: + sys.exit(1) + if not args.repeat_interval_hours: + break + print(f"\nSleeping for {args.repeat_interval_hours}h ...") + time.sleep(args.repeat_interval_hours * 3600) def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: @@ -86,6 +95,7 @@ def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: def make_parser() -> argparse.ArgumentParser: + _repeat_env = os.environ.get("REPEAT_INTERVAL_HOURS") parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, @@ -116,19 +126,15 @@ def make_parser() -> argparse.ArgumentParser: action="store_true", help="List available sources and exit", ) + parser.add_argument( + "--repeat-interval-hours", + type=float, + default=float(_repeat_env) if _repeat_env else None, + metavar="HOURS", + help="Re-seed every N hours instead of exiting (default: $REPEAT_INTERVAL_HOURS or run once)", + ) return parser if __name__ == "__main__": - repeat_hours = os.environ.get("REPEAT_INTERVAL_HOURS") - while True: - try: - main() - except Exception as e: - print(f"Error: {e}", file=sys.stderr) - if not repeat_hours: - sys.exit(1) - if not repeat_hours: - break - print(f"\nSleeping for {repeat_hours}h ...") - time.sleep(float(repeat_hours) * 3600) + main() From 069af1cf69efc7226e2dd6afe8df98c9a19f0431 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 17:12:54 +0200 Subject: [PATCH 32/54] docs(collection-seeding): update README with sample source, SEEDER_API_KEY, and new CLI flags Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 63dc153b2..87a40071f 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -4,6 +4,7 @@ Seeds the backend with example collections: - **covid-resistance-mutations** — resistance mutation data for 3CLpro, RdRp, and Spike mAb - **covid-pango-lineages** — one collection per pango lineage, with nucleotide substitutions as variants +- **covid-pango-lineages-sample** — same as above but limited to 10 lineages, for quick testing The script is idempotent — re-running it will create new collections or update existing ones (matched by name). If a collection's name changes in the source, the old entry is orphaned and a new one is created. @@ -14,7 +15,7 @@ Use `--repeat-interval-hours N` (or `$REPEAT_INTERVAL_HOURS`) to run on a loop The seeder runs automatically as part of Docker Compose: ```bash -BACKEND_TAG=latest WEBSITE_TAG=latest SEEDER_TAG=latest docker compose up +BACKEND_TAG=latest WEBSITE_TAG=latest SEEDER_TAG=latest SEEDER_API_KEY=... docker compose up ``` ## Running locally @@ -40,7 +41,7 @@ To target a different backend: pixi run seed --url http://localhost:4321 ``` -Run `pixi run seed --help` for all options. Use `pixi run seed --list` to print all available sources. +Run `pixi run seed --help` for all options, including `--source`, `--list`, `--repeat-interval-hours`, and `--url`. ## Adding a new source From ce59d149c768f6631f738f79d1da56f063df7ce7 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 20 May 2026 17:28:04 +0200 Subject: [PATCH 33/54] fix(collection-seeding): crash on error in repeat mode instead of swallowing Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 9c653b4f5..ad80958da 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -56,8 +56,7 @@ def main(): print(f"\nTotal — created: {total_created}, updated: {total_updated}.") except Exception as e: print(f"Error: {e}", file=sys.stderr) - if not args.repeat_interval_hours: - sys.exit(1) + sys.exit(1) if not args.repeat_interval_hours: break print(f"\nSleeping for {args.repeat_interval_hours}h ...") From c4c3f88ddf606998b9521387d094fa59b8fc5476 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:04:52 +0200 Subject: [PATCH 34/54] feat(collection-seeding): restructure pango lineage variants into 4 fixed types Each collection now has exactly 4 variants (nucleotide substitutions, amino acid substitutions, new nucleotide substitutions, new amino acid substitutions) instead of one variant per individual nucleotide substitution. Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/sources/pango_lineages.py | 32 ++++++-- .../tests/test_pango_lineages.py | 81 ++++++++++++++----- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index d9441cb7a..c62645817 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -26,9 +26,7 @@ def get_collections(self) -> list[Collection]: if self._limit is not None: entries = entries[:self._limit] print(f" Loaded {len(entries)} lineage(s).") - collections = [self._build_collection(e) for e in entries] - # Drop lineages that ended up with no variants after filtering blank subs - return [c for c in collections if c["variants"]] + return [self._build_collection(e) for e in entries] @staticmethod def _build_collection(entry: dict) -> Collection: @@ -37,14 +35,32 @@ def _build_collection(entry: dict) -> Collection: clade: str = entry.get("nextstrainClade") or "—" date: str = entry.get("designationDate") or "unknown" - subs = [s for s in entry.get("nucSubstitutions", []) if s] + nuc_subs = [s for s in entry.get("nucSubstitutions", []) if s] + aa_subs = [s for s in entry.get("aaSubstitutions", []) if s] + nuc_subs_new = [s for s in entry.get("nucSubstitutionsNew", []) if s] + aa_subs_new = [s for s in entry.get("aaSubstitutionsNew", []) if s] + variants: list[Variant] = [ { "type": "filterObject", - "name": sub, - "filterObject": {"nucleotideMutations": [sub]}, - } - for sub in subs + "name": "Nucleotide substitutions", + "filterObject": {"nucleotideMutations": nuc_subs}, + }, + { + "type": "filterObject", + "name": "Amino acid substitutions", + "filterObject": {"aminoAcidMutations": aa_subs}, + }, + { + "type": "filterObject", + "name": "New nucleotide substitutions", + "filterObject": {"nucleotideMutations": nuc_subs_new}, + }, + { + "type": "filterObject", + "name": "New amino acid substitutions", + "filterObject": {"aminoAcidMutations": aa_subs_new}, + }, ] description = ( diff --git a/collection-seeding/tests/test_pango_lineages.py b/collection-seeding/tests/test_pango_lineages.py index 38c0571d4..2025642fa 100644 --- a/collection-seeding/tests/test_pango_lineages.py +++ b/collection-seeding/tests/test_pango_lineages.py @@ -9,6 +9,9 @@ "parent": "BA", "nextstrainClade": "22C", "nucSubstitutions": ["C241T", "A23403G", ""], + "aaSubstitutions": ["S:N501Y", ""], + "nucSubstitutionsNew": ["A23403G"], + "aaSubstitutionsNew": ["S:N501Y"], "designationDate": "2022-01-20", }, "XBB": { @@ -17,6 +20,9 @@ "parent": "", "nextstrainClade": "", "nucSubstitutions": [""], + "aaSubstitutions": [""], + "nucSubstitutionsNew": [""], + "aaSubstitutionsNew": [""], "designationDate": "", }, "BA.5": { @@ -25,6 +31,9 @@ "parent": "BA", "nextstrainClade": "22B", "nucSubstitutions": ["C241T", "T19955C"], + "aaSubstitutions": ["S:L452R"], + "nucSubstitutionsNew": ["T19955C"], + "aaSubstitutionsNew": [], "designationDate": "2022-05-06", }, } @@ -50,27 +59,60 @@ def test_build_collection_description_format(): assert "2022-01-20" in col["description"] -def test_build_collection_filters_blank_subs(): +def test_build_collection_missing_fields_use_defaults(): + col = PangoLineagesSource._build_collection(SAMPLE_DATA["XBB"]) + assert "—" in col["description"] # parent and clade fallback + assert "unknown" in col["description"] # date fallback + + +def test_build_collection_always_four_variants(): + col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + assert len(col["variants"]) == 4 + + +def test_build_collection_variant_names(): col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) - # nucSubstitutions has ["C241T", "A23403G", ""] — blank should be dropped - assert len(col["variants"]) == 2 names = [v["name"] for v in col["variants"]] - assert "C241T" in names - assert "A23403G" in names + assert names == [ + "Nucleotide substitutions", + "Amino acid substitutions", + "New nucleotide substitutions", + "New amino acid substitutions", + ] -def test_build_collection_variant_structure(): +def test_build_collection_variant_filter_keys(): col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) - for v in col["variants"]: - assert v["type"] == "filterObject" - assert "nucleotideMutations" in v["filterObject"] - assert len(v["filterObject"]["nucleotideMutations"]) == 1 + variants = col["variants"] + assert "nucleotideMutations" in variants[0]["filterObject"] + assert "aminoAcidMutations" in variants[1]["filterObject"] + assert "nucleotideMutations" in variants[2]["filterObject"] + assert "aminoAcidMutations" in variants[3]["filterObject"] -def test_build_collection_missing_fields_use_defaults(): +def test_build_collection_variant_contents(): + col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + variants = col["variants"] + assert variants[0]["filterObject"]["nucleotideMutations"] == ["C241T", "A23403G"] + assert variants[1]["filterObject"]["aminoAcidMutations"] == ["S:N501Y"] + assert variants[2]["filterObject"]["nucleotideMutations"] == ["A23403G"] + assert variants[3]["filterObject"]["aminoAcidMutations"] == ["S:N501Y"] + + +def test_build_collection_filters_blank_subs(): + col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + # nucSubstitutions has ["C241T", "A23403G", ""] — blank should be dropped + nuc = col["variants"][0]["filterObject"]["nucleotideMutations"] + assert "" not in nuc + assert len(nuc) == 2 + + +def test_build_collection_empty_lists_when_all_blanks(): col = PangoLineagesSource._build_collection(SAMPLE_DATA["XBB"]) - assert "—" in col["description"] # parent and clade fallback - assert "unknown" in col["description"] # date fallback + assert len(col["variants"]) == 4 + for v in col["variants"]: + lists = list(v["filterObject"].values()) + assert lists == [[]] # --- get_collections --- @@ -84,12 +126,14 @@ def test_get_collections_fetches_data_url(): @rsps_lib.activate -def test_get_collections_excludes_empty_variants(): +def test_get_collections_includes_all_lineages(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) cols = PangoLineagesSource().get_collections() - # XBB has only blank subs → should be excluded + # All lineages included regardless of empty subs names = [c["name"] for c in cols] - assert "XBB" not in names + assert "BA.2" in names + assert "XBB" in names + assert "BA.5" in names @rsps_lib.activate @@ -100,8 +144,7 @@ def test_get_collections_respects_limit(): @rsps_lib.activate -def test_get_collections_no_limit_returns_all_valid(): +def test_get_collections_no_limit_returns_all(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) cols = PangoLineagesSource().get_collections() - # BA.2 and BA.5 have valid subs; XBB does not - assert len(cols) == 2 + assert len(cols) == 3 From 3ded99ec8147bc8426402b557ae2eb17fa8e72ce Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:09:36 +0200 Subject: [PATCH 35/54] fix(collection-seeding): strip organism from PUT body to avoid 400 CollectionUpdate has no organism field and the backend is configured with fail-on-unknown-properties=true, so including organism caused every update to fail with a Bad Request. Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/collection-seeding/api.py b/collection-seeding/api.py index 9673e99af..8057ddc70 100644 --- a/collection-seeding/api.py +++ b/collection-seeding/api.py @@ -47,6 +47,8 @@ def create_collection(self, collection: Collection) -> int: return r.json()["id"] def update_collection(self, collection_id: int, collection: Collection) -> None: - r = requests.put(f"{self._collections_url}/{collection_id}", headers=self._auth_headers, json=collection, timeout=10) + # CollectionUpdate has no organism field; sending it causes a 400 (fail-on-unknown-properties=true) + body = {k: v for k, v in collection.items() if k != "organism"} + r = requests.put(f"{self._collections_url}/{collection_id}", headers=self._auth_headers, json=body, timeout=10) if not r.ok: raise RuntimeError(f"PUT /api/collections/{collection_id} failed: {r.status_code} {r.text}") From 431a9a202110185a9d15f71dd7c7ca54cbc9919f Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:38:43 +0200 Subject: [PATCH 36/54] refactor(collection-seeding): defer REPEAT_INTERVAL_HOURS parsing to argparse, remove seed.mjs Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.mjs | 261 ------------------------------------ collection-seeding/seed.py | 3 +- 2 files changed, 1 insertion(+), 263 deletions(-) delete mode 100644 collection-seeding/seed.mjs diff --git a/collection-seeding/seed.mjs b/collection-seeding/seed.mjs deleted file mode 100644 index 3ecdb4dc9..000000000 --- a/collection-seeding/seed.mjs +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env node -// Seeds example collections into the backend. -// Idempotent: skips any collection whose name already exists for the seed user. -// -// Run with --help for usage. - -import { parseArgs } from 'node:util'; - -const HELP = `\ -Usage: node seed.mjs [options] - -Options: - -u, --url Backend base URL (default: $BACKEND_URL or http://localhost:8080) - --user-id User ID (default: $SEED_USER_ID or example-data-seeder) - --wait Retry until backend is ready (auto-enabled when no TTY) - -h, --help Show this help - -Examples: - # Local backend running on :8080 - node seed.mjs - - # Local backend on a different port - node seed.mjs --url http://localhost:9021 -`; - -let parsedArgs; -try { - parsedArgs = parseArgs({ - options: { - url: { type: 'string', short: 'u' }, - 'user-id': { type: 'string' }, - wait: { type: 'boolean' }, - help: { type: 'boolean', short: 'h' }, - }, - }); -} catch (err) { - console.error(`Error: ${err.message}\n`); - console.error(HELP); - process.exit(1); -} - -const { values } = parsedArgs; - -if (values.help) { - console.log(HELP); - process.exit(0); -} - -const BACKEND_URL = values.url ?? process.env.BACKEND_URL ?? 'http://localhost:8080'; -const SEED_USER_ID = values['user-id'] ?? process.env.SEED_USER_ID ?? 'example-data-seeder'; -// Auto-enable wait when there's no TTY (e.g. running inside Docker). -const WAIT = values.wait ?? !process.stdout.isTTY; - -const RETRY_ATTEMPTS = 30; -const RETRY_DELAY_MS = 2000; -const COLLECTIONS_BASE = `${BACKEND_URL}/collections`; - -// Converts a genomic mutation code to a mature protein name with the given offset. -// e.g. matureName("ORF1a:T3284I", "3CLpro", -3263) => "3CLpro:T21I" -function matureName(mutation, setName, offset) { - const mutationPart = mutation.slice(mutation.indexOf(':') + 1); - const originalBase = mutationPart[0]; - const newBase = mutationPart[mutationPart.length - 1]; - const position = parseInt(mutationPart.match(/\d+/)[0], 10); - return `${setName}:${originalBase}${position + offset}${newBase}`; -} - -function buildVariants(mutations, setName, offset) { - return mutations.map((mutation) => ({ - type: 'filterObject', - name: matureName(mutation, setName, offset), - filterObject: { aminoAcidMutations: [mutation] }, - })); -} - -// --- Collection definitions --- - -const CLPRO_MUTATIONS = [ - 'ORF1a:T3284I', 'ORF1a:T3288A', 'ORF1a:T3288N', 'ORF1a:T3308I', 'ORF1a:D3311Y', - 'ORF1a:M3312I', 'ORF1a:M3312L', 'ORF1a:M3312T', 'ORF1a:M3312-', 'ORF1a:L3313F', - 'ORF1a:G3401S', 'ORF1a:F3403L', 'ORF1a:F3403S', 'ORF1a:N3405D', 'ORF1a:N3405L', - 'ORF1a:N3405S', 'ORF1a:G3406S', 'ORF1a:S3407A', 'ORF1a:S3407E', 'ORF1a:S3407L', - 'ORF1a:S3407P', 'ORF1a:C3423F', 'ORF1a:M3428R', 'ORF1a:M3428T', 'ORF1a:E3429A', - 'ORF1a:E3429G', 'ORF1a:E3429K', 'ORF1a:E3429Q', 'ORF1a:E3429V', 'ORF1a:L3430F', - 'ORF1a:P3431-', 'ORF1a:T3432I', 'ORF1a:H3435L', 'ORF1a:H3435N', 'ORF1a:H3435Q', - 'ORF1a:H3435Y', 'ORF1a:A3436T', 'ORF1a:A3436V', 'ORF1a:V3449A', 'ORF1a:R3451G', - 'ORF1a:R3451S', 'ORF1a:Q3452I', 'ORF1a:Q3452K', 'ORF1a:T3453I', 'ORF1a:A3454T', - 'ORF1a:A3454V', 'ORF1a:Q3455A', 'ORF1a:Q3455C', 'ORF1a:Q3455D', 'ORF1a:Q3455E', - 'ORF1a:Q3455F', 'ORF1a:Q3455G', 'ORF1a:Q3455H', 'ORF1a:Q3455I', 'ORF1a:Q3455K', - 'ORF1a:Q3455L', 'ORF1a:Q3455N', 'ORF1a:Q3455P', 'ORF1a:Q3455R', 'ORF1a:Q3455S', - 'ORF1a:Q3455T', 'ORF1a:Q3455V', 'ORF1a:Q3455W', 'ORF1a:Q3455Y', 'ORF1a:A3456P', - 'ORF1a:A3457S', 'ORF1a:P3515L', 'ORF1a:V3560A', 'ORF1a:S3564P', 'ORF1a:T3567I', - 'ORF1a:F3568L', -]; - -const RDRP_MUTATIONS = [ - 'ORF1b:V157A', 'ORF1b:V157L', 'ORF1b:N189S', 'ORF1b:R276C', 'ORF1b:A367V', - 'ORF1b:A440V', 'ORF1b:F471L', 'ORF1b:D475Y', 'ORF1b:A517V', 'ORF1b:V548L', - 'ORF1b:G662S', 'ORF1b:S750A', 'ORF1b:V783I', 'ORF1b:E787G', 'ORF1b:C790F', - 'ORF1b:C790R', 'ORF1b:E793A', 'ORF1b:E793D', 'ORF1b:M915R', -]; - -const SPIKE_MUTATIONS = [ - 'S:P337H', 'S:P337L', 'S:P337R', 'S:P337S', 'S:P337T', - 'S:E340A', 'S:E340D', 'S:E340G', 'S:E340K', 'S:E340Q', 'S:E340V', - 'S:T345P', - 'S:R346G', 'S:R346I', 'S:R346K', 'S:R346S', 'S:R346T', - 'S:K356Q', 'S:K356T', - 'S:S371F', 'S:S371L', - 'S:D405E', 'S:D405N', 'S:E406D', - 'S:K417E', 'S:K417H', 'S:K417I', 'S:K417M', 'S:K417N', 'S:K417R', 'S:K417S', 'S:K417T', - 'S:D420A', 'S:D420N', - 'S:N439K', - 'S:N440D', 'S:N440E', 'S:N440I', 'S:N440K', 'S:N440R', 'S:N440T', 'S:N440Y', - 'S:S443Y', - 'S:K444E', 'S:K444F', 'S:K444I', 'S:K444L', 'S:K444M', 'S:K444N', 'S:K444R', 'S:K444T', - 'S:V445A', 'S:V445D', 'S:V445F', 'S:V445I', 'S:V445L', - 'S:G446A', 'S:G446D', 'S:G446I', 'S:G446N', 'S:G446R', 'S:G446S', 'S:G446T', 'S:G446V', - 'S:G447C', 'S:G447D', 'S:G447F', 'S:G447S', 'S:G447V', - 'S:N448D', 'S:N448K', 'S:N448T', 'S:N448Y', - 'S:Y449D', - 'S:N450D', 'S:N450K', - 'S:L452M', 'S:L452Q', 'S:L452R', 'S:L452W', - 'S:Y453F', 'S:Y453H', - 'S:L455F', 'S:L455M', 'S:L455S', 'S:L455W', - 'S:F456C', 'S:F456L', 'S:F456V', - 'S:S459P', - 'S:N460D', 'S:N460H', 'S:N460I', 'S:N460K', 'S:N460S', 'S:N460T', 'S:N460Y', - 'S:A475D', 'S:A475V', - 'S:G476D', 'S:G476R', 'S:G476T', - 'S:V483A', - 'S:E484A', 'S:E484D', 'S:E484G', 'S:E484K', 'S:E484P', 'S:E484Q', 'S:E484R', 'S:E484S', 'S:E484T', 'S:E484V', - 'S:G485D', 'S:G485R', - 'S:F486D', 'S:F486I', 'S:F486L', 'S:F486N', 'S:F486P', 'S:F486S', 'S:F486T', 'S:F486V', - 'S:N487D', 'S:N487H', 'S:N487S', - 'S:Y489H', 'S:Y489W', - 'S:F490G', 'S:F490I', 'S:F490L', 'S:F490R', 'S:F490S', 'S:F490V', 'S:F490Y', - 'S:Q493D', 'S:Q493E', 'S:Q493H', 'S:Q493K', 'S:Q493L', 'S:Q493R', 'S:Q493V', - 'S:S494P', 'S:S494R', - 'S:G496S', - 'S:Q498H', - 'S:P499H', 'S:P499R', 'S:P499S', 'S:P499T', - 'S:N501T', 'S:N501Y', - 'S:G504C', 'S:G504D', 'S:G504I', 'S:G504L', 'S:G504N', 'S:G504R', 'S:G504V', - 'S:P507A', - 'S:N856K', 'S:N969K', 'S:E990A', 'S:T1009I', -]; - -const COLLECTIONS = [ - { - name: '3CLpro resistance mutations', - organism: 'covid', - description: - 'SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations as per Stanford Coronavirus Antiviral & Resistance database (last updated 21 August 2024).', - variants: buildVariants(CLPRO_MUTATIONS, '3CLpro', -3263), - }, - { - name: 'RdRp resistance mutations', - organism: 'covid', - description: - 'SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations as per Stanford Coronavirus Antiviral & Resistance database (last updated 21 August 2024).', - variants: buildVariants(RDRP_MUTATIONS, 'RdRp', 9), - }, - { - name: 'Spike mAb resistance mutations', - organism: 'covid', - description: - 'SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations as per Stanford Coronavirus Antiviral & Resistance database (last updated 21 August 2024).', - variants: buildVariants(SPIKE_MUTATIONS, 'Spike', 0), - }, -]; - -// --- API helpers --- - -async function sleep(ms) { - return new Promise((resolve) => setTimeout(resolve, ms)); -} - -async function waitForBackend() { - for (let attempt = 1; attempt <= RETRY_ATTEMPTS; attempt++) { - try { - const response = await fetch( - `${BACKEND_URL}/collections?userId=${SEED_USER_ID}&organism=covid`, - ); - if (response.ok || response.status === 404) return; - } catch { - // backend not ready yet - } - console.log(`Waiting for backend... (attempt ${attempt}/${RETRY_ATTEMPTS})`); - await sleep(RETRY_DELAY_MS); - } - console.error(`Backend at ${BACKEND_URL} did not become ready after ${RETRY_ATTEMPTS} attempts.`); - process.exit(1); -} - -async function fetchExistingCollections(organism) { - const url = `${COLLECTIONS_BASE}?userId=${encodeURIComponent(SEED_USER_ID)}&organism=${encodeURIComponent(organism)}`; - const response = await fetch(url); - if (!response.ok) { - throw new Error(`GET /collections failed: ${response.status} ${await response.text()}`); - } - return response.json(); -} - -async function createCollection(collection) { - const url = `${COLLECTIONS_BASE}?userId=${encodeURIComponent(SEED_USER_ID)}`; - const response = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(collection), - }); - if (response.status !== 201) { - throw new Error(`POST /collections failed: ${response.status} ${await response.text()}`); - } - const created = await response.json(); - return created.id; -} - -// --- Main --- - -async function main() { - console.log(`Seeding example data against ${BACKEND_URL} as user '${SEED_USER_ID}'...`); - - if (WAIT) { - await waitForBackend(); - } - - // Group collections by organism to minimise GET requests - const byOrganism = {}; - for (const collection of COLLECTIONS) { - (byOrganism[collection.organism] ??= []).push(collection); - } - - let created = 0; - let skipped = 0; - - for (const [organism, collections] of Object.entries(byOrganism)) { - const existing = await fetchExistingCollections(organism); - const existingNames = new Set(existing.map((c) => c.name)); - - for (const collection of collections) { - if (existingNames.has(collection.name)) { - console.log(` SKIP ${collection.name}`); - skipped++; - } else { - const id = await createCollection(collection); - console.log(` OK id=${id} ${collection.name}`); - created++; - } - } - } - - console.log(`\nDone. Created: ${created}, skipped (already exist): ${skipped}.`); -} - -try { - await main(); -} catch (err) { - console.error(err); - process.exit(1); -} diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index ad80958da..1d5f214fe 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -94,7 +94,6 @@ def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: def make_parser() -> argparse.ArgumentParser: - _repeat_env = os.environ.get("REPEAT_INTERVAL_HOURS") parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, @@ -128,7 +127,7 @@ def make_parser() -> argparse.ArgumentParser: parser.add_argument( "--repeat-interval-hours", type=float, - default=float(_repeat_env) if _repeat_env else None, + default=os.environ.get("REPEAT_INTERVAL_HOURS"), metavar="HOURS", help="Re-seed every N hours instead of exiting (default: $REPEAT_INTERVAL_HOURS or run once)", ) From 031bdda89fa1707e0d91ccee8dccf04cd482bde9 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:42:08 +0200 Subject: [PATCH 37/54] chore(collection-seeding): rename pixi workspace from example-data-seeder to collection-seeding Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/pixi.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collection-seeding/pixi.toml b/collection-seeding/pixi.toml index 5fe7a1927..5f7eca32a 100644 --- a/collection-seeding/pixi.toml +++ b/collection-seeding/pixi.toml @@ -1,5 +1,5 @@ [workspace] -name = "example-data-seeder" +name = "collection-seeding" version = "0.1.0" channels = ["conda-forge"] platforms = ["linux-64", "osx-arm64", "osx-64", "linux-aarch64"] From 11e288190b57976968b893a4c3e922fe5d6d4bef Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:47:52 +0200 Subject: [PATCH 38/54] refactor(collection-seeding): use dict literal instead of Collection() constructor in pango_lineages Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/sources/pango_lineages.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index c62645817..193b198aa 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -70,12 +70,7 @@ def _build_collection(entry: dict) -> Collection: f"Designated: {date}." ) - return Collection( - name=lineage, - organism="covid", - description=description, - variants=variants, - ) + return {"name": lineage, "organism": "covid", "description": description, "variants": variants} class PangoLineagesSampleSource(PangoLineagesSource): From 5fef10c70d2ecae93fe95641478f49a004ceb7e9 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:52:52 +0200 Subject: [PATCH 39/54] refactor(collection-seeding): make Source.name an abstract property to enforce subclass definition Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/sources/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index 3b7976365..42a5eb8c5 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -10,7 +10,9 @@ class Source(ABC): and implement `get_collections` to return the collections to upsert. Then register it in sources/registry.py. """ - name: str + @property + @abstractmethod + def name(self) -> str: ... @abstractmethod def get_collections(self) -> list[Collection]: From fefb936ea10b0406ee972d476392f67f61b64eda Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:55:48 +0200 Subject: [PATCH 40/54] chore(collection-seeding): add .dockerignore to exclude .pixi, tests, and docs from build context Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/.dockerignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 collection-seeding/.dockerignore diff --git a/collection-seeding/.dockerignore b/collection-seeding/.dockerignore new file mode 100644 index 000000000..704b0a8c9 --- /dev/null +++ b/collection-seeding/.dockerignore @@ -0,0 +1,4 @@ +.pixi/ +__pycache__/ +tests/ +*.md From be538a5f7deeecb283d19ed599d08118bde3ba09 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 27 May 2026 15:59:09 +0200 Subject: [PATCH 41/54] chore(collection-seeding): add ruff for linting and formatting Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/api.py | 33 +- collection-seeding/models.py | 1 + collection-seeding/pixi.lock | 24 ++ collection-seeding/pixi.toml | 3 + collection-seeding/seed.py | 11 +- collection-seeding/sources/__init__.py | 6 +- collection-seeding/sources/pango_lineages.py | 12 +- .../sources/resistance_mutations.py | 321 ++++++++++++++---- .../tests/test_pango_lineages.py | 8 +- .../tests/test_resistance_mutations.py | 2 + collection-seeding/tests/test_seed.py | 13 +- 11 files changed, 350 insertions(+), 84 deletions(-) diff --git a/collection-seeding/api.py b/collection-seeding/api.py index 8057ddc70..35350bf76 100644 --- a/collection-seeding/api.py +++ b/collection-seeding/api.py @@ -17,7 +17,9 @@ def __init__(self, base_url: str, api_key: str): self._collections_url = f"{self.base_url}/api/collections" self._auth_headers = {"Authorization": f"Bearer {api_key}"} - def wait_for_api(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELAY_S): + def wait_for_api( + self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELAY_S + ): """Poll until the API is ready by checking the collections endpoint.""" for attempt in range(1, attempts + 1): try: @@ -35,20 +37,39 @@ def wait_for_api(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELA sys.exit(1) def fetch_existing_collections(self, organism: str) -> list[ExistingCollection]: - r = requests.get(self._collections_url, params={"organism": organism}, headers=self._auth_headers, timeout=10) + r = requests.get( + self._collections_url, + params={"organism": organism}, + headers=self._auth_headers, + timeout=10, + ) if not r.ok: raise RuntimeError(f"GET /api/collections failed: {r.status_code} {r.text}") return r.json() def create_collection(self, collection: Collection) -> int: - r = requests.post(self._collections_url, headers=self._auth_headers, json=collection, timeout=10) + r = requests.post( + self._collections_url, + headers=self._auth_headers, + json=collection, + timeout=10, + ) if r.status_code != 201: - raise RuntimeError(f"POST /api/collections failed: {r.status_code} {r.text}") + raise RuntimeError( + f"POST /api/collections failed: {r.status_code} {r.text}" + ) return r.json()["id"] def update_collection(self, collection_id: int, collection: Collection) -> None: # CollectionUpdate has no organism field; sending it causes a 400 (fail-on-unknown-properties=true) body = {k: v for k, v in collection.items() if k != "organism"} - r = requests.put(f"{self._collections_url}/{collection_id}", headers=self._auth_headers, json=body, timeout=10) + r = requests.put( + f"{self._collections_url}/{collection_id}", + headers=self._auth_headers, + json=body, + timeout=10, + ) if not r.ok: - raise RuntimeError(f"PUT /api/collections/{collection_id} failed: {r.status_code} {r.text}") + raise RuntimeError( + f"PUT /api/collections/{collection_id} failed: {r.status_code} {r.text}" + ) diff --git a/collection-seeding/models.py b/collection-seeding/models.py index 2e0425561..2f5a84e1e 100644 --- a/collection-seeding/models.py +++ b/collection-seeding/models.py @@ -23,5 +23,6 @@ class Collection(TypedDict): class ExistingCollection(TypedDict): """A collection as returned by the backend (includes the assigned id).""" + id: int name: str diff --git a/collection-seeding/pixi.lock b/collection-seeding/pixi.lock index b8ae0977c..37e2e83f2 100644 --- a/collection-seeding/pixi.lock +++ b/collection-seeding/pixi.lock @@ -142,6 +142,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1e/c0/56472c251d09858a53e51efbd485b09e1995d8731668b76d52e5dd6ee0f1/ruff-0.15.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl linux-aarch64: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda @@ -176,6 +177,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8d/f1/b15a7839fa4f332f8acec78e20564f26bb2d866e3d21710b877fd0263000/ruff-0.15.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl osx-64: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda @@ -205,6 +207,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/45/91/254a35c20acc38a7223c9d2d594af12e794432464f2cdeb52af1dc4a892d/ruff-0.15.14-py3-none-macosx_10_12_x86_64.whl - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl osx-arm64: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda @@ -233,6 +236,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/56/9e/d13e40f83b8d0a94430e6778ce1d94a43b38cf2efe63278bdd2b4c65abbf/ruff-0.15.14-py3-none-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda @@ -1045,6 +1049,26 @@ packages: - tomli ; python_full_version < '3.11' and extra == 'tests' - tomli-w ; extra == 'tests' requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/1e/c0/56472c251d09858a53e51efbd485b09e1995d8731668b76d52e5dd6ee0f1/ruff-0.15.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: ruff + version: 0.15.14 + sha256: 715c543cf450c4888251f91c52f1942a800541d9bddd7ac060aa4e6b77ae7cba + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/45/91/254a35c20acc38a7223c9d2d594af12e794432464f2cdeb52af1dc4a892d/ruff-0.15.14-py3-none-macosx_10_12_x86_64.whl + name: ruff + version: 0.15.14 + sha256: be4ff55af755bd71a00ab3dc6bd7ffc467bd76e0df6881e286c2e3d23e8fb43b + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/56/9e/d13e40f83b8d0a94430e6778ce1d94a43b38cf2efe63278bdd2b4c65abbf/ruff-0.15.14-py3-none-macosx_11_0_arm64.whl + name: ruff + version: 0.15.14 + sha256: 48d5909d7d06276ce7dde6d32bfa4b0d4cb2651145cd8ee4b440722cbc77832f + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/8d/f1/b15a7839fa4f332f8acec78e20564f26bb2d866e3d21710b877fd0263000/ruff-0.15.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + name: ruff + version: 0.15.14 + sha256: ca8cbfa94c4f90984a67561978602746d4cd27103568f745fa90eee3f0d4107d + requires_python: '>=3.7' - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac md5: cffd3bdd58090148f4cfcd831f4b26ab diff --git a/collection-seeding/pixi.toml b/collection-seeding/pixi.toml index 5f7eca32a..f495ccb5b 100644 --- a/collection-seeding/pixi.toml +++ b/collection-seeding/pixi.toml @@ -19,9 +19,12 @@ seed-resistance = "python seed.py --source covid-resistance-mutations" [feature.test.pypi-dependencies] pytest = ">=9.0.3" responses = ">=0.26.0" +ruff = ">=0.9" [feature.test.tasks] test = "pytest" +lint = "ruff check ." +format = "ruff format ." [environments] test = { features = ["test"] } diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 1d5f214fe..cd1a8bc23 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -29,7 +29,10 @@ def main(): return if args.source and args.source not in source_map: - print(f"Unknown source '{args.source}'. Use --list to see available sources.", file=sys.stderr) + print( + f"Unknown source '{args.source}'. Use --list to see available sources.", + file=sys.stderr, + ) sys.exit(1) if not args.api_key: @@ -99,12 +102,14 @@ def make_parser() -> argparse.ArgumentParser: formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( - "-u", "--url", + "-u", + "--url", default=os.environ.get("API_URL", "http://localhost:4321"), help="API base URL (default: $API_URL or http://localhost:4321)", ) parser.add_argument( - "-k", "--api-key", + "-k", + "--api-key", default=os.environ.get("API_KEY"), help="API key for authentication (default: $API_KEY)", ) diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index 42a5eb8c5..eb6ef56dc 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -10,12 +10,10 @@ class Source(ABC): and implement `get_collections` to return the collections to upsert. Then register it in sources/registry.py. """ + @property @abstractmethod def name(self) -> str: ... @abstractmethod - def get_collections(self) -> list[Collection]: - ... - - + def get_collections(self) -> list[Collection]: ... diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index 193b198aa..cfc5cecf1 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -8,11 +8,13 @@ "/refs/heads/main/data/pango-consensus-sequences_summary.json" ) + class PangoLineagesSource(Source): """Source: Pango lineage definitions from corneliusroemer/pango-sequences. Creates one collection per lineage, with nucleotide substitutions as variants. """ + name = "covid-pango-lineages" def __init__(self, limit: int | None = None): @@ -24,7 +26,7 @@ def get_collections(self) -> list[Collection]: response.raise_for_status() entries = list(response.json().values()) if self._limit is not None: - entries = entries[:self._limit] + entries = entries[: self._limit] print(f" Loaded {len(entries)} lineage(s).") return [self._build_collection(e) for e in entries] @@ -70,11 +72,17 @@ def _build_collection(entry: dict) -> Collection: f"Designated: {date}." ) - return {"name": lineage, "organism": "covid", "description": description, "variants": variants} + return { + "name": lineage, + "organism": "covid", + "description": description, + "variants": variants, + } class PangoLineagesSampleSource(PangoLineagesSource): """Same as PangoLineagesSource but limited to the first 10 lineages, for quick testing.""" + name = "covid-pango-lineages-sample" def __init__(self): diff --git a/collection-seeding/sources/resistance_mutations.py b/collection-seeding/sources/resistance_mutations.py index 27aad53a0..1feeba92b 100644 --- a/collection-seeding/sources/resistance_mutations.py +++ b/collection-seeding/sources/resistance_mutations.py @@ -8,6 +8,7 @@ class ResistanceMutationsSource(Source): Three collections covering 3CLpro, RdRp, and Spike mAb resistance mutations as per the Stanford Coronavirus Antiviral & Resistance database. """ + name = "covid-resistance-mutations" def get_collections(self) -> list[Collection]: @@ -50,10 +51,10 @@ def _mature_name(mutation: str, set_name: str, offset: int) -> str: e.g. _mature_name("ORF1a:T3284I", "3CLpro", -3263) -> "3CLpro:T21I" """ - mut_part = mutation[mutation.index(':') + 1:] + mut_part = mutation[mutation.index(":") + 1 :] original_base = mut_part[0] new_base = mut_part[-1] - position = int(''.join(c for c in mut_part if c.isdigit())) + position = int("".join(c for c in mut_part if c.isdigit())) return f"{set_name}:{original_base}{position + offset}{new_base}" @@ -69,72 +70,264 @@ def _build_variants(mutations: list[str], set_name: str, offset: int) -> list[Va CLPRO_MUTATIONS = [ - 'ORF1a:T3284I', 'ORF1a:T3288A', 'ORF1a:T3288N', 'ORF1a:T3308I', 'ORF1a:D3311Y', - 'ORF1a:M3312I', 'ORF1a:M3312L', 'ORF1a:M3312T', 'ORF1a:M3312-', 'ORF1a:L3313F', - 'ORF1a:G3401S', 'ORF1a:F3403L', 'ORF1a:F3403S', 'ORF1a:N3405D', 'ORF1a:N3405L', - 'ORF1a:N3405S', 'ORF1a:G3406S', 'ORF1a:S3407A', 'ORF1a:S3407E', 'ORF1a:S3407L', - 'ORF1a:S3407P', 'ORF1a:C3423F', 'ORF1a:M3428R', 'ORF1a:M3428T', 'ORF1a:E3429A', - 'ORF1a:E3429G', 'ORF1a:E3429K', 'ORF1a:E3429Q', 'ORF1a:E3429V', 'ORF1a:L3430F', - 'ORF1a:P3431-', 'ORF1a:T3432I', 'ORF1a:H3435L', 'ORF1a:H3435N', 'ORF1a:H3435Q', - 'ORF1a:H3435Y', 'ORF1a:A3436T', 'ORF1a:A3436V', 'ORF1a:V3449A', 'ORF1a:R3451G', - 'ORF1a:R3451S', 'ORF1a:Q3452I', 'ORF1a:Q3452K', 'ORF1a:T3453I', 'ORF1a:A3454T', - 'ORF1a:A3454V', 'ORF1a:Q3455A', 'ORF1a:Q3455C', 'ORF1a:Q3455D', 'ORF1a:Q3455E', - 'ORF1a:Q3455F', 'ORF1a:Q3455G', 'ORF1a:Q3455H', 'ORF1a:Q3455I', 'ORF1a:Q3455K', - 'ORF1a:Q3455L', 'ORF1a:Q3455N', 'ORF1a:Q3455P', 'ORF1a:Q3455R', 'ORF1a:Q3455S', - 'ORF1a:Q3455T', 'ORF1a:Q3455V', 'ORF1a:Q3455W', 'ORF1a:Q3455Y', 'ORF1a:A3456P', - 'ORF1a:A3457S', 'ORF1a:P3515L', 'ORF1a:V3560A', 'ORF1a:S3564P', 'ORF1a:T3567I', - 'ORF1a:F3568L', + "ORF1a:T3284I", + "ORF1a:T3288A", + "ORF1a:T3288N", + "ORF1a:T3308I", + "ORF1a:D3311Y", + "ORF1a:M3312I", + "ORF1a:M3312L", + "ORF1a:M3312T", + "ORF1a:M3312-", + "ORF1a:L3313F", + "ORF1a:G3401S", + "ORF1a:F3403L", + "ORF1a:F3403S", + "ORF1a:N3405D", + "ORF1a:N3405L", + "ORF1a:N3405S", + "ORF1a:G3406S", + "ORF1a:S3407A", + "ORF1a:S3407E", + "ORF1a:S3407L", + "ORF1a:S3407P", + "ORF1a:C3423F", + "ORF1a:M3428R", + "ORF1a:M3428T", + "ORF1a:E3429A", + "ORF1a:E3429G", + "ORF1a:E3429K", + "ORF1a:E3429Q", + "ORF1a:E3429V", + "ORF1a:L3430F", + "ORF1a:P3431-", + "ORF1a:T3432I", + "ORF1a:H3435L", + "ORF1a:H3435N", + "ORF1a:H3435Q", + "ORF1a:H3435Y", + "ORF1a:A3436T", + "ORF1a:A3436V", + "ORF1a:V3449A", + "ORF1a:R3451G", + "ORF1a:R3451S", + "ORF1a:Q3452I", + "ORF1a:Q3452K", + "ORF1a:T3453I", + "ORF1a:A3454T", + "ORF1a:A3454V", + "ORF1a:Q3455A", + "ORF1a:Q3455C", + "ORF1a:Q3455D", + "ORF1a:Q3455E", + "ORF1a:Q3455F", + "ORF1a:Q3455G", + "ORF1a:Q3455H", + "ORF1a:Q3455I", + "ORF1a:Q3455K", + "ORF1a:Q3455L", + "ORF1a:Q3455N", + "ORF1a:Q3455P", + "ORF1a:Q3455R", + "ORF1a:Q3455S", + "ORF1a:Q3455T", + "ORF1a:Q3455V", + "ORF1a:Q3455W", + "ORF1a:Q3455Y", + "ORF1a:A3456P", + "ORF1a:A3457S", + "ORF1a:P3515L", + "ORF1a:V3560A", + "ORF1a:S3564P", + "ORF1a:T3567I", + "ORF1a:F3568L", ] RDRP_MUTATIONS = [ - 'ORF1b:V157A', 'ORF1b:V157L', 'ORF1b:N189S', 'ORF1b:R276C', 'ORF1b:A367V', - 'ORF1b:A440V', 'ORF1b:F471L', 'ORF1b:D475Y', 'ORF1b:A517V', 'ORF1b:V548L', - 'ORF1b:G662S', 'ORF1b:S750A', 'ORF1b:V783I', 'ORF1b:E787G', 'ORF1b:C790F', - 'ORF1b:C790R', 'ORF1b:E793A', 'ORF1b:E793D', 'ORF1b:M915R', + "ORF1b:V157A", + "ORF1b:V157L", + "ORF1b:N189S", + "ORF1b:R276C", + "ORF1b:A367V", + "ORF1b:A440V", + "ORF1b:F471L", + "ORF1b:D475Y", + "ORF1b:A517V", + "ORF1b:V548L", + "ORF1b:G662S", + "ORF1b:S750A", + "ORF1b:V783I", + "ORF1b:E787G", + "ORF1b:C790F", + "ORF1b:C790R", + "ORF1b:E793A", + "ORF1b:E793D", + "ORF1b:M915R", ] SPIKE_MUTATIONS = [ - 'S:P337H', 'S:P337L', 'S:P337R', 'S:P337S', 'S:P337T', - 'S:E340A', 'S:E340D', 'S:E340G', 'S:E340K', 'S:E340Q', 'S:E340V', - 'S:T345P', - 'S:R346G', 'S:R346I', 'S:R346K', 'S:R346S', 'S:R346T', - 'S:K356Q', 'S:K356T', - 'S:S371F', 'S:S371L', - 'S:D405E', 'S:D405N', 'S:E406D', - 'S:K417E', 'S:K417H', 'S:K417I', 'S:K417M', 'S:K417N', 'S:K417R', 'S:K417S', 'S:K417T', - 'S:D420A', 'S:D420N', - 'S:N439K', - 'S:N440D', 'S:N440E', 'S:N440I', 'S:N440K', 'S:N440R', 'S:N440T', 'S:N440Y', - 'S:S443Y', - 'S:K444E', 'S:K444F', 'S:K444I', 'S:K444L', 'S:K444M', 'S:K444N', 'S:K444R', 'S:K444T', - 'S:V445A', 'S:V445D', 'S:V445F', 'S:V445I', 'S:V445L', - 'S:G446A', 'S:G446D', 'S:G446I', 'S:G446N', 'S:G446R', 'S:G446S', 'S:G446T', 'S:G446V', - 'S:G447C', 'S:G447D', 'S:G447F', 'S:G447S', 'S:G447V', - 'S:N448D', 'S:N448K', 'S:N448T', 'S:N448Y', - 'S:Y449D', - 'S:N450D', 'S:N450K', - 'S:L452M', 'S:L452Q', 'S:L452R', 'S:L452W', - 'S:Y453F', 'S:Y453H', - 'S:L455F', 'S:L455M', 'S:L455S', 'S:L455W', - 'S:F456C', 'S:F456L', 'S:F456V', - 'S:S459P', - 'S:N460D', 'S:N460H', 'S:N460I', 'S:N460K', 'S:N460S', 'S:N460T', 'S:N460Y', - 'S:A475D', 'S:A475V', - 'S:G476D', 'S:G476R', 'S:G476T', - 'S:V483A', - 'S:E484A', 'S:E484D', 'S:E484G', 'S:E484K', 'S:E484P', 'S:E484Q', 'S:E484R', 'S:E484S', 'S:E484T', 'S:E484V', - 'S:G485D', 'S:G485R', - 'S:F486D', 'S:F486I', 'S:F486L', 'S:F486N', 'S:F486P', 'S:F486S', 'S:F486T', 'S:F486V', - 'S:N487D', 'S:N487H', 'S:N487S', - 'S:Y489H', 'S:Y489W', - 'S:F490G', 'S:F490I', 'S:F490L', 'S:F490R', 'S:F490S', 'S:F490V', 'S:F490Y', - 'S:Q493D', 'S:Q493E', 'S:Q493H', 'S:Q493K', 'S:Q493L', 'S:Q493R', 'S:Q493V', - 'S:S494P', 'S:S494R', - 'S:G496S', - 'S:Q498H', - 'S:P499H', 'S:P499R', 'S:P499S', 'S:P499T', - 'S:N501T', 'S:N501Y', - 'S:G504C', 'S:G504D', 'S:G504I', 'S:G504L', 'S:G504N', 'S:G504R', 'S:G504V', - 'S:P507A', - 'S:N856K', 'S:N969K', 'S:E990A', 'S:T1009I', + "S:P337H", + "S:P337L", + "S:P337R", + "S:P337S", + "S:P337T", + "S:E340A", + "S:E340D", + "S:E340G", + "S:E340K", + "S:E340Q", + "S:E340V", + "S:T345P", + "S:R346G", + "S:R346I", + "S:R346K", + "S:R346S", + "S:R346T", + "S:K356Q", + "S:K356T", + "S:S371F", + "S:S371L", + "S:D405E", + "S:D405N", + "S:E406D", + "S:K417E", + "S:K417H", + "S:K417I", + "S:K417M", + "S:K417N", + "S:K417R", + "S:K417S", + "S:K417T", + "S:D420A", + "S:D420N", + "S:N439K", + "S:N440D", + "S:N440E", + "S:N440I", + "S:N440K", + "S:N440R", + "S:N440T", + "S:N440Y", + "S:S443Y", + "S:K444E", + "S:K444F", + "S:K444I", + "S:K444L", + "S:K444M", + "S:K444N", + "S:K444R", + "S:K444T", + "S:V445A", + "S:V445D", + "S:V445F", + "S:V445I", + "S:V445L", + "S:G446A", + "S:G446D", + "S:G446I", + "S:G446N", + "S:G446R", + "S:G446S", + "S:G446T", + "S:G446V", + "S:G447C", + "S:G447D", + "S:G447F", + "S:G447S", + "S:G447V", + "S:N448D", + "S:N448K", + "S:N448T", + "S:N448Y", + "S:Y449D", + "S:N450D", + "S:N450K", + "S:L452M", + "S:L452Q", + "S:L452R", + "S:L452W", + "S:Y453F", + "S:Y453H", + "S:L455F", + "S:L455M", + "S:L455S", + "S:L455W", + "S:F456C", + "S:F456L", + "S:F456V", + "S:S459P", + "S:N460D", + "S:N460H", + "S:N460I", + "S:N460K", + "S:N460S", + "S:N460T", + "S:N460Y", + "S:A475D", + "S:A475V", + "S:G476D", + "S:G476R", + "S:G476T", + "S:V483A", + "S:E484A", + "S:E484D", + "S:E484G", + "S:E484K", + "S:E484P", + "S:E484Q", + "S:E484R", + "S:E484S", + "S:E484T", + "S:E484V", + "S:G485D", + "S:G485R", + "S:F486D", + "S:F486I", + "S:F486L", + "S:F486N", + "S:F486P", + "S:F486S", + "S:F486T", + "S:F486V", + "S:N487D", + "S:N487H", + "S:N487S", + "S:Y489H", + "S:Y489W", + "S:F490G", + "S:F490I", + "S:F490L", + "S:F490R", + "S:F490S", + "S:F490V", + "S:F490Y", + "S:Q493D", + "S:Q493E", + "S:Q493H", + "S:Q493K", + "S:Q493L", + "S:Q493R", + "S:Q493V", + "S:S494P", + "S:S494R", + "S:G496S", + "S:Q498H", + "S:P499H", + "S:P499R", + "S:P499S", + "S:P499T", + "S:N501T", + "S:N501Y", + "S:G504C", + "S:G504D", + "S:G504I", + "S:G504L", + "S:G504N", + "S:G504R", + "S:G504V", + "S:P507A", + "S:N856K", + "S:N969K", + "S:E990A", + "S:T1009I", ] diff --git a/collection-seeding/tests/test_pango_lineages.py b/collection-seeding/tests/test_pango_lineages.py index 2025642fa..2576068fb 100644 --- a/collection-seeding/tests/test_pango_lineages.py +++ b/collection-seeding/tests/test_pango_lineages.py @@ -45,6 +45,7 @@ def test_name(): # --- _build_collection --- + def test_build_collection_basic(): col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) assert col["name"] == "BA.2" @@ -54,14 +55,14 @@ def test_build_collection_basic(): def test_build_collection_description_format(): col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) assert "BA.2" in col["description"] - assert "BA" in col["description"] # parent - assert "22C" in col["description"] # clade + assert "BA" in col["description"] # parent + assert "22C" in col["description"] # clade assert "2022-01-20" in col["description"] def test_build_collection_missing_fields_use_defaults(): col = PangoLineagesSource._build_collection(SAMPLE_DATA["XBB"]) - assert "—" in col["description"] # parent and clade fallback + assert "—" in col["description"] # parent and clade fallback assert "unknown" in col["description"] # date fallback @@ -117,6 +118,7 @@ def test_build_collection_empty_lists_when_all_blanks(): # --- get_collections --- + @rsps_lib.activate def test_get_collections_fetches_data_url(): rsps_lib.add(rsps_lib.GET, DATA_URL, json=SAMPLE_DATA, status=200) diff --git a/collection-seeding/tests/test_resistance_mutations.py b/collection-seeding/tests/test_resistance_mutations.py index 84a7754fa..c708f7673 100644 --- a/collection-seeding/tests/test_resistance_mutations.py +++ b/collection-seeding/tests/test_resistance_mutations.py @@ -7,6 +7,7 @@ def test_name(): # --- _mature_name --- + def test_mature_name_clpro_offset(): # ORF1a position 3284, offset -3263 → position 21 assert _mature_name("ORF1a:T3284I", "3CLpro", -3263) == "3CLpro:T21I" @@ -28,6 +29,7 @@ def test_mature_name_deletion(): # --- get_collections --- + def test_get_collections_returns_three(): cols = ResistanceMutationsSource().get_collections() assert len(cols) == 3 diff --git a/collection-seeding/tests/test_seed.py b/collection-seeding/tests/test_seed.py index e2f18e491..60bd4181c 100644 --- a/collection-seeding/tests/test_seed.py +++ b/collection-seeding/tests/test_seed.py @@ -8,7 +8,13 @@ "name": "Mock Collection A", "organism": "covid", "description": "A mock collection for testing.", - "variants": [{"type": "filterObject", "name": "C123T", "filterObject": {"nucleotideMutations": ["C123T"]}}], + "variants": [ + { + "type": "filterObject", + "name": "C123T", + "filterObject": {"nucleotideMutations": ["C123T"]}, + } + ], }, { "name": "Mock Collection B", @@ -28,6 +34,7 @@ def make_client(existing=None): # --- seed_source: create / update / mixed --- + def test_all_new_creates_all(): client = make_client(existing=[]) created, updated = seed_source(client, MockSource(COLLECTIONS)) @@ -76,7 +83,9 @@ def test_fetch_called_once_per_organism(): client = make_client(existing=[]) seed_source(client, MockSource(multi)) assert client.fetch_existing_collections.call_count == 2 - organisms_fetched = {c.args[0] for c in client.fetch_existing_collections.call_args_list} + organisms_fetched = { + c.args[0] for c in client.fetch_existing_collections.call_args_list + } assert organisms_fetched == {"covid", "mpox"} From c387cb7ece083bc52b83f7df53cd91e5451e15a6 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 09:40:34 +0200 Subject: [PATCH 42/54] chore(collection-seeding): use whitelist dockerignore with COPY . . Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/.dockerignore | 11 +++++++---- collection-seeding/Dockerfile | 3 +-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/collection-seeding/.dockerignore b/collection-seeding/.dockerignore index 704b0a8c9..2688cbdae 100644 --- a/collection-seeding/.dockerignore +++ b/collection-seeding/.dockerignore @@ -1,4 +1,7 @@ -.pixi/ -__pycache__/ -tests/ -*.md +* +!pixi.toml +!pixi.lock +!seed.py +!api.py +!models.py +!sources/ diff --git a/collection-seeding/Dockerfile b/collection-seeding/Dockerfile index 21727058c..da7b06e63 100644 --- a/collection-seeding/Dockerfile +++ b/collection-seeding/Dockerfile @@ -9,6 +9,5 @@ FROM python:3.13-slim AS final WORKDIR /app COPY --from=builder /app/.pixi/envs/default/lib/python3.13/site-packages \ /usr/local/lib/python3.13/site-packages -COPY seed.py api.py models.py . -COPY sources/ sources/ +COPY . . CMD ["python", "seed.py"] From 627f4f8367b37aa12f8df165bea929da0c6b1687 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 09:45:05 +0200 Subject: [PATCH 43/54] refactor(collection-seeding): raise RuntimeError instead of sys.exit in api.py Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/api.py | 7 ++----- collection-seeding/seed.py | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/collection-seeding/api.py b/collection-seeding/api.py index 35350bf76..5711c2105 100644 --- a/collection-seeding/api.py +++ b/collection-seeding/api.py @@ -1,6 +1,5 @@ """Shared backend API client for collection seeders.""" -import sys import time import requests @@ -30,11 +29,9 @@ def wait_for_api( pass print(f"Waiting for API... (attempt {attempt}/{attempts})") time.sleep(delay) - print( - f"API at {self.base_url} did not become ready after {attempts} attempts.", - file=sys.stderr, + raise RuntimeError( + f"API at {self.base_url} did not become ready after {attempts} attempts." ) - sys.exit(1) def fetch_existing_collections(self, organism: str) -> list[ExistingCollection]: r = requests.get( diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index cd1a8bc23..2daf3f781 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -42,13 +42,13 @@ def main(): client = ApiClient(args.url, args.api_key) print(f"Seeding collections against {args.url} ...") - if args.wait: - client.wait_for_api() - active = [source_map[args.source]] if args.source else list(source_map.values()) - while True: - try: + try: + if args.wait: + client.wait_for_api() + + while True: total_created = 0 total_updated = 0 for source in active: @@ -57,13 +57,13 @@ def main(): total_updated += u if len(active) > 1: print(f"\nTotal — created: {total_created}, updated: {total_updated}.") - except Exception as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) - if not args.repeat_interval_hours: - break - print(f"\nSleeping for {args.repeat_interval_hours}h ...") - time.sleep(args.repeat_interval_hours * 3600) + if not args.repeat_interval_hours: + break + print(f"\nSleeping for {args.repeat_interval_hours}h ...") + time.sleep(args.repeat_interval_hours * 3600) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: From d280a52f251e2f47a6b41f2f14395584edd1374e Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 10:24:27 +0200 Subject: [PATCH 44/54] docs(collection-seeding): fix idempotency description in seed.py Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 2daf3f781..2f5d99895 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Seeds example collections into the backend from one or more data sources. -Idempotent: skips any collection whose name already exists for the seed user. +Idempotent: upserts collections by name — creates new ones, updates existing ones. Run with --help for usage. """ From d45ee0874a9e99d19f17c0d6153130361166d683 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 13:58:37 +0200 Subject: [PATCH 45/54] fix(collection-seeding): convert REPEAT_INTERVAL_HOURS env var to float Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 2f5d99895..97d7ec041 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -132,7 +132,9 @@ def make_parser() -> argparse.ArgumentParser: parser.add_argument( "--repeat-interval-hours", type=float, - default=os.environ.get("REPEAT_INTERVAL_HOURS"), + default=float(os.environ["REPEAT_INTERVAL_HOURS"]) + if os.environ.get("REPEAT_INTERVAL_HOURS") + else None, metavar="HOURS", help="Re-seed every N hours instead of exiting (default: $REPEAT_INTERVAL_HOURS or run once)", ) From 74056c695e04429a1757a2db0c79305d24b4693d Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 14:14:17 +0200 Subject: [PATCH 46/54] docs: update collection-seeding ADR with Kotlin rationale Co-Authored-By: Claude Sonnet 4.6 --- docs/arc42/09-architecture-decisions.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/arc42/09-architecture-decisions.md b/docs/arc42/09-architecture-decisions.md index 6f0853a9c..2d6151a19 100644 --- a/docs/arc42/09-architecture-decisions.md +++ b/docs/arc42/09-architecture-decisions.md @@ -29,3 +29,25 @@ We would have needed to host it on one of our servers and configure it properly. Although is it relatively easy to get a Keycloak Docker image running, it is still difficult to get the configuration right. None of the team members was an expert in Keycloak and misconfigurations can lead to security issues. + +## Auto-creation of collections + +### Context + +As GenSpectrum, we want to provide a few collections that we create based on other online resources. +Ideally, others would maintain their own collections, but for now we want to do it ourselves. +For example, we create collections for mutations that are relevant for vaccine resistance, based +on online lists. Or we create collections based on canonical lineage definitions. + +Since we want others to also be able to easily generate collections, the code should also serve +as a kind of reference implementation on how one would generate collections. Therefore we want the +code to be completely independent from the rest of the codebase, and also understandable to +bioinformatics researchers. + +### Decision + +We decided to write the code in Python. +The alternative would have been to use JavaScript, since we already have that. But since we want +researchers to reuse or copy the code, Python is better suited. +Kotlin was not considered — it is heavier than both Python and JavaScript and even less familiar +to the target audience. From aecff722dec1e39881ec6a3009ca6f272f5640f2 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 14:19:53 +0200 Subject: [PATCH 47/54] feat(collection-seeding): exclude sample source from default run Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/seed.py | 6 +++++- collection-seeding/sources/__init__.py | 5 +++++ collection-seeding/sources/pango_lineages.py | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 97d7ec041..839bcef19 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -42,7 +42,11 @@ def main(): client = ApiClient(args.url, args.api_key) print(f"Seeding collections against {args.url} ...") - active = [source_map[args.source]] if args.source else list(source_map.values()) + active = ( + [source_map[args.source]] + if args.source + else [s for s in source_map.values() if s.include_in_default_run] + ) try: if args.wait: diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index eb6ef56dc..12b9fb9e8 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -9,8 +9,13 @@ class Source(ABC): Implement this to add a new source: set a unique `name` (used as the --source flag value) and implement `get_collections` to return the collections to upsert. Then register it in sources/registry.py. + + Set `include_in_default_run = False` for sources that should only be used via --source + (e.g. demo/sample sources that overlap with a full source). """ + include_in_default_run: bool = True + @property @abstractmethod def name(self) -> str: ... diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index cfc5cecf1..67e1e53d1 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -84,6 +84,7 @@ class PangoLineagesSampleSource(PangoLineagesSource): """Same as PangoLineagesSource but limited to the first 10 lineages, for quick testing.""" name = "covid-pango-lineages-sample" + include_in_default_run = False def __init__(self): super().__init__(limit=10) From a8191b60b2518ee294b881e94df309e3083f442e Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 14:24:14 +0200 Subject: [PATCH 48/54] ci(collection-seeding): run Python tests before building Docker image Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/example-data-seeder.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/example-data-seeder.yml b/.github/workflows/example-data-seeder.yml index 4ea69aa21..c5d807f87 100644 --- a/.github/workflows/example-data-seeder.yml +++ b/.github/workflows/example-data-seeder.yml @@ -6,8 +6,23 @@ env: DOCKER_IMAGE_NAME: ghcr.io/genspectrum/dashboards/example-data-seeder jobs: + test: + name: Test Example Data Seeder + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - uses: prefix-dev/setup-pixi@v0.9.6 + with: + environments: test + + - name: Run tests + working-directory: ./collection-seeding + run: pixi run -e test test + dockerImage: name: Build Example Data Seeder Docker Image + needs: test runs-on: ubuntu-latest permissions: contents: read From c19425aa984c65619ded8598e8a8d9824bbb412b Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 14:32:17 +0200 Subject: [PATCH 49/54] fix(docker-compose): hardcode dummy system user API key for both backend and seeder Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/README.md | 2 +- docker-compose.yml | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/collection-seeding/README.md b/collection-seeding/README.md index 87a40071f..3ed909a5f 100644 --- a/collection-seeding/README.md +++ b/collection-seeding/README.md @@ -15,7 +15,7 @@ Use `--repeat-interval-hours N` (or `$REPEAT_INTERVAL_HOURS`) to run on a loop The seeder runs automatically as part of Docker Compose: ```bash -BACKEND_TAG=latest WEBSITE_TAG=latest SEEDER_TAG=latest SEEDER_API_KEY=... docker compose up +BACKEND_TAG=latest WEBSITE_TAG=latest SEEDER_TAG=latest docker compose up ``` ## Running locally diff --git a/docker-compose.yml b/docker-compose.yml index 9d4562443..16e0848a5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,6 +19,8 @@ services: - "127.0.0.1:9021:8080" depends_on: - database + environment: + DASHBOARDS_SYSTEM_USER_API_KEY: dummy-system-user-api-key-insecure command: - --spring.datasource.url=jdbc:postgresql://database:5432/dashboards-backend-db - --spring.datasource.username=postgres @@ -42,7 +44,7 @@ services: - website environment: API_URL: http://website:4321 - API_KEY: ${SEEDER_API_KEY} + API_KEY: dummy-system-user-api-key-insecure REPEAT_INTERVAL_HOURS: 8 restart: always From 6ac8098c9bfadc828ae4aeec26cb666fb5aa316b Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 14:33:28 +0200 Subject: [PATCH 50/54] fix(ci): set manifest-path for setup-pixi in seeder workflow Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/example-data-seeder.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/example-data-seeder.yml b/.github/workflows/example-data-seeder.yml index c5d807f87..c759a77e3 100644 --- a/.github/workflows/example-data-seeder.yml +++ b/.github/workflows/example-data-seeder.yml @@ -14,6 +14,7 @@ jobs: - uses: prefix-dev/setup-pixi@v0.9.6 with: + manifest-path: collection-seeding/pixi.toml environments: test - name: Run tests From 478734260de889c28e7b8a27ca12b6cd192d1223 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 15:02:38 +0200 Subject: [PATCH 51/54] feat(collection-seeding): add searchable tags to collection descriptions Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/sources/pango_lineages.py | 3 ++- collection-seeding/sources/resistance_mutations.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index 67e1e53d1..dabc28587 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -69,7 +69,8 @@ def _build_collection(entry: dict) -> Collection: f"Pango lineage {lineage}. " f"Parent: {parent}. " f"Nextstrain clade: {clade}. " - f"Designated: {date}." + f"Designated: {date}. " + f"#pango-lineage" ) return { diff --git a/collection-seeding/sources/resistance_mutations.py b/collection-seeding/sources/resistance_mutations.py index 1feeba92b..f783f60ef 100644 --- a/collection-seeding/sources/resistance_mutations.py +++ b/collection-seeding/sources/resistance_mutations.py @@ -19,7 +19,8 @@ def get_collections(self) -> list[Collection]: "description": ( "SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations " "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." + "(last updated 21 August 2024). " + "#resistance-mutation" ), "variants": _build_variants(CLPRO_MUTATIONS, "3CLpro", -3263), }, @@ -29,7 +30,8 @@ def get_collections(self) -> list[Collection]: "description": ( "SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations " "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." + "(last updated 21 August 2024). " + "#resistance-mutation" ), "variants": _build_variants(RDRP_MUTATIONS, "RdRp", 9), }, @@ -39,7 +41,8 @@ def get_collections(self) -> list[Collection]: "description": ( "SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations " "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024)." + "(last updated 21 August 2024). " + "#resistance-mutation" ), "variants": _build_variants(SPIKE_MUTATIONS, "Spike", 0), }, From f9671a42a787c377ca0f7b60df86fb46443a9f30 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 28 May 2026 17:04:21 +0200 Subject: [PATCH 52/54] feat(collection-seeding): detect and delete orphaned pango lineage collections Co-Authored-By: Claude Sonnet 4.6 --- collection-seeding/api.py | 11 +++ collection-seeding/models.py | 1 + collection-seeding/seed.py | 69 +++++++++----- collection-seeding/sources/__init__.py | 13 ++- collection-seeding/sources/pango_lineages.py | 7 +- .../sources/resistance_mutations.py | 11 +-- collection-seeding/tests/mock_source.py | 2 + .../tests/test_pango_lineages.py | 18 ++-- collection-seeding/tests/test_seed.py | 95 +++++++++++++++---- 9 files changed, 163 insertions(+), 64 deletions(-) diff --git a/collection-seeding/api.py b/collection-seeding/api.py index 5711c2105..bce08cdb3 100644 --- a/collection-seeding/api.py +++ b/collection-seeding/api.py @@ -57,6 +57,17 @@ def create_collection(self, collection: Collection) -> int: ) return r.json()["id"] + def delete_collection(self, collection_id: int) -> None: + r = requests.delete( + f"{self._collections_url}/{collection_id}", + headers=self._auth_headers, + timeout=10, + ) + if not r.ok: + raise RuntimeError( + f"DELETE /api/collections/{collection_id} failed: {r.status_code} {r.text}" + ) + def update_collection(self, collection_id: int, collection: Collection) -> None: # CollectionUpdate has no organism field; sending it causes a 400 (fail-on-unknown-properties=true) body = {k: v for k, v in collection.items() if k != "organism"} diff --git a/collection-seeding/models.py b/collection-seeding/models.py index 2f5a84e1e..e0e636282 100644 --- a/collection-seeding/models.py +++ b/collection-seeding/models.py @@ -26,3 +26,4 @@ class ExistingCollection(TypedDict): id: int name: str + description: str | None diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 839bcef19..2567a59e5 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -55,12 +55,16 @@ def main(): while True: total_created = 0 total_updated = 0 + total_deleted = 0 for source in active: - c, u = seed_source(client, source) + c, u, d = seed_source(client, source) total_created += c total_updated += u + total_deleted += d if len(active) > 1: - print(f"\nTotal — created: {total_created}, updated: {total_updated}.") + print( + f"\nTotal — created: {total_created}, updated: {total_updated}, deleted: {total_deleted}." + ) if not args.repeat_interval_hours: break print(f"\nSleeping for {args.repeat_interval_hours}h ...") @@ -70,34 +74,53 @@ def main(): sys.exit(1) -def seed_source(client: ApiClient, source: Source) -> tuple[int, int]: - """Upsert collections for one source, grouped by organism. Returns (created, updated) counts. +def seed_source(client: ApiClient, source: Source) -> tuple[int, int, int]: + """Upsert collections for one source. Returns (created, updated, deleted) counts. Matching is by name — if a collection's name changes in the source, the old entry is orphaned and a new one is created.""" collections = source.get_collections() print(f"\n[{source.name}]") - organisms: dict[str, list[Collection]] = {} - for c in collections: - organisms.setdefault(c["organism"], []).append(c) + existing = client.fetch_existing_collections(source.organism) + existing_by_name = { + c["name"]: c for c in existing if source.owned_tag in (c["description"] or "") + } created = 0 updated = 0 - for organism, org_collections in organisms.items(): - existing = client.fetch_existing_collections(organism) - existing_by_name = {c["name"]: c for c in existing} - for collection in org_collections: - existing_entry = existing_by_name.get(collection["name"]) - if existing_entry: - client.update_collection(existing_entry["id"], collection) - print(f" UPDATE id={existing_entry['id']} {collection['name']}") - updated += 1 - else: - col_id = client.create_collection(collection) - print(f" CREATE id={col_id} {collection['name']}") - created += 1 - - print(f" Created: {created}, updated: {updated}.") - return created, updated + for collection in collections: + c, u = _upsert_collection(client, collection, existing_by_name) + existing_by_name.pop(collection["name"], None) + created += c + updated += u + + orphan_ids = [e["id"] for e in existing_by_name.values()] + deleted = _delete_collections(client, orphan_ids) + + print(f" Created: {created}, updated: {updated}, deleted: {deleted}.") + return created, updated, deleted + + +def _upsert_collection( + client: ApiClient, + collection: Collection, + existing_by_name: dict, +) -> tuple[int, int]: + existing_entry = existing_by_name.get(collection["name"]) + if existing_entry: + client.update_collection(existing_entry["id"], collection) + print(f" UPDATE id={existing_entry['id']} {collection['name']}") + return 0, 1 + else: + col_id = client.create_collection(collection) + print(f" CREATE id={col_id} {collection['name']}") + return 1, 0 + + +def _delete_collections(client: ApiClient, collection_ids: list[int]) -> int: + for col_id in collection_ids: + client.delete_collection(col_id) + print(f" DELETE id={col_id}") + return len(collection_ids) def make_parser() -> argparse.ArgumentParser: diff --git a/collection-seeding/sources/__init__.py b/collection-seeding/sources/__init__.py index 12b9fb9e8..5f898d4a5 100644 --- a/collection-seeding/sources/__init__.py +++ b/collection-seeding/sources/__init__.py @@ -6,8 +6,9 @@ class Source(ABC): """A data source that produces collections to be seeded into the backend. - Implement this to add a new source: set a unique `name` (used as the --source flag value) - and implement `get_collections` to return the collections to upsert. + Implement this to add a new source: set a unique `name` (used as the --source flag value), + an `organism`, an `owned_tag` (appended to each description and used to identify orphaned + collections for deletion), and implement `get_collections` to return the collections to upsert. Then register it in sources/registry.py. Set `include_in_default_run = False` for sources that should only be used via --source @@ -20,5 +21,13 @@ class Source(ABC): @abstractmethod def name(self) -> str: ... + @property + @abstractmethod + def organism(self) -> str: ... + + @property + @abstractmethod + def owned_tag(self) -> str: ... + @abstractmethod def get_collections(self) -> list[Collection]: ... diff --git a/collection-seeding/sources/pango_lineages.py b/collection-seeding/sources/pango_lineages.py index dabc28587..048d89b64 100644 --- a/collection-seeding/sources/pango_lineages.py +++ b/collection-seeding/sources/pango_lineages.py @@ -16,6 +16,8 @@ class PangoLineagesSource(Source): """ name = "covid-pango-lineages" + organism = "covid" + owned_tag = "#pango-lineage" def __init__(self, limit: int | None = None): self._limit = limit @@ -30,8 +32,7 @@ def get_collections(self) -> list[Collection]: print(f" Loaded {len(entries)} lineage(s).") return [self._build_collection(e) for e in entries] - @staticmethod - def _build_collection(entry: dict) -> Collection: + def _build_collection(self, entry: dict) -> Collection: lineage: str = entry["lineage"] parent: str = entry.get("parent") or "—" clade: str = entry.get("nextstrainClade") or "—" @@ -70,7 +71,7 @@ def _build_collection(entry: dict) -> Collection: f"Parent: {parent}. " f"Nextstrain clade: {clade}. " f"Designated: {date}. " - f"#pango-lineage" + f"{self.owned_tag}" ) return { diff --git a/collection-seeding/sources/resistance_mutations.py b/collection-seeding/sources/resistance_mutations.py index f783f60ef..30ca66501 100644 --- a/collection-seeding/sources/resistance_mutations.py +++ b/collection-seeding/sources/resistance_mutations.py @@ -10,6 +10,8 @@ class ResistanceMutationsSource(Source): """ name = "covid-resistance-mutations" + organism = "covid" + owned_tag = "#resistance-mutation" def get_collections(self) -> list[Collection]: return [ @@ -19,8 +21,7 @@ def get_collections(self) -> list[Collection]: "description": ( "SARS-CoV-2 3C-like protease (3CLpro/Mpro) inhibitor resistance mutations " "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024). " - "#resistance-mutation" + f"(last updated 21 August 2024). {self.owned_tag}" ), "variants": _build_variants(CLPRO_MUTATIONS, "3CLpro", -3263), }, @@ -30,8 +31,7 @@ def get_collections(self) -> list[Collection]: "description": ( "SARS-CoV-2 RNA-dependent RNA polymerase (RdRp) inhibitor resistance mutations " "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024). " - "#resistance-mutation" + f"(last updated 21 August 2024). {self.owned_tag}" ), "variants": _build_variants(RDRP_MUTATIONS, "RdRp", 9), }, @@ -41,8 +41,7 @@ def get_collections(self) -> list[Collection]: "description": ( "SARS-CoV-2 Spike monoclonal antibody (mAb) resistance mutations " "as per Stanford Coronavirus Antiviral & Resistance database " - "(last updated 21 August 2024). " - "#resistance-mutation" + f"(last updated 21 August 2024). {self.owned_tag}" ), "variants": _build_variants(SPIKE_MUTATIONS, "Spike", 0), }, diff --git a/collection-seeding/tests/mock_source.py b/collection-seeding/tests/mock_source.py index 28a16d734..05e9bc9a7 100644 --- a/collection-seeding/tests/mock_source.py +++ b/collection-seeding/tests/mock_source.py @@ -4,6 +4,8 @@ class MockSource(Source): name = "mock-source" + organism = "covid" + owned_tag = "#mock-tag" def __init__(self, collections: list[Collection]): self._collections = collections diff --git a/collection-seeding/tests/test_pango_lineages.py b/collection-seeding/tests/test_pango_lineages.py index 2576068fb..057301e30 100644 --- a/collection-seeding/tests/test_pango_lineages.py +++ b/collection-seeding/tests/test_pango_lineages.py @@ -47,13 +47,13 @@ def test_name(): def test_build_collection_basic(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["BA.2"]) assert col["name"] == "BA.2" assert col["organism"] == "covid" def test_build_collection_description_format(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["BA.2"]) assert "BA.2" in col["description"] assert "BA" in col["description"] # parent assert "22C" in col["description"] # clade @@ -61,18 +61,18 @@ def test_build_collection_description_format(): def test_build_collection_missing_fields_use_defaults(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["XBB"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["XBB"]) assert "—" in col["description"] # parent and clade fallback assert "unknown" in col["description"] # date fallback def test_build_collection_always_four_variants(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["BA.2"]) assert len(col["variants"]) == 4 def test_build_collection_variant_names(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["BA.2"]) names = [v["name"] for v in col["variants"]] assert names == [ "Nucleotide substitutions", @@ -83,7 +83,7 @@ def test_build_collection_variant_names(): def test_build_collection_variant_filter_keys(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["BA.2"]) variants = col["variants"] assert "nucleotideMutations" in variants[0]["filterObject"] assert "aminoAcidMutations" in variants[1]["filterObject"] @@ -92,7 +92,7 @@ def test_build_collection_variant_filter_keys(): def test_build_collection_variant_contents(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["BA.2"]) variants = col["variants"] assert variants[0]["filterObject"]["nucleotideMutations"] == ["C241T", "A23403G"] assert variants[1]["filterObject"]["aminoAcidMutations"] == ["S:N501Y"] @@ -101,7 +101,7 @@ def test_build_collection_variant_contents(): def test_build_collection_filters_blank_subs(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["BA.2"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["BA.2"]) # nucSubstitutions has ["C241T", "A23403G", ""] — blank should be dropped nuc = col["variants"][0]["filterObject"]["nucleotideMutations"] assert "" not in nuc @@ -109,7 +109,7 @@ def test_build_collection_filters_blank_subs(): def test_build_collection_empty_lists_when_all_blanks(): - col = PangoLineagesSource._build_collection(SAMPLE_DATA["XBB"]) + col = PangoLineagesSource()._build_collection(SAMPLE_DATA["XBB"]) assert len(col["variants"]) == 4 for v in col["variants"]: lists = list(v["filterObject"].values()) diff --git a/collection-seeding/tests/test_seed.py b/collection-seeding/tests/test_seed.py index 60bd4181c..9a474dc73 100644 --- a/collection-seeding/tests/test_seed.py +++ b/collection-seeding/tests/test_seed.py @@ -37,33 +37,43 @@ def make_client(existing=None): def test_all_new_creates_all(): client = make_client(existing=[]) - created, updated = seed_source(client, MockSource(COLLECTIONS)) + created, updated, deleted = seed_source(client, MockSource(COLLECTIONS)) assert created == len(COLLECTIONS) assert updated == 0 + assert deleted == 0 assert client.create_collection.call_count == len(COLLECTIONS) client.update_collection.assert_not_called() +def existing_entry(id: int, name: str) -> dict: + return { + "id": id, + "name": name, + "description": f"A collection. {MockSource.owned_tag}", + } + + def test_all_existing_updates_all(): - existing = [{"id": i + 1, "name": c["name"]} for i, c in enumerate(COLLECTIONS)] + existing = [existing_entry(i + 1, c["name"]) for i, c in enumerate(COLLECTIONS)] client = make_client(existing=existing) - created, updated = seed_source(client, MockSource(COLLECTIONS)) + created, updated, deleted = seed_source(client, MockSource(COLLECTIONS)) assert created == 0 assert updated == len(COLLECTIONS) + assert deleted == 0 assert client.update_collection.call_count == len(COLLECTIONS) client.create_collection.assert_not_called() def test_mixed_creates_and_updates(): - existing = [{"id": 10, "name": COLLECTIONS[0]["name"]}] + existing = [existing_entry(10, COLLECTIONS[0]["name"])] client = make_client(existing=existing) - created, updated = seed_source(client, MockSource(COLLECTIONS)) + created, updated, deleted = seed_source(client, MockSource(COLLECTIONS)) assert created == len(COLLECTIONS) - 1 assert updated == 1 def test_update_uses_correct_id(): - existing = [{"id": 42, "name": COLLECTIONS[0]["name"]}] + existing = [existing_entry(42, COLLECTIONS[0]["name"])] client = make_client(existing=existing) seed_source(client, MockSource([COLLECTIONS[0]])) client.update_collection.assert_called_once_with(42, COLLECTIONS[0]) @@ -75,22 +85,65 @@ def test_create_passes_full_collection(): client.create_collection.assert_called_once_with(COLLECTIONS[0]) -def test_fetch_called_once_per_organism(): - multi = [ - {**COLLECTIONS[0], "organism": "covid"}, - {**COLLECTIONS[1], "organism": "mpox"}, - ] - client = make_client(existing=[]) - seed_source(client, MockSource(multi)) - assert client.fetch_existing_collections.call_count == 2 - organisms_fetched = { - c.args[0] for c in client.fetch_existing_collections.call_args_list - } - assert organisms_fetched == {"covid", "mpox"} - - def test_returns_zero_counts_for_empty_collections(): client = make_client(existing=[]) - created, updated = seed_source(client, MockSource([])) + created, updated, deleted = seed_source(client, MockSource([])) assert created == 0 assert updated == 0 + assert deleted == 0 + + +# --- seed_source: orphan deletion --- + +TAG = "#test-tag" + + +class TaggedMockSource(MockSource): + owned_tag = TAG + + +def tagged(name: str, description: str = "") -> dict: + return { + "name": name, + "organism": "covid", + "description": description or f"A collection. {TAG}", + "variants": [], + } + + +def test_orphan_with_tag_is_deleted(): + existing = [ + {"id": 5, "name": "OldLineage", "description": f"Old. {TAG}"}, + {"id": 6, "name": "CurrentLineage", "description": f"Current. {TAG}"}, + ] + client = make_client(existing=existing) + created, updated, deleted = seed_source( + client, TaggedMockSource([tagged("CurrentLineage")]) + ) + assert deleted == 1 + client.delete_collection.assert_called_once_with(5) + + +def test_orphan_without_tag_is_not_deleted(): + existing = [{"id": 5, "name": "ManualCollection", "description": "No tag here."}] + client = make_client(existing=existing) + created, updated, deleted = seed_source(client, TaggedMockSource([])) + assert deleted == 0 + client.delete_collection.assert_not_called() + + +def test_no_deletion_when_owned_tag_is_none(): + existing = [{"id": 5, "name": "OldLineage", "description": f"Old. {TAG}"}] + client = make_client(existing=existing) + created, updated, deleted = seed_source(client, MockSource([])) + assert deleted == 0 + client.delete_collection.assert_not_called() + + +def test_current_collections_are_not_deleted(): + col = tagged("ExistingLineage") + existing = [{"id": 5, "name": "ExistingLineage", "description": col["description"]}] + client = make_client(existing=existing) + created, updated, deleted = seed_source(client, TaggedMockSource([col])) + assert deleted == 0 + client.delete_collection.assert_not_called() From 35aab82112a585246f776156861a37409499ed75 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Fri, 29 May 2026 14:22:49 +0100 Subject: [PATCH 53/54] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- collection-seeding/seed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collection-seeding/seed.py b/collection-seeding/seed.py index 2567a59e5..c52140a55 100644 --- a/collection-seeding/seed.py +++ b/collection-seeding/seed.py @@ -132,7 +132,7 @@ def make_parser() -> argparse.ArgumentParser: "-u", "--url", default=os.environ.get("API_URL", "http://localhost:4321"), - help="API base URL (default: $API_URL or http://localhost:4321)", + help="Website base URL serving /api/* (default: $API_URL or http://localhost:4321)", ) parser.add_argument( "-k", From f50037db4a6b3a752e7a056c672f57160863d2db Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Fri, 29 May 2026 14:23:01 +0100 Subject: [PATCH 54/54] Update docs/arc42/09-architecture-decisions.md Co-authored-by: Fabian Engelniederhammer <92720311+fengelniederhammer@users.noreply.github.com> --- docs/arc42/09-architecture-decisions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/arc42/09-architecture-decisions.md b/docs/arc42/09-architecture-decisions.md index 2d6151a19..32e4c44f1 100644 --- a/docs/arc42/09-architecture-decisions.md +++ b/docs/arc42/09-architecture-decisions.md @@ -40,8 +40,8 @@ For example, we create collections for mutations that are relevant for vaccine r on online lists. Or we create collections based on canonical lineage definitions. Since we want others to also be able to easily generate collections, the code should also serve -as a kind of reference implementation on how one would generate collections. Therefore we want the -code to be completely independent from the rest of the codebase, and also understandable to +as a kind of reference implementation on how one would generate collections. Therefore, we want the +code to be completely independent of the rest of the codebase, and also understandable to bioinformatics researchers. ### Decision