diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
new file mode 100644
index 000000000..e6da75312
--- /dev/null
+++ b/.github/workflows/collectivex-experimental.yml
@@ -0,0 +1,353 @@
+name: CollectiveX Experimental
+
+# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
+# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no
+# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane
+# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's
+# self-hosted runner and invokes that SKU's launch script — the same
+# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use.
+
+on:
+  push:
+    branches:
+      - collectivex
+    paths:
+      - 'experimental/CollectiveX/**'
+      - '.github/workflows/collectivex-experimental.yml'
+  workflow_dispatch:
+    inputs:
+      sku:
+        # Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
+        # runner.name's prefix selects the script, so an SKU without one fails.
+        description: Self-hosted runner pool (must have a CollectiveX launcher)
+        type: choice
+        default: gb200
+        options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, h200, b300, gb300]
+      benchmark:
+        # mori runs only on mi355x; nccl/deepep/uccl/all + the collective benches on NVIDIA SKUs.
+        # offload/copy-engine/kv-cache are single-process memcpy-family collectives (family!=moe).
+        description: Which benchmark to run
+        type: choice
+        default: nccl
+        options: [nccl, deepep, deepep-hybrid, mori, uccl, nccl-ep, flashinfer, flashinfer-combine-fp8, flashinfer-combine-fp8-directcast, flashinfer-combine-nvfp4, nixl, mori-io, nccl-kv, mooncake, offload, copy-engine, kv-cache, rl-mesh, allreduce-fw, allreduce-fw-vllm, all]
+      ops:
+        description: NCCL ops (space-separated); blank = default set
+        type: string
+        default: ''
+      min_bytes:
+        description: nccl-tests min message size
+        type: string
+        default: '8'
+      max_bytes:
+        description: nccl-tests max message size
+        type: string
+        default: '8G'
+      nodes:
+        description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
+        type: string
+        default: ''
+      phase:
+        # EP only. 'both' fans out to one job per phase (decode + prefill).
+        description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
+        type: choice
+        default: both
+        options: [both, decode, prefill]
+      timing:
+        # Combined timing knobs "iters:trials:warmup" (GitHub caps workflow_dispatch at 25 inputs,
+        # so these share one). Blank = harness defaults (200:3:32). LOWER all three for the MoRI/
+        # MI355X large-T probe (e.g. "8:1:4"): MoRI wedges (unkillable D-state) under SUSTAINED
+        # collectives at T>=32; minimal iters/trials/warmup is the only way to reach >64 tok/rank.
+        description: 'Timing "iters:trials:warmup" (blank = 200:3:32; e.g. 8:1:4 for the MoRI large-T probe)'
+        type: string
+        default: ''
+      tokens_ladder:
+        description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
+        type: string
+        default: ''
+      dispatch_dtype:
+        description: EP dispatch payload precision (fp8 scale-layout recipes + FlashInfer OCP-microscaling mxfp8/nvfp4)
+        type: choice
+        default: bf16
+        options: [bf16, fp8, fp8-pertoken, fp8-directcast, mxfp8, mxfp4, nvfp4]
+      mode:
+        # normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency
+        # (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it
+        # (MoRI) and aborts on fabrics that lack it (B300) — run only where supported.
+        description: EP kernel path — normal or low-latency (LL)
+        type: choice
+        default: normal
+        options: [normal, ll]
+      resource_mode:
+        # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples);
+        # tuned = each backend's own recommended/default launch config.
+        description: Comm resource regime
+        type: choice
+        default: normalized
+        options: [normalized, tuned, default]
+      contract:
+        # layout-and-dispatch-v1 = dispatch timing includes routing-layout gen (the only
+        # contract MoRI honors; use for cross-vendor). cached-layout-comm-only-v1 = layout
+        # hoisted out, pure-comm dispatch (DeepEP normal only).
+        # runtime-visible-v1 = serving-realistic boundary (DeepEP times fp8 cast + layout + comm +
+        # recv-dequant inside dispatch). cached-layout = pure-comm (DeepEP normal only).
+        description: Measurement contract (timing boundary)
+        type: choice
+        default: layout-and-dispatch-v1
+        options: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+      routing:
+        # Routing distribution of the shared trace. uniform=realistic; balanced=load-equalized;
+        # zipf*=skewed; hotspot-*=one hot expert (static/moving); alternating-groups=toggling halves.
+        description: EP routing distribution
+        type: choice
+        default: uniform
+        options: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate,
+                  zipf-heavy, hotspot-single, hotspot-moving, alternating-groups]
+      eplb:
+        # EPLB = replicate hot experts + balanced-place (the remedy for skewed routing). A pure
+        # routing-trace transform; experts -> num_logical+redundant. Meaningful with zipf*.
+        description: Apply EPLB expert replication/placement
+        type: boolean
+        default: false
+      canonical:
+        # Consume a CANONICAL serialized workload (generated deterministically in-container) instead
+        # of seeded-runtime. A canonical-serialized run with full GHA provenance is publication
+        # 'official' — this is the switch that promotes a cohort past comparable-experimental.
+        description: Use canonical serialized workload (official-grade workload identity)
+        type: boolean
+        default: false
+      deepep_v2:
+        # Build DeepEP V2 (NCCL Gin backend) from source in-container, overriding the image's V1
+        # (1.2.1). Hopper(SM90)+Blackwell(SM100) only. Needs compute-node network + NCCL>=2.30.4.
+        description: Use DeepEP V2 kernels (build from source; NVIDIA SM90+ only)
+        type: boolean
+        default: false
+      activation_profile:
+        # Activation VALUE distribution of expert inputs. normal = headline; the others stress a
+        # future quantized combine (latency-neutral under bf16 — the expected null result).
+        description: Activation value profile
+        type: choice
+        default: normal
+        options: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation]
+      placement:
+        # Rank->node/domain placement (locality). Single-node SKUs make these identical; meaningful
+        # on multi-domain SKUs (GB300 NVL72). packed=fill a domain first; striped=spread; adversarial.
+        description: Rank placement
+        type: choice
+        default: packed
+        options: [packed, striped, runtime-native, adversarial]
+      sm_fraction:
+        # normalized comm-resource fraction (DeepEP sm_fraction*SMs / MoRI ~*CUs). Sweep this with
+        # resource_mode=normalized to build the resource-Pareto (latency vs comm fraction). Blank =
+        # harness default 0.18.
+        description: Normalized comm-resource fraction (resource_mode=normalized)
+        type: string
+        default: ''
+      hidden:
+        # MoE hidden dim — set (with topk/experts) for model-derived workloads (ep-models-v1).
+        # Blank = ds-like-ref default 7168.
+        description: MoE hidden dim (model-derived workloads); blank = 7168
+        type: string
+        default: ''
+      topk:
+        description: MoE top-k (model-derived workloads); blank = 8
+        type: string
+        default: ''
+      experts:
+        description: MoE total experts (model-derived workloads); blank = 256
+        type: string
+        default: ''
+      routing_step:
+        # temporal snapshot index for hotspot-moving / alternating-groups (ep-temporal-v1).
+        description: Temporal routing step (hotspot-moving / alternating-groups)
+        type: string
+        default: ''
+      uneven_tokens:
+        # per-rank source-token allocation skew (ep-uneven-tokens-v1).
+        description: Uneven source-token allocation
+        type: choice
+        default: none
+        options: [none, linear, empty-rank]
+
+concurrency:
+  # Group per (SKU + FULL config): GitHub keeps only one running + one pending per group and
+  # cancels the rest, so a coarse per-SKU group made a fan-out of many configs on one SKU
+  # self-cancel down to ~2. Including dtype/mode/contract/routing/eplb/phase gives each config
+  # its OWN group -> all configs survive; they queue only on the runner's own capacity, not on
+  # GitHub concurrency. cancel-in-progress FALSE so a re-dispatch of the SAME config queues.
+  # The group includes the resource/value/placement axes (sm_fraction, resource_mode,
+  # activation_profile, placement) too — otherwise a Pareto sm-fraction sweep or an activation/
+  # placement sweep (same dtype/mode/contract/routing/phase) would self-cancel down to ~2 runs.
+  group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }}-${{ inputs.benchmark }}-${{ inputs.dispatch_dtype }}-${{ inputs.mode }}-${{ inputs.contract }}-${{ inputs.routing }}-${{ inputs.eplb }}-${{ inputs.phase }}-${{ inputs.resource_mode }}-${{ inputs.sm_fraction }}-${{ inputs.activation_profile }}-${{ inputs.placement }}-${{ inputs.hidden }}-${{ inputs.topk }}-${{ inputs.experts }}-${{ inputs.routing_step }}-${{ inputs.uneven_tokens }}-${{ inputs.nodes }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and
+  # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute-
+  # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs.
+  experimental:
+    name: CollectiveX Experimental (${{ matrix.phase }})
+    if: github.event_name == 'push'
+    runs-on: mi355x
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        # Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch.
+        phase: [decode]
+    env:
+      CX_BENCH: mori
+      CX_PHASE: ${{ matrix.phase }}
+      # SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently
+      # WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung
+      # ~1 h before the job timeout. Keep the push smoke in the known-good range; run the
+      # full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed.
+      CX_TOKENS_LADDER: "1 2 4 8 16"
+      CX_RUN_TIMEOUT: "600"
+      # Pin to the MI355X nodes that hold the node-local squash and have a writable
+      # /var/lib/squash; other nodes need a slow cold import that can fail on lock/
+      # cache permissions. Widen once the squash is staged cluster-wide.
+      CX_NODELIST: mia1-p01-g10,mia1-p01-g15
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Launch MI355X MoRI (${{ matrix.phase }})
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
+  dispatch:
+    if: github.event_name == 'workflow_dispatch'
+    # The bare `h200` label spans TWO clusters: 14 h200-dgxc runners (login-0; the EP
+    # path is validated there) and 2 h200-cw (CoreWeave) runners that have no
+    # launch_h200-cw.sh and die exit 127. Pin h200 to the h200-dgxc pool so every
+    # dispatch lands where the launcher + FS + partition are known-good. Other SKUs are
+    # single-pool, so pass the sku through unchanged.
+    runs-on: ${{ inputs.sku == 'h200' && 'h200-dgxc' || inputs.sku }}
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        # nccl/rccl are collective primitives — phase is meaningless, so run ONE job (not
+        # the same work twice). EP backends: 'both' -> decode + prefill; else a single job.
+        phase: ${{ fromJSON((inputs.benchmark == 'nccl' || inputs.benchmark == 'rccl') && '["na"]' || (inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase))) }}
+    env:
+      # flashinfer-combine-{fp8,nvfp4} = the flashinfer EP backend with a QUANTIZED COMBINE OUTPUT
+      # (MXFP8 e4m3+e8m0, or NVFP4 e2m1, via the flashinfer-main moe_a2a_combine output_dtype). Map to
+      # CX_BENCH=flashinfer + CX_COMBINE_DTYPE (run_flashinfer_suite builds flashinfer-main when
+      # CX_COMBINE_DTYPE!=bf16). Input-cap-safe (a benchmark CHOICE, not a new input).
+      CX_BENCH: ${{ startsWith(inputs.benchmark, 'flashinfer-combine') && 'flashinfer' || (inputs.benchmark == 'allreduce-fw-vllm' && 'allreduce-fw' || inputs.benchmark) }}
+      # allreduce-fw-vllm = the framework all-reduce bench in a vLLM container (container switch for
+      # the vLLM custom-AR, goal 215) — set CX_IMAGE to a vLLM cuda image; the launcher uses CX_IMAGE
+      # when non-empty, else cx_default_image. Input-cap-safe (a benchmark CHOICE).
+      CX_IMAGE: ${{ inputs.benchmark == 'allreduce-fw-vllm' && 'vllm/vllm-openai:latest' || '' }}
+      # startsWith catches both flashinfer-combine-fp8 and -fp8-directcast (both fp8 combine output;
+      # the -directcast variant differs only in CX_QC_SCALE=scalar below — a single output_scalar_scale,
+      # no per-block scales = the unscaled direct-cast fp8 combine).
+      CX_COMBINE_DTYPE: ${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'bf16') }}
+      CX_COMBINE_QUANT_MODE: ${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'none') }}
+      CX_QC_SCALE: ${{ inputs.benchmark == 'flashinfer-combine-fp8-directcast' && 'scalar' || '' }}
+      CX_OPS: ${{ inputs.ops }}
+      CX_MIN_BYTES: ${{ inputs.min_bytes }}
+      CX_MAX_BYTES: ${{ inputs.max_bytes }}
+      CX_NODES: ${{ inputs.nodes }}
+      CX_PHASE: ${{ matrix.phase }}
+      CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
+      CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
+      CX_MODE: ${{ inputs.mode }}
+      CX_RESOURCE_MODE: ${{ inputs.resource_mode }}
+      CX_MEASUREMENT_CONTRACT: ${{ inputs.contract }}
+      CX_ROUTING: ${{ inputs.routing }}
+      CX_EPLB: ${{ inputs.eplb && '1' || '' }}
+      # canonical serialized workload (official-grade identity) + value/placement axes (goal P1/P2).
+      CX_CANONICAL: ${{ inputs.canonical && '1' || '' }}
+      CX_DEEPEP_V2: ${{ inputs.deepep_v2 && '1' || '' }}
+      CX_ACTIVATION_PROFILE: ${{ inputs.activation_profile }}
+      CX_PLACEMENT: ${{ inputs.placement }}
+      CX_SM_FRACTION: ${{ inputs.sm_fraction }}
+      # model-derived workload dims (blank = ds-like-ref defaults) + temporal/uneven axes.
+      CX_HIDDEN: ${{ inputs.hidden }}
+      CX_TOPK: ${{ inputs.topk }}
+      CX_EXPERTS: ${{ inputs.experts }}
+      CX_ROUTING_STEP: ${{ inputs.routing_step }}
+      CX_UNEVEN_TOKENS: ${{ inputs.uneven_tokens }}
+      CX_TIMING: ${{ inputs.timing }}
+      # GHA run provenance: run_ep records git_run (repo/run/attempt/ref/sha/job) -> a GHA result
+      # is provenance_complete (publication_status >= comparable-experimental, official w/ canonical).
+      COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
+      COLLECTIVEX_ARTIFACT_NAME: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+      # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
+      CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+      # MI355X: pin to the warm-squash, writable nodes (see the push job).
+      CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      # Reject an unsupported backend/SKU/mode/dtype/contract BEFORE consuming the runner
+      # (review #3): fail fast on the login node, not after a salloc. 'all' fans out per
+      # vendor in-container, so skip the single-combo check for it.
+      - name: Validate capability
+        if: inputs.benchmark != 'all'
+        run: |
+          python3 experimental/CollectiveX/tests/capability.py \
+            --sku "${{ inputs.sku }}" \
+            --backend "${{ startsWith(inputs.benchmark, 'flashinfer-combine') && 'flashinfer' || (inputs.benchmark == 'allreduce-fw-vllm' && 'allreduce-fw' || inputs.benchmark) }}" \
+            --mode "${{ inputs.mode }}" --dtype "${{ inputs.dispatch_dtype }}" \
+            --contract "${{ inputs.contract }}" \
+            --combine-dtype "${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'bf16') }}" \
+            --combine-quant-mode "${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'none') }}"
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  update-frontend-snapshot:
+    name: Update InferenceX-app snapshot
+    needs: [experimental, dispatch]
+    if: >-
+      always() &&
+      (
+        (github.event_name == 'push' && needs.experimental.result == 'success') ||
+        (github.event_name == 'workflow_dispatch' && needs.dispatch.result == 'success')
+      )
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger CollectiveX snapshot update
+        env:
+          FRONTEND_PAT: ${{ secrets.INFX_FRONTEND_PAT }}
+        run: |
+          set -euo pipefail
+          curl -sSf -X POST \
+            -H "Authorization: Bearer $FRONTEND_PAT" \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
+            -d '{
+              "event_type": "update-collectivex-data",
+              "client_payload": {
+                "source_run_id": "${{ github.run_id }}"
+              }
+            }'
diff --git a/.github/workflows/collectivex-sweep.yml b/.github/workflows/collectivex-sweep.yml
new file mode 100644
index 000000000..76a91b4ad
--- /dev/null
+++ b/.github/workflows/collectivex-sweep.yml
@@ -0,0 +1,215 @@
+# CollectiveX Sweep — one structured run instead of thousands of dispatches.
+#
+# Shape (mirrors the InferenceX CI tracker): setup -> sweep (a MATRIX job = "a job with other jobs
+# in it") -> aggregate (the collector "at the end"). The matrix unit is a SHARD = one allocation that
+# sweeps many cases sharing (sku, backend, mode, resource) — generate_matrix's own grouping, chunked
+# so no cell exceeds the job budget. Each cell emits a handful of per-case JSONs; the aggregate job
+# collects every shard into ONE line-delimited file (results/aggregate/*.ndjson) so there aren't
+# thousands of individual result files. Run once per backend (deepep / uccl / flashinfer /
+# deepep-hybrid / nccl-ep, + deepep_v2) for full parity.
+name: CollectiveX Sweep
+on:
+  workflow_dispatch:
+    inputs:
+      backend:
+        description: "EP library to sweep — 'all' = every backend in ONE combined matrix run (recommended)"
+        type: choice
+        default: all
+        options: [all, deepep, uccl, flashinfer, deepep-hybrid, nccl-ep]
+      deepep_v2:
+        description: DeepEP V2 from-source kernels (kernel_gen=v2; only for a single-backend deepep run — 'all' already includes a deepep-v2 variant)
+        type: boolean
+        default: false
+      suites:
+        description: "'all' or comma-list of suite names"
+        type: string
+        default: all
+      only_sku:
+        description: Restrict to one SKU (h100-dgxc|h200|b300|b200-dgxc|gb200|gb300|mi355x); blank = all
+        type: string
+        default: ''
+      min_nodes:
+        description: Keep only shards with >= this tray count (2 = rack-scale EP8 only; blank = all)
+        type: string
+        default: ''
+      max_nodes:
+        description: Keep only shards with <= this tray count (1 = single-tray EP4 only; blank = all)
+        type: string
+        default: ''
+      max_cases:
+        description: Max cases per shard cell before chunking into another GHA job (128 = no chunking for current suites)
+        type: string
+        default: '128'
+      flashinfer_upgrade:
+        description: Upgrade FlashInfer to the newer (MNNVL-fixed) wheel for plain flashinfer runs too (fixes h100 completion-flag deadlock)
+        type: boolean
+        default: false
+
+concurrency:
+  group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.deepep_v2 }}-${{ inputs.only_sku }}
+  cancel-in-progress: false
+
+jobs:
+  # ---- setup: resolve the suites into the shard matrix (the "pending jobs" node) ----
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.gen.outputs.matrix }}
+      n: ${{ steps.gen.outputs.n }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - run: pip install --quiet pyyaml
+      - id: gen
+        working-directory: experimental/CollectiveX
+        run: |
+          set -euo pipefail
+          # backend='all' or a comma-list -> ONE combined multi-backend matrix; else a single backend.
+          case "${{ inputs.backend }}" in
+            all|*,*) bk="--backends ${{ inputs.backend }}" ;;
+            deepep)  bk="" ;;
+            *)       bk="--backend ${{ inputs.backend }}" ;;
+          esac
+          v2=""; [ "${{ inputs.deepep_v2 }}" = "true" ] && v2="--deepep-v2"
+          os=""; [ -n "${{ inputs.only_sku }}" ] && os="--only-sku ${{ inputs.only_sku }}"
+          mn=""; [ -n "${{ inputs.min_nodes }}" ] && mn="--min-nodes ${{ inputs.min_nodes }}"
+          xn=""; [ -n "${{ inputs.max_nodes }}" ] && xn="--max-nodes ${{ inputs.max_nodes }}"
+          # full matrix (with cases) -> artifact for the cells; slim (no cases) -> the strategy output.
+          python3 sweep_matrix.py --suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}" $bk $v2 $os $mn $xn --out matrix_full.json >/dev/null
+          SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='cases'} for x in m['include']]}))")
+          echo "matrix=$SLIM" >> "$GITHUB_OUTPUT"
+          echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" >> "$GITHUB_OUTPUT"
+          python3 -c "import json;m=json.load(open('matrix_full.json'));print('shard-cells:',len(m['include']),'cases:',sum(x['n'] for x in m['include']))"
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: cxsweep-matrix-${{ github.run_id }}
+          path: experimental/CollectiveX/matrix_full.json
+          if-no-files-found: error
+
+  # ---- sweep: ONE matrix cell per shard (the parent job with child jobs) ----
+  sweep:
+    needs: setup
+    if: ${{ fromJSON(needs.setup.outputs.n) > 0 }}
+    strategy:
+      fail-fast: false
+      max-parallel: 10            # don't saturate the ~20-runner fleet; cells queue as slots free
+      matrix: ${{ fromJSON(needs.setup.outputs.matrix) }}
+    # h200 label spans two clusters; pin to the validated dgxc pool (mirrors collectivex-experimental).
+    runs-on: ${{ matrix.sku == 'h200' && 'h200-dgxc' || matrix.sku }}
+    timeout-minutes: 350
+    env:
+      CX_BENCH: ${{ matrix.backend }}
+      CX_DEEPEP_V2: ${{ matrix.deepep_v2 && '1' || '' }}
+      CX_NODES: ${{ matrix.nodes }}
+      CX_SHARD_FILE: results/.shard_${{ matrix.id }}.json
+      COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
+      # Consolidated shards run a whole build-group (up to ~74 cases) + one from-source build in ONE
+      # slurm allocation, so the launcher's default 45-min --time is too short. 120 min gives headroom;
+      # the allocation releases early when the shard finishes, so short shards don't waste it.
+      CX_TIME: '120'
+      CX_FLASHINFER_UPGRADE: ${{ inputs.flashinfer_upgrade && '1' || '' }}
+      CX_NODELIST: ${{ matrix.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
+      CX_STAGE_DIR: ${{ matrix.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: cxsweep-matrix-${{ github.run_id }}
+          path: experimental/CollectiveX
+      - name: Extract this shard's cases (stdlib only — no runner deps)
+        working-directory: experimental/CollectiveX
+        run: |
+          set -euo pipefail
+          python3 -c "
+          import json
+          m=json.load(open('matrix_full.json'))
+          s=[x for x in m['include'] if x['id']=='${{ matrix.id }}']
+          assert s, 'shard ${{ matrix.id }} not in matrix'
+          s=s[0]
+          json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'deepep_v2':s['deepep_v2'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w'))
+          print('shard ${{ matrix.id }}:', len(s['cases']), 'cases')
+          "
+      - name: Sweep shard ${{ matrix.id }} (${{ matrix.n }} cases, one allocation)
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Shard summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true
+      - name: Upload shard results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: cxshard-${{ matrix.id }}-${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json   # glob skips the hidden .shard_*.json
+          if-no-files-found: warn
+
+  # ---- aggregate: collect every shard into ONE ndjson (the "result aggregator at the end") ----
+  aggregate:
+    needs: sweep
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          pattern: cxshard-*-${{ github.run_id }}
+          path: _shards
+          merge-multiple: true
+      - name: Aggregate shards -> one ndjson
+        working-directory: experimental/CollectiveX
+        run: |
+          set -euo pipefail
+          tag="${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}"
+          python3 aggregate_results.py --in-dir ../../_shards --out "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson"
+          {
+            echo "## CollectiveX sweep aggregate (${tag})"
+            echo '```'
+            wc -l results/aggregate/*.ndjson 2>/dev/null || echo "no ndjson"
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload aggregate
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: cxsweep-aggregate-${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}-${{ github.run_id }}
+          path: experimental/CollectiveX/results/aggregate/*.ndjson
+          if-no-files-found: warn
+
+  update-frontend-snapshot:
+    name: Update InferenceX-app snapshot
+    needs: aggregate
+    if: always() && needs.aggregate.result == 'success'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger CollectiveX snapshot update
+        env:
+          FRONTEND_PAT: ${{ secrets.INFX_FRONTEND_PAT }}
+        run: |
+          set -euo pipefail
+          tmp="$(mktemp -d)"
+          trap 'rm -rf "$tmp"' EXIT
+          git clone --quiet --depth 1 --branch collectivex \
+            "https://x-access-token:${FRONTEND_PAT}@github.com/SemiAnalysisAI/InferenceX-app.git" \
+            "$tmp/app"
+          cd "$tmp/app"
+          git pull --rebase origin collectivex
+          mkdir -p .github
+          {
+            echo "source_run_id=${{ github.run_id }}"
+            echo "source_sha=${{ github.sha }}"
+            echo "source_workflow=${{ github.workflow }}"
+            echo "source_run_url=https://github.com/SemiAnalysisAI/InferenceX/actions/runs/${{ github.run_id }}"
+            echo "triggered_at=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+          } > .github/collectivex-source-run.env
+
+          git config user.name "InferenceX Data Bot"
+          git config user.email "actions@users.noreply.github.com"
+          git add .github/collectivex-source-run.env
+          if git diff --cached --quiet; then
+            echo "CollectiveX source-run marker is already current."
+            exit 0
+          fi
+          git commit -m "chore: trigger CollectiveX data update for ${{ github.run_id }}"
+          git push origin HEAD:collectivex
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
new file mode 100644
index 000000000..e30004ffc
--- /dev/null
+++ b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,22 @@
+# in-container nccl-tests build cache
+.nccl-tests/
+# python
+__pycache__/
+*.pyc
+# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs,
+# so keep results out of git (CI uploads them as workflow artifacts instead).
+# Sanitized headline numbers live in CONTAINERS.md.
+results/*.json
+results/plots/
+results/raw_*.txt
+results/raw_*.txt.stderr
+# superseded SSH-provenance result JSONs moved aside so plot_ep's recursive glob
+# won't double-load them; same hostname/UUID sensitivity as results/.
+_ssh_v4_archive/
+# running local-only reflection log (not a committed artifact)
+notes.md
+goal.md
+# superseded seeded-runtime GHA results (canonical counterpart exists); kept out of the plot glob
+_seeded_archive/
+# newest-good-per-config kept in results/; superseded runs moved here (out of the plot glob)
+_superseded/
diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
new file mode 100644
index 000000000..8a8bbf56e
--- /dev/null
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -0,0 +1,75 @@
+# CollectiveX — container & library versions
+
+One **multi-arch, digest-pinned** container is used for all NVIDIA SKUs, so B200
+(x86_64) and GB200 (aarch64) share a single reference and the cross-vendor
+comparison is truly same-image. Set in `runtime/common.sh` (`cx_default_image`).
+
+## Default container (all NVIDIA SKUs)
+
+- **Image:** import by tag **`lmsysorg/sglang:v0.5.11-cu130`** (multi-arch OCI index). Expected index digest, recorded for provenance/verification: `sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975`.
+- **Multi-arch manifest list:** linux/amd64 + linux/arm64; `enroot import` on each host pulls the matching arch.
+- **Import by TAG, not digest.** enroot builds its anonymous Docker Hub token scope from the *tag* and succeeds (no creds needed — same as the serving launchers). A bare `repo@sha256:` ref makes enroot prompt for a password and **hang** in non-interactive CI; a combined `tag@sha256:` ref 400s. `cx_ensure_squash` therefore imports by tag with `</dev/null` (a missing token fails fast instead of hanging). First import is multi-GB (~minutes); subsequent runs reuse the staged squash.
+- **Why v0.5.11-cu130 (chosen):** it's the newest cu130 release **pre-staged on BOTH clusters** — B200 `/home/sa-shared/containers/` (amd64 squash) and GB200 `/mnt/lustre01/users-public/sa-shared/` (arm64 squash), same filename — so neither side imports at all. (Shared cu130 multi-arch squashes across both clusters: v0.5.8.post1, v0.5.9, v0.5.11 — v0.5.11 is newest.) `v0.5.12-cu130` is staged on B200 but **not** GB200: its 62 layers overflow enroot's overlay-based squash creation on the GB200 kernel (`enroot-mksquashovlfs: failed to mount overlay … Invalid argument`), so it can't be the shared default.
+- **DeepEP: NOT bundled** here → `run_in_container.sh` builds it via `rebuild-deepep` at job setup (CX_BENCH=deepep). The NCCL path needs no DeepEP.
+- **nccl-tests build:** in-container (login nodes have no `nvcc`), `CX_NCCL_HOME=/usr` (system `nccl.h` in `/usr/include`), `CX_CUDA_HOME=/usr/local/cuda`. cu130 lineage ⇒ CUDA 13; confirm exact NCCL/torch on first run and append below.
+
+## Audited reference (cu130 lineage)
+
+Live audit of the sibling DeepSeek-V4 image `lmsysorg/sglang:deepseek-v4-grace-blackwell` (aarch64) on GB200, 2026-06-23 — the multi-arch `v0.5.11-cu130` should match closely (same cu130 base); reconfirm on first run:
+
+| Component | Version |
+|---|---|
+| OS / arch | Ubuntu 24.04.3, aarch64 |
+| CUDA (`nvcc`) | 13.0 (V13.0.88) |
+| NCCL (system `/usr/include/nccl.h`) | 2.28.3; torch-bundled 2.27.7 |
+| PyTorch | 2.9.1+cu130 |
+| DeepEP | bundled in *that* image; **not** in the multi-arch default |
+| NVSHMEM | `libnvshmem_host.so.3` present |
+| OpenMPI / gcc / make | 4.1.6 / 13.3.0 / 4.3 |
+| GPU / driver | GB200, 580.126.20 |
+
+**Version caveat:** the nccl-tests binary links **system NCCL** (2.28.x), while torch/DeepEP use the **bundled** NCCL (2.27.x). Record both in provenance (env_capture does); don't compare an nccl-tests curve against a DeepEP run as if NCCL were identical.
+
+## Bundled-DeepEP reference images (not the default)
+
+If a bundled DeepEP is needed before `rebuild-deepep` is wired on the multi-arch image, these arch-specific images bundle it (pin by digest):
+
+- B200 (amd64): `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b` (pre-staged on B200)
+- GB200 (arm64): `lmsysorg/sglang:deepseek-v4-grace-blackwell@sha256:4f583347d7ff08aef7e16dbb4985b2a7c147ff49a0c261d5e27b8f5f41719368` (staged on GB200 Lustre)
+
+Select via `CX_IMAGE=…@sha256:…` on the launch script.
+
+## AMD container (MI355X) — MoRI EP
+
+AMD CDNA4 cannot run the CUDA multi-arch image; MI355X uses a ROCm image that
+bundles **MoRI** (AMD's EP dispatch/combine library). Set in `cx_default_image`
+for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
+
+- **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image.
+- **MoRI:** bundled in-image (build tag `mori-0227`). `tests/ep_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
+- **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
+- **Transport:** intra-node **XGMI** (8× MI355X). Two backends wired: `CX_BENCH=mori` (MoRI EP dispatch/combine) and `CX_BENCH=nccl` (collective primitives via **rccl-tests**, the ROCm nccl-tests fork — built in-container with `make` against `/opt/rocm`/`amdclang++`/`librccl`; same `<op>_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged).
+- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `tests/ep_mori.py`:
+  - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here.
+  - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now.
+  - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `tests/ep_mori.py`'s `finalize()` hard-exits after writing results to avoid it.
+
+  Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image.
+
+## Cluster access / QOS
+
+- **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account.
+- **GB200** (`watchtower`): account `benchmark`, qos `normal`, partition `batch` (`AllowQos=ALL`); idle capacity available. Runner workspace is **not** compute-visible → set `CX_STAGE_DIR` to a Lustre path (the launcher rsyncs there).
+
+## First real results (Milestone-0 spike, on the DeepSeek-V4 images)
+
+nccl-tests (system NCCL 2.28.3), all correctness-passed, peak bus-bw:
+
+| op | B200 8× (NVLink island, x86_64) | GB200 4× (NVL72 MNNVL, aarch64) |
+|---|---|---|
+| all_reduce | 835 GB/s | 689 GB/s |
+| all_gather | 653 | 658 |
+| reduce_scatter | 667 | 661 |
+| alltoall | 638 | 666 |
+
+(B200 vs GB200 carry distinct `comparison_key`s by topology-class, so they are labelled-distinct, not silently merged. Re-run on the multi-arch default to refresh under one image.)
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
new file mode 100644
index 000000000..580a0399c
--- /dev/null
+++ b/experimental/CollectiveX/README.md
@@ -0,0 +1,128 @@
+# CollectiveX
+
+Cross-vendor collective / EP-library benchmark (see `plan.md`). Per-SKU **launch
+adapters** (InferenceX-style `launch_<sku>.sh`) run **any benchmark** — selected
+by `CX_BENCH` — through a shared in-container runner, and a GitHub Actions
+workflow triggers runs on `push` (no merge to main needed). Milestone-0 headline
+already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL).
+
+> Experimental: WIP, not an official InferenceMAX result. All logic stays under
+> `experimental/CollectiveX/`; the only file outside is the orchestration-only
+> workflow.
+
+## Files
+
+| File | Role |
+|---|---|
+| `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) |
+| `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) |
+| `tests/run_ep.py` | EP dispatch/combine entrypoint (torchrun): source-tokens-per-rank sweep, dispatch & combine timed **separately** |
+| `tests/ep_harness.py` | shared EP harness: token ladder, separated timing, correctness gate, doc emission (stdlib top) |
+| `tests/ep_deepep.py`, `tests/ep_mori.py` | per-backend adapters (DeepEP / MoRI) implementing the harness protocol |
+| `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) |
+| `runtime/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build |
+| `runtime/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) over `CX_PHASE` |
+| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) |
+| `CONTAINERS.md` | the pinned multi-arch container + audited library versions |
+| `results/` | flat JSON artifacts (+ `plots/`, raw captures) |
+| `tests/fixtures/` | captured nccl-tests output for offline parser checks |
+
+## Run
+
+### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`)
+
+- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** EP dispatch/combine
+  sweep, **one job per phase** (decode + prefill) via a matrix (lands on free
+  `mi355x-amds` runners).
+- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode /
+  mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl`
+  on MI355X runs rccl-tests), `phase` (decode / prefill / **both** → a job each),
+  `tokens_ladder`, `dispatch_dtype`, ops, sizes, ngpus. Lands on that SKU's
+  self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. For EP results
+  across all SKUs, dispatch once per `sku` with `phase=both`.
+
+Each job renders a results table to the **GitHub Actions job summary** (via
+`summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs
+as an artifact. (The workflow only fires once the branch is pushed to GitHub.)
+
+### Directly on a cluster login node
+
+```bash
+# benchmark is selected by CX_BENCH (default nccl)
+bash experimental/CollectiveX/launchers/launch_gb200-nv.sh                 # GB200, NCCL primitives
+CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild)
+bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh               # B200 8× NVLink
+bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh         # B200 2-node, cross-IB
+bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh                # MI355X 8× XGMI, MoRI EP (CX_BENCH=mori, default)
+CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh   # MI355X primitives via rccl-tests
+```
+
+Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
+`CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible
+staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate
+nothing). EP (deepep/mori) adds `CX_PHASE` (decode|prefill|both), `CX_TOKENS_LADDER`
+(e.g. `"1 2 4 8 16 32 64 128"`), `CX_HIDDEN`/`CX_TOPK`/`CX_EXPERTS`,
+`CX_DISPATCH_DTYPE`, `CX_NUM_EP_GROUPS`. Results land in `experimental/CollectiveX/results/`.
+
+### Offline (no GPU) — verify the parser/JSON pipeline
+
+```bash
+python3 run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \
+  --world-size 8 --nodes 1 --runner b200-dgxc --topology-class b200-nvlink-island --out /tmp/parsed.json
+python3 env_capture.py            # prints a (degraded, off-GPU) env record
+python3 plot.py --results-dir results --out-dir results/plots   # needs matplotlib
+```
+
+## Container
+
+One **multi-arch** image for all NVIDIA SKUs, imported by tag
+`lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…`
+recorded for provenance). Imported by tag, not digest — enroot's anonymous
+Docker Hub auth needs a tag, and a bare digest ref hangs in CI. See
+`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the bundled-DeepEP
+DeepSeek-V4 fallback images.
+
+## How it runs (confirmed against the live clusters)
+
+- Adapters mirror `runners/launch_*.sh`: `salloc` → enroot squash (import only if
+  missing) → `srun --container-image=… --container-mounts=<repo>:/ix` → in-container
+  `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account
+  `benchmark`.
+- **AMD MI355X** (`launch_mi355x-amds.sh`, MoRI / `CX_BENCH=mori`) diverges: partition
+  `compute`, no account, pyxis `--container-writable --container-remap-root`, and a
+  **node-local** squash (`/var/lib/squash`) imported via `srun` on the allocated node
+  (not the login node). Workspace is bind-mounted directly (no `CX_STAGE_DIR`).
+- Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in
+  `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node
+  adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`).
+- The sglang image installs editable under `/workspace`, so the repo is mounted at
+  **`/ix`**. GB200 compute nodes don't see the runner workspace → `CX_STAGE_DIR`
+  rsyncs the tree to Lustre first.
+- Every result embeds an `env_capture` record and a `comparison_key`; topology
+  class is part of the key, so B200(IB/NVLink) and GB200(MNNVL) stay labelled
+  distinct, never silently overlaid.
+
+## Status & known risks
+
+- **Spike done on real hardware** (both SKUs, 4 NCCL primitives, correctness-passed)
+  — on the DeepSeek-V4 images. Now standardizing on the **multi-arch** default;
+  validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9).
+- **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds
+  it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive;
+  `tests/ep_deepep.py` follows the documented normal-mode API — validate against
+  the built commit. B200 (x86_64) first; GB200 (aarch64) follows.
+- **MoRI / MI355X** (`tests/ep_mori.py` + `launch_mi355x-amds.sh`) is **validated on
+  hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip).
+  It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer`
+  zero-copy path, `expected = input × #unique-destination-ranks`). Three
+  ionic_rdma-fabric constraints are baked in (see `CONTAINERS.md`): a 2 GiB heap
+  (the NICs cap RDMA MRs at ~4 GiB), a bounded `max_num_inp_token_per_rank`, and a
+  hard-exit past MoRI's buggy shmem teardown. The ROCm image isn't digest-pinned yet.
+- **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a
+  compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container
+  or srt-slurm. CX_BENCH=nccl only for now.
+- **B200 QOS:** account `benchmark` has only `gpu-2_qos` (the serving-sweep
+  partition); idle `gpu-1` needs a QOS grant. GB200 `batch` is open.
+
+Once the multi-arch image is validated end-to-end, freeze the schema from the
+artifacts (plan: "Freeze the contract").
diff --git a/experimental/CollectiveX/aggregate_results.py b/experimental/CollectiveX/aggregate_results.py
new file mode 100644
index 000000000..3771d17c0
--- /dev/null
+++ b/experimental/CollectiveX/aggregate_results.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""CollectiveX — result aggregator (the end-of-sweep collector).
+
+The sweep workflow (collectivex-sweep.yml) fans out one matrix CELL per SHARD
+(platform × backend × mode × resource), each cell sweeping its cases in a single
+allocation and emitting a handful of per-case result JSONs. Instead of leaving
+thousands of individual files scattered across the repo, this aggregator COLLECTS
+every shard's results into ONE compact line-delimited file:
+
+    results/aggregate/collectivex_ep.ndjson     # one result doc per line
+
+That single artifact is the deliverable the plotter + the app read; the per-case
+JSONs stay inside the run as transient shard intermediates. Within a shard, a
+config that was re-run keeps only its NEWEST usable doc (newest generated_at with
+publication_status/status in official|comparable-experimental|valid), with
+genuinely-failed configs preserved when they have no usable counterpart — the same
+hygiene prune_results.py applies, folded into the merge so the aggregate is already
+canonical.
+
+  python3 aggregate_results.py --in-dir <shards_root> --out results/aggregate/collectivex_ep.ndjson
+  python3 aggregate_results.py --in-dir results --explode results   # ndjson -> per-doc (for the plotter)
+
+Stdlib only.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+
+USABLE = {"official", "comparable-experimental", "valid"}
+
+
+def _key(d: dict) -> str:
+    """Config identity used to keep newest-per-config (mirrors prune_results._doc_key)."""
+    if d.get("comparison_key"):
+        return str(d["comparison_key"])
+    keys = [g.get("comparison_key") for g in d.get("groups", []) if g.get("comparison_key")]
+    if keys:
+        return "|".join(sorted(str(k) for k in keys))
+    return "|".join(str(d.get(k, "")) for k in ("family", "runner", "backend", "phase",
+                                                "measurement_contract"))
+
+
+def _usable(d: dict) -> bool:
+    return (d.get("publication_status") or d.get("status")) in USABLE
+
+
+def _iter_docs(in_dir: str):
+    """Yield (source, doc) for every result doc under in_dir — both per-file *.json and
+    line-delimited *.ndjson (so aggregates can be re-merged idempotently)."""
+    for root, _dirs, files in os.walk(in_dir):
+        for f in files:
+            if f.startswith("env_") or f == "analysis.json":
+                continue
+            p = os.path.join(root, f)
+            if f.endswith(".ndjson"):
+                for line in open(p):
+                    line = line.strip()
+                    if line:
+                        try:
+                            yield p, json.loads(line)
+                        except Exception:
+                            pass
+            elif f.endswith(".json"):
+                try:
+                    yield p, json.load(open(p))
+                except Exception:
+                    pass
+
+
+def aggregate(in_dir: str, keep_per_key: int = 3) -> list:
+    """Collect every result doc, keep newest KEEP_PER_KEY usable per config (+ orphan failures)."""
+    groups: dict = {}
+    for _src, d in _iter_docs(in_dir):
+        groups.setdefault(_key(d), []).append(d)
+    out = []
+    for _k, docs in groups.items():
+        usable = sorted([d for d in docs if _usable(d)],
+                        key=lambda d: d.get("generated_at", ""), reverse=True)
+        if usable:
+            out.extend(usable[:keep_per_key])
+        else:
+            # a config that ONLY ever failed: keep its newest record (preserve failed cases)
+            out.append(sorted(docs, key=lambda d: d.get("generated_at", ""), reverse=True)[0])
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX result aggregator")
+    ap.add_argument("--in-dir", default="results", help="root to walk for shard result files")
+    ap.add_argument("--out", default="results/aggregate/collectivex_ep.ndjson")
+    ap.add_argument("--keep-per-key", type=int, default=3)
+    ap.add_argument("--explode", metavar="DIR",
+                    help="instead of merging, write each ndjson doc in --in-dir back to a per-doc "
+                         "JSON under DIR (so the existing plotter glob can read an aggregate)")
+    a = ap.parse_args()
+
+    if a.explode:
+        os.makedirs(a.explode, exist_ok=True)
+        n = 0
+        for _src, d in _iter_docs(a.in_dir):
+            name = (d.get("artifact_name") or
+                    f"{d.get('runner','x')}_{d.get('backend',d.get('op','x'))}_"
+                    f"{d.get('phase','na')}_{d.get('generated_at','')}".replace(":", "-"))
+            with open(os.path.join(a.explode, f"{name}.json"), "w") as fh:
+                json.dump(d, fh)
+            n += 1
+        print(f"explode: wrote {n} per-doc JSON to {a.explode}")
+        return 0
+
+    docs = aggregate(a.in_dir, a.keep_per_key)
+    os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True)
+    with open(a.out, "w") as fh:
+        for d in docs:
+            fh.write(json.dumps(d, separators=(",", ":")) + "\n")
+    skus = sorted({str(d.get("runner", "?")).split("_")[0].split("-")[0] for d in docs})
+    backs = sorted({str(d.get("backend") or d.get("op") or "?") for d in docs})
+    print(f"aggregate: {len(docs)} docs -> {a.out}  (SKUs={skus} backends={backs})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/analyze_ep.py b/experimental/CollectiveX/analyze_ep.py
new file mode 100644
index 000000000..e53497087
--- /dev/null
+++ b/experimental/CollectiveX/analyze_ep.py
@@ -0,0 +1,435 @@
+#!/usr/bin/env python3
+"""CollectiveX operating-envelope analysis (goal Part 2 'operating-envelope outputs' + Part 3
+'regression/decision outputs'). Post-processes result JSONs (v3 flat or v4 nested) into the
+decision-facing summaries, comparing ONLY matching (workload, topology, contract, backend,
+resource) cells:
+
+  routing-skew penalty     zipf* vs matched uniform — p50/p99 dispatch amplification
+  LL-to-normal crossover   token count where normal becomes faster than LL (p50 and p99)
+  topology penalty         EP4 vs EP8 (and placement, when present) latency penalty
+  strong/weak scaling      fixed-global-tokens and fixed-tokens/rank efficiency across EP
+  resource marginal eff.   Δlatency per Δcomm-fraction (needs a resource ladder; reports n/a otherwise)
+  pareto + recommendations lowest-latency / lowest-resource configs per (sku, phase)
+
+Pure stdlib; reads the same JSONs the plotter does. Honest about missing cells (prints n/a with
+the reason) rather than inventing comparisons.
+
+  python3 analyze_ep.py --results-dir results --out analysis.json
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+from collections import defaultdict
+
+
+def _p(r, op, pct):
+    """percentile from v4 nested {op:{p50..}} or v3 flat {op_us_p50}."""
+    if isinstance(r.get(op), dict):
+        return r[op].get(pct)
+    return r.get(f"{op}_us_{pct}")
+
+
+def load(results_dir):
+    series = []
+    for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)):
+        if os.path.basename(f).startswith("env_"):
+            continue
+        try:
+            d = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") != "moe" or not d.get("rows"):
+            continue
+        sh = d.get("shape", {})
+        v = d.get("validity", {}) or {}
+        series.append({
+            "sku": (d.get("runner") or "?").split("_")[0].split("-")[0],
+            "ep": d.get("ep_size"), "phase": d.get("phase"), "mode": d.get("mode", "normal"),
+            "dtype": sh.get("dispatch_dtype"), "contract": d.get("measurement_contract"),
+            "routing": (sh.get("routing", "?") + ("+eplb" if (d.get("eplb") or {}).get("enabled") else "")),
+            "topo": d.get("topology_class"), "resource": d.get("resource_mode", "tuned"),
+            # placement + publication/anomaly state (goal P2 placement penalty / P2-o LL gating).
+            "placement": (d.get("placement") or {}).get("kind", "packed"),
+            "pub": d.get("publication_status") or "legacy",
+            "anomaly_free": v.get("anomaly_free", True),
+            "hidden": sh.get("hidden"), "topk": sh.get("topk"), "experts": sh.get("experts"),
+            # resource-Pareto axis (immediate P2): achieved comm-fraction + class; fixed-kernel
+            # (DeepEP LL) is EXCLUDED from Pareto (it is not a normalized resource-constrained run).
+            "resource_class": (d.get("resource_profile") or {}).get("resource_class"),
+            "achieved_fraction": (d.get("resource_profile") or {}).get("achieved_fraction"),
+            "pareto_eligible": (d.get("resource_profile") or {}).get("pareto_eligible"),
+            "fixed_kernel": (d.get("resource_profile") or {}).get("fixed_kernel", False),
+            "rows": {r["tokens_per_rank"]: r for r in d["rows"]},
+        })
+    return series
+
+
+def resource_pareto(series):
+    """latency vs achieved comm-resource fraction (immediate P2 'resource Pareto sweeps'). Per
+    (sku,phase,dtype,T): the (achieved_fraction -> dispatch p50/p99) curve across resource points
+    (normalized sm-fraction ladder + tuned/default anchors), EXCLUDING fixed-kernel (LL) runs which
+    are not normalized resource-constrained. Reports the points + marginal efficiency Δlatency/Δfrac
+    so the resource/latency trade-off (more comm SMs -> lower latency, with diminishing returns) is
+    explicit. Needs >=2 distinct fractions at a matched cell; reports per-cell curves where present."""
+    by = defaultdict(dict)   # (sku,phase,dtype,T) -> {achieved_fraction: (p50,p99,class,mode)}
+    for s in series:
+        if s["mode"] != "normal" or s["routing"] != "uniform" or s["contract"] != "layout-and-dispatch-v1":
+            continue
+        if s.get("fixed_kernel"):
+            continue                                   # exclude fixed-kernel from the Pareto
+        af = s.get("achieved_fraction")
+        if af is None:
+            continue
+        for T, r in s["rows"].items():
+            p50, p99 = _p(r, "dispatch", "p50"), _p(r, "dispatch", "p99")
+            if p50:
+                by[(s["sku"], s["phase"], s["dtype"], T)][round(af, 4)] = (round(p50, 1),
+                                                                           round(p99 or 0, 1), s["resource_class"])
+    out = []
+    for (sku, phase, dtype, T), pts in by.items():
+        if len(pts) < 2:
+            continue                                   # need >=2 fractions for a Pareto curve
+        fr = sorted(pts)
+        curve = [{"achieved_fraction": f, "dispatch_p50": pts[f][0], "dispatch_p99": pts[f][1],
+                  "resource_class": pts[f][2]} for f in fr]
+        # marginal efficiency between adjacent points: Δlatency per +0.1 comm-fraction (negative = faster).
+        marg = []
+        for a, b in zip(fr, fr[1:]):
+            dlat, dfr = pts[b][0] - pts[a][0], b - a
+            if dfr > 0:
+                marg.append({"from_frac": a, "to_frac": b, "us_per_0.1frac": round(dlat / dfr * 0.1, 2)})
+        out.append({"sku": sku, "phase": phase, "dtype": dtype, "T": T,
+                    "n_points": len(fr), "curve": curve, "marginal": marg})
+    return out
+
+
+def model_envelope(series, here):
+    """Map each model-derived workload (configs/workloads.yaml) onto the SYNTHETIC measured envelope
+    (goal P2 "model workload summaries"). A model whose (hidden,topk,experts) matches a measured
+    synthetic shape is 'measured-via-proxy'; otherwise 'projected' (no run at those dims yet). Honest
+    about measured vs fitted vs projected; links each to its registry config."""
+    try:
+        import yaml
+        wl = yaml.safe_load(open(os.path.join(here, "configs", "workloads.yaml")))
+    except Exception as exc:
+        return [{"note": f"workloads.yaml unreadable: {exc!r}"}]
+    measured = {}
+    for s in series:
+        if s["hidden"] and s["routing"] == "uniform" and s["mode"] == "normal":
+            measured.setdefault((s["hidden"], s["topk"], s["experts"]), []).append(s["sku"])
+    out = []
+    for name, m in (wl.get("model_derived") or {}).items():
+        dims = (m.get("hidden"), m.get("topk"), m.get("routed_experts"))
+        skus = measured.get(dims)
+        out.append({"model": name, "hidden": dims[0], "topk": dims[1], "routed_experts": dims[2],
+                    "dispatch_dtype": m.get("dispatch_dtype"), "combine_dtype": m.get("combine_dtype"),
+                    "kind": m.get("kind"), "verify": m.get("verify"),
+                    "envelope_placement": ("measured-via-proxy" if skus else "projected"),
+                    "measured_on": sorted(set(skus)) if skus else [],
+                    "note": ("dims match the measured synthetic envelope — read its curve directly"
+                             if skus else "no run at these dims — projected onto the synthetic envelope")})
+    return out
+
+
+def _key(s, *fields):
+    return tuple(s[f] for f in fields)
+
+
+def skew_penalty(series):
+    """zipf* vs matched uniform: dispatch p50/p99 amplification at shared T."""
+    out = []
+    base = {_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"): s
+            for s in series if s["routing"] == "uniform"}
+    for s in series:
+        if not s["routing"].startswith("zipf"):
+            continue
+        b = base.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"))
+        if not b:
+            continue
+        for T in sorted(set(s["rows"]) & set(b["rows"])):
+            zp, up = _p(s["rows"][T], "dispatch", "p50"), _p(b["rows"][T], "dispatch", "p50")
+            zq, uq = _p(s["rows"][T], "dispatch", "p99"), _p(b["rows"][T], "dispatch", "p99")
+            if up and uq:
+                out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], "routing": s["routing"],
+                            "T": T, "p50_amplification": round(zp / up, 3), "p99_amplification": round(zq / uq, 3)})
+    return out
+
+
+def ll_crossover(series):
+    """Token count where normal becomes faster than LL (per sku,dtype). Two variants, gated
+    differently (goal P2-o "gate LL crossover on valid measured roundtrip"):
+      * op='dispatch' -> ISOLATED-KERNEL crossover (always allowed; clearly labelled isolated).
+      * op='roundtrip' -> MEASURED-roundtrip crossover, EXCLUDED when the LL series carries an
+        unresolved timing anomaly (the open LL-FP8 case) so a suspect roundtrip can't set it."""
+    out = []
+    for op in ("dispatch", "roundtrip"):
+        norm = {_key(s, "sku", "ep", "dtype"): s for s in series
+                if s["mode"] == "normal" and s["routing"] == "uniform"
+                and s["contract"] == "layout-and-dispatch-v1"}
+        for s in series:
+            if s["mode"] != "ll" or s["routing"] != "uniform":
+                continue
+            n = norm.get(_key(s, "sku", "ep", "dtype"))
+            if not n:
+                continue
+            gated = (op == "roundtrip" and not s.get("anomaly_free", True))
+            for stat in ("p50", "p99"):
+                cross = None
+                if not gated:
+                    for T in sorted(set(s["rows"]) & set(n["rows"])):
+                        ll, nm = _p(s["rows"][T], op, stat), _p(n["rows"][T], op, stat)
+                        if ll and nm and nm < ll:
+                            cross = T
+                            break
+                out.append({"sku": s["sku"], "ep": s["ep"], "dtype": s["dtype"], "stat": stat,
+                            "basis": "isolated-kernel" if op == "dispatch" else "measured-roundtrip",
+                            "normal_faster_at_T": ("excluded-ll-roundtrip-anomaly" if gated
+                                                   else (cross if cross is not None else "never-in-range"))})
+    return out
+
+
+def placement_penalty(series):
+    """packed vs striped (vs adversarial) at matched (sku,phase,dtype,ep,routing): absolute +
+    % latency delta AND the cross-domain-copy-fraction delta — so the penalty can be attributed
+    to routing locality vs backend overhead (goal P2 topology-penalty). Needs placement-varied
+    runs (multi-node); reports nothing when only one placement is present."""
+    out = []
+    by = defaultdict(dict)
+    for s in series:
+        if s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1":
+            by[(s["sku"], s["phase"], s["dtype"], s["ep"], s["routing"])][s["placement"]] = s
+    for k, places in by.items():
+        if "packed" not in places or len(places) < 2:
+            continue
+        base = places["packed"]
+        for kind, s in places.items():
+            if kind == "packed":
+                continue
+            for T in sorted(set(s["rows"]) & set(base["rows"])):
+                a = _p(base["rows"][T], "dispatch", "p50"); b = _p(s["rows"][T], "dispatch", "p50")
+                if not (a and b):
+                    continue
+                la = (base["rows"][T].get("locality") or {}).get("cross_domain_fraction")
+                lb = (s["rows"][T].get("locality") or {}).get("cross_domain_fraction")
+                out.append({"sku": k[0], "phase": k[1], "dtype": k[2], "ep": k[3], "routing": k[4],
+                            "placement": kind, "T": T, "packed_p50": round(a, 1),
+                            f"{kind}_p50": round(b, 1), "abs_penalty_us": round(b - a, 1),
+                            "penalty_pct": round(100 * (b - a) / a, 1),
+                            "cross_domain_frac_packed": la, "cross_domain_frac_other": lb})
+    return out
+
+
+def topology_penalty(series):
+    """EP4 vs EP8 dispatch p50 at matched tokens/rank for the same sku (a scaling/topology cost)."""
+    out = []
+    by = defaultdict(dict)
+    for s in series:
+        if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1":
+            by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s
+    for k, eps in by.items():
+        if len(eps) < 2:
+            continue
+        lo, hi = min(eps), max(eps)
+        sl, sh = eps[lo], eps[hi]
+        for T in sorted(set(sl["rows"]) & set(sh["rows"])):
+            a, b = _p(sl["rows"][T], "dispatch", "p50"), _p(sh["rows"][T], "dispatch", "p50")
+            if a and b:
+                out.append({"sku": k[0], "phase": k[1], "dtype": k[2], "T": T,
+                            f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1),
+                            "penalty_pct": round(100 * (b - a) / a, 1)})
+    return out
+
+
+def scaling(series):
+    """strong: fixed GLOBAL tokens, vary EP -> latency. weak: fixed tokens/RANK, vary EP."""
+    out = {"strong": [], "weak": []}
+    by = defaultdict(dict)
+    for s in series:
+        if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1":
+            by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s
+    for k, eps in by.items():
+        if len(eps) < 2:
+            continue
+        for ep, s in eps.items():
+            for T, r in s["rows"].items():
+                d50 = _p(r, "dispatch", "p50")
+                if d50:
+                    out["weak"].append({"sku": k[0], "phase": k[1], "ep": ep, "tokens_per_rank": T,
+                                        "global_tokens": T * ep, "dispatch_p50": round(d50, 1)})
+                    out["strong"].append({"sku": k[0], "phase": k[1], "ep": ep, "global_tokens": T * ep,
+                                          "tokens_per_rank": T, "dispatch_p50": round(d50, 1)})
+    return out
+
+
+def scaling_efficiency(series):
+    """From EP4+EP8 (same sku/phase): weak = fixed tokens/rank (ideal: flat latency); strong =
+    fixed GLOBAL tokens (ideal: latency falls ~1/EP). Efficiency = ideal/observed (1.0 = ideal)."""
+    out = {"weak": [], "strong": []}
+    by = defaultdict(dict)
+    for s in series:
+        if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1":
+            by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s
+    for k, eps in by.items():
+        if len(eps) < 2:
+            continue
+        lo, hi = min(eps), max(eps)
+        # weak: same tokens/rank T on both EP -> latency should stay flat
+        for T in sorted(set(eps[lo]["rows"]) & set(eps[hi]["rows"])):
+            a, b = _p(eps[lo]["rows"][T], "dispatch", "p50"), _p(eps[hi]["rows"][T], "dispatch", "p50")
+            if a and b:
+                out["weak"].append({"sku": k[0], "phase": k[1], "tokens_per_rank": T,
+                                    f"ep{lo}": round(a, 1), f"ep{hi}": round(b, 1),
+                                    "weak_efficiency": round(a / b, 3)})  # >1 = EP8 faster (super-ideal)
+        # strong: same GLOBAL tokens -> EP_hi has fewer tokens/rank; ideal latency ~ a*(lo/hi)
+        for Tlo in eps[lo]["rows"]:
+            gt = Tlo * lo
+            Thi = gt // hi
+            if Thi in eps[hi]["rows"]:
+                a, b = _p(eps[lo]["rows"][Tlo], "dispatch", "p50"), _p(eps[hi]["rows"][Thi], "dispatch", "p50")
+                if a and b:
+                    ideal = a * (lo / hi)
+                    out["strong"].append({"sku": k[0], "phase": k[1], "global_tokens": gt,
+                                          f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1),
+                                          "strong_efficiency": round(ideal / b, 3)})
+    return out
+
+
+def regressions(series, baseline_series, thresh=0.10):
+    """Flag latency regressions vs a baseline, comparing ONLY matching (sku,ep,phase,mode,dtype,
+    contract,routing) cells at shared T. Regression = current p50/p99 > baseline*(1+thresh)."""
+    bkey = {_key(b, "sku", "ep", "phase", "mode", "dtype", "contract", "routing"): b for b in baseline_series}
+    out = []
+    for s in series:
+        b = bkey.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract", "routing"))
+        if not b:
+            continue
+        for T in sorted(set(s["rows"]) & set(b["rows"])):
+            for op in ("dispatch", "combine", "roundtrip"):
+                for stat in ("p50", "p99"):
+                    cur, base = _p(s["rows"][T], op, stat), _p(b["rows"][T], op, stat)
+                    if cur and base and cur > base * (1 + thresh):
+                        out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"],
+                                    "routing": s["routing"], "T": T, "op": op, "stat": stat,
+                                    "baseline": round(base, 1), "current": round(cur, 1),
+                                    "regression_pct": round(100 * (cur - base) / base, 1)})
+    return out
+
+
+def distribution_summary(series, results_dir):
+    """One block per (sku,backend?,phase): worst-distribution penalty, zipf penalty, EPLB recovery,
+    balanced/high-fanout penalty, + placeholders for activation/quant penalties (goal P2
+    "distribution-sensitivity summaries"). Reuses tests/sensitivity.py for the ratio and adds the
+    balanced + EPLB views the skew table doesn't surface."""
+    summary = {"note": "ratios = p99(distribution) / p99(uniform) at matched tokens/rank"}
+    # worst / zipf / EPLB recovery come straight from tests/sensitivity.py.
+    try:
+        import sys as _sys
+        _sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "tests"))
+        import sensitivity as _sens
+        groups = _sens.analyze(results_dir)["groups"]
+        summary["sensitivity"] = [{"sku": g["sku"], "backend": g["backend"], "phase": g["phase"],
+                                   "worst": g["worst_distribution"],
+                                   "worst_ratio": g["distribution_sensitivity_ratio"],
+                                   "best_case": g["best_case_ratio"], "eplb_recovery": g["eplb_recovery"],
+                                   "per_distribution": g["per_distribution"]} for g in groups
+                                  if g["distribution_sensitivity_ratio"] is not None]
+    except Exception as exc:
+        summary["sensitivity"] = []
+        summary["sensitivity_error"] = repr(exc)
+    # balanced (high-fanout) penalty: balanced p99 / uniform p99 (a distinct stressor from zipf).
+    base = {_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"): s
+            for s in series if s["routing"] == "uniform"}
+    bal = []
+    for s in series:
+        if s["routing"] != "balanced":
+            continue
+        b = base.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"))
+        if not b:
+            continue
+        for T in sorted(set(s["rows"]) & set(b["rows"])):
+            up, bp = _p(b["rows"][T], "dispatch", "p99"), _p(s["rows"][T], "dispatch", "p99")
+            if up and bp:
+                bal.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], "T": T,
+                            "balanced_p99_penalty": round(bp / up, 3)})
+    summary["balanced_high_fanout_penalty"] = bal
+    # activation / quant-combine distribution penalties: only meaningful under a quantized combine
+    # (bf16 is value-independent). Recorded as blocked until PR311 lands (goal P2 — kept honest).
+    summary["activation_profile_penalty"] = {
+        "status": "blocked-on-quant-combine",
+        "note": "activation VALUE distribution is latency-neutral under bf16 combine; needs a "
+                "quantized (value-sensitive) combine kernel (ROCm/MoRI PR311) to measure"}
+    summary["quant_combine_penalty"] = {
+        "status": "blocked-on-quant-combine",
+        "note": "no quantized combine kernel wired (combine_quant_mode=none everywhere); the rig "
+                "(combine_quant_mode field + capability gate + suite) is ready for when it lands"}
+    return summary
+
+
+def recommendations(series):
+    """Per (sku, phase): lowest-p99-dispatch config at the headline T=64 (decode) / T=256 (prefill)."""
+    out = []
+    by = defaultdict(list)
+    for s in series:
+        by[(s["sku"], s["phase"])].append(s)
+    for (sku, phase), ss in by.items():
+        T = 64 if phase == "decode" else 256
+        cands = []
+        for s in ss:
+            r = s["rows"].get(T)
+            if r:
+                q = _p(r, "dispatch", "p99")
+                if q:
+                    cands.append((q, f"{s['dtype']}/{s['mode']}/{s['contract']}/{s['routing']}/{s['resource']}", s["ep"]))
+        if cands:
+            cands.sort()
+            out.append({"sku": sku, "phase": phase, "at_T": T, "lowest_p99_dispatch_us": round(cands[0][0], 1),
+                        "config": cands[0][1], "ep": cands[0][2]})
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX operating-envelope analysis")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--baseline", help="dir of baseline results for regression detection")
+    ap.add_argument("--out")
+    a = ap.parse_args()
+    here = os.path.dirname(os.path.abspath(__file__))
+    s = load(a.results_dir)
+    rep = {"n_series": len(s), "skew_penalty": skew_penalty(s), "ll_crossover": ll_crossover(s),
+           "topology_penalty": topology_penalty(s), "placement_penalty": placement_penalty(s),
+           "scaling": scaling(s), "scaling_efficiency": scaling_efficiency(s),
+           "model_envelope": model_envelope(s, here),
+           "distribution_summary": distribution_summary(s, a.results_dir),
+           "resource_pareto": resource_pareto(s),
+           "recommendations": recommendations(s)}
+    if a.baseline:
+        regs = regressions(s, load(a.baseline))
+        rep["regressions"] = regs
+        print(f"regressions vs baseline: {len(regs)} cell(s) > +10%")
+    print(f"loaded {len(s)} series")
+    sk = rep["skew_penalty"]
+    if sk:
+        worst = max(sk, key=lambda x: x["p99_amplification"])
+        print(f"skew penalty: {len(sk)} cells; worst p99 amplification {worst['p99_amplification']}x "
+              f"({worst['sku']} {worst['routing']} T{worst['T']})")
+    tp = rep["topology_penalty"]
+    if tp:
+        print(f"topology penalty (EP4->EP8): {len(tp)} cells; e.g. "
+              + ", ".join(f"{x['sku']} T{x['T']} {x['penalty_pct']:+}%" for x in tp[:3]))
+    rpar = rep["resource_pareto"]
+    print(f"resource-Pareto cells (>=2 fractions, fixed-kernel excluded): {len(rpar)}"
+          + (f"; e.g. {rpar[0]['sku']} T{rpar[0]['T']} {rpar[0]['n_points']} pts" if rpar else " (need an sm_fraction ladder)"))
+    print(f"LL crossover cells: {len(rep['ll_crossover'])}; recommendations: {len(rep['recommendations'])}")
+    for r in rep["recommendations"]:
+        print(f"  rec {r['sku']}/{r['phase']} @T{r['at_T']}: {r['lowest_p99_dispatch_us']}us via {r['config']}")
+    if a.out:
+        json.dump(rep, open(a.out, "w"), indent=2)
+        print(f"wrote {a.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/cohort.py b/experimental/CollectiveX/cohort.py
new file mode 100644
index 000000000..96f31f322
--- /dev/null
+++ b/experimental/CollectiveX/cohort.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+"""CollectiveX publication-cohort builder + validator (goal Part 1: publication cohort manifests,
+official-cohort validation, source-SHA pinning; goal Part 2: EPLB mapping identity).
+
+A *publication cohort* is the set of result artifacts that are meant to be compared on ONE chart —
+e.g. the same workload + measurement contract + config across SKUs/backends. Unlike `comparison_key`
+(which gates a single curve and so INCLUDES topology/sku), a cohort deliberately lets sku / backend /
+topology VARY (those are the independent variable) while requiring everything that must be identical
+for the comparison to be fair to actually match:
+
+    cohort_key = (mode, phase, ep_size, resource_mode, comparison_class, measurement_contract,
+                  dispatch_dtype, activation_profile, combine_quant_mode, trace_signature)
+
+For each cohort this tool emits a MANIFEST listing every member with its identity fingerprint
+(source SHA, workload id, image digest, backend version, schema version) and decides whether the
+cohort is OFFICIAL-eligible. A cohort is official only when every member is itself measurement-sound
+and the dimensions that MUST match across hardware do:
+
+  * one benchmark source SHA           (goal P1 "same benchmark source SHA"; --pin-sha enforces)
+  * non-null + identical workload_id   (goal P1 "non-null workload identity")
+  * identical trace_signature          (same realized routing bytes — by cohort_key construction)
+  * identical EPLB mapping_hash         (goal P2 "matching EPLB mapping identity") when EPLB is on
+  * no unresolved timing anomalies      (goal P1 anomaly gate)
+  * complete provenance per member      (image digest + git run)
+
+Rejected members are recorded WITH machine-readable reasons (goal P1 "store rejected artifacts with
+explicit rejection reasons") rather than silently dropped.
+
+  python3 cohort.py --results-dir results                      # summarize all cohorts
+  python3 cohort.py --results-dir results --require-official    # exit 3 unless an official cohort exists
+  python3 cohort.py --results-dir results --pin-sha --out results/cohorts.json
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import hashlib
+import json
+import os
+
+MIN_SAMPLES_OFFICIAL = 100
+
+
+def _backend_version(doc: dict) -> str:
+    p = doc.get("backend_provenance", {}) or {}
+    return (p.get("deepep_commit") or p.get("deepep_version")
+            or p.get("mori_commit") or "unknown")
+
+
+def fingerprint(doc: dict, path: str) -> dict:
+    """Per-artifact identity used to detect cohort mismatches + build the cohort id."""
+    sh = doc.get("shape", {}) or {}
+    q = sh.get("quant", {}) or {}
+    wl = doc.get("workload", {}) or {}
+    repro = doc.get("reproduction", {}) or {}
+    gr = repro.get("git_run") or {}
+    eplb = doc.get("eplb") or {}
+    v = doc.get("validity", {}) or {}
+    return {
+        "file": os.path.basename(path),
+        "sku": (doc.get("runner") or "?").split("_")[0].split("-")[0],
+        "backend": doc.get("backend"), "mode": doc.get("mode"), "phase": doc.get("phase"),
+        "ep_size": doc.get("ep_size"), "resource_mode": doc.get("resource_mode"),
+        "comparison_class": doc.get("comparison_class"),
+        "measurement_contract": doc.get("measurement_contract"),
+        "dispatch_dtype": sh.get("dispatch_dtype"),
+        "kernel_gen": sh.get("kernel_gen") or ("v1" if doc.get("backend") == "deepep" else "n-a"),
+        "activation_profile": sh.get("activation_profile", "normal"),
+        "combine_quant_mode": q.get("combine_quant_mode", "none"),
+        "trace_signature": wl.get("trace_signature") or (doc.get("routing_identity") or {}).get("trace_signature"),
+        "workload_id": wl.get("workload_id"),
+        "workload_source": wl.get("source"),
+        "source_sha": (gr.get("source_sha") or ""),
+        "image_digest": (repro.get("image_digest") or ""),
+        "backend_version": _backend_version(doc),
+        "schema_version": doc.get("schema_version"),
+        "publication_status": doc.get("publication_status") or "legacy",
+        "anomaly_free": v.get("anomaly_free", True),
+        "provenance_complete": v.get("provenance_complete", False),
+        "eplb_enabled": bool(eplb.get("enabled")),
+        "eplb_mapping_hash": eplb.get("mapping_hash"),
+        "min_samples": min((r.get("samples_pooled", 0) for r in doc.get("rows", [])), default=0),
+        "correct": all(r.get("correct") for r in doc.get("rows", [])) if doc.get("rows") else False,
+    }
+
+
+def cohort_key(fp: dict) -> tuple:
+    """Identity a cohort's members must share. sku/backend/topology deliberately EXCLUDED — those
+    are what a cross-hardware chart compares."""
+    return (fp["mode"], fp["phase"], fp["ep_size"], fp["resource_mode"], fp["comparison_class"],
+            fp["measurement_contract"], fp["dispatch_dtype"], fp["kernel_gen"],
+            fp["activation_profile"], fp["combine_quant_mode"], fp["trace_signature"])
+
+
+def cohort_id(members: list) -> str:
+    """Stable content hash of the cohort: encodes every member's (source SHA, workload id, image
+    digest, backend version, schema version) — goal P1 'cohort IDs that encode ...'."""
+    parts = sorted(f"{m['sku']}|{m['backend']}|{m['source_sha']}|{m['workload_id']}|"
+                   f"{m['image_digest']}|{m['backend_version']}|{m['schema_version']}" for m in members)
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def evaluate_cohort(members: list, pin_sha: bool) -> dict:
+    """Split members into the OFFICIAL subset (accepted) + the rest (rejected, with reasons).
+    A non-canonical (wid=null / seeded-runtime) member is REJECTED from the official cohort but
+    does NOT block it — that is the point of recording rejections. official_eligible then depends
+    on the ACCEPTED subset being mutually consistent (one source SHA under --pin-sha, one workload_id,
+    one EPLB mapping), NOT on there being zero rejected members. A seeded run of the same config
+    shares the deterministic trace_signature, so it lands in the same cohort and is simply excluded."""
+    rejected, accepted = [], []
+    for m in members:
+        reasons = []                                  # PER-MEMBER gates only
+        # publication_status is machine-derived from ALL validity dims (correctness, workload
+        # identity, measurement + RESOURCE conformance, provenance, anomalies). Only an 'official'
+        # member belongs in an official cohort — this is the authoritative gate; the granular
+        # checks below just enrich the rejection reason (e.g. a resource-nonconforming MoRI run is
+        # 'diagnostic' and excluded here even though it is correct + canonical + provenance-complete).
+        if m["publication_status"] != "official":
+            reasons.append(f"publication_status={m['publication_status']} (official cohort needs 'official')")
+        if not m["correct"]:
+            reasons.append("a point failed correctness")
+        if not m["anomaly_free"]:
+            reasons.append("unresolved timing anomaly (not waived)")
+        if not m["workload_id"]:
+            reasons.append("workload_id is null (not canonical-serialized) — comparable-experimental, not official")
+        if m["workload_source"] != "canonical-serialized":
+            reasons.append(f"workload_source={m['workload_source']} (official needs canonical-serialized)")
+        if not m["provenance_complete"]:
+            reasons.append("provenance incomplete (image digest / git run missing)")
+        if m["min_samples"] < MIN_SAMPLES_OFFICIAL:
+            reasons.append(f"a point has <{MIN_SAMPLES_OFFICIAL} pooled samples")
+        (rejected if reasons else accepted).append({**m, "rejection_reasons": reasons})
+    # cross-member consistency over the ACCEPTED (would-be-official) subset.
+    a_shas = {m["source_sha"] for m in accepted if m["source_sha"]}
+    a_wids = {m["workload_id"] for m in accepted if m["workload_id"]}
+    a_maps = {m["eplb_mapping_hash"] for m in accepted if m["eplb_enabled"]}
+    a_eplb = any(m["eplb_enabled"] for m in accepted)
+    incoherent = []
+    if pin_sha and len(a_shas) > 1:
+        incoherent.append(f"accepted members span {len(a_shas)} source SHAs (--pin-sha requires one)")
+    if len(a_wids) > 1:
+        incoherent.append(f"accepted members span {len(a_wids)} workload_ids")
+    if a_eplb and len(a_maps) > 1:
+        incoherent.append(f"accepted members span {len(a_maps)} EPLB mapping_hashes")
+    official_eligible = len(accepted) >= 1 and not incoherent
+    return {
+        "cohort_id": cohort_id(members), "n_members": len(members),
+        "skus": sorted({m["sku"] for m in members}),
+        "official_skus": sorted({m["sku"] for m in accepted}),
+        "backends": sorted({m["backend"] for m in members if m["backend"]}),
+        "source_shas": sorted({m["source_sha"] for m in members if m["source_sha"]}),
+        "workload_ids": sorted({m["workload_id"] for m in members if m["workload_id"]}),
+        "official_source_shas": sorted(a_shas), "official_workload_ids": sorted(a_wids),
+        "eplb_mapping_hashes": sorted(a_maps), "any_eplb": a_eplb,
+        "official_eligible": official_eligible, "incoherent": incoherent,
+        "accepted": accepted, "rejected": rejected,
+    }
+
+
+def build(results_dir: str, pin_sha: bool) -> dict:
+    cohorts = {}
+    for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)):
+        if os.path.basename(f).startswith("env_"):
+            continue
+        try:
+            doc = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if doc.get("family") != "moe" or not doc.get("rows"):
+            continue
+        if "publication_status" not in doc:
+            continue                                   # legacy v3 — not cohort-eligible
+        fp = fingerprint(doc, f)
+        cohorts.setdefault(cohort_key(fp), []).append(fp)
+    out = []
+    for ck, members in cohorts.items():
+        ev = evaluate_cohort(members, pin_sha)
+        ev["key"] = {"mode": ck[0], "phase": ck[1], "ep_size": ck[2], "resource_mode": ck[3],
+                     "comparison_class": ck[4], "measurement_contract": ck[5],
+                     "dispatch_dtype": ck[6], "kernel_gen": ck[7], "activation_profile": ck[8],
+                     "combine_quant_mode": ck[9], "trace_signature": ck[10]}
+        out.append(ev)
+    out.sort(key=lambda c: (not c["official_eligible"], -c["n_members"]))
+    return {"results_dir": results_dir, "pin_sha": pin_sha, "n_cohorts": len(out),
+            "n_official_eligible": sum(1 for c in out if c["official_eligible"]),
+            "cohorts": out}
+
+
+def to_markdown(report: dict) -> str:
+    h = (f"### Publication cohorts ({report['n_cohorts']} cohorts, "
+         f"{report['n_official_eligible']} official-eligible; pin_sha={report['pin_sha']})\n\n"
+         "| cohort | contract | dtype·act·cq | EP | SKUs | backends | members | official | top rejection |\n"
+         "|---|---|---|---|---|---|---|---|---|\n")
+    for c in report["cohorts"]:
+        k = c["key"]
+        cfg = f"{k['dispatch_dtype']}·{k['activation_profile']}·{k['combine_quant_mode']}"
+        rej = ""
+        if c["rejected"]:
+            rs = c["rejected"][0]["rejection_reasons"]
+            rej = (rs[0] if rs else "")[:48]
+        h += (f"| `{c['cohort_id']}` | {(k['measurement_contract'] or '').replace('-v1','')} | {cfg} | "
+              f"{k['ep_size']} | {','.join(c['skus'])} | {','.join(c['backends'])} | "
+              f"{len(c['accepted'])}✓/{len(c['rejected'])}✗ | {'YES' if c['official_eligible'] else '—'} | {rej} |\n")
+    return h
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX publication-cohort builder/validator")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--pin-sha", action="store_true",
+                    help="require all members of an official cohort to share one source SHA")
+    ap.add_argument("--require-official", action="store_true",
+                    help="exit 3 unless at least one cohort is official-eligible")
+    ap.add_argument("--out", help="write the full cohort manifest JSON here")
+    a = ap.parse_args()
+    report = build(a.results_dir, a.pin_sha)
+    if a.out:
+        os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True)
+        json.dump(report, open(a.out, "w"), indent=2, sort_keys=True)
+        print(f"wrote {a.out}")
+    print(to_markdown(report))
+    if a.require_official and report["n_official_eligible"] == 0:
+        print("FAIL: no official-eligible cohort (see rejection reasons above)")
+        return 3
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/configs/backends.yaml b/experimental/CollectiveX/configs/backends.yaml
new file mode 100644
index 000000000..c83d94fbc
--- /dev/null
+++ b/experimental/CollectiveX/configs/backends.yaml
@@ -0,0 +1,83 @@
+# CollectiveX backend registry (goal Part 2) — the single source of truth for backend
+# capability, replacing the data split between the adapters and tests/capability.py. Keep in
+# sync with ep_deepep.py / ep_mori.py SUPPORTED_* sets (capability.py mirrors this at runtime).
+schema_version: 1
+backends:
+  deepep:
+    vendor: nvidia
+    modes: [normal, ll]                 # ll is DECODE-ONLY (fixed num_max dispatch)
+    dtypes: [bf16, fp8]                  # DISPATCH-side precision
+    contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+    transports: [nvlink, mnnvl, rdma]
+    ep_max_intranode: 8                  # <=8 ranks = intranode NVL kernel (incl. MNNVL trays)
+    ep_min: 2
+    # combine path + distribution semantics (goal P2 "distribution + quant-combine constraints").
+    # bf16/none combine only (quantized combine reserved until a kernel is wired); honors any
+    # routing trace + EPLB; all activation profiles runnable (value-neutral under bf16).
+    combine_dtypes: [bf16]
+    quant_modes: [none]
+    routings: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, zipf-heavy,
+               hotspot-single, hotspot-moving, alternating-groups]
+    eplb: true
+    activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation]
+    phase_constraints:
+      ll: {phases: [decode], max_tokens_per_rank: 128}   # LL is a fixed-num_max decode path
+    required_image: "lmsysorg/sglang:v0.5.11-cu130"
+    cap_token_per_rank: 4096             # 4 GiB NVL buffer holds ~4096 tok/rank at hidden=7168
+  uccl:
+    vendor: nvidia
+    modes: [normal, ll]                  # uccl.ep.Buffer is a DeepEP-API clone
+    dtypes: [bf16, fp8]                  # DISPATCH-side precision
+    contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+    transports: [nvlink, rdma]
+    ep_max_intranode: 8
+    ep_min: 2
+    combine_dtypes: [bf16]
+    quant_modes: [none]
+    routings: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, zipf-heavy,
+               hotspot-single, hotspot-moving, alternating-groups]
+    eplb: true
+    activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation]
+    phase_constraints:
+      ll: {phases: [decode], max_tokens_per_rank: 128}
+    required_image: "lmsysorg/sglang:v0.5.11-cu130"
+    install: "pip install uccl nvidia-cuda-runtime-cu12 (cu12 runtime on LD_LIBRARY_PATH); see cx_build_uccl"
+    cap_token_per_rank: 4096
+  mori:
+    vendor: amd
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    transports: [xgmi, rdma]
+    ep_max_intranode: 8
+    ep_min: 2
+    combine_dtypes: [bf16]               # + fp8 when ROCm/MoRI PR311 quant_type combine lands
+    quant_modes: [none]                  # + the PR311 mode id once validated
+    routings: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, zipf-heavy,
+               hotspot-single, hotspot-moving, alternating-groups]
+    eplb: true
+    activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation]
+    phase_constraints:
+      normal: {max_tokens_per_rank: 512}   # 2 GiB registerable heap cap at hidden=7168
+    required_image: "rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+    cap_token_per_rank: 512
+    fragility: "wedges (D-state) on sustained iters>=200 at T>=32; needs gradual ramp, low iters"
+  aiter:
+    vendor: amd
+    modes: [normal]
+    dtypes: [bf16, fp8]
+    contracts: [layout-and-dispatch-v1]
+    transports: [xgmi, rdma]
+    ep_max_intranode: 8
+    ep_min: 2
+    status: "scaffolded — adapter ep_aiter.py not yet implemented (capability declared, not validated)"
+    required_image: "rocm/sgl-dev (AITER CK MoE EP)"
+
+# 'all' resolves to a DEFINED per-vendor backend set (NOT the same across vendors).
+vendor_backends:
+  nvidia: [nccl, deepep, uccl]
+  amd: [rccl, mori]
+# Collective primitives (not EP dispatch/combine — phase/dtype/mode/contract N/A).
+collective_backends:
+  nccl: [nvidia]
+  rccl: [amd]
diff --git a/experimental/CollectiveX/configs/platforms.yaml b/experimental/CollectiveX/configs/platforms.yaml
new file mode 100644
index 000000000..a25fd97a8
--- /dev/null
+++ b/experimental/CollectiveX/configs/platforms.yaml
@@ -0,0 +1,118 @@
+# CollectiveX platform registry (goal Part 2). One entry per SKU: hardware capability is
+# separated from VALIDATED software capability (what we've actually run green on real HW).
+# scale_up_domain = #GPUs reachable over the intra-domain fabric before crossing a tier
+# (NVLink island / NVL72 MNNVL tray-group / XGMI). gpus_per_node bounds single-node EP.
+schema_version: 1
+platforms:
+  h100:
+    vendor: nvidia
+    arch: sm90
+    gpu: "H100 80GB HBM3"
+    gpus_per_node: 8
+    scale_up_domain: 8            # single 8-GPU NVLink island
+    transport_tiers: [nvlink, ib]
+    runner: h100-8x
+    launcher: launch_h100-dgxc-slurm.sh
+    ssh: "sa-shared@100.118.57.65"   # partition hpc-gpu-1, /mnt/nfs, exclude hpc-gpu-1-7
+    validated:
+      ep_degrees: [8]
+      backends: [deepep]
+      max_intranode_gpus: 8
+      internode: false            # not yet exercised for EP
+  h200:
+    vendor: nvidia
+    arch: sm90
+    gpu: "H200 143GB HBM3e"
+    gpus_per_node: 8
+    scale_up_domain: 8
+    transport_tiers: [nvlink, ib]
+    runner: h200-8x
+    launcher: launch_h200.sh
+    ssh: "sa-shared@100.78.55.80"    # partition main, /home NFS
+    validated:
+      ep_degrees: [8]
+      backends: [deepep]
+      max_intranode_gpus: 8
+      internode: false
+  b300:
+    vendor: nvidia
+    arch: sm100
+    gpu: "B300 SXM6 268GB"
+    gpus_per_node: 8
+    scale_up_domain: 8
+    transport_tiers: [nvlink, ib]
+    runner: b300-nv
+    launcher: launch_b300.sh
+    ssh: "sa-shared@100.101.13.83"   # partition batch_1, acct benchmark, /data, exclude b300-018
+    notes: "Blackwell drops clocks on tiny T -> per-point warm burst (warmup>=30). LL aborts."
+    validated:
+      ep_degrees: [8]
+      backends: [deepep]
+      modes: [normal]              # Blackwell LL aborts on this fabric -> normal-only
+      max_intranode_gpus: 8
+      internode: false
+  gb300:
+    vendor: nvidia
+    arch: sm100
+    gpu: "GB300 Grace-Blackwell (aarch64)"
+    gpus_per_node: 4              # NVL72 compute tray = 4 GPU/node
+    scale_up_domain: 72          # NVL72 MNNVL: one NVLink P2P domain spans the rack
+    transport_tiers: [mnnvl, ib]
+    runner: gb300-8x
+    launcher: _gb300_ep8.sh
+    ssh: "2-hop: sa-shared@100.92.114.46 -> im-gb300-login-02"  # batch_1, acct benchmark, /data
+    notes: "EP8 = 2 trays but INTRANODE NVLink path (MNNVL is one domain for <=8 ranks). deep_ep 1.1.0."
+    validated:
+      ep_degrees: [4, 8]
+      backends: [deepep]
+      max_intranode_gpus: 8        # <=8 ranks use the intranode NVL kernel even across 2 trays
+      internode: false             # internode-normal asserts out until >8 ranks (EP16+)
+  b200:
+    vendor: nvidia
+    arch: sm100
+    gpu: "B200 SXM 180GB"
+    gpus_per_node: 8
+    scale_up_domain: 8
+    transport_tiers: [nvlink, ib]
+    runner: b200-dgxc
+    launcher: launch_b200-dgxc.sh
+    ssh: ""                        # GHA self-hosted pool (sku=b200-dgxc); dispatch uses the runner label
+    notes: "B200 8x NVLink (sibling of B300, sm100). Single-node; normal-only (Blackwell LL aborts)."
+    validated:
+      ep_degrees: [8]
+      backends: [deepep]
+      modes: [normal]
+      max_intranode_gpus: 8
+      internode: false
+  gb200:
+    vendor: nvidia
+    arch: sm100
+    gpu: "GB200 Grace-Blackwell (aarch64)"
+    gpus_per_node: 4              # NVL72 compute tray = 4 GPU/node
+    scale_up_domain: 72          # NVL72 MNNVL one NVLink domain
+    transport_tiers: [mnnvl, ib]
+    runner: gb200-nv
+    launcher: launch_gb200-nv.sh
+    ssh: ""                        # GHA self-hosted pool (sku=gb200)
+    notes: "NVL72 sibling of GB300. EP4/EP8 intranode-NVL (<=8 ranks, MNNVL one domain); EP16/32/64 via the multi-tray nodes sweep."
+    validated:
+      ep_degrees: [4, 8]
+      backends: [deepep]
+      max_intranode_gpus: 8
+      internode: false
+  mi355x:
+    vendor: amd
+    arch: gfx950
+    gpu: "MI355X CDNA4 256 CU"
+    gpus_per_node: 8
+    scale_up_domain: 8           # single 8-GPU XGMI island
+    transport_tiers: [xgmi, rdma]
+    runner: mi355x-8x
+    launcher: launch_mi355x-amds.sh
+    ssh: "2-hop bastion -> mia1-vm-amd-prj3-slurm-001"  # partition compute, cpus-per-task=128
+    notes: "MoRI wedges (D-state) on sustained iters>=200 at T>=32; cap iters. 512-tok buffer cap. No LL/fp8."
+    validated:
+      ep_degrees: [8]
+      backends: [mori]
+      max_intranode_gpus: 8
+      internode: false
diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml
new file mode 100644
index 000000000..194f5f40c
--- /dev/null
+++ b/experimental/CollectiveX/configs/suites.yaml
@@ -0,0 +1,218 @@
+# CollectiveX named benchmark suites (goal Part 2). A suite binds workloads x platforms x
+# backends x modes x contracts x resource regimes x repetitions x required publication level.
+# generate_matrix.py resolves a suite against platforms.yaml/backends.yaml capabilities BEFORE
+# any GPU is allocated, omitting unsupported combinations with recorded reasons.
+schema_version: 1
+
+# HEADLINE DISTRIBUTION CONTRACT (goal Part 2 "define one headline distribution"). ONE routing
+# profile is the cross-hardware headline; every other distribution is a SENSITIVITY view, never a
+# peer headline dimension. plot_ep.py defaults to this (HEADLINE_DISTRIBUTION) and labels the
+# sensitivity section as "not the headline".
+headline_distribution:
+  routing: uniform
+  basis: synthetic            # synthetic | fitted | replayed — uniform is the controlled synthetic ref
+  rationale: >-
+    uniform is deterministic, controlled, and present on every SKU/backend, so it is the
+    apples-to-apples cross-hardware reference. balanced / zipf / zipf+eplb / hotspot* are
+    sensitivity views. Interim load-realism reference = zipf+eplb (skew + the production remedy);
+    long-term headline will be InferenceX TRACE-REPLAY (captured per-step serving routing) once a
+    replay loader lands — then `basis` becomes `replayed`.
+  sensitivity_distributions: [balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate,
+                              zipf-heavy, hotspot-single, hotspot-moving, alternating-groups]
+
+suites:
+  ep-smoke-v1:
+    description: "fast canary: one small point per platform/backend/mode/contract"
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, gb300, gb200, mi355x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    token_points: [8, 64]
+    trials: 1
+    required_publication: comparable-experimental
+
+  ep-nightly-v1:
+    description: "headline matrix: both contracts, bf16+fp8, normal+LL, decode+prefill"
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, b300, b200, gb300, gb200, mi355x]
+    backends: [deepep, mori]
+    modes: [normal, ll]
+    dtypes: [bf16, fp8]
+    contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    trials: 3
+    required_publication: official
+
+  ep-models-v1:
+    description: "model-shape envelope: real MoE dimensions, controlled routing"
+    workloads: [deepseek-v4, kimi-k2.x, qwen3.5, glm-5, minimax-m3]
+    platforms: [h100, h200, b300, b200, gb300, gb200, mi355x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [fp8, bf16]
+    contracts: [runtime-visible-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-scaling-v1:
+    description: "strong (fixed global tokens) + weak (fixed tokens/rank) scaling across EP degrees"
+    workloads: [ds-like-ref]
+    platforms: [gb300, gb200]            # the only SKU with >1 validated EP degree (EP4 + EP8)
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    scaling: [strong, weak]
+    ep_degrees: [4, 8]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-topology-v1:
+    description: "placement sensitivity: packed vs striped vs adversarial on multi-domain SKUs"
+    workloads: [ds-like-ref]
+    platforms: [gb300, gb200]            # NVL72 tray boundary is the scale-up domain edge
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform, zipf]
+    placements: [packed, striped, adversarial]
+    resource_modes: [tuned]
+    ep_degrees: [8]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-distribution-sensitivity-v1:
+    description: "distribution robustness: ratio p99_worst / p99_headline(uniform) at ANCHOR tokens
+      only. NOT a chart dimension — collapses to one sensitivity number per (sku,backend,phase) via
+      tests/sensitivity.py. BF16/normal today; the value (activation) axis is added when the rig lands."
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, b300, b200, gb300, gb200, mi355x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    # headline = uniform; balanced-rank-local = min-comm best case; zipf-heavy/hotspot-single = worst.
+    routings: [uniform, balanced, balanced-rank-local, zipf, zipf-heavy, hotspot-single]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    # ANCHOR points only (not the full ladder) — the suite answers "how fragile", not "the curve".
+    token_points_decode: [1, 8, 32, 128]
+    token_points_prefill: [128, 512, 2048]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-routing-v1:
+    description: "routing-skew sensitivity + EPLB remedy"
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, b300, b200, gb300, gb200]
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform, balanced, zipf, zipf-mild, zipf-moderate, zipf-heavy, hotspot-single]
+    eplb: [false, true]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-activation-sensitivity-v1:
+    description: "activation-VALUE sensitivity: same trace under each value profile. Under bf16
+      combine the ratio is ~1.0 (value-independent) — the EXPECTED null result that also baselines
+      the rig for when a quantized (value-sensitive) combine lands. Diagnostic, never headline."
+    workloads: [ds-like-ref]
+    platforms: [h100, h200, b300, b200, mi355x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform]
+    # the activation value distributions (routing.ACTIVATION_PROFILES). normal = headline.
+    activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation]
+    resource_modes: [tuned]
+    phases: [decode]
+    token_points: [1, 8, 32, 128]
+    trials: 3
+    required_publication: diagnostic
+
+  ep-quant-combine-sensitivity-v1:
+    description: "BLOCKED ON PR311 — quantized-combine distribution sensitivity (none/fp8/mxfp8).
+      The rig is ready (combine_quant_mode field + capability gate + comparison_key fold), but no
+      quantized combine kernel is wired, so this suite resolves to ZERO valid cases today (capability
+      rejects combine_quant_mode != none). Kept so the matrix lights up the moment the kernel lands."
+    workloads: [ds-like-ref]
+    platforms: [mi355x]
+    backends: [mori]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform, zipf, hotspot-single]
+    combine_quant_modes: [none, fp8, mxfp8]   # only 'none' resolves valid until PR311
+    resource_modes: [tuned]
+    phases: [decode]
+    trials: 3
+    required_publication: diagnostic
+
+  ep-placement-v1:
+    description: "placement matrix: packed vs striped vs adversarial. Single-node SKUs make these
+      identical (all same-node); meaningful once a multi-node EP cohort exists. analyze_ep computes
+      the packed-vs-striped topology penalty + locality attribution."
+    workloads: [ds-like-ref]
+    platforms: [gb300, gb200]            # NVL72 tray boundary = the only multi-domain SKU here
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform, zipf]
+    placements: [packed, striped, adversarial]
+    resource_modes: [tuned]
+    ep_degrees: [8]
+    phases: [decode, prefill]
+    trials: 3
+    required_publication: comparable-experimental
+
+  ep-temporal-v1:
+    description: "temporal routing: a hot expert that MOVES across decode steps + expert groups that
+      ALTERNATE. One run per step (--routing-step); analyze across steps. Diagnostic sensitivity view."
+    workloads: [ds-like-ref]
+    platforms: [h100, h200]
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [hotspot-moving, alternating-groups]
+    routing_steps: [0, 1, 2, 3]
+    resource_modes: [tuned]
+    phases: [decode]
+    token_points: [8, 32, 128]
+    trials: 3
+    required_publication: diagnostic
+
+  ep-uneven-tokens-v1:
+    description: "uneven source-token allocation: per-rank token counts vary (global may not divide
+      EP); includes the empty-source-rank case. Records source_token_stats (min/mean/max/CV)."
+    workloads: [ds-like-ref]
+    platforms: [h100, h200]
+    backends: [deepep]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform]
+    uneven_tokens: [none, linear, empty-rank]
+    resource_modes: [tuned]
+    phases: [decode]
+    token_points: [8, 32, 128]
+    trials: 3
+    required_publication: diagnostic
diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml
new file mode 100644
index 000000000..1612c773d
--- /dev/null
+++ b/experimental/CollectiveX/configs/workloads.yaml
@@ -0,0 +1,146 @@
+# CollectiveX workload registry (goal Part 2). Each workload references an IMMUTABLE canonical
+# manifest (tests/workload.py -> <workload_id>.npz + .manifest.json). Three kinds:
+#   synthetic       — controlled DeepSeek-like baseline (dims real, routing controlled)
+#   model-derived   — REAL model MoE dimensions with controlled routing (shape != routing behavior)
+#   trace-replay    — captured routing behavior (future; needs a captured trace)
+# Model dims marked verify=true must be confirmed against a checked-in model config before any
+# result built on them is promoted past 'comparable-experimental'.
+schema_version: 1
+
+synthetic:
+  ds-like-ref:
+    kind: synthetic
+    hidden: 7168
+    topk: 8
+    experts: 256
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    routings: [uniform, balanced, zipf]
+    note: "Controlled baseline used through v3/v4 (DeepSeek-V3-shaped)."
+
+model_derived:
+  # --- PINNED, NAMED model manifests (goal P1 "Add workload manifests"). The "-v1" suffix freezes
+  # the (hidden, topk, routed_experts) shape behind an immutable name so a published result can cite
+  # `kimi-k2-v1` and have it mean exactly these dims forever; if a future model rev changes a dim it
+  # gets a "-v2" manifest, never a silent edit here. These are the names referenced for model-shape
+  # coverage. The legacy unsuffixed entries below are kept for back-compat with existing suites.
+  # canonical workload_id folds (hidden, topk, routed_experts) -> identical bytes on every SKU.
+  deepseek-v3-v1:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    expert_alignment: 128
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: false           # DeepSeek-V3 EP serving shape; identical dims to the ds-like-ref baseline
+  deepseek-v4-v1:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    expert_alignment: 128
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: false           # matches the validated DSV3/V4 serving shape used on these clusters
+  minimax-m3-v1:
+    kind: model-derived
+    hidden: 6144
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: true
+  kimi-k2-v1:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 384
+    shared_experts: 1
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: true
+  qwen3.5-v1:
+    kind: model-derived
+    hidden: 4096
+    topk: 8
+    routed_experts: 128
+    shared_experts: 0
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    verify: true
+
+  # --- LEGACY unsuffixed entries (kept for back-compat with ep-models-v1 and analyze_ep envelope
+  # matching). Prefer the "-v1" names above for new work. deepseek-v4/minimax-m3/qwen3.5 mirror their
+  # "-v1" shapes exactly; kimi-k2.x == kimi-k2-v1; glm-5 has no "-v1" (not in the goal's manifest set).
+  deepseek-v4:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    expert_alignment: 128
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: false           # matches the validated DSV3/V4 serving shape used on these clusters
+  minimax-m3:
+    kind: model-derived
+    hidden: 6144
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: true
+  kimi-k2.x:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 384
+    shared_experts: 1
+    dispatch_dtype: fp8
+    combine_dtype: bf16
+    verify: true
+  glm-5:
+    kind: model-derived
+    hidden: 5120
+    topk: 8
+    routed_experts: 160
+    shared_experts: 1
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    verify: true
+  qwen3.5:
+    kind: model-derived
+    hidden: 4096
+    topk: 8
+    routed_experts: 128
+    shared_experts: 0
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    verify: true
+
+# decode vs prefill are workload METADATA, not just token-ladder aliases (goal Part 2). Each point
+# is ONE MoE layer · ONE step · a SINGLE dispatch+combine collective pair (NOT a whole model or
+# several concurrent layers). The harness emits this as `phase_profile` so a T=128 point launched
+# under "prefill" is never silently read as a decode point.
+phase_profiles:
+  decode:
+    token_ladder: [1, 2, 4, 8, 16, 32, 64, 128]
+    description: "one (or few) tokens per active sequence per step; routing varies step-to-step"
+    active_sequences: "one batch of active sequences"
+    tokens_per_iter: "1 (or few) per active sequence"
+    microbatch_distribution: "one decode step across the active sequences"
+    routing_variability: "varies step-to-step (use the temporal routing modes to model this)"
+    represents: "one MoE layer · one decode step · one dispatch+combine collective"
+  prefill:
+    token_ladder: [128, 256, 512, 1024, 2048, 4096]
+    description: "chunked-prefill: many tokens per sequence enter each MoE layer at once"
+    chunk_size: "the tokens/rank point IS the prefill chunk size entering the MoE layer"
+    tokens_entering_moe: "chunk_size * ep_size tokens enter one MoE layer at once"
+    request_mixture: "a single chunked-prefill chunk (no request-mix modelled yet)"
+    chunked_prefill_behavior: "one chunk per measured point"
+    represents: "one MoE layer · one prefill chunk · one dispatch+combine collective"
diff --git a/experimental/CollectiveX/docs/gated.md b/experimental/CollectiveX/docs/gated.md
new file mode 100644
index 000000000..5f69783b9
--- /dev/null
+++ b/experimental/CollectiveX/docs/gated.md
@@ -0,0 +1,332 @@
+# CollectiveX — gated items: implemented-where-possible, honest blockers otherwise
+
+This records goal.md items that are **not** completable as real GHA results on the available
+NVIDIA fleet today, with the *specific* blocker for each (empirically established, not assumed),
+plus what WAS done toward each. Scope: NVIDIA chips (H100, H200, B300; GB300 capacity-limited).
+
+The container all NVIDIA results run in is `lmsysorg/sglang:v0.5.11-cu130` (CUDA 13.0, NCCL 2.28.9,
+torch 2.11; pre-installed: deep_ep 1.2.1, flashinfer 0.6.8, nixl 1.0.1, nvshmem 3.4.5). Established
+by an in-container probe on the H200 cluster.
+
+## EP backends
+
+### NVIDIA NCCL EP — NOT represented by DeepEP V2; needs its own adapter
+Upstream `NVIDIA/nccl` now has a real `contrib/nccl_ep` implementation. It is an NCCL API extension for
+MoE dispatch/combine built on NCCL Device API LSA/GIN, and should be treated as its own backend surface,
+not as a synonym for DeepEP V2.
+
+CollectiveX currently keeps these surfaces separate:
+- **DeepEP V2**: `backend=deepep`, `shape.kernel_gen=v2`, `deepep_version=2.0.0+...`; this is DeepEP's
+  ElasticBuffer/dispatch/combine implementation using the NCCL Gin backend.
+- **`nccl-ep` baseline in this harness**: a portable token-shuffle implementation using
+  `torch.distributed.all_to_all_single` over NCCL/RCCL. This is useful as a host-orchestrated baseline,
+  especially cross-node, but it is **not** upstream `contrib/nccl_ep`.
+- **Upstream NCCL EP**: still needs a dedicated adapter/provenance label before CollectiveX can claim
+  native NCCL EP results. When wired, it must not overwrite either DeepEP V2 or the current
+  all-to-all baseline identity.
+
+So the correct comparison is not "NCCL EP = DeepEP V2". DeepEP V2 remains a relevant NCCL-Gin-backed
+comparison point, but native NCCL EP needs its own line in the backend/version matrix.
+
+### UCCL EP — DONE via vendored deep_ep_wrapper (was deferred; the bootstrap is now wired)
+`pip install uccl` (prebuilt cp312 wheel) + a cu12 CUDA runtime on `LD_LIBRARY_PATH` (the wheel is
+cu12 on a cu13 image) **builds and imports** — the C++ runtime `uccl.ep` loads (pkg-0.1.1), confirmed
+on H100 via GHA. BUT the DeepEP-compatible surface is **not** the low-level `uccl.ep.Buffer`: that
+constructor is `Buffer(rank, num_ranks, num_nvl_bytes, num_rdma_bytes, low_latency_mode, …)` — it does
+NOT take a torch ProcessGroup, and a no-bootstrap construction raises `TypeError: incompatible
+function arguments`. The DeepEP-identical `Buffer(group, …)` lives in UCCL's separate ~1900-line
+`deep_ep_wrapper` package (packaged AS `deep_ep`, so it collides with the container's real DeepEP).
+That wrapper's `__init__` runs a non-trivial bootstrap — `get_local_ipc_handle` / `get_local_device_id`
+exchanged via `dist.all_gather_object`, `runtime.sync(...)`, CPU `UcclProxy` setup
+(`get_cpu_proxies_meta`), and `connect_atomic_buffer` — entangled with UCCL's bench harness `init_dist`.
+The wrapper is cleanly vendorable (relative imports + only depends on `uccl.ep`), and that is now
+DONE: `cx_build_uccl` git-clones `uccl-project/uccl` at the wheel-matched tag and vendors
+`deep_ep_wrapper` under the non-colliding name `uccl_deepep`; `ep_uccl.py` imports its
+`Buffer(group, …)` and runs genuine UCCL dispatch/combine. **Validated: `correct=True`,
+`uccl_version=0.1.1`, intranode NVLink on h100/h200/b300/b200** (normal bf16+fp8 + LL). If the wrapper
+is ever absent the import falls back to the low-level `uccl.ep.Buffer`, which fails loudly (preserved
+failed-case) — never faked. Fresh full-sweep re-validation (post idempotent-build fix, which cured the
+old per-case-rebuild SIGABRT/timeout): **h200 = 426/426 correct incl LL-mode 32/32** (run 28535235520);
+**h100 = 394/394 correct in NORMAL mode** (run 28535226475) **but all 4 LL-mode cases HANG (rc=124, 900s
+timeout — 0/32)**. Since the identical UCCL LL code is 32/32 on h200 (same Hopper arch, same wheel), the
+h100 LL hang is an **h100-dgxc cluster limitation** (LL uses IBGDA-style low-latency proxies; the
+h100-dgxc fabric deadlocks them — consistent with the documented h100-dgxc cross-node IB wall below),
+NOT an arch or UCCL-code wall. Both SKUs also fail ONLY the `empty-rank` diagnostic (see empty-rank note
+below). Remaining gap: aarch64 GB200/GB300 (the from-source/proxy bootstrap doesn't come up — see the
+aarch64 wall below); uccl is x86-single-node so far.
+
+### NIXL — transfer DONE (container switch); device-EP blocked on UCX GPU Device API
+Two distinct things. **(1) NIXL host RDMA transfer** (`nixl_agent.register_memory / get_xfer_descs /
+initialize_xfer / transfer`) — the fabric dynamo uses for KV movement — is **WIRED + valid**
+(`tests/nixl_transfer.py`, `CX_BENCH=nixl`). It needed a **container switch** (the sglang multiarch
+image has no NIXL build deps): `cx_default_image` selects `nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
+1.3.0-dev.1-cuda13` for `CX_BENCH=nixl`. B300 run 28314858649: NIXL 0.10.1, UCX backend, 2 in-process
+agents — dtod-local **94 GB/s**, dtod-remote **24 GB/s** (dtoh/htod hit a NIC dmabuf `ibv_reg_mr Bad
+address` limit; GPU↔GPU is the KV-handoff path that matters).
+
+**(2) NIXL device-EP** (`examples/device/ep`, a DeepEP fork) — the from-source **meson** build. The
+container switch was the directive's exact ask ("switch containers and see if it fixes"), and it
+**CLEARED the documented Abseil 20220623 blocker**: the dynamo image ships **Abseil 20250814** (meson
+subproject) + meson/ninja/pybind11 3.0.2/cmake, and `meson setup` now SUCCEEDS (build-probe
+`cx_probe_nixl_ep`, run 28314858649 log). The next blocker is `UCX GPU Device API: NO` (the device-EP
+needs UCX's device-initiated GPU put/get API via `<ucp/api/device/ucp_device_impl.h>`). **Build attempt
+made:** `cx_probe_nixl_ep` now BUILDS UCX from source with `--with-cuda` and points pkg-config at it —
+but `meson setup` STILL reports `UCX GPU Device API : NO` (run 28320702204). So it is NOT a missing
+build flag: UCX's device API compiles in only with GPUDirect-Async / device-initiated-comm **driver +
+hardware** support (IBGDA/GDAKI), a base-platform capability absent here — not a container/build fix.
+`nixl_ep_cpp` therefore does not build; the adapter (mirroring `ep_deepep.py`) waits on a platform with
+that device-comm support. Evidenced terminal wall.
+
+### FlashInfer EP / TensorRT-LLM NVLink one-sided AllToAll — DONE on H100 + B300 (H200 runner gated)
+`flashinfer.comm.MoeAlltoAll` (which LIVES IN `flashinfer.comm.trtllm_moe_alltoall` — it IS the
+TRT-LLM "throughput backend" one-sided all-to-all, calling the same `moe_a2a_dispatch`/`moe_a2a_combine`
+kernels) builds its MNNVL symmetric workspace over the torch.distributed NCCL group via FlashInfer's
+`TorchDistBackend` (no MPI/mpi4py). The cross-rank symmetric buffer uses
+`CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR` → `pidfd_getfd` → **CAP_SYS_PTRACE** on x86_64. Empirically:
+- **H100 (`h100-dgxc`) + B300 (`b300`):** their enroot/pyxis runner containers **grant** the cap →
+  FlashInfer EP runs and is **official** (bf16 + the quant dispatch matrix below), decode + prefill.
+  This is the TRT-LLM NVLink one-sided AllToAll EP — the existing FlashInfer EP results ARE that path
+  (provenance `backend_lineage = flashinfer.comm.trtllm_moe_alltoall.MoeAlltoAll`).
+  - **H100 intermittent crash (open):** the MoeAlltoAll **construction** succeeds (cap granted), but
+    ~half of h100 flashinfer cases hit `torch.AcceleratorError: CUDA error: unspecified launch failure`
+    during dispatch/combine execution (run 28500524185: 21/38 cases; scattered across T/routing, the SAME
+    config both crashes AND passes → a genuine intermittent, NOT config/pidfd). NOT a per-case IPC reclaim
+    race either: a between-case `/dev/shm` drop + settle was tested (run 28522872429) and made it WORSE
+    (in-flight IPC corruption, 21→27 fails). So it's flashinfer MoE-kernel flakiness on Hopper — needs
+    compute-sanitizer on a live run to root-cause. Mitigations shipped: (1) each flashinfer case is
+    RETRIED up to `CX_FLASHINFER_RETRIES` (default 3) times in the shard loop, dropping the intermediate
+    failed-case record on a retry-success so the shard isn't polluted; (2) flashinfer is sweep-chunked
+    (`SLOW_MAX_CASES=12`, smaller than others so the retry budget stays within `--time`), bounded +
+    PARALLEL so a crash can't take a large shard down. **Retry MEASURED (run 28534841204, retry engaged
+    — 17 retries in the p3 shard alone): coverage 30/46 configs, 173/173 correct — up from the ~19-24
+    baseline but NOT the ~94% a clean-independent-50% model predicts.** The deadlock is severe (1470
+    completion-flag-timeout events that run) and, crucially, CORRELATED within a container: once the
+    MNNVL barrier state degrades, retries in the same allocation keep timing out, so retry has
+    diminishing returns (one whole chunk, p1, passed cleanly while p0/p2/p3 degraded). Fuller coverage
+    would need a fresh container per retry (re-import cost) or much smaller chunks (more GHA jobs) — both
+    rejected for marginal gain; the real fix is live compute-sanitizer root-cause. Upgrade to 0.6.14 was
+    also tested (run 28530579787) and did NOT fix it (it was a vLLM-side fix), so bundled wheel + retry
+    is the shipped path. B300 + GB300 flashinfer are 100% clean (Blackwell), confirming Hopper-kernel.
+- **H200 (`h200-dgxc`) runner:** its container **denies** CAP_SYS_PTRACE, so `pidfd_getfd` fails at
+  MoeAlltoAll **construction** on every rank (`pidfd_getfd(...) errno 1: Operation not permitted`,
+  deterministic — NOT the h100 intermittent, so retry cannot help). This is a per-runner environment
+  limitation, NOT a code/hardware gap — the identical adapter is official on H100+B300. Not
+  harness-fixable: our launchers pass no `--container-cap-add`/cap flags (caps are the cluster's enroot
+  default — h100-dgxc grants it, h200-dgxc doesn't), enroot runs unprivileged so the cap isn't grantable
+  per-job, and `MoeAlltoAll` has **no non-MNNVL transport** to route around it (it IS the MNNVL one-sided
+  A2A). Documented rather than forcing a security-sensitive `--cap-add SYS_PTRACE` on that shared runner.
+- **aarch64 (GB200/GB300):** would use `CU_MEM_HANDLE_TYPE_FABRIC` (no pidfd); GB300 capacity-limited.
+
+## Precision matrix
+
+### MXFP8 / NVFP4 dispatch — DONE on FlashInfer EP; MXFP4 dispatch — gated (tile-padded SF)
+DeepEP (V1/V2) dispatch accepts **e4m3 fp8 only**. But FlashInfer's A2A is a **dtype-agnostic byte
+mover** taking `input_payloads` as a LIST, so a quantized dispatch moves `[q, scale_factor]` and
+dequants in `stage()` (UNTIMED preprocessing, cached so the roundtrip measures comm). Using FlashInfer's
+own quantize/dequantize kernels, `ep_flashinfer.py` now does **MXFP8** (`mxfp8_quantize`, e4m3 + e8m0
+block-32 — device dequant verified == `mxfp8_dequantize_host`) and **NVFP4** (`fp4_quantize` +
+`e2m1_and_ufp8sf_scale_to_float`, e2m1 + e4m3 block-16) dispatch, plus the three e4m3 fp8 scale-layouts.
+Coverage by arch (all `correct=True` end-to-end):
+- **e4m3 fp8 (×3) + mxfp8:** H100 **and** B300 (e4m3/e8m0 are Hopper-supported).
+- **nvfp4:** **B300 (Blackwell) only.** FP4 (e2m1) is a Blackwell-native tensor format; FlashInfer's
+  fp4 quantize/dequantize does NOT round-trip on Hopper sm90 (validated: nvfp4 `correct=True` on B300,
+  `correct=False` on H100). `capability.resolve` now gates nvfp4 to Blackwell (`ARCH_ONLY_DTYPES`), so a
+  Hopper nvfp4 dispatch is cleanly rejected rather than run-and-marked-invalid.
+- **MXFP4 dispatch — gated:** FlashInfer's `mxfp4_quantize` emits its scale factor in a **tile-padded
+  `[pad(T,128), H/32]` swizzled layout** with no `is_sf_swizzled_layout=False` option — it does NOT
+  factor as a per-token `[T, k]` tensor, so it can't be moved through the per-token A2A. (mxfp8 + nvfp4
+  both expose a linear per-token SF; mxfp4 alone does not.) The 4-bit MX format is covered in spirit by
+  nvfp4 (also 4-bit e2m1); mxfp4 specifically stays gated on the quantizer's SF layout.
+
+### Quantized combine OUTPUT (MXFP8 / NVFP4 combine) — DONE on B300 via flashinfer-main (container switch)
+Distinct from quantized *dispatch*: a quantized **combine** emits a non-bf16 reduced output. The bundled
+`flashinfer 0.6.8.post1` `moe_a2a_combine` had **no `output_dtype`**, and neither did 0.6.13 (latest
+PyPI) nor the cu130 nightly wheel (0.6.13.dev20260612) — `output_dtype`/`output_scales` landed on
+flashinfer **main** after those. So `cx_build_flashinfer_latest` BUILDS flashinfer main from source
+in-container (after a 7-layer version-coupling peel: cubin↔python↔jit-cache version checks, then
+`nvidia-cutlass-dsl` 4.5.2 for the CuTe `OperandMajorMode`, then **uninstalling** the stale precompiled
+cubin/jit-cache so `get_moe_alltoall_module()` JIT-compiles the 14-arg kernel fresh from main's csrc).
+- **MXFP8 combine — DONE on B300:** `combine(output_dtype=float8_e4m3fn, output_scales=uint8[T,H/32])` =
+  e4m3 + UE8M0 block-32 (the source-spec'd layout); dequant `e4m3 * 2^(e8m0-127)`. Valid, `correct=True`
+  ×8 (`backend_provenance.combine_quant=True`, `flashinfer_stack` captured). FP32-accum is the kernel's
+  internal reduce; scale-transport (e8m0) + tolerance-class (1.6e-1 vs bf16 5e-2) are exercised.
+- **NVFP4 combine — DONE on B300:** `output_dtype=uint8 (packed e2m1) + e4m3 vec-16 scales +
+  output_scalar_scale`; dequant via `e2m1_and_ufp8sf_scale_to_float` (the e4m3 scales viewed as uint8
+  ufp8). Valid, `correct=True` ×8 (Blackwell-native fp4, like nvfp4 dispatch).
+- **H100 combine — build-time-limited (NOT arch):** the ~70-min in-container flashinfer-main source
+  build exceeds the H100 runner's job budget (SIGTERM). B300's longer budget lets it land. A pre-staged
+  flashinfer-main wheel (one-time build) would remove the per-run rebuild; deferred.
+- **Direct-cast FP8 combine — kernel limit (evidenced, B300 run 28315037266):** ATTEMPTED via
+  `CX_QC_SCALE=scalar` (`output_dtype=float8_e4m3fn` + `output_scalar_scale`, NO per-block
+  `output_scales`). The kernel ASSERTS `Check failed: (output.dtype()==payload.dtype()) is false:
+  output_dtype without output_scales must match payload dtype` — i.e. an fp8 output REQUIRES per-block
+  `output_scales`; a scalar-only/unscaled direct-cast fp8 combine is **not a supported moe_a2a_combine
+  mode**. The SCALED mxfp8/nvfp4 outputs are the only fp8/fp4 combine paths. (Also confirmed the nightly
+  `flashinfer 0.6.13` wheel now carries `output_dtype` — the ~70-min main-source build is no longer
+  needed for combine-quant.) MoRI fp8_blockwise combine (AMD, PR311) remains a separate AMD path.
+
+## Topology and rack-scale
+
+### NVL72 rack-scale EP — DONE up to EP64 via FlashInfer-MNNVL; cross-node-over-IB DONE via nccl-ep
+**Within an NVL72 NVLink domain, EP8/16/32/64 are DONE.** The key: DeepEP's NVLink `Buffer(group,nvl,0)`
+is intranode-only (≤8 ranks, incl. MNNVL trays → GB300/GB200 EP8 over 2 trays via deepep), BUT
+**FlashInfer's MoeAlltoAll MNNVL symmetric workspace SPANS the whole NVL72 NVLink domain** — so
+`benchmark=flashinfer nodes=4/8/16` runs EP16/32/64 across 4/8/16 trays. Validated correct=True:
+GB300 EP8 (28319504164) + EP16 (28319809968); GB200 EP8 (28319793439, after porting the GB300 EP
+multi-srun path into launch_gb200-nv.sh — was nccl-only) + EP16 (28319971335) + EP64 (28319975631,
+ep_size=64/world=64). EP32 (both SKUs) re-dispatched after a workflow concurrency-group collision
+(the group omitted inputs.nodes — fixed). Bounded only by NVL72 tray CAPACITY, not the method.
+- **Cross-node over InfiniBand (H200 DONE via nccl-ep; H100 cluster WALLED).** Two layers had to fall:
+  (1) **Rendezvous:** torch's `env://` TCPStore *and* torchrun's elastic-agent store advertise the
+  rank-0 management-subnet NodeAddr, which is NOT reachable from a peer rank's enroot container net
+  namespace (900s connect timeout; runs 28325250919 / 28326334616). Solved with a shared-mount
+  **FileStore** (`CX_RDZV_FILE`) + a **local NGPUS-process spawn** (no torchrun elastic agent) — the PG
+  bootstraps through the shared file and NCCL then connects peers over IB. (2) **Data path:** the custom
+  one-sided RDMA backends do NOT survive cross-node — UCCL's `ibv_reg_mr` fails EINVAL → `free():
+  corrupted unsorted chunks` → SIGSEGV (run 28326528672, *after* the rendezvous now forms), DeepEP
+  normal-internode asserts out — because they need GPUDirect-RDMA peer-memory registration the cluster's
+  IB HCAs / container don't expose. The portable fix is a transport that host-stages gracefully:
+  **nccl-ep** (`tests/ep_nccl.py`), the NCCL `all_to_all_single` token-shuffle EP baseline. H200
+  nodes=2 / **world=16 over IB**, run 28327088942: **correct=True at every T(1→128)**, disp_p50
+  547–808µs, status=comparable-experimental (single-node world=8 validated first, run 28327013318).
+  (IBGDA/internode-DeepEP would be a faster one-sided path but needs the driver capability — gated;
+  nccl-ep is the validated, portable cross-node EP.)
+  **H100 cross-node — WALLED (correcting an earlier "same path covers H100" overclaim).** The h100
+  launcher gained the same `CX_NODES>1` FileStore-rendezvous block (ported from h200; committed), and the
+  2-node allocation + per-node container DO come up (run 28446105759: nodes hpc-gpu-1-0/1). But the
+  nccl-ep run reproducibly HANGS to the 900s timeout on BOTH decode and prefill, with no captured evidence
+  (the `timeout -k` kill pre-empts stderr) — the gloo+NCCL FileStore bringup that auto-detects the right
+  interface on the h200 fabric does not converge on the hpc-gpu-1 cluster (different inter-node
+  networking; no SSH to introspect the correct `GLOO/NCCL_SOCKET_IFNAME`). Not a systematic-matrix data
+  point either: `sweep_matrix` places h100 at `nodes=''` (single-node) only — cross-node ws16 was a
+  separate goal-182 demo. So h100 single-node EP (all backends @ ws8) is complete; cross-node ws16 stays a
+  cluster-bringup wall pending interface-level access to that cluster.
+- **Cross-node MI355X (goal 183, "if available") — via nccl-ep on RCCL.** MoRI's RDMA registration also
+  aborts cross-node (SIGABRT, run 28325251742, *after* the rendezvous master is correctly resolved) —
+  the AMD analogue of UCCL's GPUDirect-RDMA wall. nccl-ep runs on RCCL (identical `all_to_all_single`
+  API) over a 2-node MI355X allocation with the same FileStore rendezvous (the MI355X multi-srun gained
+  `CX_RDZV_FILE`; nccl-ep uses a pure rccl PG, sidestepping the gloo `connectFullMesh` 127.0.1.1 alias
+  too — and `nccl-ep` had to be added to the MI355X launcher's AMD-bench allowlist, else it silently
+  fell back to MoRI). **DONE:** MI355X nodes=2 / **world=16 over RoCE/IB**, run 28328718973,
+  **correct=True** T=1→8, disp_p50 345–431µs, status=comparable-experimental.
+- **DeepEP-hybrid on gb300 WORKS at EP4 AND EP8 (corrected twice); only UCCL aarch64 remains a wall.**
+  Per-backend re-validation (informed by upstream docs: NVIDIA HybridEP = the Megatron
+  `moe_flex_dispatcher_backend="hybridep"`, TMA-NVLink + IBGDA, **built for NVL72 rack-scale GB200/GB300**)
+  overturned the earlier blanket "uccl + deepep-hybrid fail at EP4 and EP8 on Grace-Blackwell" claim:
+  - **DeepEP-hybrid gb300 EP4 (single-tray) — WORKS.** EP4 sweep (run 28452161275): 30 valid docs,
+    **169/169 correct**, `max_rel_error=0.0`, `branch=hybrid-ep`.
+  - **DeepEP-hybrid gb300 EP8 (2-tray, MNNVL) — WORKS.** Run 28480519588: decode **8/8** + prefill **6/6**,
+    `ws=8 nodes=2 transport=mnnvl`, full T-ladder 128→4096 all `correct=True` (RT p50 374µs@T128 →
+    1404µs@T4096). NOT intranode-only (an earlier wrong claim): the only blocker was build PERSISTENCE —
+    `cx_build_deepep_hybrid` did `build_ext --inplace` under `/tmp/DeepEP_hybrid` + PYTHONPATH, but `/tmp`
+    does NOT survive across the EP8 multi-srun's separate srun steps (only the pyxis container rootfs does),
+    so the case-srun saw the bundled mainline `deep_ep` → `no attribute HybridEPBuffer`. Fixed by installing
+    into site-packages (`pip install`, persists — mirrors deepep-v2), build_ext fallback for EP4.
+  - **DeepEP-hybrid h100 + h200 (Hopper, EP8 single-node) — WORKS, 212/212 correct each** (runs
+    28535221873 / 28535231056, post idempotent-build fix): 43/44 cases valid across the `none` +
+    `linear` uneven-token distributions, decode+prefill ladders T=8→4096, all `correct=True`. The ONE
+    failing case (c043) is the `empty-rank` diagnostic (`ep-uneven-tokens-v1`, `required_publication:
+    diagnostic` — one rank gets ZERO tokens): HybridEP's `set_intra_node_buffers` → `hybrid_ep.cu:81
+    cudaDeviceSynchronize` raises `cudaErrorIllegalAddress` on Hopper (identical index c043 on BOTH
+    SKUs = deterministic-by-config, NOT the flashinfer intermittent nor accumulation). Not
+    retried/chunked: deterministic kernel limit, and the backend already has 212 correct points/SKU.
+  - **`empty-rank` is a CROSS-BACKEND Hopper diagnostic differentiator (not HybridEP-only).** The same
+    zero-token-rank case ALSO crashes **UCCL** on Hopper (h100 c073 rc=1, h200 c073) — so of the Hopper
+    EP backends, deepep-hybrid + uccl fail it while **mainline DeepEP HANDLES it** (verified control:
+    h100 mainline deepep empty-rank case c073 = valid doc, **3/3 correct**, zero failed records in the
+    shard). So the empty-rank diagnostic cleanly separates zero-token-rank-robust (mainline DeepEP) from
+    non-robust (HybridEP, UCCL) EP kernels. It's `required_publication: diagnostic`, one case per
+    backend, and flips those backends' GHA jobs to "failure" despite full data — judge by the failed-case
+    record + the 200+ correct points, not the job conclusion. Untested on Blackwell (b300/gb300 hybrid +
+    uccl suites are `uneven_tokens=none` only, so no Blackwell control exists for empty-rank).
+  - **UCCL aarch64 (gb300) — WALL (confirmed fresh, the one genuine aarch64 EP wall).** Run 28457032490:
+    `ModuleNotFoundError: No module named 'uccl.ep'` — the uccl EP extension does not import on aarch64
+    Grace-Blackwell (consistent with UCCL-EP docs: NVIDIA/AMD + EFA/IB/Broadcom, no aarch64/Grace). EP4+EP8.
+  LESSON: a failing run is not proof of a capability wall — both deepep-hybrid claims were wrong; the EP8
+  one was a build-env bug, not a hardware limit. Always check the library's actual support before walling.
+  Both backends work on x86 single-node (uccl b300=126/b200=124; deepep-hybrid h100=212/h200=212/b300=36,
+  43/44 cases on Hopper — only the empty-rank diagnostic crashes, see above). deepep
+  (bundled V1), deepep-v2 (from-source), flashinfer, nccl-ep, AND deepep-hybrid@EP4 all run on gb300, so
+  the only unfillable gb300 cells are uccl (any EP) and deepep-hybrid EP8.
+- **DeepEP V2 (from-source `kernel_gen=v2`): DONE on x86 + aarch64, EP4 AND rack EP8.** Genuine V2
+  (`deepep_version=2.0.0+af9a040`) builds on h100/h200/b300/b200 AND on aarch64 Grace-Blackwell — gb300
+  EP4 (run 28429220764) produced `kernel_gen=v2`/`2.0.0`, log "built deep_ep 2.0.0 … V2 ready". So aarch64
+  V2 is NOT a wall: wherever the EP4/single-node path runs (it calls `cx_build_deepep_v2` once in
+  `run_in_container`), V2 builds and runs. **Rack EP8 (gb200/gb300, 2 trays) — now DONE too**, after two
+  fixes the earlier "deferred" note anticipated only the first of: (1) the EP8 multi-srun launcher ran
+  `run_ep.py` over 8 ephemeral per-rank containers, BYPASSING `cx_build_deepep_v2` (so `deepep_v2=true`
+  silently ran bundled V1 and the doc `kernel_gen` was honestly `v1`). Fixed with `CX_BUILD_ONLY` +
+  a setup-srun that builds V2 ONCE PER NODE into a persistent `--container-name` every case-srun reuses.
+  (2) With V2 actually installed, EP8 then crashed `cudaErrorIllegalAddress` at `csrc/legacy/buffer.hpp`
+  across trays — NOT a hardware wall (bundled V1 runs 180 correct cross-tray EP8 docs, `ws8/nodes2/mnnvl`).
+  Upstream V2's `Buffer` ADDED `allow_mnnvl` (default **False**); when off, DeepEP itself sets
+  `NVSHMEM_DISABLE_MNNVL=1` and the legacy buffer falls onto the intranode-only CUDA-IPC peer path, which
+  faults across NVL72 trays. `tests/ep_deepep.py` now passes `allow_mnnvl=True` on both Buffer ctors when
+  `CX_ALLOW_MNNVL=1` (gated on `inspect` finding the param, so bundled-V1 + x86 single-node are unchanged);
+  the gb300 launcher exports it for the deepep EP8 case. **Validated:** gb300 EP8 run 28434764062 →
+  `kernel_gen=v2 / ws8 / nodes2 / transport=mnnvl / allow_mnnvl=True / mode=normal / correct=8/8`, roundtrip
+  p50 158→227µs (T=8→1024). `sweep_matrix` re-enables v2 at gb200/gb300 EP8. (gb200 launcher inherits the
+  same build-once + `CX_ALLOW_MNNVL` fix; pending a gb200 allocation to re-confirm.)
+
+## Other inference collectives (NVIDIA scope)
+
+- **All-reduce / all-gather (standardized NCCL):** DONE — real `family=nccl` results on H100/H200/B300,
+  rendered in the All-reduce/All-gather tabs.
+- **CPU↔GPU offload, copy-engine/SDMA, KV-cache transfer:** DONE — single-process memcpy-family benches
+  (`tests/offload_bench.py`, `copy_engine_bench.py`, `kv_cache_transfer.py`).
+- **Framework all-reduce — FlashInfer one-shot/two-shot DONE:** `allreduce_fw_bench.py` wires the real
+  `trtllm_allreduce_fusion` (pattern `kAllReduce`, `use_oneshot` True/False) over the TRT-LLM IPC
+  workspace — nccl baseline + flashinfer-oneshot + flashinfer-twoshot, all `correct=True` (one-shot
+  beats the NCCL ring in the small-message latency regime). **SGLang/vLLM/AITER custom-AR — now DONE**
+  by REPLICATING the framework's serving distributed-init (init_distributed_environment +
+  initialize_model_parallel) on the torchrun group and using the TP GroupCoordinator's
+  ca_comm.custom_all_reduce (the wrapper builds ca_comm only inside that init — a bare ctor skipped):
+  sglang H200 175 GB/s correct=True (run 28320404895); AITER MI355X 367.8 GB/s correct=True (run
+  28320579741, aiter.dist.parallel_state, ca_comm under device_communicator); vLLM via the
+  allreduce-fw-vllm CONTAINER SWITCH to vllm/vllm-openai + entering set_current_vllm_config(VllmConfig())
+  (its CustomAllreduce is a CustomOp asserting an active config), H200 correct=True (run 28320699661).
+  RL mesh-to-mesh + all-gather DP-attention→TP-MoE shapes: covered by the standardized sweeps.
+- **KV-cache backends:** raw memcpy + CPU-pinned WIRED; **NIXL WIRED** (`tests/nixl_transfer.py`, B300
+  via the dynamo-container switch — see the NIXL section above); **MoRI-IO WIRED** (`tests/
+  mori_io_transfer.py`, MI355X, `mori.io` IOEngine RDMA p2p). **MoonCake WIRED on NVIDIA** (`tests/
+  mooncake_transfer.py`, run_mooncake_suite pip-installs the engine; B300 35.4 GB/s via
+  `transfer_write_on_cuda`). **MoonCake on MI355X = ROCm wall (evidenced):** the engine initializes on
+  ROCm (`MOONCAKE_INIT … on rdma device rdma0`) but the pip wheel exposes NO `transfer_write_on_hip`
+  method (only the CUDA one) — `0 groups, status=invalid`, run 28342781762. A HIP transfer path would
+  need an upstream Mooncake ROCm build, not a container/flag fix.
+
+- **MI355X primitives (rccl-tests) tab:** the All-reduce/All-gather tabs render `family=nccl`; the AMD
+  equivalent is `rccl` (`CX_BENCH=nccl` → rccl-tests on the MI355X launcher). Repeated dispatches
+  (28340951946, 28342780904) failed in the runner *checkout/setup* step (exit 2/3, `EACCES` on a shared
+  `LOGS/agentic` dir + missing workspace) — the MI355X GHA runners are shared with the agentic
+  benchmark fleet, so the CollectiveX checkout collides intermittently. This is a runner-contention
+  infra flake, NOT an rccl-tests limitation; it lands when it gets a clean runner.
+
+## AMD / MI355X items — now ATTEMPTED via GHA (no longer "out of scope")
+The directive's container-switch + AMD-lift asks. All run via GHA on the MI355X MoRI image:
+- **FNUZ fp8 dispatch (MoRI) — VALIDATED (e4m3fnuz):** `dispatch_dtype=fp8` on the mori backend routes
+  MoRI's `quant_type=fp8_direct_cast` — the ROCm-native e4m3fnuz format (the self-introspecting adapter
+  found the valid set is `['none','fp8_direct_cast']`; the guessed `fp8_blockwise` is rejected by this
+  build). Required `use_external_inp_buf=True` (Fp8DirectCast asserts in zero-copy mode) + gating against
+  the e4m3fnuz consistency reference. MI355X run 28318788729: T=2/4/8 `correct=True`, max_rel **3e-4**,
+  disp_p99 ~45-70µs. The run's status=invalid is solely MoRI's forced-T=1 ramp point (a single-token
+  relErr-metric instability, rank-0 max_rel=3e-4 — not a comm error). Full 5-run resolution chain (each
+  peeling one layer via the GHA log alone — no SSH) in notes.md.
+- **AMD SDMA copy path:** `copy_engine_bench.py` no longer refuses on ROCm — the off-SM DMA path IS the
+  SDMA engine; labeled `copy_engine_kind=sdma` / `accelerator=rocm` (vs NVIDIA `copy-engine`). The
+  non-interference probe characterizes SDMA-vs-CU interference (pynvml absent → graceful fallback).
+- **MoRI-IO KV backend:** `tests/mori_io_transfer.py` (above).
+- **MI355X cross-node EP (goal 183):** the custom-RDMA MoRI path aborts cross-node (SIGABRT, GPUDirect-
+  RDMA wall) — same class as UCCL on NVIDIA — so cross-node MI355X EP runs via **nccl-ep on RCCL**
+  (NCCL/RCCL `all_to_all_single`, host-staged over IB) with the shared-mount FileStore rendezvous. See
+  the rack-scale section above; single-node MI355X EP is covered by the MoRI sweep.
+
+## Operational note — do not delete ALL runs of a non-`main` workflow
+`collectivex-experimental.yml` lives ONLY on the `collectivex` branch (unlike `collectivex-sweep.yml`,
+which is also on `main`). GitHub keeps a workflow in the Actions registry only if it is on the default
+branch OR has at least one run. Deleting EVERY run of `collectivex-experimental.yml` therefore
+DE-REGISTERS it — `gh workflow run collectivex-experimental.yml --ref collectivex` then fails with
+"workflow not found on the default branch," and `gh` even reports the failed dispatch as success if the
+caller greps stdout for `github.com` (the 404 URL matches). Re-register by pushing any change under
+`experimental/CollectiveX/**` (the `on: push` trigger creates a run). Robust fix: also add the workflow
+to `main` (as the sweep already is), so run-deletion can never de-register it.
diff --git a/experimental/CollectiveX/docs/methodology.md b/experimental/CollectiveX/docs/methodology.md
new file mode 100644
index 000000000..41a246991
--- /dev/null
+++ b/experimental/CollectiveX/docs/methodology.md
@@ -0,0 +1,384 @@
+# CollectiveX EP benchmark — methodology mapping
+
+> Status: experimental (goal P2, "Methodology/reference docs"). This document explains
+> what the CollectiveX EP dispatch/combine harness reused from upstream test code, what it
+> deliberately changed, and the exact contracts a result must satisfy to be published. It is
+> grounded in the code as it stands: `tests/ep_harness.py`, `tests/ep_deepep.py`,
+> `tests/ep_mori.py`, `tests/reference_ep.py`, `tests/run_ep.py`, `validate_results.py`, and
+> `schemas/ep-result-v4.schema.json`. Where a claim cannot be verified from the repo it is
+> flagged inline rather than asserted.
+
+The shared design constraint behind everything below is the *fair-comparison contract* stated at
+the top of `ep_harness.py`: a single deterministic routing trace is generated once from a fixed
+seed over the **global** batch and is identical on every SKU; each rank materializes only its
+slice (`routing.rank_slice` / the `my_off:my_off+my_cnt` slice in `run_sweep`). Adapters never
+roll their own RNG. So "what was reused vs changed" always means: *reused the library's API call,
+changed the workload and the timing boundary so every backend runs the same problem under a named,
+machine-checkable measurement contract.*
+
+---
+
+## DeepEP tests/legacy: what was reused
+
+The DeepEP adapter (`tests/ep_deepep.py`) reuses DeepEP's **documented normal-mode and
+low-latency Python API directly**, the same surface its own intranode/internode test code drives:
+
+- **The buffer + dispatch/combine call sequence.** Normal mode constructs a single
+  `deep_ep.Buffer(group, num_nvl_bytes, 0)`, calls `buffer.get_dispatch_layout(topk_idx, experts)`,
+  then `buffer.dispatch(...)` and `buffer.combine(...)`. Low-latency mode uses
+  `Buffer(..., low_latency_mode=True, num_qps_per_rank=…)`, `low_latency_dispatch`, and
+  `low_latency_combine`. These are DeepEP's own entrypoints, not reimplementations.
+- **The correctness identity from DeepEP's intranode test.** A pure dispatch→combine round trip
+  with *no expert compute* reconstructs `x` scaled by the number of destination ranks each token
+  was sent to. The adapter's `expected()` encodes exactly this: `ref * ranks_per_token`, where
+  `ranks_per_token = is_token_in_rank.sum(dim=1)` (see the module docstring and `expected()`).
+  This is the same invariant DeepEP's `test_intranode` relies on.
+- **DeepEP's own comm-only timing boundary** is preserved as one of the offered contracts:
+  `cached-layout-comm-only-v1` hoists `get_dispatch_layout` out of the timed region (computed once
+  in `make_problem`, stored on `p.layout`), so the timed `dispatch()` is pure communication —
+  matching the boundary DeepEP's own benchmark uses.
+- **The fp8 per-token block-128 cast convention.** `deep_ep` 1.2.x ships no helper for this (its
+  `utils` is empty), so `_per_token_cast_to_fp8` / `_per_block_dequant` implement the exact
+  convention DeepEP's kernels expect (scales `[T, H//128]` float32, e4m3, `448.0` as e4m3 max).
+  This is faithful reuse of the kernel's data contract, not a new scheme.
+- **The LL QP convention** (one QP per local expert: `num_qps = experts // world_size`) and the
+  fixed `num_max_dispatch_tokens_per_rank` decode shape follow DeepEP's LL usage.
+
+## DeepEP tests/legacy: what was changed
+
+- **Workload: synthetic per-rank uniform random routing → one deterministic global trace.**
+  DeepEP's tests generate routing per rank locally. CollectiveX generates the routing **once over
+  the global batch** from a fixed seed (`routing.build_global_routing`) and hands each rank its
+  slice via `make_problem`, so DeepEP and MoRI provably run the *same* routed problem
+  (`make_problem` does no RNG — see the docstring: "materializes the harness-provided rank slice").
+- **Workload axes DeepEP's test does not sweep.** The harness drives a tokens-per-rank ladder
+  (decode `1..128`, prefill `128..4096`), and adds routing-distribution control (`uniform`,
+  `zipf*`, `hotspot-*`, `alternating-groups`, `balanced*`), temporal snapshots (`--routing-step`),
+  uneven per-rank source-token allocation (`--uneven-tokens`), EPLB replication
+  (`tests/eplb.py`), and structured placement metadata. None of these exist in the upstream test.
+- **Timing boundary made explicit and named.** DeepEP's bench implicitly measures comm-only;
+  CollectiveX requires the adapter to *declare* `SUPPORTED_CONTRACTS` and conform to whichever the
+  run requests — `layout-and-dispatch-v1` (layout timed *inside* dispatch),
+  `cached-layout-comm-only-v1` (DeepEP's own boundary), or `runtime-visible-v1` (fp8 cast +
+  recv-dequant moved *inside* the timed window). `run_ep.py` rejects an unsupported contract
+  rather than letting the backend silently pick one.
+- **Statistics.** Instead of a single timed loop, the harness pools `iters × trials`
+  (default `200 × 3 = 600`) samples with per-trial token-order shuffling, reduces **cross-rank MAX
+  per iteration before percentiling** (`median_i(max_r)`, not `max_r(median_i)`), and reports
+  p50/p90/p95/p99 with p99 as the headline. It also adds a separately *measured* round trip
+  (dispatch→stage→combine in one timed region) distinct from the `isolated_sum` of the two medians.
+- **Correctness oracle is independent.** DeepEP's test validates DeepEP against DeepEP's own
+  expected formula; CollectiveX additionally carries a backend-free oracle (`reference_ep.py`,
+  see below) so correctness is not "backend vs itself."
+- **Resource normalization.** The adapter can be restricted to a device-SM *fraction*
+  (`set_num_sms(round(sm_fraction · device_sms))`) so DeepEP and MoRI run at a comparable comm-unit
+  budget — an axis the upstream test does not model.
+
+> Note on "DeepEP `tests/legacy`": the plan references upstream DeepEP `tests/legacy` and a
+> "DeepEP legacy test parity" item (goal P1, still open). The current adapter follows DeepEP's
+> *documented normal/LL API*; a dedicated `tests/legacy` parity adapter is not yet implemented in
+> this repo, so claims here describe the API surface reuse, not a line-for-line legacy port.
+
+---
+
+## MoRI tests/python/ops: what was reused
+
+The MoRI adapter (`tests/ep_mori.py`) follows the upstream `ROCm/mori` `tests`/`examples`
+dispatch+combine path:
+
+- **The op construction and call sequence.** It builds `mori.ops.EpDispatchCombineConfig(...)` and
+  `mori.ops.EpDispatchCombineOp(config)`, then calls `op.dispatch(x, weights, scales, indices, …)`
+  and `op.combine(...)` — MoRI's own ops, with `block_num` / `warp_per_block` launch parameters as
+  in its examples.
+- **The shmem bring-up.** It registers the torch process group as `"default"` and calls
+  `mori.shmem.shmem_torch_process_group_init("default")`, mirroring MoRI's reference test setup
+  (`cpu:gloo,cuda:nccl` group with an explicit `device_id`, set up in `run_ep.py`).
+- **The zero-copy registered-combine-input buffer path.**
+  `op.get_registered_combine_input_buffer(...)` is filled in `stage()` — the same zero-copy path
+  the upstream example uses to place "expert outputs" before combine.
+- **The combine correctness identity.** MoRI's combine sums one copy per destination **rank**, so
+  with no expert compute `combined[i] ≈ x[i] × (#unique destination ranks among the token's topk
+  experts)`. `expected()` computes exactly this (`unique_pes` per token). This is the upstream
+  example's `expected = input × #unique-destination-ranks` reused verbatim in intent.
+- **int32 expert ids / the scale-tensor shape.** MoRI expects int32 indices and a real `(T, 0)`
+  fp8 scale tensor (because `scale_dim == 0`); the adapter honors both.
+
+## MoRI tests/python/ops: what was changed
+
+- **Workload: always-uniform → the shared global trace.** The reference test routes uniformly.
+  The adapter's `make_problem` now materializes the **harness-provided** rank slice, so MoRI honors
+  the requested routing distribution and runs the identical workload to the NVIDIA SKUs (docstring:
+  "it no longer always-uniform").
+- **Heap held at 2 GiB instead of the reference's hardcoded 6 GiB.** MoRI registers the *entire*
+  symmetric heap as one RDMA MR at init. On the MI355X ionic_rdma NICs a 6 GiB MR fails
+  (`RegisterRdmaMemoryRegion … EINVAL`); 2 GiB registers. The adapter sets
+  `MORI_SHMEM_HEAP_SIZE` (default `2G`) **before** `import mori`. The reference's 6 GiB is "exactly
+  why it can't run as-is here" (CONTAINERS.md).
+- **Bounded `max_num_inp_token_per_rank` → a real `buffer_cap`.** Capped at 512 tokens/rank at
+  hidden 7168 so dispatch/combine buffers fit the 2 GiB heap. The harness clamps the ladder to this
+  cap and **reports dropped points** rather than silently truncating (`token_ladder` returns
+  `dropped`).
+- **`combine_needs_redispatch = True`.** MoRI's `combine()` resets `recv_num`, so `total_recv`
+  must be read **before** combine, and the harness re-dispatches (untimed) before *each* timed
+  combine sample (`time_us(..., pre=prep)`). DeepEP reuses its handle, so it sets this `False`.
+- **Gradual cold-start ramp.** MoRI wedges on a cold dispatch that jumps straight to a large T, so
+  `needs_gradual_ramp = True` makes the harness approach max-T via a geometric ramp from 1 and
+  *not* shuffle token order. It also opts out of the Blackwell warm-burst (`wants_warm_burst =
+  False`) because a sustained burst wedges it.
+- **Hard-exit teardown.** MoRI's post-`shmem_finalize()` teardown asserts (`CheckStatusValid` →
+  SIGABRT). The adapter's `finalize()` flushes results and `os._exit()`s past it instead of
+  returning cleanly the way DeepEP does.
+- **Contract restriction.** MoRI computes its routing layout **inside** the dispatch kernel and it
+  cannot be hoisted, so it declares only `layout-and-dispatch-v1`. This is *why* cross-vendor
+  comparisons must use `layout-and-dispatch-v1` — it is the one contract both backends can honor.
+- **Resource budget floored, not normalized down.** MoRI deadlocks at T≥32 when `block_num` is
+  reduced to the normalized target (validated: 46 wedges, 80 completes), so the adapter floors
+  `block_num` at a functional minimum and **records that the target fraction was not reached**
+  (`block_num_floored = True`, `tuned_source = "normalized-floored"`). The harness reads this and
+  marks the result resource-nonconforming → demoted to `diagnostic` (see publication contract).
+
+> Note on the exact upstream path name: CONTAINERS.md and the plan refer to `ROCm/mori`
+> `tests`/`examples` and `tests/python/ops`. The adapter reproduces that dispatch+combine path's
+> API and expected-value formula; the precise upstream file/commit is captured at runtime via
+> `MORI_COMMIT` (else the image tag) into provenance rather than pinned in this doc.
+
+---
+
+## FlashInfer PR 3000 benchmark inspiration
+
+The project plan lists, under "Reference benchmark scripts to draw from": *"flashinfer PR #3000;
+ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`."* (`plan.md`). FlashInfer PR #3000 is named
+there as **methodological inspiration for the EP dispatch/combine benchmark shape** — i.e. one of
+the reference benchmark scripts whose structure informed how CollectiveX measures a single MoE
+dispatch+combine pair — alongside the MoRI and DeepEP test code described above.
+
+**What is verifiable from this repo:** PR #3000 is cited only as a reference script in `plan.md`.
+There is no FlashInfer adapter, import, or copied benchmark code in the tree today (a "FlashInfer
+EP paths" item remains open in goal.md P1, and FlashInfer is otherwise referenced only for combine
+precision via PRs #3643 / #3376). 
+
+**What this doc does not assert:** I have **not** independently verified the contents of FlashInfer
+PR #3000 (its exact title, the kernel it benchmarks, or which specific measurement choices were
+borrowed) against the FlashInfer repository — that verification is outside what the CollectiveX
+codebase contains, and the PR number is recorded here as-cited. Treat the specific influence as
+"named as inspiration in the plan," not as a line-level provenance claim. If precise attribution is
+needed, confirm against `flashinfer-ai/flashinfer` PR #3000 directly before publishing.
+
+What CollectiveX's EP methodology demonstrably shares with a good EP micro-benchmark (whatever its
+origin): dispatch and combine are timed **separately**, each point is **one MoE layer / one step /
+one dispatch+combine collective pair** (not a whole model), the token-count is the swept x-axis,
+and percentiles come from many pooled iterations rather than a single timed loop.
+
+---
+
+## Why CollectiveX timing boundaries differ
+
+DeepEP's and MoRI's own benchmarks each measure *their* natural boundary, which makes their numbers
+non-comparable: DeepEP can hoist layout computation out of the timed region; MoRI computes layout
+*inside* its kernel and cannot. If each backend simply reported "dispatch latency" under its own
+convention, a DeepEP comm-only number would be compared against a MoRI layout-and-dispatch number
+as if they measured the same thing. CollectiveX therefore makes the boundary an **explicit, named,
+machine-checked contract** (review #3 in `ep_harness.py`): adapters declare `SUPPORTED_CONTRACTS`
+and `run_ep.py` rejects an unsupported request. There are three contracts.
+
+### `layout-and-dispatch-v1` — the cross-vendor common boundary
+Dispatch timing **includes** routing-layout generation. For DeepEP, `get_dispatch_layout` runs
+*inside* the timed `dispatch()` (`p.layout is None`). For MoRI, layout is computed inside the
+kernel and **cannot** be hoisted — so this is *the only contract MoRI can honor*, and hence the one
+both vendors share. The fp8 cast/dequant stays **outside** the timed window (cast in
+`make_problem`, dequant in `stage`), modelling a producer that hands the dispatcher already-quantized
+activations. **Use this for any DeepEP-vs-MoRI comparison.**
+
+### `cached-layout-comm-only-v1` — DeepEP's own boundary (DeepEP only, normal mode)
+Layout is computed **once, untimed** (in `make_problem`, stored on `p.layout`) so the timed
+`dispatch()` is **pure communication**. This reproduces DeepEP's own benchmark boundary and is
+useful for "how fast is the comm kernel alone," but it is **not** comparable to MoRI (which can't
+hoist layout) and is rejected for LL mode (low-latency dispatch computes layout internally —
+nothing to hoist; `run_ep.py` rejects this combo).
+
+### `runtime-visible-v1` — the serving-realistic boundary (DeepEP only today)
+Dispatch starts from **what the runtime has right after routing** and **includes everything needed
+to make expert input consumable**: the per-token block-128 **fp8 cast moves inside** the timed
+window, plus layout, comm, and the recv-side **dequant to bf16** (`_per_block_dequant` inside
+`dispatch()`, after which `stage()` no-ops). Combine starts from bf16 expert outputs and ends when
+token outputs are consumable. This answers "what does the serving path actually pay," and the
+adapter records the boundary honestly via `fp8_in_timing` (true only under this contract for fp8).
+LL is runtime-visible *by construction* (its single kernel already times cast+layout+comm), so the
+flag only changes normal mode.
+
+### Boundaries shared across all three
+- **Combine excludes staging in every contract.** Placement of expert outputs (`stage()`) is
+  untimed for every backend — it stands in for the expert FFN write, which is not part of the
+  collective being measured.
+- **`isolated_sum` is a diagnostic, not a measurement.** It is the arithmetic SUM of the isolated
+  dispatch and combine percentiles. It **cannot** reveal shared sync, launch amortization, or
+  dispatch/combine overlap, so it must not be used for throughput or SLO capacity. The **measured
+  round trip** (`roundtrip`, one timed region over dispatch→stage→combine) is the real chained
+  latency, and it is the only basis for `roundtrip_tokens_per_second`.
+- **Cross-rank reduction order.** A collective finishes with its slowest rank, so each iteration's
+  latency is reduced **MAX across ranks first**, then percentiled.
+
+The contract name is part of the `comparison_key` and the schema enum, so two rows under different
+contracts are labelled distinct and never silently overlaid.
+
+---
+
+## Correctness contract definition
+
+"Correct" in CollectiveX has two layers: the **independent oracle** that defines the semantics, and
+the **runtime gate** that every sweep point must pass.
+
+### The independent oracle (`tests/reference_ep.py`)
+A from-scratch numpy model of MoE dispatch + combine, written **without** DeepEP or MoRI, used only
+for untimed validation — so the benchmark is never "validated against itself." Its model:
+
+- **Layout:** expert `e` lives on rank `e // experts_per_rank`.
+- **Dispatch:** token `t` selected for expert `e` contributes one copy of `x[t]` to
+  `(rank e//epr, expert e)`. `dispatch_plan()` enumerates every routed copy exactly once and
+  `validate_dispatch()` asserts each `(token, selected-expert)` maps to the **correct rank and
+  expert, exactly once** (duplicate `(token,expert)` pairs and out-of-range ranks are errors).
+- **Expert transform:** a deterministic per-expert factor `f_e = 1 + e/E`, **distinct per expert**,
+  so a copy routed to the *wrong* expert produces a wrong value (identity would hide mis-routing —
+  the self-test corrupts one expert id and asserts the oracle output changes).
+- **Combine:** `y[t] = Σ_k weights[t,k] · f_e(x[t])`, reduced over the token's selected experts,
+  output in **source-token order**. `validate_combine()` recomputes this two independent ways
+  (vectorizable reduction vs explicit per-copy accumulation) and asserts they agree — exercising
+  the reduction, the **gate-weighting**, the **source ordering**, and the
+  **multiple-experts-on-one-rank** case.
+- **Edge cases** (goal P3): empty rank, repeated destination rank, single-rank hotspot (all topk on
+  rank 0) are covered in the self-test; non-divisible global token counts are handled by callers.
+
+So the oracle's definition of correct is **exact destination rank/expert/token mapping (each routed
+copy once), plus the combine reduction with correct gate weights in correct source order.**
+
+### The runtime gate (in `ep_harness.run_sweep`)
+Per ladder point, each backend's `combine` output is compared to its `expected()` reference
+(DeepEP: `x · #destination-ranks`; MoRI: `x · #unique-destination-ranks`). The gate computes
+`max_rel = max_abs_error / max|expected|` and passes the point when `max_rel < tolerance`
+(bf16 `5e-2`; fp8 `1.25e-1`, looser because e4m3's 3 mantissa bits cap round-trip error — the
+tolerance is **recorded in the artifact** so the looser fp8 gate is explicit). A point is `correct`
+only if the local gate passes on **every** rank (MIN-reduced `local_ok`) **and** non-zero tokens
+were actually received (`recv_total > 0`) — so a silent no-op cannot pass.
+
+The artifact is honest about scope: `correctness.scope = "roundtrip-reconstruction-smoke-v1"` — it
+is a round-trip reconstruction plus non-silent-recv check at runtime, **not** a full per-token
+routing/ordering/padding proof at runtime (that exhaustive proof is what `reference_ep.py` provides
+off the hot path).
+
+### Workload identity (part of "did everyone run the same correct thing")
+Beyond per-point correctness, the sweep proves all ranks built the **same** global routing: each
+rank hashes its per-T routing hashes into a `trace_signature` and the harness MIN/MAX-reduces it;
+`workload_identity = "consistent-across-ranks"` only if all ranks agree. A mismatch means NVIDIA and
+AMD did **not** run identical routing, which (see below) makes the result `invalid`.
+
+---
+
+## Publication contract definition
+
+`publication_status` is **machine-derived** from a multi-dimensional `validity` record — no caller
+may hand-label a result `official`. The derivation lives in `ep_harness._derive_publication_status`
+and is **mirrored** in `validate_results.py:derive_publication_status`; the validator's core job is
+to confirm the recorded status equals this re-derivation (a mismatch = "validity tampered or
+stale", a hard error). The five tiers and their gates:
+
+### `failed`
+`execution_status != "complete"` — the sweep produced no rows. Nothing else is evaluated.
+
+### `invalid`
+Execution completed but a **fundamental soundness gate failed**: `semantic_correctness != "pass"`
+(a point failed the correctness gate), **or** `measurement_conformance != "conformant"`, **or**
+`workload_identity == "inconsistent"` (ranks did not run the same routing). An invalid result is
+not a usable measurement of anything.
+
+### `diagnostic`
+Measurement is **sound** (correct + consistent workload + conformant contract) but it is **not a
+fair cross-platform point**, for one of:
+- **Resource-nonconforming** — `resource_conformance` ends in `"nonconforming"` (e.g. MoRI's
+  floored `block_num`: it needed *more* comm units than the normalized target, so it isn't an
+  apples-to-apples resource point). Fixed-kernel paths (DeepEP LL: `low_latency_mode`) are
+  classified `not-applicable`, **not** a conformance failure, and are simply excluded from the
+  resource-Pareto comparison.
+- **A flagged timing anomaly** — `anomaly_free == false`. The harness flags
+  `roundtrip_gt_isolated_sum` (measured RT p99 > `threshold ×` isolated-sum p99, default 3×; the
+  open LL-FP8 case) and `roundtrip_lt_component_floor` (RT p50 < 0.95 × max(dispatch, combine) p50,
+  which violates chained-op sync semantics). Either demotes to `diagnostic` **unless explicitly
+  waived** via `--waive-anomaly` (which sets `anomaly_free = true`) *after* the cause is understood
+  and documented.
+- It is also the fallback for an otherwise-sound result that does not meet the higher bars.
+
+### `comparable-experimental`
+Measurement is sound (`semantic_correctness == pass`, `workload_identity` starts with
+`"consistent"`, `measurement_conformance == conformant`), resource-conforming, and anomaly-free —
+but it is **missing a publication requirement** (e.g. incomplete provenance, or a seeded-runtime
+workload rather than a canonical serialized one). This is the normal tier for a clean development or
+cross-vendor run that hasn't cleared the full official bar. It is comparable, just not "official."
+
+### `official`
+Everything `comparable-experimental` requires **plus both**:
+- `provenance_complete == true` — no `"unknown"` backend provenance, **and** a non-empty image
+  digest, **and** a GitHub run record with `run_id` + `source_sha` (assembled in `run_ep.py` from
+  `GITHUB_*` / `COLLECTIVEX_*` env). A bare local run can never be official.
+- `workload_source == "canonical-serialized"` — the run consumed pre-generated, checksum-verified
+  trace bytes (`--workload-dir`, `tests/workload.py`), so it is **provably** the same workload as
+  any other run consuming the same files (not just a same-seed regeneration).
+
+`validate_results.py` enforces additional **official-grade** gates on top of the derivation: a
+non-null `workload_id` and `trace_signature`, no unwaived anomalies, every point `correct`, and a
+minimum of `100` pooled samples per point (`MIN_SAMPLES_OFFICIAL`). It exits non-zero if any doc
+claims `official` but fails a gate, and (with `--require-official`) if any non-legacy doc is not
+official.
+
+### Cross-run identity (validator-only)
+Within a `comparison_key` (further grouped by `routing_step` and `uneven_tokens`, which change the
+realized workload but live in `reproduction`, not the key), the validator checks **per-T
+`routing_hash` agreement**: two runs at the same config and same T but **different routing bytes**
+are flagged as "not the same workload." It deliberately keys on per-T hashes (not the whole
+`trace_signature`) so a capped cross-vendor sweep (e.g. `1..16`) and a full headline sweep
+(`1..128`) of the same config are **not** falsely flagged — only a genuine same-T conflict is.
+
+### Other record types the validator preserves
+- **Legacy (v3, no `publication_status`)** docs load as `legacy-experimental` and are reported, not
+  failed.
+- **Preserved failed-case** records (`record_type == "failed-case"`, emitted by the runner on a
+  wedge/timeout/crash) are reported as preserved cases, **not** validation errors — the project
+  rule is "do not silently discard failed or incorrect results."
+
+## Collective suites: all-reduce / all-gather / framework AR — serving-use mapping
+
+The non-EP collective families map to specific inference-serving communication patterns:
+
+### All-reduce (`family=nccl` op=all_reduce + `family=allreduce-fw`)
+TP all-reduce of activations — the per-layer reduction across a tensor-parallel group after the
+attention/MLP matmuls. Two tiers measured in the SAME All-reduce tab so they are directly comparable:
+- **NCCL ring** (`run_nccl.py`, nccl-tests): the bandwidth-optimal baseline; wins at large messages.
+- **Framework custom AR** (`allreduce_fw_bench.py`): FlashInfer one-shot + two-shot via
+  `trtllm_allreduce_fusion` (pattern `kAllReduce`). One-shot is a single NVLink round that beats the
+  ring in the small-message latency-bound regime (the few-KiB..few-MiB activations a decode step
+  all-reduces); two-shot trades a second round for higher bandwidth as the message grows (and needs
+  `token_num > tp_size`). The crossover is exactly the decision this tab visualizes.
+
+### All-gather (`family=nccl` op=all_gather) — DP-attention → TP-MoE handoff
+In SGLang/DeepSeek-style serving, **data-parallel attention** runs each DP rank over its own token
+shard, then the hidden states are **all-gathered** before the **tensor-parallel MoE** so every TP
+rank sees the full token set for expert routing. The collected payload is `[total_tokens, hidden]`
+bf16. The standardized all-gather sweep is a geometric byte ladder that **spans the payload-size
+range of this handoff** (a few KiB per-rank shard up to the tens-of-MiB full-batch gather), so the
+latency/bandwidth curves in the All-gather tab cover the DP-attention→TP-MoE handoff sizes directly.
+
+**Named per-model handoff shapes.** The gathered payload is `total_tokens × hidden × 2` bytes (bf16).
+The table names the exact points for each model's EP shape (`hidden` from the `-v1` workload manifests),
+at a representative decode batch (256 tokens) and prefill chunk (4096 tokens), and the nearest covering
+point on the geometric all-gather byte ladder — so the named shapes are explicit, not just read off the
+sweep:
+
+| Model            | hidden | decode (256 tok) | prefill (4096 tok) | covered by all-gather sweep |
+|------------------|-------:|-----------------:|-------------------:|-----------------------------|
+| DeepSeek-V3/V4   |  7168  | 3.67 MB          | 58.7 MB            | yes (1 MiB–64 MiB band)     |
+| Kimi-K2          |  7168  | 3.67 MB          | 58.7 MB            | yes (1 MiB–64 MiB band)     |
+| MiniMax-M3       |  6144  | 3.15 MB          | 50.3 MB            | yes (1 MiB–64 MiB band)     |
+| Qwen3.5          |  4096  | 2.10 MB          | 33.6 MB            | yes (1 MiB–64 MiB band)     |
+
+All four models' decode and prefill handoffs land inside the standardized sweep's 1–64 MiB span, so the
+All-gather tab's measured latency/bandwidth at those byte points IS the per-model DP-attention→TP-MoE
+handoff cost (read the curve at the model's column value). The shapes are model-derived (hidden) ×
+serving-regime (token count); the byte ladder is dtype-agnostic so an fp8 handoff halves each figure.
diff --git a/experimental/CollectiveX/docs/references.md b/experimental/CollectiveX/docs/references.md
new file mode 100644
index 000000000..91f3a0918
--- /dev/null
+++ b/experimental/CollectiveX/docs/references.md
@@ -0,0 +1,154 @@
+# CollectiveX — learning / resource notes
+
+> Status: experimental (goal P2, "Add learning/resource notes"). These four arXiv papers are the
+> learning resources listed in `plan.md`. Each summary below was fetched from `arxiv.org/abs/<id>`
+> (titles/authors/dates taken from the live abstract page) and is then **mapped to the specific
+> CollectiveX benchmark dimensions it informs** — the metric, contract, capability axis, or
+> comparison the paper bears on.
+
+**Retrieval status (fetched 2026-06):**
+
+| arXiv ID | Title | Retrieved? | Note |
+|---|---|---|---|
+| 2511.15076 | GPU-Initiated Networking for NCCL | yes | clean fetch |
+| 2603.13606 | NCCL EP: Towards a Unified Expert Parallel Communication API for NCCL | yes | **ID looked future-dated (year "26"); verify.** The page resolved to real content (submitted 13 Mar 2026 per the page), not a not-found error — recorded as retrieved, flagged for a sanity check of the ID/date before citing. |
+| 2512.19849 | UCCL-EP: Portable Expert-Parallel Communication | yes | clean fetch |
+| 2412.19437 | DeepSeek-V3 Technical Report | yes | clean fetch |
+
+All four resolved to genuine abstract pages. 2603.13606 is the only one flagged: its identifier
+(and the page's stated 13 March 2026 submission date) is forward-dated relative to when it was
+assigned in the plan, so although the fetch returned coherent NCCL-EP content, the ID should be
+double-checked against arXiv directly before it is used as a hard citation. Nothing below is
+fabricated; the one uncertainty is called out here.
+
+---
+
+## Summarize arXiv 2511.15076
+
+**GPU-Initiated Networking for NCCL** — Hamidouche, Bachan, Markthub, Gootzen, Agostini, Jeaugey,
+Shafi, Theodorakis, Gorentla Venkata (NVIDIA). Submitted 19 Nov 2025 (v2 24 Nov 2025).
+
+Describes NCCL 2.28's new **Device API**, focused on the **GPU-Initiated Networking (GIN)**
+component for network RDMA. The motivation is fine-grained, low-latency GPU-to-GPU communication
+for tightly coupled compute-communication workloads — explicitly Mixture-of-Experts — where the
+traditional host-initiated model's CPU coordination is overhead. GIN is a three-layer architecture:
+host-side setup APIs, device-side remote-memory operations callable from inside CUDA kernels, and a
+network plugin with dual semantics (GPUDirect Async Kernel-Initiated and a Proxy backend). The paper
+demonstrates GIN by integrating it with **DeepEP** and reports benchmark results, positioning GIN as
+combining low-latency device-initiated ops with NCCL's collective algorithms and production
+infrastructure.
+
+## Summarize arXiv 2603.13606
+
+> **Flagged ID — see retrieval table.** The arXiv identifier is forward-dated; the fetch returned
+> the content below (an NCCL-EP paper), but verify the ID/date before citing as authoritative.
+
+**NCCL EP: Towards a Unified Expert Parallel Communication API for NCCL** — Goldman, Boker,
+Sheraizin, Admoni, Polyakov, Bhattacharya, Yu, Sun, Theodorakis, Yin, Gootzen, Shafi, Ravid,
+Di Girolamo, Dinan, Li, Gorentla Venkata, Bloch (NVIDIA). Page states submitted 13 Mar 2026
+(v3 2 Apr 2026); 13 pages, 8 figures, 7 tables; cs.DC.
+
+Introduces **NCCL EP**, an MoE communication library built on NCCL's Device API (the GIN work
+above), offering unified `ncclEpDispatch` / `ncclEpCombine` primitives with **C and Python**
+interfaces. It has two modes: a **Low-Latency (LL)** mode for inference decode targeting small
+batches (the page quotes "1–128 tokens") over all-to-all RDMA+NVLink, and a **High-Throughput (HT)**
+mode for training and inference prefill targeting large batches ("4096+ tokens") using hierarchical
+communication that aggregates within NVLink domains before inter-node RDMA. It situates itself
+alongside DeepEP and Hybrid-EP, evaluates on an H100 cluster across multi-node configs (LL kernel
+results + end-to-end with vLLM), and aims to be a supported EP path on current and emerging NVIDIA
+platforms.
+
+## Summarize arXiv 2512.19849
+
+**UCCL-EP: Portable Expert-Parallel Communication** — Mao, Zhang, Cui, Huang, You, Chen, Xu, Gu,
+Shenker, Raiciu, Zhou, Stoica. Submitted 22 Dec 2025 (v2 22 Jan 2026).
+
+Targets the **portability** problem in EP: systems like DeepEP perform well but require tight
+GPU↔NIC coupling for GPU-initiated RDMA, so they don't run everywhere. **UCCL-EP** instead routes
+compact token commands through a **GPU–CPU control channel** where multithreaded CPU proxies issue
+the RDMA operations, and it **emulates ordering semantics using RDMA immediate data** for NICs that
+lack native support (e.g. AWS EFA). Implemented on **both NVIDIA and AMD** GPUs with EFA and
+Broadcom NICs, it reports up to **2.1× dispatch/combine throughput on EFA**, up to **40% higher
+SGLang token throughput**, and up to **45% higher DeepSeek-V3 training throughput on a 16-node
+AMD+Broadcom platform**.
+
+## Summarize arXiv 2412.19437
+
+**DeepSeek-V3 Technical Report** — DeepSeek-AI et al. (~200 authors). Submitted 27 Dec 2024
+(v2 18 Feb 2025).
+
+Describes **DeepSeek-V3**, a **Mixture-of-Experts** LLM with **671B total / 37B activated per
+token**, using **Multi-head Latent Attention (MLA)** and **DeepSeekMoE**, an **auxiliary-loss-free
+load-balancing** strategy, and a **multi-token-prediction** objective. Pre-trained on 14.8T tokens
+then SFT + RL; reported comparable to leading closed-source models at **2.788M H800 GPU-hours**, with
+stable training (no irrecoverable loss spikes / rollbacks) and public checkpoints. For CollectiveX
+the load-bearing details are the **MoE shape and the load-balancing approach**, not the end-to-end
+quality numbers.
+
+---
+
+## Map each paper to CollectiveX benchmark dimensions
+
+Each paper informs specific, concrete axes of the harness (`tests/ep_harness.py`,
+`tests/ep_deepep.py`, `configs/backends.yaml`, `schemas/ep-result-v4.schema.json`). The mapping:
+
+### 2511.15076 (GIN / NCCL Device API) → the DeepEP **kernel-generation axis** and the **runtime-visible** boundary
+- **`shape.kernel_gen` (v1 NVSHMEM vs v2 NCCL-GIN).** The harness already records DeepEP's kernel
+  generation as part of line identity (`kernel_gen` derived from `deepep_version`, folded into
+  `comparison_key`) precisely because DeepEP V2 moved its transport from NVSHMEM to the NCCL Device
+  API. This paper *is* the NCCL device-side RDMA (GIN) that the V2 path builds on — it is the
+  primary-source explanation for why a "DeepEPv2" run must never be conflated with a "DeepEP V1" run
+  (goal P1, "DeepEP version matrix"). Informs the `kernel_gen` field and the version-as-first-class-
+  axis requirement.
+- **`runtime-visible-v1` measurement contract.** GIN's thesis is removing CPU coordination so comm
+  is launched/issued from inside the kernel. That is exactly the cost-surface `runtime-visible-v1`
+  tries to capture (cast + layout + comm + recv-dequant inside the timed window). The paper
+  motivates why a serving-realistic boundary, not just comm-only, is worth measuring.
+- **`transport` axis** (`nvlink`/`mnnvl`/`rdma` in `backends.yaml`) — GIN is the RDMA device-path
+  whose latency the EP transports record.
+
+### 2603.13606 (NCCL EP) → the planned **NVIDIA NCCL EP adapter**, the **dispatch/combine API contract**, and **phase = decode/prefill**
+- **The open "NVIDIA NCCL EP" backend** (goal P1: *"Add adapter for `NVIDIA/nccl/contrib/nccl_ep`"*)
+  — this paper is the design of that very library (`ncclEpDispatch` / `ncclEpCombine`). It is the
+  reference for adding an `nccl-ep` entry to `configs/backends.yaml` and a third adapter beside
+  DeepEP and MoRI, to be compared against DeepEP normal/LL under `layout-and-dispatch-v1`.
+- **`mode` axis (normal vs ll) and `phase` (decode vs prefill).** NCCL EP's split into **LL
+  (1–128 tokens, decode)** and **HT (4096+ tokens, prefill/training)** lines up directly with the
+  harness's `DECODE_LADDER = [1..128]` / `PREFILL_LADDER = [128..4096]` and the `mode = ll|normal`
+  axis. It corroborates the decode/prefill token-regime modelling and the LL decode cap.
+- **`comparison_key` design.** NCCL EP, DeepEP, and Hybrid-EP being distinct libraries with the same
+  `dispatch`/`combine` surface is exactly the situation the `backend` field + provenance
+  (`backend name, fork, commit, API generation`) exist to disambiguate.
+
+### 2512.19849 (UCCL-EP) → **cross-vendor portability**, the planned **UCCL adapter**, and the **transport / resource axes**
+- **The open "UCCL EP" backend** (goal P1: *"Add UCCL backend adapter … Add cross-platform result
+  class"*) — this paper is that backend. It is the reference for a UCCL `backends.yaml` entry and a
+  capability declaration spanning **both NVIDIA and AMD** (the only paper here that is natively
+  cross-vendor, like CollectiveX itself).
+- **The whole cross-vendor comparison thesis.** UCCL-EP exists because DeepEP's GPU↔NIC coupling
+  isn't portable. CollectiveX's reason for being is comparing such EP libraries fairly *across
+  vendors* — and its mechanism (one deterministic shared routing trace, `layout-and-dispatch-v1` as
+  the common contract, topology-class in the `comparison_key` so NVIDIA and AMD are never silently
+  overlaid) is the apparatus needed to evaluate exactly this paper's portability-vs-performance
+  trade-off.
+- **`transport` axis + the CPU-proxy resource story.** UCCL-EP's CPU-proxy / RDMA-immediate-data
+  design adds transports (EFA, Broadcom) beyond `nvlink/xgmi`, and its CPU-side issue model is a
+  data point for the `resource_profile` vocabulary (comm units / where the work runs), which today
+  models SM/CU fractions.
+
+### 2412.19437 (DeepSeek-V3) → the **default benchmark shape**, **EPLB / routing-skew axis**, and **fp8 dispatch**
+- **The headline shape itself.** The harness defaults — `hidden = 7168`, `topk = 8`,
+  `experts = 256` (`add_common_args`), and the goal's "Default to DeepSeek V3 shape / EP8 / uniform
+  / BF16" — *are* DeepSeek-V3's MoE configuration. This paper is the source of the canonical shape
+  every official curve is reported at, and of the `deepseek-v3-v1` / `deepseek-v4-v1` workload
+  manifests (goal P1).
+- **EPLB and the routing-distribution axis.** DeepSeek-V3's **auxiliary-loss-free load balancing**
+  is the real-world counterpart to (a) the `--routing` skew distributions (`zipf*`, `hotspot-*`) the
+  harness stresses and (b) the **EPLB** expert-replication transform (`tests/eplb.py`,
+  `--eplb`/`--num-redundant-experts`) offered as the remedy for skew. The paper motivates *why*
+  load imbalance and its mitigation are first-class benchmark dimensions (`expert_load_cv`,
+  `rank_load_cv`, `hotspot_ratio`, the EPLB `imbalance_before/after` + `mapping_hash`).
+- **fp8 throughout.** DeepSeek-V3's fp8 training/inference underpins the `dispatch_dtype = fp8`
+  axis and the per-token block-128 fp8 scale convention in `ep_deepep.py`.
+- **Per-token activation rate.** "37B activated per token" is the MoE sparsity that makes
+  tokens-per-rank (not model size) the meaningful x-axis for a dispatch/combine micro-benchmark.
diff --git a/experimental/CollectiveX/docs/upstream_precision.md b/experimental/CollectiveX/docs/upstream_precision.md
new file mode 100644
index 000000000..62f96d66f
--- /dev/null
+++ b/experimental/CollectiveX/docs/upstream_precision.md
@@ -0,0 +1,54 @@
+# Upstream precision work — review + mapping to CollectiveX (goal P1 "Integrate precision-related upstream work")
+
+Reviews the three precision PRs named in goal.md and maps each onto CollectiveX's precision axes
+(`shape.dispatch_dtype`, `shape.quant.combine_input_dtype/combine_quant_mode`, the
+`combine_quant_in_timing` reproduction flag, and the `capability.py` / `backends.yaml` `combine_dtypes`
++ `quant_modes` sets). All three are MERGED upstream. CollectiveX already carries the *scaffold* for
+them (the combine-path axes default to bf16/none and are validated by `capability.resolve`), so each PR
+maps to a concrete, reserved mode id that slots in when the kernel is wired + hardware-available.
+
+## MoRI PR 311 — `feat(EP): FP8 blockwise quantization for IntraNode combine` (ROCm/mori, MERGED)
+- **What:** adds `QuantType::Fp8BlockwiseQuant` (Python `fp8_blockwise`) — a quant-aware FP8 combine for
+  the IntraNode EP path, replacing MoRI's old direct-cast (which truncated activations above the e4m3
+  range and degraded SGLang DeepSeek-R1 accuracy at high concurrency). Per-token per-block max-abs scale
+  on the quant side; per-block FMA dequant on recv. Block size = `hidden_dim / scale_dim`.
+- **Maps to:** the `combine_quant_mode` axis. CollectiveX's `ep_mori.py` / `capability.py` /
+  `backends.yaml` already reserve this ("`+ fp8 when the MoRI quant_type combine path (PR311) lands`").
+  The reserved mode id is now concrete: **`fp8_blockwise`** with `combine_input_dtype=fp8`,
+  per-block scale layout — exactly the CollectiveX `combine_quant_mode` + `scale_layout` fields.
+- **Scope:** AMD/MI355X (MoRI is the AMD backend). Out of scope for *NVIDIA chips*, but it is the
+  reference design for the quant-combine contract that the NVIDIA backends will mirror.
+
+## FlashInfer PR 3376 — `feat: add mxfp8 quant to moe a2a combine` (flashinfer-ai/flashinfer, MERGED)
+- **What:** `moe_a2a_combine` can directly output **MXFP8** — adds `output_dtype`, `output_scales`,
+  `sf_layout`; bumps `kMaxPayloads` for per-token quantization dispatch.
+- **Maps to:** `combine_quant_mode=mxfp8`, `combine_output_dtype=mxfp8`, `scale_layout=sf_layout`, and
+  `combine_quant_in_timing=true` (the quant is inside the combine kernel). This is the NVIDIA
+  quantized-combine path.
+
+## FlashInfer PR 3643 — `feat: add mxfp4/nvfp4 quant to moe a2a combine` (flashinfer-ai/flashinfer, MERGED)
+- **What:** follow-up to 3376; adds **MXFP4 / NVFP4** quant to `moe_a2a_combine`, plus
+  `output_scalar_scale: float = 1.0`.
+- **Maps to:** `combine_quant_mode ∈ {mxfp4, nvfp4}`, `combine_output_dtype ∈ {mxfp4, nvfp4}`. These are
+  the goal's "NVFP4 combine" / "MXFP8 combine" precision-matrix rows, and (via the dispatch side of the
+  same kernel family) the "NVFP4/MXFP4/MXFP8 dispatch" rows.
+
+## Why these are not yet RUN on NVIDIA (see docs/gated.md)
+The FlashInfer combine quant (3376/3643) lives in `flashinfer.comm.moe_a2a_*` — the same MoE all-to-all
+that needs a **symmetric multi-process MNNVL workspace**. On x86_64 (H100/H200/B200) that needs
+`CAP_SYS_PTRACE`/pidfd (not granted in the enroot/pyxis container); on aarch64 (GB200/GB300) it uses
+CUDA FABRIC handles (would work; GB300 capacity-limited). So MXFP8/MXFP4/NVFP4 *combine* (and the fp4
+*dispatch* in the same family) are reachable on NVIDIA only once that container-capability/hardware
+blocker is resolved — they are not silently faked. DeepEP's own dispatch remains e4m3-fp8-only.
+
+## What CollectiveX did with this review
+- **Capability table:** the reserved mode ids are now named in `capability.py` / `backends.yaml`
+  comments (`fp8_blockwise` for mori; `mxfp8`/`mxfp4`/`nvfp4` for the flashinfer combine path) so a
+  future wiring is a one-line capability widening, not a redesign. They remain **rejected** by
+  `capability.resolve` today (not runnable → not claimed).
+- **Schema/labels:** `shape.quant.{combine_input_dtype,combine_quant_mode,combine_output_dtype,
+  scale_layout}` + `reproduction.combine_quant_in_timing` already exist (v4 schema), so a quantized-
+  combine result is a distinct, correctly-labelled comparison point the moment one is produced.
+- **Correctness tests:** deferred with the kernels — when a quant-combine path is wired, the
+  `reference_ep.py` oracle gains a tolerance class per `combine_quant_mode` (looser e4m3/fp4 bound),
+  mirroring the existing fp8-dispatch tolerance (1.25e-1 vs bf16 5e-3).
diff --git a/experimental/CollectiveX/env_capture.py b/experimental/CollectiveX/env_capture.py
new file mode 100644
index 000000000..b906a0497
--- /dev/null
+++ b/experimental/CollectiveX/env_capture.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — Layer-0 environment + topology capture.
+
+Emits a JSON document describing the node a collective benchmark ran on, so
+every result is provenance-tagged and a B200-vs-GB200 comparison is defensible.
+Standard library only (so it runs in any minimal container, and off-GPU it
+degrades gracefully instead of crashing). torch is used only if importable.
+
+Usage:
+    python env_capture.py --out results/env_b200-dgxc.json
+    python env_capture.py --redact --out env.json   # hash hostnames/IPs/UUIDs
+
+Importable:
+    from env_capture import capture_environment
+    env = capture_environment(redact=False)
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import platform
+import re
+import shutil
+import socket
+import subprocess
+import sys
+
+SCHEMA_VERSION = 1
+
+# Env vars worth recording — transport/tuning knobs that change what a
+# collective actually does (esp. the GB200 MNNVL flags vs B200).
+ENV_PREFIXES = ("NCCL_", "NVSHMEM_", "MC_", "UCX_", "SGLANG_DEEPEP", "DEEPEP_")
+ENV_EXACT = (
+    "CUDA_VISIBLE_DEVICES",
+    "CUDA_DEVICE_ORDER",
+    "SLURM_JOB_ID",
+    "SLURM_NNODES",
+    "SLURM_NTASKS",
+    "SLURM_JOB_PARTITION",
+    # Image identity — set by the launcher so the bundle records what ran.
+    "COLLECTIVEX_IMAGE",
+    "COLLECTIVEX_IMAGE_DIGEST",
+)
+
+
+def _run(cmd: list[str], timeout: int = 20) -> str | None:
+    """Run a command, return stdout (stripped) or None if unavailable."""
+    if shutil.which(cmd[0]) is None:
+        return None
+    try:
+        out = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout, check=False
+        )
+    except (subprocess.TimeoutExpired, OSError):
+        return None
+    if out.returncode != 0:
+        return None
+    return out.stdout.strip()
+
+
+def _redact(value: str | None) -> str | None:
+    """Stable short hash so artifacts can be shared without leaking
+    hostnames / IPs / GPU UUIDs / IB GUIDs while staying joinable."""
+    if not value:
+        return value
+    return "redacted-" + hashlib.sha256(value.encode()).hexdigest()[:12]
+
+
+def _gpus(redact: bool) -> dict:
+    """GPU inventory via nvidia-smi (None fields off-GPU)."""
+    info: dict = {"source": None, "count": None, "devices": []}
+    q = _run(
+        [
+            "nvidia-smi",
+            "--query-gpu=name,uuid,memory.total,compute_cap,pci.bus_id",
+            "--format=csv,noheader,nounits",
+        ]
+    )
+    if q is None:
+        return info
+    info["source"] = "nvidia-smi"
+    devices = []
+    for line in q.splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) < 5:
+            continue
+        name, uuid, mem_mib, cc, bus = parts[:5]
+        devices.append(
+            {
+                "name": name,
+                "uuid": _redact(uuid) if redact else uuid,
+                "memory_total_mib": int(mem_mib) if mem_mib.isdigit() else mem_mib,
+                "compute_capability": cc,
+                "pci_bus_id": _redact(bus) if redact else bus,
+            }
+        )
+    info["count"] = len(devices)
+    info["devices"] = devices
+    return info
+
+
+def _driver_cuda() -> dict:
+    out = _run(
+        ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]
+    )
+    driver = out.splitlines()[0].strip() if out else None
+    # `nvidia-smi` (no args) prints the CUDA driver-API version in its header.
+    cuda = None
+    header = _run(["nvidia-smi"])
+    if header:
+        m = re.search(r"CUDA Version:\s*([0-9.]+)", header)
+        if m:
+            cuda = m.group(1)
+    return {"driver_version": driver, "cuda_version": cuda}
+
+
+def _torch_info() -> dict:
+    """NCCL / torch build info — only if torch is importable in this env."""
+    info: dict = {"available": False}
+    try:
+        import torch  # type: ignore
+    except Exception:
+        return info
+    info["available"] = True
+    info["torch_version"] = torch.__version__
+    try:
+        info["cuda_runtime"] = torch.version.cuda
+    except Exception:
+        info["cuda_runtime"] = None
+    try:
+        if torch.cuda.is_available():
+            nccl = torch.cuda.nccl.version()
+            # version() returns an int (e.g. 22304) or a tuple, depending on build.
+            info["nccl_version"] = (
+                ".".join(map(str, nccl)) if isinstance(nccl, tuple) else nccl
+            )
+            info["device_count"] = torch.cuda.device_count()
+            info["device_name"] = torch.cuda.get_device_name(0)
+            cc = torch.cuda.get_device_capability(0)
+            info["compute_capability"] = f"{cc[0]}.{cc[1]}"
+    except Exception as exc:  # pragma: no cover - hardware dependent
+        info["error"] = repr(exc)
+    return info
+
+
+def _topology(redact: bool) -> dict:
+    """GPU/NIC topology matrix + a fingerprint to gate comparability.
+
+    The fingerprint is a hash of the structural part of `nvidia-smi topo -m`
+    (the connection legend), so two nodes with the same wiring share a key
+    even if absolute device IDs differ."""
+    topo = _run(["nvidia-smi", "topo", "-m"])
+    if topo is None:
+        return {"source": None, "matrix": None, "fingerprint": None}
+    # Fingerprint the link-type tokens (NV#, NODE, SYS, PIX, PXB, ...) only —
+    # ignore GPU/NIC labels and whitespace so it's placement-stable.
+    tokens = re.findall(r"\b(NV\d+|NODE|SYS|PIX|PXB|PHB|X)\b", topo)
+    fingerprint = hashlib.sha256(" ".join(tokens).encode()).hexdigest()[:16]
+    return {
+        "source": "nvidia-smi topo -m",
+        # The matrix can contain hostnames in some setups; redact wholesale.
+        "matrix": ("<redacted>" if redact else topo),
+        "fingerprint": fingerprint,
+    }
+
+
+def _rdma(redact: bool) -> dict:
+    """RDMA/IB device presence — names only, GUIDs redactable."""
+    devices: list[str] = []
+    listing = _run(["ibv_devinfo", "-l"])
+    if listing:
+        for line in listing.splitlines()[1:]:  # first line is a count
+            name = line.strip()
+            if name:
+                devices.append(name)
+    elif _run(["ibstat", "-l"]):
+        devices = [d.strip() for d in _run(["ibstat", "-l"]).splitlines() if d.strip()]
+    return {
+        "available": bool(devices),
+        "devices": [_redact(d) if redact else d for d in devices],
+    }
+
+
+def _env_vars() -> dict:
+    out = {}
+    for k, v in os.environ.items():
+        if k in ENV_EXACT or any(k.startswith(p) for p in ENV_PREFIXES):
+            out[k] = v
+    return dict(sorted(out.items()))
+
+
+def capture_environment(redact: bool = False, timestamp: str | None = None) -> dict:
+    """Return a JSON-serializable environment/provenance record."""
+    host = socket.gethostname()
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "captured_at": timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "redacted": redact,
+        "host": _redact(host) if redact else host,
+        "platform": {
+            "system": platform.system(),
+            "release": platform.release(),
+            "machine": platform.machine(),  # x86_64 vs aarch64 (B200 vs GB200)
+            "python": sys.version.split()[0],
+        },
+        "gpus": _gpus(redact),
+        "driver": _driver_cuda(),
+        "torch": _torch_info(),
+        "topology": _topology(redact),
+        "rdma": _rdma(redact),
+        "env": _env_vars(),
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX Layer-0 environment capture")
+    ap.add_argument("--out", help="write JSON here (default: stdout)")
+    ap.add_argument(
+        "--redact",
+        action="store_true",
+        help="hash hostnames / IPs / GPU UUIDs / IB GUIDs for shareable artifacts",
+    )
+    ap.add_argument(
+        "--timestamp",
+        help="ISO timestamp to stamp (default: now); pass one for reproducible bundles",
+    )
+    args = ap.parse_args()
+
+    env = capture_environment(redact=args.redact, timestamp=args.timestamp)
+    blob = json.dumps(env, indent=2)
+    if args.out:
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            fh.write(blob + "\n")
+        # A one-line human summary to stdout (the JSON is the artifact).
+        g = env["gpus"]
+        print(
+            f"env -> {args.out} | machine={env['platform']['machine']} "
+            f"gpus={g['count']} topo_fp={env['topology']['fingerprint']}"
+        )
+    else:
+        print(blob)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/generate_matrix.py b/experimental/CollectiveX/generate_matrix.py
new file mode 100644
index 000000000..dd9ecc045
--- /dev/null
+++ b/experimental/CollectiveX/generate_matrix.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""CollectiveX matrix generator (goal Part 2: capability planning, sharding, canaries).
+
+Reads configs/{suites,workloads,platforms,backends}.yaml, resolves a named suite into the FULLY
+VALIDATED set of (workload, platform, backend, mode, dtype, contract, routing, ep, phase) cases
+BEFORE any GPU is allocated — omitting unsupported combinations with a recorded reason. Then:
+  * groups compatible cases into SHARDS (same platform/nodes/placement/image/backend/mode/resource
+    -> one allocation runs many token points), and
+  * selects a CANARY per (platform, backend, mode, contract) to run before the full shard.
+
+  python3 generate_matrix.py --suite ep-nightly-v1 --out matrix.json
+  python3 generate_matrix.py --suite ep-smoke-v1            # prints summary + omissions
+
+Pure stdlib + PyYAML. 'all' as a backend resolves to the platform vendor's EP backend set.
+"""
+from __future__ import annotations
+
+import argparse
+import itertools
+import json
+import os
+
+import yaml
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+
+def _load(name):
+    with open(os.path.join(HERE, "configs", name)) as fh:
+        return yaml.safe_load(fh)
+
+
+def resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase, platforms, backends,
+                 combine_quant_mode="none", placement="packed", activation_profile="normal", eplb=False):
+    """Return (ok, reason). Mirrors adapter SUPPORTED_* + platform/backend registry limits, including
+    the combine-quant / routing / EPLB / activation distribution constraints (goal P2-m)."""
+    p = platforms["platforms"].get(plat)
+    b = backends["backends"].get(beng)
+    if p is None:
+        return False, f"unknown platform {plat}"
+    if b is None:
+        return False, f"unknown backend {beng}"
+    if b["vendor"] != p["vendor"]:
+        return False, f"{beng} is {b['vendor']}, {plat} is {p['vendor']}"
+    if mode not in b["modes"]:
+        return False, f"{beng} has no mode {mode}"
+    pm = (p.get("validated") or {}).get("modes")
+    if pm and mode not in pm:
+        return False, f"{plat} validated modes={pm} (got {mode})"   # e.g. B300 LL aborts -> normal-only
+    if dtype not in b["dtypes"]:
+        return False, f"{beng} has no dtype {dtype}"
+    if contract not in b["contracts"]:
+        return False, f"{beng} has no contract {contract}"
+    if ep not in p["validated"]["ep_degrees"]:
+        return False, f"{plat} EP{ep} not validated (have {p['validated']['ep_degrees']})"
+    if ep > p["validated"]["max_intranode_gpus"] and not p["validated"].get("internode"):
+        return False, f"{plat} EP{ep} needs internode (not validated)"
+    pc = (b.get("phase_constraints") or {}).get(mode)
+    if pc and pc.get("phases") and phase not in pc["phases"]:
+        return False, f"{beng} mode={mode} is {pc['phases']}-only (got {phase})"
+    if contract == "cached-layout-comm-only-v1" and mode == "ll":
+        return False, "cached-layout meaningless for LL"
+    # combine-quant / distribution constraints (goal P2-m). Default none/packed/normal reproduce
+    # today; the quant-combine suite's fp8/mxfp8 modes are REJECTED here (no kernel wired) so it
+    # resolves to zero valid cases until PR311 lands.
+    if combine_quant_mode not in b.get("quant_modes", ["none"]):
+        return False, f"{beng} quant_modes={b.get('quant_modes', ['none'])} (got {combine_quant_mode}) — not wired"
+    if routing not in b.get("routings", [routing]):
+        return False, f"{beng} does not support routing {routing}"
+    if eplb and not b.get("eplb", False):
+        return False, f"{beng} does not support EPLB"
+    if activation_profile not in b.get("activation_profiles", ["normal"]):
+        return False, f"{beng} does not support activation_profile {activation_profile}"
+    return True, "ok"
+
+
+def expand_backends(spec, plat, platforms, backends):
+    """Resolve 'all' to the platform vendor's EP backend set (goal: do NOT skip capability)."""
+    if spec != "all":
+        return spec if isinstance(spec, list) else [spec]
+    vendor = platforms["platforms"][plat]["vendor"]
+    eps = [b for b in backends["vendor_backends"][vendor] if b in backends["backends"]]
+    return eps
+
+
+def generate(suite_name):
+    suites = _load("suites.yaml")["suites"]
+    platforms = _load("platforms.yaml")
+    backends = _load("backends.yaml")
+    workloads = _load("workloads.yaml")
+    if suite_name not in suites:
+        raise SystemExit(f"unknown suite {suite_name}; have {sorted(suites)}")
+    s = suites[suite_name]
+    phases = s.get("phases", ["decode"])
+    routings = s.get("routings", ["uniform"])
+    resource_modes = s.get("resource_modes", ["tuned"])
+    # optional distribution axes (default to today's single value when the suite omits them).
+    cqms = s.get("combine_quant_modes", ["none"])
+    placements = s.get("placements", ["packed"])
+    activations = s.get("activation_profiles", ["normal"])
+    eplbs = s.get("eplb", [False])                 # ep-routing-v1 sweeps [false, true]
+    steps = s.get("routing_steps", [0])            # ep-temporal-v1 sweeps the snapshot index
+    unevens = s.get("uneven_tokens", ["none"])     # ep-uneven-tokens-v1 sweeps the allocation
+    cases, omitted = [], []
+    for plat in s["platforms"]:
+        bset = []
+        for bspec in s["backends"]:
+            bset += expand_backends(bspec, plat, platforms, backends)
+        for beng in sorted(set(bset)):
+            eps = s.get("ep_degrees") or platforms["platforms"][plat]["validated"]["ep_degrees"]
+            for (wl, mode, dtype, contract, routing, ep, phase, rmode, cqm, placement, act,
+                 eplb, step, uneven) in itertools.product(
+                    s["workloads"], s["modes"], s.get("dtypes", ["bf16"]), s["contracts"],
+                    routings, eps, phases, resource_modes, cqms, placements, activations,
+                    eplbs, steps, unevens):
+                ok, reason = resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase,
+                                          platforms, backends, combine_quant_mode=cqm,
+                                          placement=placement, activation_profile=act, eplb=eplb)
+                rec = {"workload": wl, "platform": plat, "backend": beng, "mode": mode,
+                       "dtype": dtype, "contract": contract, "routing": routing, "ep": ep,
+                       "phase": phase, "resource_mode": rmode, "combine_quant_mode": cqm,
+                       "placement": placement, "activation_profile": act,
+                       "eplb": eplb, "routing_step": step, "uneven_tokens": uneven}
+                (cases if ok else omitted).append({**rec, **({} if ok else {"reason": reason})})
+    # SHARDS: one allocation per (platform, backend, mode, resource, image) runs many points.
+    shards = {}
+    for c in cases:
+        img = backends["backends"][c["backend"]].get("required_image", "?")
+        key = (c["platform"], c["backend"], c["mode"], c["resource_mode"], img)
+        shards.setdefault(key, []).append(c)
+    shard_list = [{"platform": k[0], "backend": k[1], "mode": k[2], "resource_mode": k[3],
+                   "image": k[4], "cases": v} for k, v in shards.items()]
+    # CANARY: one representative (smallest) case per (platform, backend, mode, contract).
+    canary = {}
+    for c in cases:
+        ck = (c["platform"], c["backend"], c["mode"], c["contract"])
+        canary.setdefault(ck, c)
+    # cohort-level source-SHA pinning (goal P2-n): record whether this suite REQUIRES all SKUs to
+    # use one benchmark source SHA (official runs) — cohort.py --pin-sha enforces it at validation.
+    # official suites pin by default; diagnostic/bring-up may mix.
+    pin = s.get("pin_source_sha", s.get("required_publication") == "official")
+    return {"suite": suite_name, "required_publication": s.get("required_publication"),
+            "pin_source_sha": pin,
+            "headline_distribution": (_load("suites.yaml").get("headline_distribution") or {}).get("routing"),
+            "n_cases": len(cases), "n_omitted": len(omitted),
+            "cases": cases, "omitted": omitted, "shards": shard_list,
+            "canaries": list(canary.values())}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX matrix generator")
+    ap.add_argument("--suite", required=True)
+    ap.add_argument("--out")
+    a = ap.parse_args()
+    m = generate(a.suite)
+    print(f"suite={m['suite']} required={m['required_publication']}: "
+          f"{m['n_cases']} valid cases, {m['n_omitted']} omitted, "
+          f"{len(m['shards'])} shards, {len(m['canaries'])} canaries")
+    seen = set()
+    for o in m["omitted"]:
+        k = (o["platform"], o["backend"], o["mode"], o["dtype"], o["contract"], o["reason"])
+        if k not in seen:
+            seen.add(k)
+            print(f"  OMIT {o['platform']}/{o['backend']}/{o['mode']}/{o['dtype']}/{o['contract']}: {o['reason']}")
+    if a.out:
+        with open(a.out, "w") as fh:
+            json.dump(m, fh, indent=2)
+        print(f"wrote {a.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
new file mode 100644
index 000000000..e5e7ddeb6
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+# CollectiveX — 2-node B200 SKU adapter (cross CX-7 InfiniBand spine), x86_64.
+#
+# The other half of the headline: the same primitives as single-node B200, but
+# spanning two nodes so the transport is InfiniBand rather than NVLink. Contrast
+# with GB200, where the 2-node-equivalent stays on NVL72 NVLink (MNNVL).
+#
+# Multi-node orchestration differs from single-node, so this adapter does NOT
+# use run_in_container.sh: it builds nccl-tests (MPI=1), runs each op across all
+# ranks (raw capture), then parses on the login node. Currently CX_BENCH=nccl
+# only (multi-node DeepEP/MNNVL is the srt-slurm follow-up).
+#
+# SPIKE CAVEATS: needs `srun --mpi=pmix` wired for pyxis and a compute-visible
+# checkout — set CX_STAGE_DIR to a shared FS (e.g. /home/sa-shared/cx-stage) if
+# the runner workspace is not cross-mounted to compute.
+#
+# Run: bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+CX_BENCH="${CX_BENCH:-nccl}"
+[ "$CX_BENCH" = "nccl" ] || cx_die "launch_b200-dgxc-slurm.sh supports CX_BENCH=nccl only (got '$CX_BENCH'); multi-node DeepEP is a follow-up"
+
+RUNNER_NAME="${RUNNER_NAME:-b200-dgxc-slurm}"
+PARTITION="${CX_PARTITION:-gpu-2}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+GPUS_PER_NODE="${CX_GPUS_PER_NODE:-8}"
+NODES="${CX_NODES:-2}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+TOPO="b200-nvlink-island+cx7-ib"
+WORLD=$((NODES * GPUS_PER_NODE))
+MPI_FLAG="${CX_SRUN_MPI:-pmix}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance (propagated via --export=ALL).
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+
+declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf
+                 [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf )
+
+cx_log "runner=$RUNNER_NAME nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+          --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR"
+              --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX"
+              --no-container-entrypoint)
+ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json"
+
+# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node).
+srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" \
+  bash -c '
+    set -euo pipefail
+    cd /ix/experimental/CollectiveX
+    source runtime/common.sh
+    mkdir -p results
+    cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null
+    python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
+  '
+
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build"
+OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+
+# 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS.
+for op in $OPS; do
+  raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt"
+  cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG) -> $raw"
+  srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \
+       --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \
+       --export=ALL,NCCL_CUMEM_ENABLE=1 \
+       "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-8G}" -f 2 -g 1 -c 1 -w 5 -n 20 \
+       > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)"
+
+  # 3) Parse on the login node (pure stdlib python; no container needed).
+  python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \
+    --world-size "$WORLD" --nodes "$NODES" \
+    --runner "$RUNNER_NAME" --topology-class "$TOPO" --transport ib \
+    --env-json "$ENVJSON" \
+    --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \
+    --timestamp "$TS" || cx_log "WARN: parse $op failed"
+done
+
+cx_log "done — JSON artifacts under $CX_DIR/results/"
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
new file mode 100644
index 000000000..6d0c31c11
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# CollectiveX — B200 single-node SKU adapter (8x B200, NVLink island, x86_64).
+#
+# Thin adapter: handles B200-specific allocation/container, then hands off to
+# runtime/run_in_container.sh which runs whichever benchmark CX_BENCH selects
+# (nccl | deepep | all). Mirrors runners/launch_b200-dgxc.sh (salloc + enroot
+# squash + srun --container) with all model-serving stripped.
+#
+# Run from inside the InferenceX checkout on the B200 login node:
+#     bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh           # nccl (default)
+#     CX_BENCH=deepep bash .../launch_b200-dgxc.sh                          # DeepEP (rebuild)
+#
+# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(30)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES
+#   CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-b200-dgxc}"
+PARTITION="${CX_PARTITION:-gpu-2}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance.
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \
+          --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_b300-nv.sh b/experimental/CollectiveX/launchers/launch_b300-nv.sh
new file mode 100644
index 000000000..7f485480a
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b300-nv.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+# CollectiveX — B300 (b300-nv GH runner) adapter. The self-hosted runner is named
+# `b300-nv_NN`, so runner.name's prefix resolves to this file via
+# launch_${RUNNER_NAME%%_*}.sh. Identical B300 settings to launch_b300.sh (the
+# canonical/manual entry point) — delegate so there is a single source of truth.
+set -euo pipefail
+exec bash "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/launch_b300.sh" "$@"
diff --git a/experimental/CollectiveX/launchers/launch_b300.sh b/experimental/CollectiveX/launchers/launch_b300.sh
new file mode 100644
index 000000000..422d045c2
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b300.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+# CollectiveX — B300 single-node SKU adapter (8x B300 SXM6, NVLink island, x86_64, SM100).
+#
+# Thin adapter: B300-specific allocation/container, then hands off to
+# runtime/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors
+# launch_h200.sh; B300 differs in: partition `batch_1` with a REQUIRED account
+# (`benchmark`), and the compute-visible share is /data (10.3.26.100:/data) — NOT
+# /home and NOT the node-local /scratch, both invisible to compute nodes here. Both
+# the squash AND the staged repo MUST live on /data or pyxis fails "No such file".
+#
+# Run from inside the InferenceX checkout on the B300 login node:
+#     bash experimental/CollectiveX/launchers/launch_b300.sh            # nccl (default)
+#     CX_BENCH=deepep CX_PHASE=both bash .../launch_b300.sh             # DeepEP, decode+prefill
+#
+# Env knobs: CX_PARTITION(batch_1) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-b300}"
+PARTITION="${CX_PARTITION:-batch_1}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"   # B300 scheduler REQUIRES a valid account/partition combo
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-b300-018}"  # known-bad node (per the serving launcher)
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b300)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/data/sa-shared/containers}"
+export CX_STAGE_DIR="${CX_STAGE_DIR:-/data/sa-shared/cx_stage}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="b300-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \
+          --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
new file mode 100644
index 000000000..6a754f5bf
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -0,0 +1,226 @@
+#!/usr/bin/env bash
+# CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray.
+#
+# Two paths, selected by CX_NODES:
+#   * CX_NODES=1 (default): single tray, 4 GPU, intra-tray MNNVL. Hands off to
+#     run_in_container.sh (CX_BENCH = nccl | deepep | all), -g 4.
+#   * CX_NODES>1: multi-node over the NVL72 NVLink fabric (MNNVL), e.g. CX_NODES=2
+#     = 8 GPU. nccl only — builds nccl-tests (MPI=1), runs each op across all ranks
+#     via `srun --mpi=pmix` (1 GPU/rank), parses on the login node. Same shape that
+#     runs single-node B200 (NVLink island) and multi-node B200 (CX-7 IB) — here it
+#     stays entirely on NVL72 NVLink. Validated 8-GPU (2 trays) on-node.
+#
+# Run from inside the InferenceX checkout on the GB200 login node:
+#     bash experimental/CollectiveX/launchers/launch_gb200-nv.sh             # 4 GPU, nccl
+#     CX_NODES=2 bash .../launch_gb200-nv.sh                                  # 8 GPU MNNVL
+#     CX_BENCH=deepep bash .../launch_gb200-nv.sh                             # 4 GPU, DeepEP
+#
+# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NODES(1)
+#   CX_GPUS_PER_NODE(4) CX_TIME(30) CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH
+#   CX_OPS CX_MIN_BYTES CX_MAX_BYTES CX_SRUN_MPI(pmix) CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-gb200-nv}"
+PARTITION="${CX_PARTITION:-batch}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+GPUS_PER_NODE="${CX_GPUS_PER_NODE:-4}"          # NVL72 compute tray = 4 GPU/node
+NODES="${CX_NODES:-1}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+WORLD=$((NODES * GPUS_PER_NODE))
+
+export CX_RUNNER="$RUNNER_NAME" CX_TS="$TS"
+export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+# Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded.
+export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD bench=$CX_BENCH (aarch64)"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+# ----------------------------------------------------------------------------
+if [ "$NODES" -le 1 ]; then
+  # Single tray (4 GPU): generic dispatcher, -g N single process.
+  export CX_NGPUS="$GPUS_PER_NODE"
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPUS_PER_NODE" \
+            --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+  [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+  cx_log "JOB_ID=$JOB_ID"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  srun --jobid="$JOB_ID" \
+    --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+    --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+    --no-container-entrypoint --export=ALL \
+    bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
+  exit 0
+fi
+
+# ----------------------------------------------------------------------------
+# Multi-node MNNVL over the NVL72 NVLink fabric. CX_BENCH=nccl -> nccl-tests across WORLD ranks
+# (build MPI=1, srun --mpi=pmix, parse on login). Any EP backend (deepep/uccl/flashinfer) -> the
+# EP multi-srun path ported from launch_gb300-nv.sh: run_ep.py across WORLD srun tasks (1 GPU/rank,
+# per-rank RANK/LOCAL_RANK from SLURM_*), intranode NVLink across <=8 MNNVL ranks. One config/dispatch.
+MPI_FLAG="${CX_SRUN_MPI:-pmix}"
+declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf
+                 [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf )
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+          --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR"
+              --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX"
+              --no-container-entrypoint)
+ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json"
+
+# EP backends (deepep/uccl/flashinfer): run run_ep.py across WORLD srun tasks over MNNVL, then exit
+# (the nccl-tests path below is nccl-only). Mirrors launch_gb300-nv.sh's shard-aware EP8 path.
+if [ "$CX_BENCH" != "nccl" ]; then
+  MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N)" | head -1)"; MP=29553
+  mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results"
+  # Source the hybrid-ep build env if the build-once wrote it (build_ext --inplace PYTHONPATH/LD_LIBRARY_PATH
+  # are process-local and don't cross srun steps; the file persists in the named container). No-op otherwise.
+  WRAP='[ -f /tmp/.cx_hybrid_env ] && . /tmp/.cx_hybrid_env; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"'
+
+  # Build from-source kernels (DeepEP V2 / flashinfer-quant-combine) ONCE PER NODE into a persistent
+  # named container, then every case-srun reuses it (build visible to all WORLD ranks). Mirrors the
+  # proven launch_gb300-nv.sh EP8 path: without this, the multi-srun ran ephemeral per-rank containers
+  # that bypassed the build hooks (deepep_v2 silently ran bundled V1, quant-combine ran cq=none).
+  CNAME="cxep_${JOB_ID}"
+  CMOUNT=(--container-mounts="$MOUNT_SRC:$MOUNT_DIR" --no-container-mount-home
+          --container-workdir="$MOUNT_DIR/experimental/CollectiveX" --no-container-entrypoint)
+  cx_log "EP setup: build into named container $CNAME per node (deepep_v2=${CX_DEEPEP_V2:-} combine=${CX_COMBINE_DTYPE:-bf16})"
+  srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --container-name="$CNAME" \
+    --container-image="$SQUASH_FILE" "${CMOUNT[@]}" --export=ALL,CX_BUILD_ONLY=1 \
+    bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" </dev/null 2>&1 | tail -15 \
+    || cx_log "WARN: EP build-only step returned nonzero (see above)"
+
+  # Per-rank env. deepep V2 spans NVL72 trays only with allow_mnnvl=True (else DeepEP sets
+  # NVSHMEM_DISABLE_MNNVL=1 -> intranode-IPC path -> illegal address cross-tray); CX_ALLOW_MNNVL=1 makes
+  # tests/ep_deepep.py pass it (gated on the param existing, so bundled V1 is unchanged). flashinfer rides
+  # NCCL's MNNVL transport. (gb200 validation pending an allocation; identical to gb300 run 28434764062.)
+  EP_EXPORTS="ALL,MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1,MC_FORCE_MNNVL=1"
+  [ "$CX_BENCH" = "deepep" ] && EP_EXPORTS="$EP_EXPORTS,CX_ALLOW_MNNVL=1"
+
+  # SWEEP (CX_SHARD_FILE set): one TAB-line per shard case so the rack-scale EP path sweeps EVERY
+  # case (parity with single-node). MANUAL: one line per phase from the :-defaulted CX_* env.
+  cx_ep_cases() {
+    # CX_SHARD_FILE is workflow-relative (results/.shard_<id>.json, written under
+    # working-directory=experimental/CollectiveX). This path runs on the SUBMIT HOST (cwd=repo root),
+    # so resolve against $CX_DIR when not found as-is — else the SHARD branch is skipped and only ONE
+    # default case runs instead of the shard's N.
+    local sf="${CX_SHARD_FILE:-}"
+    [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf"
+    if [ -n "$sf" ] && [ -f "$sf" ]; then
+      # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` collapses consecutive tabs and
+      # swallows empty fields like a false eplb, shifting columns. No case field contains '|'.)
+      python3 - "$sf" <<'PY'
+import json, sys
+d = json.load(open(sys.argv[1]))
+for c in d.get("cases", []):
+    g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv)
+    print("|".join([g("phase","decode"), g("dtype","bf16"), g("mode","normal"),
+        g("contract","layout-and-dispatch-v1"), g("routing","uniform"),
+        ("1" if c.get("eplb") else ""), g("resource_mode","tuned"),
+        g("activation_profile","normal"), g("placement","packed"), g("routing_step","0"),
+        g("uneven_tokens","none"), g("hidden","7168"), g("topk","8"), g("experts","256"),
+        g("ladder","")]))
+PY
+    else
+      local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill"
+      local ph
+      for ph in $phases; do
+        printf '%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \
+          "$ph" "${CX_DISPATCH_DTYPE:-bf16}" "${CX_MODE:-normal}" \
+          "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" "${CX_ROUTING:-uniform}" \
+          "${CX_EPLB:+1}" "${CX_RESOURCE_MODE:-tuned}" "${CX_ACTIVATION_PROFILE:-normal}" \
+          "${CX_PLACEMENT:-packed}" "${CX_ROUTING_STEP:-0}" "${CX_UNEVEN_TOKENS:-none}" \
+          "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}"
+      done
+    fi
+  }
+
+  ci=0
+  while IFS='|' read -r ph dtype mode contract routing eplb rmode act placement rstep uneven hidden topk experts lad; do
+    [ -n "$ph" ] || continue
+    ci=$((ci+1))
+    out="results/${RUNNER_NAME}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")_${dtype}_${mode}.json"
+    cx_log "EP${WORLD}[$ci] $ph $CX_BENCH $dtype/$mode/$contract routing=$routing eplb=${eplb:-} rmode=$rmode act=$act plc=$placement"
+    # shellcheck disable=SC2086
+    timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$WORLD" \
+      --ntasks-per-node="$GPUS_PER_NODE" --container-name="$CNAME" "${CMOUNT[@]}" \
+      --export="$EP_EXPORTS" \
+      bash -c "$WRAP" _ --backend "$CX_BENCH" --phase "$ph" --dispatch-dtype "$dtype" \
+        --mode "$mode" --measurement-contract "$contract" \
+        --routing "$routing" ${eplb:+--eplb} --resource-mode "$rmode" \
+        --activation-profile "$act" --placement "$placement" --routing-step "$rstep" --uneven-tokens "$uneven" \
+        --tokens-ladder "$lad" --hidden "$hidden" --topk "$topk" \
+        --experts "$experts" --warmup "${CX_WARMUP:-32}" --iters "${CX_ITERS:-200}" \
+        --trials "${CX_TRIALS:-3}" --seed "${CX_SEED:-67}" --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" \
+        --transport "$CX_TRANSPORT" \
+        ${CX_COMBINE_DTYPE:+--combine-dtype "$CX_COMBINE_DTYPE"} ${CX_COMBINE_QUANT_MODE:+--combine-quant-mode "$CX_COMBINE_QUANT_MODE"} \
+        --out "$out" </dev/null 2>&1 | tail -8
+    cx_log "EP${WORLD}[$ci] $ph rc=${PIPESTATUS[0]}"
+  done < <(cx_ep_cases)
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — EP artifacts under $CX_DIR/results/"
+  exit 0
+fi
+
+# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node).
+srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \
+     --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" </dev/null \
+  bash -c '
+    set -euo pipefail
+    cd /ix/experimental/CollectiveX
+    source runtime/common.sh
+    mkdir -p results
+    cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null
+    python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
+  '
+
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build"
+OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+
+# 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS.
+for op in $OPS; do
+  raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt"
+  cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG, MNNVL) -> $raw"
+  srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \
+       --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \
+       --export=ALL,NCCL_CUMEM_ENABLE=1,NCCL_MNNVL_ENABLE=1,MC_FORCE_MNNVL=1 </dev/null \
+       "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-2G}" -f 2 -g 1 -c 1 -w 5 -n 20 \
+       > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)"
+
+  # 3) Parse on the login node (pure stdlib; no container needed).
+  python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \
+    --world-size "$WORLD" --nodes "$NODES" \
+    --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+    --env-json "$ENVJSON" \
+    --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \
+    --timestamp "$TS" || cx_log "WARN: parse $op failed"
+done
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $CX_DIR/results/"
diff --git a/experimental/CollectiveX/launchers/launch_gb300-nv.sh b/experimental/CollectiveX/launchers/launch_gb300-nv.sh
new file mode 100644
index 000000000..41d08bbb9
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb300-nv.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# CollectiveX — GB300 (NVL72 Grace-Blackwell, aarch64) GHA launcher. Lands on the gb300-nv
+# self-hosted runner (on the im-gb300-login-02 slurm login) and runs the chosen EP config.
+#
+# Two paths by CX_NODES:
+#   CX_NODES<=1 (EP4): single NVL72 tray, 4 GPU. Hands off to run_in_container.sh (torchrun -g 4).
+#   CX_NODES==2 (EP8): 2 trays, 8 GPU over the MNNVL NVLink domain. run_in_container's single-node
+#     torchrun can't span nodes, so this path runs run_ep.py DIRECTLY across 8 srun tasks (1 rank
+#     each), per-rank RANK/LOCAL_RANK from SLURM_*, MASTER_ADDR=first node — the intranode NVLink
+#     path works across <=8 ranks on MNNVL (no internode/NVSHMEM). One CX_* config per dispatch.
+#
+# Env: CX_NODES(2) CX_PARTITION(batch_1) CX_ACCOUNT(benchmark) CX_BENCH(deepep) CX_PHASE + the
+#   CX_DISPATCH_DTYPE/CX_MODE/CX_MEASUREMENT_CONTRACT/CX_ROUTING/CX_EPLB/CX_TOKENS_LADDER knobs.
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"; REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+PARTITION="${CX_PARTITION:-batch_1}"; ACCOUNT="${CX_ACCOUNT:-benchmark}"
+NODES="${CX_NODES:-2}"; GPN="${CX_GPUS_PER_NODE:-4}"
+NGPUS="${CX_NGPUS:-$((NODES*GPN))}"; TIME_MIN="${CX_TIME:-90}"
+# CX_IMAGE is a docker TAG, not a squash path: cx_ensure_squash mangles the tag to
+# <repo>_<tag>.sqsh and finds the pre-staged squash by THAT name (the same convention
+# H200/B300 use). Passing a .sqsh PATH here made it try `enroot import docker://<path>`
+# -> "Invalid image reference", then pyxis "No such file or directory" on the mangled
+# target. The pre-staged file is /data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh,
+# which is exactly the mangled name of this tag, so it resolves with no re-import.
+IMAGE="${CX_IMAGE:-$(cx_default_image gb300)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/data/sa-shared/containers}"
+export CX_STAGE_DIR="${CX_STAGE_DIR:-/data/sa-shared/cx_stage}"
+export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}"
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+RUNNER="gb300-${NGPUS}x"
+export CX_RUNNER="$RUNNER" CX_TS="$TS" CX_TOPO="gb300-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
+export CX_BENCH="${CX_BENCH:-deepep}" CX_NGPUS="$NGPUS"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+
+cx_log "GB300 runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH phase=${CX_PHASE:-decode}"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "$CX_STAGE_DIR")"
+[ "${CX_DRYRUN:-0}" = "1" ] && { cx_log "DRYRUN"; exit 0; }
+command -v salloc >/dev/null || cx_die "salloc not found"
+
+if [ "$NODES" -le 1 ]; then   # ---- EP4: single tray, run_in_container (torchrun -g 4) ----
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPN" --exclusive \
+            --time="$TIME_MIN" --job-name="$RUNNER")"
+  [ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  srun --jobid="$JOB_ID" --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:/ix" \
+    --no-container-mount-home --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \
+    --export=ALL bash /ix/experimental/CollectiveX/runtime/run_in_container.sh
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"; exit 0
+fi
+
+# ---- EP8: 2 trays, run_ep.py directly across 8 ranks (no torchrun; MNNVL intranode path) ----
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" --gres=gpu:"$GPN" \
+          --ntasks-per-node="$GPN" --exclusive --time="$TIME_MIN" --job-name="$RUNNER")"
+[ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N)" | head -1)"; MP=29551
+mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results"
+# Source the hybrid-ep build env if the build-once wrote it (deepep-hybrid: build_ext --inplace +
+# PYTHONPATH/LD_LIBRARY_PATH are process-local and don't cross srun steps; the file persists in the
+# named container). No-op for other backends (file absent).
+WRAP='[ -f /tmp/.cx_hybrid_env ] && . /tmp/.cx_hybrid_env; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"'
+
+# From-source kernels (DeepEP V2 / flashinfer quant-combine) cannot be built in the per-rank multi-srun
+# (8 separate ephemeral containers). Build them ONCE PER NODE into a PERSISTENT named container, then
+# every case-srun REUSES it (--container-name, no re-import) so the build is visible to all 8 ranks.
+# Brings the EP8 rack path to parity with EP4 (run_in_container builds once + reuses). Mounts re-apply
+# per srun-step (not persisted in the container fs), so each srun still passes "${CMOUNT[@]}".
+CNAME="cxep8_${JOB_ID}"
+CMOUNT=(--container-mounts="$MOUNT_SRC:/ix" --no-container-mount-home
+        --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint)
+cx_log "EP8 setup: build into named container $CNAME per node (deepep_v2=${CX_DEEPEP_V2:-} combine=${CX_COMBINE_DTYPE:-bf16})"
+srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 \
+  --container-name="$CNAME" --container-image="$SQUASH_FILE" "${CMOUNT[@]}" --export=ALL,CX_BUILD_ONLY=1 \
+  bash /ix/experimental/CollectiveX/runtime/run_in_container.sh </dev/null 2>&1 | tail -15 \
+  || cx_log "WARN: EP8 build-only step returned nonzero (see above)"
+
+# The EP8 case list as TAB-separated arg-lines. SWEEP (CX_SHARD_FILE set): one line per shard case,
+# so the rack-scale EP8 path sweeps EVERY case of its shard (parity with run_in_container's single-
+# node SHARD loop) instead of the old single CX_* config. MANUAL (no shard file): one line per phase
+# from the CX_* env — every field is :-defaulted so set -u never trips on an unset knob (the old bug:
+# bare $CX_DISPATCH_DTYPE here was unbound under sweep, crashing the whole job on its first line).
+cx_ep8_cases() {
+  # CX_SHARD_FILE is workflow-relative (results/.shard_<id>.json, written by the Extract step with
+  # working-directory=experimental/CollectiveX). This EP8 path runs on the SUBMIT HOST where cwd is
+  # the repo root, so resolve it against $CX_DIR (=experimental/CollectiveX) when not found as-is —
+  # else the SHARD branch is skipped and only ONE default case runs instead of the shard's N.
+  local sf="${CX_SHARD_FILE:-}"
+  [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf"
+  if [ -n "$sf" ] && [ -f "$sf" ]; then
+    # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` would collapse consecutive tabs and
+    # swallow empty fields like a false eplb, shifting every column. No case field contains '|'.)
+    python3 - "$sf" <<'PY'
+import json, sys
+d = json.load(open(sys.argv[1]))
+for c in d.get("cases", []):
+    g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv)
+    print("|".join([g("phase","decode"), g("dtype","bf16"), g("mode","normal"),
+        g("contract","layout-and-dispatch-v1"), g("routing","uniform"),
+        ("1" if c.get("eplb") else ""), g("resource_mode","tuned"),
+        g("activation_profile","normal"), g("placement","packed"), g("routing_step","0"),
+        g("uneven_tokens","none"), g("hidden","7168"), g("topk","8"), g("experts","256"),
+        g("ladder","")]))
+PY
+  else
+    local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill"
+    local ph
+    for ph in $phases; do
+      printf '%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \
+        "$ph" "${CX_DISPATCH_DTYPE:-bf16}" "${CX_MODE:-normal}" \
+        "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" "${CX_ROUTING:-uniform}" \
+        "${CX_EPLB:+1}" "${CX_RESOURCE_MODE:-tuned}" "${CX_ACTIVATION_PROFILE:-normal}" \
+        "${CX_PLACEMENT:-packed}" "${CX_ROUTING_STEP:-0}" "${CX_UNEVEN_TOKENS:-none}" \
+        "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}"
+    done
+  fi
+}
+
+# Per-rank env for the EP8 case sruns. flashinfer-combine rides NCCL's MNNVL transport (validated:
+# cq=fp8/nvfp4 @ ws8). DeepEP V2's Buffer gates multi-tray NVLink behind allow_mnnvl, which defaults
+# False -> DeepEP then sets NVSHMEM_DISABLE_MNNVL=1 and the legacy buffer takes the intranode-only CUDA-IPC
+# peer path, faulting across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). CX_ALLOW_MNNVL=1
+# makes tests/ep_deepep.py pass allow_mnnvl=True so the NVL buffer spans both trays over the fabric API.
+# Bundled V1's Buffer predates the param (its NVL buffer already spans MNNVL) -> the harness drops the kwarg.
+EP8_EXPORTS="ALL,MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1"
+[ "$CX_BENCH" = "deepep" ] && EP8_EXPORTS="$EP8_EXPORTS,CX_ALLOW_MNNVL=1"
+
+ci=0
+while IFS='|' read -r ph dtype mode contract routing eplb rmode act placement rstep uneven hidden topk experts lad; do
+  [ -n "$ph" ] || continue
+  ci=$((ci+1))
+  out="results/${RUNNER}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")_${dtype}_${mode}.json"
+  cx_log "EP8[$ci] $ph $CX_BENCH $dtype/$mode/$contract rt=$routing eplb=${eplb:-} combine=${CX_COMBINE_DTYPE:-bf16}/${CX_COMBINE_QUANT_MODE:-none}"
+  # shellcheck disable=SC2086
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$NGPUS" \
+    --ntasks-per-node="$GPN" --container-name="$CNAME" "${CMOUNT[@]}" \
+    --export="$EP8_EXPORTS" \
+    bash -c "$WRAP" _ --backend "$CX_BENCH" --phase "$ph" --dispatch-dtype "$dtype" \
+      --mode "$mode" --measurement-contract "$contract" \
+      --routing "$routing" ${eplb:+--eplb} --resource-mode "$rmode" \
+      --activation-profile "$act" --placement "$placement" --routing-step "$rstep" --uneven-tokens "$uneven" \
+      --tokens-ladder "$lad" --hidden "$hidden" --topk "$topk" \
+      --experts "$experts" --warmup "${CX_WARMUP:-32}" --iters "${CX_ITERS:-200}" \
+      --trials "${CX_TRIALS:-3}" --seed "${CX_SEED:-67}" --runner "$RUNNER" --topology-class "$CX_TOPO" \
+      --transport "$CX_TRANSPORT" \
+      ${CX_COMBINE_DTYPE:+--combine-dtype "$CX_COMBINE_DTYPE"} ${CX_COMBINE_QUANT_MODE:+--combine-quant-mode "$CX_COMBINE_QUANT_MODE"} \
+      --out "$out" </dev/null 2>&1 | tail -8
+  cx_log "EP8[$ci] $ph rc=${PIPESTATUS[0]}"
+done < <(cx_ep8_cases)
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
diff --git a/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh
new file mode 100644
index 000000000..2a35340a8
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# CollectiveX — H100 (DGX Cloud Slurm) single-node SKU adapter (8x H100, NVLink
+# island, x86_64, SM90). Matches the GH self-hosted runner name `h100-dgxc-slurm_NN`
+# (runner.name prefix -> this script via launch_${RUNNER_NAME%%_*}.sh).
+#
+# Thin adapter mirroring launch_b200-dgxc.sh (same DGX Cloud tenancy/conventions:
+# partition default gpu-2, account benchmark, compute-visible /home/sa-shared);
+# allocates, then hands off to run_in_container.sh (CX_BENCH = nccl | deepep | all).
+# The DeepEP path runs the full FP8 + low-latency matrix (validated on 8x H100).
+#
+# !!! First on-runner run = validation (no direct SSH to this cluster at authoring).
+# If pyxis fails "No such file" the share is not compute-visible — set CX_SQUASH_DIR
+# + CX_STAGE_DIR to a compute-visible FS (cf. hpc-gpu-1 needing /mnt/nfs).
+#
+# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+# Cluster identity from runners/launch_h100-dgxc-slurm.sh (the serving launcher):
+# partition hpc-gpu-1, account customer, known-bad node hpc-gpu-1-7 excluded. This
+# is the SAME cluster validated over SSH. CRITICAL: /home is login-local (not
+# compute-visible) — the squash MUST live on /mnt/nfs; the GH runner workspace is
+# already on /mnt/nfs (compute-visible) so the checkout mounts directly (no staging).
+RUNNER_NAME="${RUNNER_NAME:-h100-dgxc-slurm}"
+PARTITION="${CX_PARTITION:-hpc-gpu-1}"
+ACCOUNT="${CX_ACCOUNT:-customer}"
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-hpc-gpu-1-7}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"
+IMAGE="${CX_IMAGE:-$(cx_default_image h100)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/nfs/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="h100-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+# ---- Cross-node H100 EP (goal 182): mirrors launch_h200.sh. Allocate N nodes, ONE container task per
+# node; run_in_container builds the backend per node then spawns NGPUS local ranks rendezvousing via a
+# FileStore on the shared mount (CX_RDZV_FILE) — deliberately AVOIDS torchrun (its elastic-agent TCPStore
+# at the management-subnet NodeAddr is unreachable from a peer's enroot container net namespace). nccl-ep
+# is the validated portable cross-node EP (all_to_all_single, host-stages); custom-RDMA backends hit the
+# GPUDirect-RDMA wall. /mnt/nfs is compute-visible so the FileStore is shared across nodes.
+if [ "${CX_NODES:-1}" -gt 1 ]; then
+  NODES="${CX_NODES}"
+  cx_log "H100 CROSS-NODE EP: nodes=$NODES world=$((NODES*NGPUS)) bench=$CX_BENCH (IB; FileStore rdzv)"
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \
+            --nodes="$NODES" --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+  [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID (multi-node) from salloc"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N)]"
+  export CX_TOPO="h100-multinode-ib" CX_TRANSPORT="rdma"
+  # FileStore rendezvous file on the shared mount (same underlying file on every node); fresh per job.
+  RDZV="$MOUNT_DIR/experimental/CollectiveX/.rdzv_${JOB_ID}"
+  rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true
+  srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 \
+    --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+    --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+    --no-container-entrypoint \
+    --export=ALL,CX_NNODES="$NODES",CX_RDZV_FILE="$RDZV" \
+    bash -c 'export CX_NODE_RANK=${SLURM_NODEID:-0}; exec bash "$0"' \
+      "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" || cx_log "WARN: cross-node H100 EP rc=$?"
+  rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — cross-node H100 EP artifacts under results/"
+  exit 0
+fi
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \
+          --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_h200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_h200-dgxc-slurm.sh
new file mode 100755
index 000000000..9dd862987
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_h200-dgxc-slurm.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# The H200 GHA self-hosted runner is named h200-dgxc-slurm_NN, so the workflow's
+# launch_${RUNNER_NAME%%_*}.sh convention resolves to THIS name. Thin alias to the real
+# H200 adapter (launch_h200.sh) — no logic here, just the name the runner expects.
+exec bash "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/launch_h200.sh" "$@"
diff --git a/experimental/CollectiveX/launchers/launch_h200.sh b/experimental/CollectiveX/launchers/launch_h200.sh
new file mode 100644
index 000000000..72f34b69a
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_h200.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# CollectiveX — H200 single-node SKU adapter (8x H200, NVLink island, x86_64, SM90).
+#
+# Thin adapter: H200-specific allocation/container, then hands off to
+# runtime/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors
+# launch_b200-dgxc.sh; H200 differs in: partition `main` (14x 8-GPU H200 nodes),
+# NO account (open scheduler), home is shared NFS (compute-visible, so no
+# CX_STAGE_DIR), and the sglang image is imported on first use (not pre-staged).
+#
+# Run from inside the InferenceX checkout on the H200 login node:
+#     bash experimental/CollectiveX/launchers/launch_h200.sh             # nccl (default)
+#     CX_BENCH=deepep CX_PHASE=both bash .../launch_h200.sh              # DeepEP, decode+prefill
+#
+# Env knobs: CX_PARTITION(main) CX_ACCOUNT() CX_NGPUS(8) CX_TIME(45) CX_IMAGE
+#   CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-h200}"
+PARTITION="${CX_PARTITION:-main}"            # H200 cluster's only partition (sinfo: main*)
+ACCOUNT="${CX_ACCOUNT:-}"            # H200 scheduler is open; no account needed
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"            # generous: first-use enroot import of the image
+IMAGE="${CX_IMAGE:-$(cx_default_image h200)}"
+# This cluster's /home is shared NFS and IS compute-visible (confirmed on login-0:
+# the GHA runners live under /home/sa-shared/gharunners and the sglang image is
+# pre-staged at /home/sa-shared/containers). The h100-dgxc sibling is the opposite
+# (/home login-local, /mnt/nfs is the share) — /mnt/nfs does NOT exist here, so the
+# old /mnt/nfs default failed the GHA runner at "mkdir /mnt/nfs: Permission denied".
+# The checkout already lives on the compute-visible NFS, so mount it directly: no
+# staging (CX_STAGE_DIR empty). Override CX_STAGE_DIR only from a login-local checkout.
+SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}"
+export CX_STAGE_DIR="${CX_STAGE_DIR:-}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="h200-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ${ACCOUNT:+account=$ACCOUNT }ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+# ---- Cross-node H100/H200 EP (goal 182): allocate N nodes, run ONE container task per node, and let
+# run_in_container build uccl (per node) then spawn its NGPUS local ranks rendezvousing via a FileStore
+# on the shared mount (CX_RDZV_FILE). This deliberately AVOIDS torchrun: torchrun's elastic agent runs
+# its OWN cross-node TCPStore at --master-addr, unreachable from a peer's enroot container net namespace
+# (the management-subnet NodeAddr is not in the container's net view — the prior torchrun attempt timed
+# out 900s at exactly that bootstrap, while the FileStore path got past it). The build MUST be in-
+# container per node (uccl is pip-installed, not in the image), so one-container-per-node — NOT multi-
+# srun-per-rank — is required: separate per-rank containers are ephemeral and would each lack uccl.
+# UCCL EP is internode-native (RDMA/IB); DeepEP normal-internode asserts out. Repo on compute-vis NFS.
+if [ "${CX_NODES:-1}" -gt 1 ]; then
+  NODES="${CX_NODES}"
+  cx_log "H200 CROSS-NODE EP: nodes=$NODES world=$((NODES*NGPUS)) bench=$CX_BENCH (IB; UCCL internode-native; FileStore rdzv)"
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" ${ACCOUNT:+--account="$ACCOUNT"} --nodes="$NODES" --gres=gpu:"$NGPUS" \
+            --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+  [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID (multi-node) from salloc"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N)]"
+  export CX_TOPO="h200-multinode-ib" CX_TRANSPORT="rdma"
+  # FileStore rendezvous file on the shared mount (same underlying file on every node); fresh per job.
+  RDZV="$MOUNT_DIR/experimental/CollectiveX/.rdzv_${JOB_ID}"
+  rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true
+  # one task/node; CX_NODE_RANK is the per-node SLURM_NODEID (set inside the task, not via --export).
+  srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 \
+    --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+    --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+    --no-container-entrypoint \
+    --export=ALL,CX_NNODES="$NODES",CX_RDZV_FILE="$RDZV" \
+    bash -c 'export CX_NODE_RANK=${SLURM_NODEID:-0}; exec bash "$0"' \
+      "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" || cx_log "WARN: cross-node H200 EP rc=$?"
+  rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — cross-node H200 EP artifacts under results/"
+  exit 0
+fi
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" ${ACCOUNT:+--account="$ACCOUNT"} --gres=gpu:"$NGPUS" \
+          --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
new file mode 100644
index 000000000..7be963cfb
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -0,0 +1,190 @@
+#!/usr/bin/env bash
+# CollectiveX — MI355X (AMD CDNA4, 8 GPU/node) SKU adapter: MoRI dispatch/combine.
+#
+# AMD counterpart to the NVIDIA adapters. Differs from them in ways taken from
+# the real runners/launch_mi355x-amds.sh:
+#   * partition `compute`, no --account (cluster default), --cpus-per-task=128,
+#     and known-bad nodes excluded;
+#   * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on
+#     the allocated node (not on the login node like the shared-FS NVIDIA path);
+#   * pyxis flags --container-writable --container-remap-root for the ROCm image.
+# AMD backends: CX_BENCH=mori (MoRI EP dispatch/combine, default) or nccl
+# (collective primitives via rccl-tests, the ROCm nccl-tests fork).
+#
+# !!! NOT yet validated on hardware (no MI355X cluster access at authoring time).
+# Treat the first on-runner run as validation — like the DeepEP path was on GB200.
+#
+# Run from inside the InferenceX checkout on the MI355X login node:
+#     bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh
+#
+# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(60) CX_IMAGE
+#   CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}"
+PARTITION="${CX_PARTITION:-compute}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-60}"   # generous: a cold enroot import of the large ROCm image
+IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}"   # node-local on MI355X
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}"
+# Optional node pin. The node-local squash is only staged on some nodes, and on
+# others /var/lib/squash isn't writable (cold-import fails). Pin CI to nodes that
+# already hold the squash via CX_NODELIST (overrides the exclude list).
+NODELIST="${CX_NODELIST:-}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+# AMD backends/benches wired on MI355X (ROCm/CDNA4):
+#   mori        — MoRI EP dispatch/combine (the AMD EP backend)
+#   nccl        — collective primitives via rccl-tests (the ROCm nccl-tests fork)
+#   kv-cache    — KV block transfer (HIP memcpy family; capability allows amd)
+#   rl-mesh     — RL trainer<->generator mesh (torch.distributed -> RCCL on ROCm)
+#   allreduce-fw— framework all-reduce (RCCL baseline; the flashinfer one/two-shot impls are
+#                 NVIDIA-only and self-skip on the ROCm image, leaving a valid RCCL-baseline curve)
+#   copy-engine — off-SM DMA copy vs CU-kernel copy; on ROCm the DMA path IS the SDMA engine
+#                 (the AMD SDMA path), labeled copy_engine_kind=sdma in the result
+#   mori-io     — MoRI-IO RDMA p2p transfer engine (mori.io; AMD analog of NIXL) GPU0<->GPU1
+# Default mori; honor an explicit CX_BENCH within this set. NVIDIA-only EP backends
+# (deepep/uccl/flashinfer/deepep-hybrid/offload/nixl) fall back to mori (capability also
+# rejects them on amd, so a dispatch of those to mi355x is a no-op the validator catches first).
+# nccl-ep IS supported on AMD: it is pure torch.distributed all_to_all_single over RCCL (the
+# cross-node EP path that host-stages where MoRI's custom RDMA aborts — goal 183).
+export CX_BENCH="${CX_BENCH:-mori}"
+case "$CX_BENCH" in
+  mori|nccl-ep|nccl|kv-cache|rl-mesh|allreduce-fw|copy-engine|mori-io|nccl-kv|mooncake) ;;
+  *) cx_log "mi355x: CX_BENCH='$CX_BENCH' is NVIDIA-only / unsupported on AMD; using mori"; export CX_BENCH=mori ;;
+esac
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi"
+# MI355X is a shared cluster with slow cold enroot imports + node contention; the default 900s
+# per-phase wall-clock guard is too tight here (MoRI prefill at large T + a busy node times out).
+# Raise to 1800s (fits inside the 60-min salloc). Override with CX_RUN_TIMEOUT.
+export CX_RUN_TIMEOUT="${CX_RUN_TIMEOUT:-1800}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH image=$IMAGE"
+# AMD workspace is compute-visible (the serving launcher bind-mounts it directly),
+# so no staging; the node-local squash is handled via srun below.
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+SQUASH_KEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')"
+SQUASH_FILE="$SQUASH_DIR/${SQUASH_KEY}.sqsh"
+# Lock in a guaranteed-writable per-node dir, NOT next to the squash: on some
+# nodes /var/lib/squash is root/admin-owned, so even a world-readable squash
+# can't get a sibling .lock created (flock -> "Bad file descriptor"). CX_LOCK_DIR
+# overrides. The lock only serializes concurrent imports on the same node.
+LOCK_FILE="${CX_LOCK_DIR:-/tmp}/${SQUASH_KEY}.sqsh.lock"
+cx_log "squash(node-local)=$SQUASH_FILE  lock=$LOCK_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+# ---- Cross-node MI355X EP (goal 183): MoRI is RDMA-native (ionic_rdma) — it registers a symmetric
+# heap per rank and dispatches/combines over RDMA, so it spans nodes natively. CX_NODES>1 allocates
+# N nodes (pinned to the warm-squash nodes via CX_NODELIST so no cold import), imports the squash on
+# each, then multi-sruns run_ep across NODES*8 ranks (1 GPU/rank, RANK/LOCAL_RANK from SLURM_*) — the
+# same multi-srun shape the GB300 EP8 path uses. Reduced timing (MoRI wedges under sustained load).
+if [ "${CX_NODES:-1}" -gt 1 ]; then
+  NODES="${CX_NODES}"; WORLD=$((NODES * NGPUS))
+  cx_log "MI355X CROSS-NODE EP: nodes=$NODES world=$WORLD bench=$CX_BENCH (MoRI RDMA internode)"
+  if [ -n "$NODELIST" ]; then
+    JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --nodelist="$NODELIST" --nodes="$NODES" --gres=gpu:"$NGPUS" \
+              --ntasks-per-node="$NGPUS" --exclusive --cpus-per-task=16 --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+  else
+    JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --nodes="$NODES" --gres=gpu:"$NGPUS" \
+              --ntasks-per-node="$NGPUS" --exclusive --cpus-per-task=16 --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+  fi
+  [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID (multi-node) from salloc"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]"
+  # import the squash on EVERY allocated node (1 task/node).
+  srun --jobid="$JOB_ID" --ntasks-per-node=1 bash -c "
+    mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true
+    exec 9>\"$LOCK_FILE\" 2>/dev/null; flock -w 600 9 2>/dev/null || true
+    unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1 && echo \"squash present: $SQUASH_FILE\" \
+      || { rm -f \"$SQUASH_FILE\"; enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" </dev/null; }
+  " || cx_log "WARN: multi-node squash import had issues on a node"
+  # MASTER_ADDR must be the rank-0 node's ROUTABLE IP, not its hostname: MI355X /etc/hosts aliases
+  # the hostname to 127.0.1.1 (loopback), which made gloo rendezvous fail "connect refused
+  # remote=[127.0.1.1]". scontrol NodeAddr gives the routable address; fall back to hostname.
+  _mn="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N)" | head -1)"
+  MA="$(scontrol show node "$_mn" 2>/dev/null | grep -oE 'NodeAddr=[^ ]+' | head -1 | cut -d= -f2)"; [ -z "$MA" ] && MA="$_mn"; MP=29557
+  cx_log "rendezvous master node=$_mn addr=$MA:$MP"
+  # FileStore rendezvous on the shared mount: nccl-ep (pure rccl PG, no gloo) inits via file:// and
+  # sidesteps BOTH the TCPStore master-addr reach AND the gloo connectFullMesh 127.0.1.1 alias. MoRI
+  # (gloo+nccl) still consumes MASTER_ADDR; run_ep.py prefers CX_RDZV_FILE when set (harmless for mori).
+  RDZV="$MOUNT_DIR/experimental/CollectiveX/.rdzv_${JOB_ID}"; rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true
+  phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill"
+  # source _xnode_net.sh inside each rank: pins GLOO/NCCL_SOCKET_IFNAME to the routable 10.x NIC so
+  # gloo's per-rank connectFullMesh advertises the reachable iface (not the 127.0.1.1 hostname alias).
+  WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; cd /ix/experimental/CollectiveX; source runtime/_xnode_net.sh 2>/dev/null || true; exec python3 tests/run_ep.py "$@"'
+  rc=0
+  for ph in $phases; do
+    out="results/${RUNNER_NAME}_${CX_BENCH}_${ph}_${TS}.json"
+    # shellcheck disable=SC2086
+    timeout -k 30 "${CX_RUN_TIMEOUT:-1800}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$WORLD" \
+      --ntasks-per-node="$NGPUS" --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+      --container-writable --container-remap-root --no-container-mount-home \
+      --container-workdir="$MOUNT_DIR/experimental/CollectiveX" --no-container-entrypoint \
+      --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",CX_RDZV_FILE="$RDZV" \
+      bash -c "$WRAP" _ --backend "$CX_BENCH" --phase "$ph" --tokens-ladder "${CX_TOKENS_LADDER:-1 2 4 8}" \
+        --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
+        --measurement-contract layout-and-dispatch-v1 --routing "${CX_ROUTING:-uniform}" \
+        --iters "${CX_ITERS:-8}" --trials "${CX_TRIALS:-1}" --warmup "${CX_WARMUP:-4}" --seed 67 \
+        --runner "$RUNNER_NAME" --topology-class mi355x-multinode-rdma --transport rdma --out "$out" </dev/null 2>&1 | tail -12
+    cx_log "cross-node $ph rc=${PIPESTATUS[0]}"
+  done
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true
+  cx_log "done — cross-node MI355X EP artifacts under results/"
+  exit 0
+fi
+
+# Pin to specific nodes (CX_NODELIST) when set, else exclude the known-bad ones.
+if [ -n "$NODELIST" ]; then
+  cx_log "node pin: --nodelist=$NODELIST"
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --nodelist="$NODELIST" --gres=gpu:"$NGPUS" \
+            --exclusive --cpus-per-task=128 --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+else
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \
+            --exclusive --cpus-per-task=128 --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+fi
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+# Clear stray containers, then enroot-import to the node-local squash (flock,
+# </dev/null so a missing token can't hang). Both run on the allocated node.
+# shellcheck disable=SC2016  # $(...) must expand on the remote node, not here
+srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true
+srun --jobid="$JOB_ID" bash -c "
+  mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true
+  exec 9>\"$LOCK_FILE\" || { echo 'cannot open lock $LOCK_FILE' >&2; exit 1; }
+  flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; }
+  if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then
+    echo 'squash present: $SQUASH_FILE'
+  else
+    rm -f \"$SQUASH_FILE\"
+    enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" </dev/null
+  fi
+"
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --container-writable --container-remap-root --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+# ROCm can leave gpucore.* dumps in the workdir on a crash; clear them so the
+# next checkout on this runner is clean (mirrors the serving launcher).
+rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
new file mode 100644
index 000000000..d62bb7746
--- /dev/null
+++ b/experimental/CollectiveX/plan.md
@@ -0,0 +1,940 @@
+# CollectiveX — Plan
+
+> **How to read this.** This is the single canonical plan. It is **spike-first** and **scoped to `experimental/CollectiveX/`** on a branch — nothing in the production serving path changes until a promotion decision is made later. Part 1 is background (what CollectiveX is, reconstructed from team discussion). Part 2 is the implementation plan. Where this plan says "now," it means the Milestone 0 spike; "later" items (GitHub workflow, database, app frontend) are deliberately deferred. All repository references (runners, launchers, workflows, matrix logic, the `experimental/` charter) were verified against the live InferenceX repo — see References.
+
+---
+
+# Part 1 — Background
+
+## What it is
+
+CollectiveX is an benchmarking workstream under the InferenceX umbrella. It measures **collective communication** and **MoE dispatch/combine**, and performs **apples-to-apples, cross-vendor comparison of expert-parallel (EP) libraries** across NVIDIA and AMD (TPU later). The intended deliverables are an **OSS benchmark project** and a **public explainer article** — a credible cross-vendor collective benchmark plus the story around it.
+
+## Why
+
+Existing public benchmarks don't offer trustworthy, like-for-like collective/EP comparison across vendors. CollectiveX fills that gap by reusing InferenceX's runner and cluster infrastructure to produce reproducible, provenance-tagged results.
+
+## Current state
+
+- An initial MVP exists: it collected collective and kernel shapes and produced MoE dispatch/combine results on NVIDIA.
+- **Normal mode works; low-latency (LL) mode is blocked** on IBGDA enablement — a direct GPU↔NIC data-and-control path over PCIe that removes CPU coordination and simplifies MoE dispatch/combine collectives — which depends on cluster-networking work outside this project.
+- The main near-term enabler is NVIDIA networking / IBGDA; the AMD EP stack and AMD networking (Ultra Ethernet) are the cross-vendor counterpart.
+
+---
+
+# Part 2 — Implementation plan
+
+## Implementation status (built)
+
+The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that:
+
+- **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`.
+- **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`.
+- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
+- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `tests/ep_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `tests/ep_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`).
+
+This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental).
+
+## Scope and placement
+
+CollectiveX starts as an **experimental project on its own branch**, fully contained under `experimental/CollectiveX/`:
+
+```bash
+git switch main
+git pull --ff-only
+git switch -c collectivex
+mkdir -p experimental/CollectiveX
+```
+
+This matches the repository's intent: `experimental/` is explicitly non-core ("experimental WIP code that is mostly Claude Code generated… not intended for production use or as part of the official InferenceMAX results").
+
+For the experimental phase, **everything stays inside `experimental/CollectiveX/**`**. Do **not** modify:
+
+```text
+benchmarks/
+runners/
+utils/
+.github/configs/
+perf-changelog.yaml
+InferenceX-app
+```
+
+The only eventual exception is a minimal workflow dispatcher under `.github/workflows/` (because executable workflows must live there); all real CollectiveX logic, schemas, launchers, and processing stay under `experimental/CollectiveX/`.
+
+**This supersedes any notion of CollectiveX becoming a top-level InferenceX subsystem or extending the production serving matrix up front.** Promotion — into core InferenceX, into a dedicated repo, or into InferenceX-app's database/frontend — is an explicit *later* decision (Milestone 4), made only after the benchmark contract has stabilized on real hardware.
+
+### What InferenceX already gives us
+
+InferenceX's existing execution model is almost exactly the control plane CollectiveX needs:
+
+1. Generate and strictly validate a matrix on a GitHub-hosted runner.
+2. Fan jobs out to named or labelled self-hosted runners.
+3. Those listeners submit work to Slurm (or launch Docker locally).
+4. Normalize outputs.
+5. Upload artifacts.
+6. Aggregate and dispatch ingestion to the dashboard.
+
+`e2e-tests.yml` already divides generated configs into job families and invokes reusable single-node and multi-node workflows; `benchmark-tmpl.yml` cleans up resources, checks out the selected ref, **derives the launcher from the runner name**, launches the job, validates outputs, and uploads normalized results. Runner listeners live on cluster login/controller nodes while jobs run on compute nodes via Slurm; runner names/labels are load-bearing — the name prefix selects the launcher and exact names/SKU labels control scheduling.
+
+CollectiveX reuses all of this, but enters through **CollectiveX-specific launchers** rather than threading fake models through the serving launchers (see Cluster reuse).
+
+## Architecture
+
+Four planes, cleanly separated:
+
+- **Control plane:** scheduling, runners, cleanup, artifact movement, workflow metadata (reused from InferenceX).
+- **Benchmark plane:** collective semantics, backend invocation, correctness, timing.
+- **Data plane:** canonical result records, raw per-rank samples, topology and provenance.
+- **Presentation plane:** comparable subsets, charts, history, diagnostics.
+
+Data flow within the experimental directory:
+
+```text
+Portable shape definitions
+          +
+Backend definitions
+          +
+Target/cluster definitions
+          ↓
+CollectiveX matrix resolver
+          ↓
+Resolved shards
+          ↓
+Existing InferenceX self-hosted runner
+          ↓
+experimental/CollectiveX/launchers/*
+          ↓
+Backend adapter  (NCCL / RCCL / DeepEP / AITER / MoRI / …)
+          ↓
+Versioned result bundle
+          ↓
+Aggregator + regression checker
+          ↓
+Static experimental report   →  (later) InferenceX-app ingestion → Postgres → /collectives
+```
+
+### Target structure at promotion (Milestone 4)
+
+This packaged layout is the **promotion target**, not the spike. Milestone 0 uses the light layout in the rollout section below (`run_nccl.py` / `run_deepep.py` / `env_capture.py` / `plot.py` + flat `results/`); the structure here is what CollectiveX grows into *if* it is promoted out of `experimental/`.
+
+```text
+InferenceX/
+├── experimental/
+│   ├── README.md
+│   └── CollectiveX/
+│       ├── README.md
+│       ├── DESIGN.md
+│       ├── ROADMAP.md
+│       ├── pyproject.toml
+│       ├── Makefile
+│       │
+│       ├── src/
+│       │   └── collectivex/
+│       │       ├── __init__.py
+│       │       ├── cli.py
+│       │       ├── config/
+│       │       │   ├── models.py
+│       │       │   ├── loader.py
+│       │       │   ├── resolver.py
+│       │       │   └── matrix.py
+│       │       ├── benchmark/
+│       │       │   ├── harness.py
+│       │       │   ├── timing.py
+│       │       │   ├── correctness.py
+│       │       │   ├── routing.py
+│       │       │   └── metrics.py
+│       │       ├── backends/
+│       │       │   ├── base.py
+│       │       │   ├── fake.py
+│       │       │   ├── nccl_tests.py
+│       │       │   ├── rccl_tests.py
+│       │       │   ├── deepep.py
+│       │       │   └── framework_ep.py
+│       │       ├── cluster/
+│       │       │   ├── inventory.py
+│       │       │   ├── capabilities.py
+│       │       │   ├── environment.py
+│       │       │   └── launcher.py
+│       │       ├── results/
+│       │       │   ├── models.py
+│       │       │   ├── writer.py
+│       │       │   ├── aggregate.py
+│       │       │   ├── compare.py
+│       │       │   └── redact.py
+│       │       └── report/
+│       │           ├── build.py
+│       │           └── templates/
+│       │
+│       ├── configs/
+│       │   ├── suites/
+│       │   │   ├── smoke.yaml
+│       │   │   ├── primitives.yaml
+│       │   │   ├── moe-decode.yaml
+│       │   │   ├── moe-prefill.yaml
+│       │   │   └── full.yaml
+│       │   ├── shapes/
+│       │   │   ├── synthetic/
+│       │   │   └── traced/
+│       │   ├── backends/
+│       │   ├── targets/
+│       │   └── clusters.yaml
+│       │
+│       ├── launchers/
+│       │   ├── common.sh
+│       │   ├── launch_b200-dgxc.sh         # B200 single node
+│       │   ├── launch_b200-dgxc-slurm.sh   # B200 multinode
+│       │   └── launch_gb200-nv.sh          # GB200 NVL72
+│       │
+│       ├── schemas/
+│       │   ├── case-v1.schema.json
+│       │   ├── result-v1.schema.json
+│       │   ├── manifest-v1.schema.json
+│       │   └── environment-v1.schema.json
+│       │
+│       ├── scripts/
+│       │   ├── bootstrap.sh
+│       │   ├── run_suite.sh
+│       │   ├── run_shard.sh
+│       │   └── build_report.sh
+│       │
+│       ├── tests/
+│       │   ├── fixtures/
+│       │   ├── test_config.py
+│       │   ├── test_matrix.py
+│       │   ├── test_parsers.py
+│       │   ├── test_correctness.py
+│       │   └── test_comparability.py
+│       │
+│       └── docs/
+│           ├── BENCHMARK_CONTRACT.md
+│           ├── BACKEND_ADAPTER.md
+│           ├── SHAPE_REGISTRY.md
+│           ├── RESULT_FORMAT.md
+│           ├── FRONTEND.md
+│           └── PROMOTION_CRITERIA.md
+│
+└── .github/workflows/
+    └── collectivex-experimental.yml   # Added only when cluster CI begins (Milestone 2)
+```
+
+> Note: launcher names mirror the real runner-name prefixes. The spike adds the three NVIDIA launchers above; AMD (`launch_mi355x-amds.sh`) and others follow.
+
+## Benchmark model — keep four concepts separate
+
+CollectiveX needs its **own** schema. Do **not** reuse or extend the serving matrix, which is built around model / ISL / OSL / framework / TP / EP / concurrency and lives in `utils/matrix_logic/generate_sweep_configs.py`. Representing collectives with fake model names, `ISL=0`, or overloaded concurrency fields would create permanent technical debt. CollectiveX gets its own matrix logic (in the packaged layout, `src/collectivex/config/matrix.py`) — introduced with the workflow at Milestone 2, not the spike — rather than touching `utils/matrix_logic/generate_sweep_configs.py`.
+
+The model keeps four concepts independent:
+
+**Shape** — the logical communication workload:
+
+```text
+operation, message size, tokens per rank, hidden size, top-k,
+expert count, routing distribution, dtype, phase
+```
+
+**Backend** — the implementation under test:
+
+```text
+NCCL, RCCL, DeepEP, AITER, MoRI, framework-native EP, reference implementation
+```
+
+**Target** — where and how it runs:
+
+```text
+runner type, cluster, nodes, GPUs per node, rank placement,
+fabric, container image, transport capabilities
+```
+
+**Suite** — a curated selection of shape × backend × target combinations. Keeping these separate prevents copying the same DeepSeek/MiniMax shape into every NVIDIA and AMD configuration.
+
+### Portable definitions
+
+Shape:
+
+```yaml
+schema-version: 1
+shape-id: moe.decode.h7168.top8.e256.t64.uniform.v1
+
+kind: moe
+phase: decode
+operation: dispatch-combine
+
+shape:
+  tokens-per-rank: 64
+  hidden-size: 7168
+  top-k: 8
+  num-experts: 256
+  dispatch-dtype: fp8
+  combine-dtype: bf16
+  routing:
+    distribution: uniform
+    seed: 67
+  expert-alignment: 16
+```
+
+Backend:
+
+```yaml
+backend-id: deepep-normal
+backend: deepep
+mode: normal
+
+source:
+  repository: deepseek-ai/DeepEP
+  ref: pinned-commit
+
+settings:
+  async-overlap: false
+  num-comm-sms: standardized
+  qp-count: auto
+```
+
+Target:
+
+```yaml
+target-id: b200-dgxc-4n
+runner-type: b200-multinode
+cluster-id: b200-dgxc
+
+resources:
+  nodes: 4
+  gpus-per-node: 8
+  exclusive: true
+
+placement:
+  ranks-per-node: 8
+  rank-order: contiguous
+
+capabilities:
+  rdma: true
+  ibgda: experimental
+  nvshmem: true
+```
+
+Suite:
+
+```yaml
+suite-id: moe-decode-smoke
+
+shapes:
+  - moe.decode.h7168.top8.e256.t64.uniform.v1
+
+backends:
+  - deepep-normal
+  - deepep-low-latency
+
+targets:
+  - b200-dgxc-2n
+
+measurement:
+  warmup-iterations: 20
+  measured-iterations: 200
+  trials: 3
+  correctness: full
+```
+
+### Case identity
+
+A **case** is one immutable, versioned point: the natural key composes the three concepts —
+
+```text
+case-id = <backend-id> __ <shape-id> __ <target-id>
+e.g.  deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n
+      nccl__allreduce.fp16.logsweep.v1__b200-dgxc-2n
+```
+
+A shape must never silently change; a newly extracted distribution gets a new versioned `shape-id`.
+
+**Required shape fields — primitives:** operation; logical element count; datatype; input/output bytes; in-place vs out-of-place; reduction op (where applicable); world size; rank placement; host-driven vs device-driven launch; blocking/synchronization semantics.
+
+**Required shape fields — MoE (additional):** tokens per rank; hidden size; top-k; number of experts; EP size; dispatch and combine dtypes; routing distribution; expert alignment/padding; capacity constraints; quantization scale representation; cached vs recomputed routing layout; communication-SM count; async-overlap mode. DeepEP shows why these must be first-class — its interface takes tokens/rank, hidden size, top-k, expert count, FP8 mode and comm-SM settings, and exposes async dispatch/combine.
+
+### Shape registry
+
+Two independent shape sources:
+
+**Synthetic** — for continuous curves and hardware characterization (logarithmic byte sweep for primitives; token-count sweep for MoE; EP-scaling sweep; uniform and controlled-skew routing; intranode and internode placements; decode-oriented and prefill-oriented regimes). Don't build every Cartesian combination; define named suites (`primitive-latency-v1`, `primitive-bandwidth-v1`, `moe-decode-v1`, `moe-prefill-v1`, `moe-skew-v1`, `scaleout-v1`).
+
+**Trace-derived** — extracted from real InferenceX runs/profiles:
+
+```text
+models/deepseek-v4/decode/<shape-id>
+models/minimax-m3/decode/<shape-id>
+models/kimi-k2.7/prefill/<shape-id>
+```
+
+Each traced shape retains: source workflow run; model/config; phase; layer/layer-group; observed token histogram; routing skew; concurrent collective count; framework version; extraction-tool version. InferenceX already has a targeted profiling workflow (`profile.yml`) with optional MoE debug output and a separate trace-storage path — a natural source for real shapes rather than only guessed synthetic inputs.
+
+## Benchmark layers and comparison classes
+
+| Layer | Purpose | Examples |
+|---|---|---|
+| **L0 Environment** | Prove the cluster is benchmarkable | topology, NIC/GPU state, peer access, RDMA, IBGDA capability, version capture |
+| **L1 Primitive collectives** | Characterize the raw communication substrate | send/recv, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv |
+| **L2 MoE communication** | Compare real EP libraries | dispatch, combine, dispatch+combine round trip, normal and low-latency modes |
+| **L3 Integrated pipelines** | Communication in realistic operator sequences | route → permute → dispatch → grouped GEMM → combine → unpermute |
+| **L4 E2E correlation** | Explain InferenceX serving performance | isolated CollectiveX result linked to the corresponding InferenceX run/profile |
+
+The MVP concentrates on **L1 and L2**. L3 overlaps OperatorX and comes after the contracts are stable; L4 is the eventual tie-back to serving.
+
+**L0 — Environment validation** (before measuring anything): GPU count/identity; GPU/NIC topology; CUDA/ROCm version; driver version; NCCL/RCCL version; RDMA device visibility; peer-access matrix; IBGDA/SHMEM capability; container digest; clock/power state; selected network interfaces. A failed probe yields one clear `environment-invalid` result, not dozens of misleading backend failures.
+
+**L1 — Primitives:** send/receive, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv. Use vendor test programs where possible rather than rewriting primitives. Measure two regions separately: latency (bytes→low KiB) and bandwidth (MiB→GiB).
+
+**L2 — MoE collectives:** dispatch, combine, dispatch+combine. Dimensions: tokens/rank, hidden size, top-k, expert count, EP size, dispatch dtype, combine dtype, routing skew, normal vs low-latency, comm-SM count, node count.
+
+### Three comparison classes
+
+Every result is tagged with exactly one, and they must never be silently mixed on one chart:
+
+| Class | Meaning |
+|---|---|
+| `standardized` | Matched logical shape **and** fixed resource budget — same shape, topology, dtype, correctness contract, allowed comm-SMs, and timing boundaries. The main apples-to-apples comparison. |
+| `backend-optimized` | Same logical output, but each library uses its recommended comm-SMs / protocols / QP count / buffer sizing / graph capture / tuning. Answers "what is the best each stack can do?" |
+| `framework-integrated` | The actual path used by SGLang / vLLM / TensorRT-LLM / Dynamo. Connects to InferenceX; not a pure microbenchmark. |
+
+### Comparability key
+
+Every result gets a machine-generated comparison key; rows with different keys are not connected on the same curve by default:
+
+```text
+operation, shape ID, dtype, world size, node count, rank placement,
+routing distribution, comparison class, measurement contract version, topology class
+```
+
+## Measurement and correctness
+
+### Timing boundaries
+
+Record separately — never report one latency that sometimes includes JIT and sometimes doesn't:
+
+```text
+1. communicator creation
+2. buffer allocation and registration
+3. first invocation / JIT
+4. warmed steady-state invocation
+5. host launch time
+6. GPU completion time
+7. optional end-to-end framework-visible time
+```
+
+Per measured iteration: synchronize before starting (unless explicitly testing queued execution); use GPU events for device duration and host monotonic time for API/launch duration; retain per-rank measurements; aggregate only after rank-level data is stored; report the **slowest rank** as well as the average.
+
+### Correctness as a hard gate
+
+A result is `valid` only after correctness passes. A fast result that fails correctness stays visible as `invalid` — never silently dropped.
+
+Primitive checks: deterministic input; expected reduction result; guard regions around buffers; in-place and out-of-place checks; dtype-specific tolerances.
+
+MoE checks: token conservation; correct expert assignment; correct routing weights; valid permutation metadata; dispatch output vs reference; combine output vs reference; no padded-token leakage; deterministic routing hash.
+
+Failed results remain in artifacts, e.g.:
+
+```json
+{
+  "status": "invalid",
+  "correctness_passed": false,
+  "error": "combine result exceeded bf16 tolerance"
+}
+```
+
+### Routing distributions
+
+At minimum: uniform; single-hot/worst-case concentration; Zipf-like skew; bounded imbalance; replayed real histogram. Store the routing seed and the generated assignment hash.
+
+### Metrics
+
+| Category | Metrics |
+|---|---|
+| Latency | p50, p90, p95, p99, min, max |
+| Rank behavior | slowest-rank latency, rank spread, coefficient of variation |
+| Primitive throughput | algorithm bandwidth, bus bandwidth, effective bytes/s |
+| MoE throughput | tokens/s, logical payload GB/s, dispatch and combine separately |
+| Efficiency | bandwidth relative to declared topology bottleneck |
+| Host overhead | API launch time, CPU utilization where available |
+| GPU overhead | communication SM count, GPU active time, optional power |
+| Memory | persistent buffer bytes, peak temporary bytes |
+| Overlap | standalone comm, standalone compute, overlapped duration, overlap efficiency |
+| Reliability | initialization failures, hangs, retries, correctness failures |
+| Provenance | all software, image, driver, firmware and topology identifiers |
+
+### Bandwidth definitions
+
+NCCL `algbw`/`busbw` are stored but not treated as universal (NCCL applies operation-specific correction factors). MoE libraries often report **logical bottleneck bandwidth** (may include local-rank traffic or exclude metadata/padding; DeepEP explicitly publishes logical bandwidth). Store separate fields, and use `null` rather than a deceptive inference when a backend can't expose physical bytes:
+
+```text
+logical_payload_bytes
+allocated_payload_bytes
+estimated_link_bytes
+metadata_bytes
+padding_bytes
+```
+
+## Result and artifact format
+
+Each shard emits a versioned bundle:
+
+```text
+output/
+├── manifest.json
+├── cases.json
+├── results.jsonl
+├── rank-samples.jsonl.gz
+├── summary.json
+├── environment/
+│   ├── gpu.json
+│   ├── network.json
+│   ├── topology.json
+│   └── software.json
+├── raw/
+│   ├── stdout.log
+│   ├── stderr.log
+│   └── backend-output/
+├── commands/
+│   └── reproduce.sh
+└── profiles/
+```
+
+**Manifest** (invariant run-level metadata): schema version; workflow run + attempt; source SHA/ref; cluster ID; runner; Slurm job ID; node count; topology fingerprint; image digest; backend commit/build; start/end timestamps; redaction version.
+
+**Result row:**
+
+```json
+{
+  "schema_version": 1,
+  "case_id": "deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n",
+  "status": "valid",
+  "trial": 1,
+  "backend": "deepep",
+  "mode": "normal",
+  "comparison_class": "standardized",
+  "metrics": {
+    "latency_us_p50": 0,
+    "latency_us_p99": 0,
+    "slowest_rank_us_p50": 0,
+    "logical_bandwidth_gbps": 0,
+    "tokens_per_second": 0,
+    "rank_spread_pct": 0,
+    "persistent_buffer_bytes": 0
+  },
+  "correctness": { "passed": true, "max_abs_error": 0, "max_rel_error": 0 }
+}
+```
+
+Use an explicit `schema_version` from the beginning — do not repeat the app's historical need to infer schema version from whether a field happens to exist.
+
+## Backend adapters
+
+Each adapter implements a small contract:
+
+```python
+class CollectiveBackend:
+    def probe(self, environment) -> CapabilityReport: ...
+    def prepare(self, case, workdir) -> PreparedCommand: ...
+    def run(self, prepared, launcher) -> RawRun: ...
+    def parse(self, raw_run) -> list[RankSample]: ...
+    def validate(self, case, raw_run) -> CorrectnessReport: ...
+    def describe(self) -> BackendProvenance: ...
+```
+
+**Tier 0 — communication baselines:** NVIDIA `nccl-tests`, ROCm `rccl-tests`, optionally PyTorch distributed as a common-API baseline. Don't rewrite primitives from scratch — `nccl-tests` already supports multi-node, warmups, correctness checking (`-c 1`), per-rank aggregation, device-driven implementations, and separate CPU-time reporting. *(Confirm whether the installed build emits JSON; if not, parse the text table.)*
+
+**Tier 1 — MoE dispatch/combine:** upstream DeepEP, ROCm DeepEP, and the NVIDIA/AMD EP paths already used by the InferenceX serving stacks. **Version pins are first-class.** Upstream DeepEP V2 changed NVSHMEM→NCCL, unified high-throughput and low-latency APIs, changed buffer behavior, and removed a previous zero-SM LL mode; ROCm's port has different maturity, NIC variants, rocSHMEM dependencies. DeepEP is **built at job setup** (via `rebuild-deepep.sh`, resolved by srt-slurm), not shipped in the image — its build time and `aarch64` (GB200) feasibility are tracked spike risks. A chart labelled only "DeepEP" is therefore ambiguous — store:
+
+```text
+backend name, upstream/fork, git commit, API generation,
+transport backend, build flags, runtime library versions, container digest
+```
+
+**Tier 2 — additional optimized stacks (later):** MSCCL++, AITER comm/fusion paths, MoRI/Pollara, NVSHMEM/rocSHMEM microbenchmarks, framework-native fused collectives.
+
+## Rollout — spike-first
+
+**Spike-first.** No schema, Pydantic model, or comparison contract is frozen until one real, correctness-gated number exists on real hardware. The first milestone is a single end-to-end spike on **two NVIDIA topologies, B200 and GB200**, chosen because they exercise the two transport regimes that matter: B200 is an 8-GPU NVLink island with CX-7 InfiniBand between nodes; GB200 is an NVL72 multi-node-NVLink (MNNVL) domain. Running the same collective across both is itself the first headline result, and it forces the provenance and comparison-class machinery to be real from line one. The schema is the spike's *output*, extracted from the artifacts it produces — not its input. AMD and all platform work (workflow, DB, frontend) follow.
+
+### Milestone 0 — NVIDIA B200 + GB200 spike
+
+One milestone, NVIDIA-only, end to end. This collapses the former "design contract," "CPU framework," "primitive NVIDIA baseline," and the NVIDIA half of "MoE MVP" into a single vertical slice that produces real numbers on real fabric.
+
+Scaffolding — deliberately light, matching `experimental/` convention (bare scripts + flat JSON + a plot; no package / Pydantic / JSON-schemas yet — those arrive at the contract freeze):
+
+```text
+experimental/CollectiveX/
+  README.md
+  run_nccl.py        # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON)
+  tests/run_ep.py    # EP dispatch/combine sweep (DeepEP/MoRI); dispatch & combine timed separately
+  env_capture.py     # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json
+  plot.py            # matplotlib, like token_position_decode_slo/*/plot_*.py
+  launchers/
+    common.sh
+    launch_b200-dgxc.sh         # B200 single node  (b200-dgxc runner → 8-GPU NVLink island, x86_64)
+    launch_b200-dgxc-slurm.sh   # B200 multinode    (b200-multinode runner → CX-7 IB spine)
+    launch_gb200-nv.sh          # GB200             (gb200 runner → NVL72 MNNVL, aarch64, 4 GPU/node)
+  results/*.json     # flat, hand-verifiable
+```
+
+Reuse existing patterns rather than reinventing: `experimental/dsv32/bench.py` for `torch.cuda.Event` timing and stdout environment capture, and `experimental/token_position_decode_slo/glm-5/{bmk_*_sbatch.sh,plot_sla_frontier.py}` for Slurm orchestration + plotting. Mirror the runner→launcher routing convention (`bash ./launchers/launch_${RUNNER_NAME%%_*}.sh`) so the runner name selects the CollectiveX launcher as the serving path does.
+
+**DeepEP is not prebuilt in any image.** The serving recipes build it at job setup via `setup_script: rebuild-deepep.sh` (resolved by srt-slurm; see `benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`). The spike reuses that same rebuild path — on B200 (x86_64) first. Pin images by digest from `.github/configs/nvidia-master.yaml`: B200 `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b`; GB200 `lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc` (an unpinned nightly today — capture its digest before relying on it).
+
+What it measures:
+
+```text
+Primitives (stock nccl-tests, -c 1 for correctness) — on BOTH B200 and GB200:
+  all-reduce, all-gather, reduce-scatter, all-to-all
+  latency regime (bytes→KiB) and bandwidth regime (MiB→GiB)
+  B200  : 8 GPU/node (x86_64); 1 node (NVLink island) and 2 nodes (cross CX-7 IB)
+  GB200 : 4 GPU/node (aarch64); 1 node and 2+ nodes — all still inside the NVL72 NVLink (MNNVL) domain
+
+MoE (DeepEP, normal mode only — LL mode is the known-broken/blocked path, out of scope):
+  one decode-shaped dispatch+combine: tokens-per-rank=64, hidden=7168,
+  top-k=8, experts=256, dispatch fp8
+  correctness: token conservation + combine vs a reference implementation
+  B200 (x86_64) first; GB200 DeepEP is a fast-follow once the aarch64 rebuild-deepep path is proven
+```
+
+The headline is the **same NCCL primitive shape on both topologies**: B200's 2-node path crosses CX-7 InfiniBand, while GB200's stays on NVL72 NVLink (MNNVL). That IB-vs-MNNVL contrast at a matched logical shape is the result worth publishing. (nccl-tests and DeepEP must be built for `aarch64` on GB200 — the reason DeepEP is B200-first.)
+
+Provenance captured on every row from the first run — non-negotiable even in a spike, because it is what makes the B200-vs-GB200 number defensible:
+
+```text
+topology-class       b200-nvlink-island(+cx7-ib)  |  gb200-nvl72-mnnvl
+transport actually used   (NVLink / IB / NVSHMEM-IBGDA), derived from flags + measured behavior
+transport env set/recorded:
+  B200  : NCCL_CUMEM_ENABLE=1
+  GB200 : NCCL_CUMEM_ENABLE=1, NCCL_MNNVL_ENABLE=1, MC_FORCE_MNNVL=1
+  (also seen in serving: NCCL_P2P_LEVEL=NVL, SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK)
+comm-SM count, QP count where applicable
+backend commit + API generation + build flags
+container digest, CUDA / driver / NCCL versions
+comparison-class tag (standardized where shape, dtype and SM budget match)
+```
+
+These flags come from validated GB200 serving recipes (`…/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`); MNNVL is GB200/GB300-only, which is exactly what makes the transport differ from B200.
+
+Output: a result bundle on disk (`manifest.json`, `results.jsonl`, `environment/`, `raw/`, `commands/reproduce.sh`). Hand-verify the first rows; do not build a generated Pydantic contract yet.
+
+Exit criteria:
+
+* real NCCL latency + bandwidth curves on **both** B200 and GB200, correctness-passed (the headline)
+* one DeepEP dispatch+combine number (normal mode) on **B200**, correctness-passed; GB200 DeepEP as the immediate fast-follow
+* every row carries topology-class, transport, comparison-class and full provenance
+* a B200-vs-GB200 side-by-side that the comparison key permits **and labels as topology-class-differing** — that labeled comparison is the intended result, not an accident
+* **only now** freeze the schema (`CollectiveCase` / `CollectiveResult` / manifest), extracted from these artifacts
+
+Explicitly out of scope for the spike: AMD, IBGDA low-latency mode, GitHub Actions, database, frontend, trace-derived shapes, and the fake backend as a deliverable (keep a trivial one only if it speeds offline tests).
+
+### Milestone 1 — AMD parity
+
+Bring the AMD side up against the schema the spike froze — not in parallel with it:
+
+```text
+RCCL-tests adapter (mirror the nccl-tests text-table parser)
+one AMD launcher (launch_mi355x-amds.sh)
+one AMD MoE dispatch/combine backend (DeepEP ROCm / AITER / MoRI)
+equivalent shapes + identical result contract
+first cross-vendor (NVIDIA vs AMD) comparison
+```
+
+Record the AMD transport stack (rocSHMEM, MoRI-IO / Pollara, NIC variant) with the same provenance rigor the spike established. An unlabeled "DeepEP" row compared across vendors is meaningless.
+
+### Milestone 2 — GitHub workflow
+
+Add (orchestration only; see GitHub workflow design below):
+
+```text
+collectivex-experimental.yml
+preflight
+canary
+matrix sharding
+artifact collection
+regression comparison
+static report artifact
+```
+
+Do not connect it to `perf-changelog.yaml`.
+
+### Milestone 3 — Trace-derived shapes
+
+Extract representative shapes from InferenceX profiles (DeepSeek V4, MiniMax M3, Kimi). Every traced shape must retain: source workflow run; source configuration; framework version; model phase; extraction-tool version; routing-histogram hash.
+
+### Milestone 4 — Promotion decision
+
+Only then decide whether to: keep CollectiveX permanently experimental; move it into core InferenceX; extract it into a dedicated repository; or integrate its data into InferenceX-app (database + `/collectives` frontend).
+
+### First PRs (the spike)
+
+The spike lands as a few small PRs, each producing something runnable — not a docs-and-schema PR:
+
+```text
+1. Scaffold + NCCL on B200 single node
+   run_nccl.py (text-table parser), env_capture.py, plot.py,
+   launchers/launch_b200-dgxc.sh, results/*.json
+   → lands when it emits a real all-reduce curve with provenance from an 8-GPU B200
+
+2. B200 multinode + GB200
+   launchers/launch_b200-dgxc-slurm.sh, launchers/launch_gb200-nv.sh
+   → lands when the same primitive runs on 2-node B200 (cross-IB) and on GB200 NVL72 (MNNVL),
+     each tagged with topology-class and transport (aarch64 build for GB200)
+
+3. DeepEP dispatch+combine — B200 first
+   tests/ep_deepep.py, routing generator + reference combine for correctness,
+   reusing rebuild-deepep at job setup
+   → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow
+
+4. Freeze the contract
+   extract the case / result / manifest schema from the bundles produced in 1–3;
+   add fixtures captured from real output — this is where the packaged structure begins
+```
+
+The first objective is a real, provenance-tagged, correctness-gated number on two NVIDIA topologies — the contract is the spike's output, not its foundation.
+
+## Cluster reuse and capability inventory
+
+### What to reuse
+
+Existing self-hosted runner registrations; exact runner labels; Slurm access from runner hosts; checkout and artifact patterns; resource-cleanup strategy; repository secrets; container caches where appropriate. The runner inventory (`.github/configs/runners.yaml`) already enumerates H100, H200, B200, B300, GB200, GB300, MI300X, MI325X, MI355X fleets and groups such as `h200-multinode`, `b200-multinode`, individual nodes, etc. CollectiveX **reads** this file rather than duplicating runner names.
+
+### What not to reuse directly
+
+Do not call the serving launchers (`runners/launch_${RUNNER_NAME%%_*}.sh`) — they carry model-serving assumptions (model paths, framework setup, result naming). Mirror the **selection convention** with CollectiveX launchers instead:
+
+```bash
+bash experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh
+```
+
+Each CollectiveX launcher handles only: Slurm allocation; container image; mounts; network environment; rank launch; result copy-back; cleanup. There are **two launch paths**, mirroring the serving side: **single-node** B200 mirrors the `salloc … --gres=gpu:N --exclusive … && srun --container-image=<enroot squash>` pattern in `runners/launch_b200-dgxc.sh`; **multi-node** B200/GB200 drives **srt-slurm** (`srtctl apply -f <recipe>`), which already knows how to rebuild DeepEP and set the MNNVL env — so the CollectiveX GB200 launcher is a thin wrapper handing srt-slurm a CollectiveX recipe, not a from-scratch sbatch. (Later, common Slurm/container functions can be factored into a shared lib used by both systems.)
+
+> Runner-name subtlety to handle in `inventory.py`: one physical cluster can appear under multiple prefixes — `b200-dgxc_NN` routes to `launch_b200-dgxc.sh` (single-node) while `b200-dgxc-slurm_N` (label `b200-multinode`) routes to `launch_b200-dgxc-slurm.sh`. One fabric domain can therefore span several runner labels.
+
+### Capability overlay
+
+`inventory.py` loads `../../../.github/configs/runners.yaml` and combines it with a CollectiveX capability overlay — one source of truth for runner names, CollectiveX metadata kept isolated:
+
+```yaml
+b200-multinode:
+  launcher: b200-dgxc-slurm
+  vendor: nvidia
+  hardware: b200
+  topology-class: b200-nvlink-cx7
+  fabric-domain: b200-dgxc-main
+  gpus-per-node: 8
+  arch: x86_64
+  max-nodes: 16
+  scheduler: slurm
+  container-runtime: enroot-pyxis
+  capabilities:
+    nccl: true
+    deepep: true                # built at job setup via rebuild-deepep, not prebuilt
+    rdma: true
+    nvshmem: true
+    ibgda: experimental         # capability present ≠ currently validated
+  scheduling:
+    exclusive-nodes: true
+    max-parallel-shards: 1
+
+gb200:
+  launcher: gb200-nv
+  vendor: nvidia
+  hardware: gb200
+  topology-class: gb200-nvl72-mnnvl
+  gpus-per-node: 4              # NVL72 compute tray
+  arch: aarch64                 # nccl-tests + DeepEP must build for aarch64
+  scheduler: srt-slurm
+  transport-env: { NCCL_CUMEM_ENABLE: 1, NCCL_MNNVL_ENABLE: 1, MC_FORCE_MNNVL: 1 }
+  capabilities:
+    nccl: true
+    deepep: true                # rebuilt at setup; aarch64 path is a tracked risk
+    mnnvl: true                 # GB200/GB300 only
+    ibgda: experimental
+```
+
+`fabric-domain` is essential: two jobs on separate compute nodes may still contend for the same leaf/spine network, so **GitHub concurrency is keyed by fabric domain, not GPU SKU**. The inventory distinguishes hardware capability, software currently installed, and feature state (known-good vs experimental vs temporarily broken) — IBGDA support and "IBGDA low-latency currently validated" are different properties.
+
+**Operational coexistence with the serving sweep.** `b200-multinode` is only three runners (`b200-dgxc-slurm_7/8/9`), **shared with the production serving sweeps**, and srt-slurm allocations are long. Exclusive nodes + `max-parallel-shards: 1` + fabric-domain serialization means CollectiveX and the serving sweep contend for the same scarce runners. Decide the scheduling/coexistence policy (off-hours windows? a dedicated runner?) before enabling any recurring CollectiveX suite, rather than discovering the contention in CI.
+
+## GitHub workflow design (Milestone 2)
+
+When cluster CI begins, add one small orchestration-only file — `.github/workflows/collectivex-experimental.yml` — with no benchmarking logic:
+
+```text
+validate → resolve matrix → preflight canaries → benchmark shards
+→ aggregate → compare against baseline → build static report → upload artifacts
+```
+
+Triggers while on the branch:
+
+```yaml
+on:
+  push:
+    branches: [ collectivex ]
+    paths:
+      - experimental/CollectiveX/**
+      - .github/workflows/collectivex-experimental.yml
+  pull_request:
+    paths:
+      - experimental/CollectiveX/**
+      - .github/workflows/collectivex-experimental.yml
+```
+
+Later, after a minimal dispatcher exists on `main`, add `workflow_dispatch` with inputs: `ref, suite, target, backend, shape, profile` (and comparison class / normal-LL-both / dry-run).
+
+Jobs:
+
+1. **Validate** — install the package; validate all suite/shape/backend/cluster YAML; confirm runner references exist in `runners.yaml`; reject unknown fields; emit the resolved run plan as an artifact. (Match InferenceX's strict Pydantic practice — models reject extra fields.)
+2. **Compile and shard** — **do not** generate one job per benchmark point. Group cases by `cluster, node count, GPU placement, container image, backend build, transport mode, fabric domain, profiler requirement`. A shard runs many compatible points under one Slurm allocation (avoids thousands of matrix jobs, repeated communicator init, queue latency, repeated container import). Bounded runtime; record per-case failures unless the cluster itself is unhealthy.
+3. **Preflight** — confirm GPU count; validate peer access; enumerate NICs; test RDMA/device visibility; verify backend libraries; run a tiny correctness case; capture topology/software. A failed preflight marks the whole shard `environment-invalid` rather than manufacturing dozens of backend failures.
+4. **Canary** — for each `(cluster, backend, mode)` group, run one small representative case; launch the larger matrix only after it passes (mirrors InferenceX's canary-before-full-sweep).
+5. **Benchmark** (`collectivex-benchmark-tmpl.yml`) — run on the resolved runner label; unique Slurm job name from workflow/attempt/shard; exclusive nodes; serialize/limit by `fabric-domain`; call the CollectiveX launcher; upload results even on partial failure; always upload environment+logs; fail the job only after artifact creation.
+6. **Aggregate and regress** — validate every result against JSON schema; reject duplicate natural keys; merge rank samples and summaries; compute trial aggregates; compare against the most recent compatible baseline; publish a step summary; upload one `results_collectivex` bundle.
+7. **Dispatch ingestion** (only once promoted to feed the app) — repository-dispatch the InferenceX-app repo with `{ "benchmark-family": "collectivex", "run-id": "...", "run-attempt": "..." }`.
+
+Use a separate `collectivex-changelog.yaml`: a CollectiveX backend change must not trigger the expensive serving sweep through `perf-changelog.yaml`, and a serving change must not launch every collective suite.
+
+## Regression policy (Milestone 2+)
+
+A compatible baseline requires exact matches on: case ID; cluster ID; topology fingerprint (or approved topology class); backend; comparison class; normal/LL mode; node and rank placement; dtype and shape; measurement-contract version. **Do not compare "same GPU SKU" across materially different fabrics.**
+
+```text
+regression if:
+  correctness changed pass → fail
+  OR median latency degradation exceeds max(fixed floor, cluster noise threshold)
+  OR bandwidth degradation exceeds max(fixed floor, cluster noise threshold)
+```
+
+Derive each cluster's noise threshold from repeated baseline measurements via median absolute deviation — don't hard-code a universal 3% before knowing each fabric's noise. Retain failed, timed-out, and invalid results; reliability is part of the benchmark.
+
+## Reporting, database, and frontend
+
+**Now (spike / Milestone 2): a static, artifact-driven report.** Do not begin by changing InferenceX-app.
+
+```bash
+python -m collectivex.report --results output/aggregate.json --output output/report/
+```
+
+```text
+report/
+├── index.html
+├── data.json
+├── assets/
+└── runs/
+    └── <case-id>.html
+```
+
+Report views: **Overview** (supported clusters/backends, latest run, correctness failures, recent regressions, coverage matrix); **Primitive explorer** (latency / algbw / busbw / rank-spread vs payload size; single-node vs multinode); **MoE explorer** (dispatch & combine latency vs tokens/rank; tokens/s vs EP size; uniform vs skewed; normal vs LL; comm-SMs vs performance); **Case details** (exact shape, backend commit, container digest, topology fingerprint, environment, command, correctness report, rank-level distribution, raw logs). A **comparison warning** must visibly reject invalid comparisons:
+
+```text
+Not directly comparable:
+- different routing distribution
+- different topology class
+- different communication-SM budget
+- standardized versus backend-optimized mode
+```
+
+**Later (Milestone 4 / promotion into InferenceX-app):** add `/collectives` to the app (Next.js, React Query, raw API rows, client-side transforms, D3 charts; tab metadata/routing are centralized). Avoid a single global "CollectiveX score" at launch. Port the report views, plus Library Comparison, Scale-and-topology, and Historical-regression views, and a run-detail drawer. The frontend computes the `comparison-key` and refuses to connect rows with differing keys by default — **this guard matters more than any individual chart.**
+
+API routes (app):
+
+```text
+/api/v1/collectives
+/api/v1/collectives/availability
+/api/v1/collectives/history
+/api/v1/collectives/runs/:id
+/api/v1/collectives/artifacts/:id
+```
+
+Continue the app convention: API returns raw DB rows; the frontend does chart-specific transforms.
+
+**Database (app, later).** Do not put CollectiveX rows in `benchmark_results` (its identity is serving configs + ISL/OSL/concurrency). Reuse `workflow_runs`, then add:
+
+```sql
+collective_workloads(id, case_id, schema_version, family, operation, shape jsonb)
+collective_environments(id, cluster_id, hardware, topology_class, topology_hash, software jsonb, capabilities jsonb)
+collective_configs(id, workload_id, environment_id, backend, backend_version, comparison_class, mode, nodes, gpus_per_node, world_size, settings jsonb)
+collective_results(id, workflow_run_id, config_id, trial, date, status, metrics jsonb,
+                   latency_p50_us, latency_p99_us, logical_bandwidth_gbps, bus_bandwidth_gbps,
+                   tokens_per_second, rank_skew_pct, error)
+collective_artifacts(result_id, artifact_type, storage_url, metadata jsonb)
+collective_availability(date, hardware, cluster_id, backend, family, operation, mode)
+```
+
+Follow the app's hybrid design (JSONB for evolving metrics; indexed "hot" columns for common filters; idempotent ingestion; natural unique keys; denormalized date; latest-results materialized view). Keep raw per-rank samples in artifacts/object storage, not in Postgres.
+
+## Future expansions
+
+The spike de-risks the path to the actual deliverable — a public OSS collective benchmark and an explainer article. Expansion axes, roughly near → far, with dependencies:
+
+**Hardware breadth.** B300 / GB300 next (GB300 is also MNNVL, with known disagg KV-transfer wins) → H100 / H200 as a cheaper, more-available **InfiniBand baseline** ideal for characterizing per-fabric noise → AMD MI300X / MI325X / MI355X (this is Milestone 1) → TPU (far; a separate stack and toolchain).
+
+**Backend breadth.** Framework-native EP (the `framework-integrated` class — ties numbers back to the SGLang/vLLM serving paths) → MSCCL++, NVSHMEM / rocSHMEM microbenchmarks, AITER comm/fusion, MoRI / Pollara (AMD).
+
+**IBGDA low-latency mode.** The recurring strategic blocker and the original "LL is broken" story; gated on the NVIDIA SRE maintenance window for B200/B300. Highest narrative value — add as an experimental suite the moment it unblocks.
+
+**Scale-out.** 2 → 4 → 8 → 16 nodes; on GB200, intra-NVL72 vs cross-rack scaling-efficiency curves (where MNNVL ends and the inter-rack fabric begins).
+
+**L3 integrated operator path.** route → permute → dispatch → grouped-GEMM → combine → unpermute — the bridge to OperatorX.
+
+**L4 e2e correlation.** Link an isolated dispatch/combine number to the same shape's cost inside a real serving run via `profile.yml` traces — the "explain serving performance" payoff and the tie-back to the core product.
+
+**Trace-derived shapes (Milestone 3).** DeepSeek V4 / MiniMax M3 / Kimi token-histogram and routing-skew extraction, so the synthetic shapes are anchored to real workloads.
+
+**AMD Ultra Ethernet (UEC).** The AMD networking path; pairs with the MoRI / Pollara backends.
+
+**Productization (north star).** Static report → public OSS benchmark site + the explainer article; promotion into InferenceX-app (`/collectives` + Postgres + nightly suite + regression alerts) at Milestone 2 / 4.
+
+## Continuous benchmark — vision & scope
+
+Goal: a continuous benchmark that reproduces the spike automatically and grows into a credible cross-vendor EP/collective comparison. **Start with balanced DeepSeek shapes, intranode EP**, then venture to advanced cases. Target **≥1 EP library per platform** first — DeepEP on NVIDIA, MoRI on AMD.
+
+### EP library landscape
+- MoRI (AMD) — https://github.com/ROCm/mori
+- DeepEP / DeepEPv2 / Hybrid-EP — https://github.com/deepseek-ai/DeepEP (hybrid: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep)
+- NVIDIA NCCL EP — https://github.com/NVIDIA/nccl/tree/master/contrib/nccl_ep
+- UCCL — https://github.com/uccl-project/uccl
+- NVLink One-Sided AllToAll EP (mainly NVL72) — TensorRT-LLM blog18 (Optimizing MoE Communication with One-Sided AllToAll over NVLink)
+- NIXL EP — https://github.com/ai-dynamo/nixl/tree/main/examples/device/ep
+
+### Shapes & axes
+- **Classic DeepSeek V3:** hidden 7168, top-8, 256 routable experts.
+- **Prefill vs decode** (# tokens).
+- **Normal EP vs low-latency (LL) EP.**
+- **Dispatch precision:** NVFP4, MXFP4, MXFP8, BF16.
+- **Combine precision:** MXFP8, direct-cast FP8, BF16, NVFP4 — see MoRI #311, flashinfer #3643 / #3376.
+- **Balanced vs unbalanced vs EPLB.**
+- **Realistic shapes from InferenceX models** — collect hidden sizes / routing (Qwen3.5 has an unusual top-k).
+
+### Other inference collectives (later)
+- KV-cache transfer: MoRI-IO, NIXL, Mooncake; CPU↔GPU offload — `experimental/kvcache_transfer_DtoH_HtoD/benchmark.py`.
+- Low-latency one-shot / two-shot all-reduce (SGLang & vLLM in-tree kernels + AITER / FlashInfer variants) — e.g. sglang `sgl-kernel/csrc/allreduce/quick_all_reduce.cuh`.
+
+### Reference benchmark scripts to draw from
+- flashinfer PR #3000; ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`.
+
+### Learning resources
+- arXiv 2511.15076, 2603.13606, 2512.19849, 2412.19437.
+
+## Things not to do
+
+* Do not add collective fields to the existing serving matrix.
+* Do not make one GitHub Actions job per payload size.
+* Do not call all logical-bandwidth figures "bus bandwidth."
+* Do not compare different topology fingerprints as though GPU SKU were sufficient.
+* Do not silently discard failed or incorrect results.
+* Do not let a backend choose undocumented tuning parameters (in `standardized` mode).
+* Do not make low-latency mode the only reported result.
+* Do not publish one overall ranking before coverage and comparison contracts are stable.
+* Do not start with every EP library, TPU, UEC, and every model shape.
+* Do not store full raw rank samples indefinitely in Postgres.
+* Do not expose internal hostnames, paths, NIC GUIDs, IP addresses, or private image references in public artifacts.
+* Do not freeze the schema before the spike has produced a real artifact to freeze it from.
+
+## References (verified against the live InferenceX repo)
+
+- `experimental/README.md` — the non-core / "not official results" charter this project lives under.
+- `.github/configs/runners.yaml` — runner labels and exact names (H100…GB300, AMD MI3xx).
+- `.github/workflows/benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`, `profile.yml`, `speedbench-al.yml` — the `bash ./runners/launch_${RUNNER_NAME%%_*}.sh` selection convention.
+- `runners/launch_*.sh` — existing per-cluster launchers (`launch_b200-dgxc.sh`, `launch_b200-dgxc-slurm.sh`, `launch_gb200-nv.sh`, `launch_mi355x-amds.sh`, …).
+- `utils/matrix_logic/generate_sweep_configs.py`, `validation.py` — the serving matrix CollectiveX must **not** extend.
+- `.github/workflows/e2e-tests.yml`, `collect-results.yml` — the validate → fan-out → collect control plane being reused.
+- `perf-changelog.yaml` — the additions-only serving gate CollectiveX must **not** trigger.
+- NVIDIA Magnum IO NVSHMEM + GPUDirect Async (IBGDA): `https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/`
diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py
new file mode 100644
index 000000000..c24136ebc
--- /dev/null
+++ b/experimental/CollectiveX/plot.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — plot NCCL primitive curves, B200 vs GB200.
+
+Loads run_nccl.py result JSONs from results/, and for each operation draws two
+panels: latency-vs-size and bus-bandwidth-vs-size, overlaying one curve per
+(runner, topology-class, world-size). The B200(IB)-vs-GB200(MNNVL) contrast at
+a matched shape is the intended overlay and the spike's headline.
+
+Comparison guard (plan §Comparability): curves are only overlaid when they
+share op + dtype + comparison-class + measurement-contract. Anything else is
+reported as "not directly comparable" and skipped rather than silently mixed.
+
+    python plot.py --results-dir results --out-dir results/plots
+
+matplotlib + (optional) numpy. Run on a workstation/laptop over the JSON
+artifacts; no GPU needed.
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+from collections import defaultdict
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+def _human(nbytes: int) -> str:
+    for unit in ("B", "KiB", "MiB", "GiB"):
+        if nbytes < 1024 or unit == "GiB":
+            return f"{nbytes:.0f}{unit}" if unit == "B" else f"{nbytes/1:.0f}{unit}"
+        nbytes /= 1024
+    return str(nbytes)
+
+
+def load_nccl_results(results_dir: str) -> list[dict]:
+    docs = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
+        try:
+            with open(path) as _f:
+                d = json.load(_f)
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") == "nccl" and d.get("rows"):
+            d["_path"] = path
+            docs.append(d)
+    return docs
+
+
+def curve_label(d: dict) -> str:
+    return f"{d['runner']} · {d['topology_class']} · ws{d['world_size']}"
+
+
+def overlay_signature(d: dict) -> tuple:
+    """Fields that must match for two curves to share a chart (topology and
+    world-size are deliberately NOT here — they are the comparison axis)."""
+    return (d["op"], d.get("dtype"), d.get("comparison_class"), d.get("measurement_contract"))
+
+
+def plot_op(op: str, docs: list[dict], out_dir: str) -> str | None:
+    if not docs:
+        return None
+    # Comparison guard: keep the dominant signature, warn on the rest.
+    sigs = defaultdict(list)
+    for d in docs:
+        sigs[overlay_signature(d)].append(d)
+    main_sig = max(sigs, key=lambda s: len(sigs[s]))
+    keep = sigs[main_sig]
+    for sig, ds in sigs.items():
+        if sig == main_sig:
+            continue
+        for d in ds:
+            print(f"  [guard] skipping {curve_label(d)} for op={op}: not directly "
+                  f"comparable (dtype/class/contract differs: {sig} vs {main_sig})")
+
+    fig, (ax_lat, ax_bw) = plt.subplots(1, 2, figsize=(14, 5))
+    for d in sorted(keep, key=curve_label):
+        rows = sorted(d["rows"], key=lambda r: r["size_bytes"])
+        sizes = [r["size_bytes"] for r in rows]
+        lat = [r["out_of_place"]["time_us"] for r in rows]
+        bw = [r["busbw_gbps"] for r in rows]
+        label = curve_label(d)
+        ax_lat.plot(sizes, lat, "o-", linewidth=2, markersize=4, label=label)
+        ax_bw.plot(sizes, bw, "o-", linewidth=2, markersize=4, label=label)
+
+    for ax in (ax_lat, ax_bw):
+        ax.set_xscale("log", base=2)
+        ax.set_xlabel("Message size (bytes)")
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=9)
+    ax_lat.set_yscale("log")
+    ax_lat.set_ylabel("Latency (µs, out-of-place)")
+    ax_lat.set_title(f"{op}: latency vs size")
+    ax_bw.set_ylabel("Bus bandwidth (GB/s)")
+    ax_bw.set_title(f"{op}: bus bandwidth vs size")
+    fig.suptitle(
+        f"CollectiveX · {op} · dtype={main_sig[1]} · class={main_sig[2]}  "
+        f"(topology is the comparison axis)",
+        fontsize=11,
+    )
+    fig.tight_layout()
+    os.makedirs(out_dir, exist_ok=True)
+    out = os.path.join(out_dir, f"nccl_{op}.png")
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX primitive plots")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--out-dir", default="results/plots")
+    ap.add_argument("--op", help="only plot this op")
+    args = ap.parse_args()
+
+    docs = load_nccl_results(args.results_dir)
+    if not docs:
+        print(f"no nccl result JSONs found in {args.results_dir}/")
+        return 1
+
+    by_op = defaultdict(list)
+    for d in docs:
+        by_op[d["op"]].append(d)
+
+    ops = [args.op] if args.op else sorted(by_op)
+    made = []
+    for op in ops:
+        out = plot_op(op, by_op.get(op, []), args.out_dir)
+        if out:
+            made.append(out)
+            print(f"wrote {out}  ({len(by_op[op])} curve(s))")
+    if not made:
+        print("nothing plotted")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/plot_ep.py b/experimental/CollectiveX/plot_ep.py
new file mode 100644
index 000000000..e26b9d686
--- /dev/null
+++ b/experimental/CollectiveX/plot_ep.py
@@ -0,0 +1,1789 @@
+#!/usr/bin/env python3
+"""CollectiveX — render EP dispatch/combine sweeps to a self-contained HTML.
+
+Reads the family=moe result JSONs (tests/run_ep.py output) and emits ONE
+dependency-free HTML file (inline SVG, no CDN — opens offline) with:
+
+  * an interactive explorer: operation (dispatch | combine | round-trip) x
+    phase (decode | prefill) x x-axis (tokens/rank | global tokens) x y-axis
+    (latency | tokens/s | alg bandwidth), one colored line per SKU/backend/EP;
+  * a static small-multiples grid (phase x operation) of latency vs tokens/rank.
+
+Only source-tokens-per-rank varies along a line; everything else (backend, EP
+degree, phase, precision, top-k/experts/hidden, routing) is fixed and identifies
+the line — per the CollectiveX EP framework.
+
+    python3 plot_ep.py --results-dir results --out results/plots/collectivex_ep.html
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+import sys
+
+# SKU -> color (matches the matplotlib convention used for the NCCL plots).
+COLORS = {"b200": "#1f77b4", "gb200": "#2ca02c", "mi355x": "#d62728",
+          "b300": "#9467bd", "gb300": "#8c564b", "h100": "#ff7f0e", "h200": "#e377c2"}
+
+# Per-SKU color FAMILIES: every (sku,backend,dtype,mode,resource) config gets its own
+# shade within its SKU's hue family, so lines are individually identifiable AND the SKU
+# is still readable at a glance (SKU-only coloring collided same-SKU configs into one).
+SKU_FAMILY = {
+    "h100":  ["#ff7f0e", "#d6a72b", "#ffbb78", "#8c6d1f", "#e8a33d"],  # oranges / golds
+    "h200":  ["#e377c2", "#b04a8f", "#f4b6df"],                        # pinks
+    "b200":  ["#1f77b4", "#0d3d66", "#4a90d9", "#7fb2e0"],             # blues
+    "b300":  ["#9467bd", "#6b3fa0", "#c5b0d5", "#7b4fa0"],             # purples
+    "gb200": ["#2ca02c", "#1a661a", "#7bc77b"],                        # greens
+    "gb300": ["#8c564b", "#5e372f", "#c49c94"],                        # browns
+    "mi355x": ["#d62728", "#a30000", "#ff9896", "#e34a4a"],            # reds
+}
+PALETTE = ["#17becf", "#bcbd22", "#7f7f7f", "#393b79", "#637939"]      # fallback for unknown SKUs
+
+# MoE (hidden, top-k, routed-experts) -> human model name. Used to label the model-shape selector
+# + coverage + tooltips. DeepSeek-V3/V4 (7168/8/256) is the cross-hardware headline shape; the
+# others are official canonical results at additional model dims. An unlisted shape is labelled by
+# its dims (see model_name) so a new model is still selectable the moment its data lands.
+MODEL_NAMES = {
+    (7168, 8, 256): "DeepSeek-V3/V4",
+    (6144, 8, 256): "MiniMax-M3",
+    (7168, 8, 384): "Kimi-K2",
+    (4096, 8, 128): "Qwen3.5",
+    (7168, 8, 288): "DeepSeek-V3 (EPLB physical)",
+}
+
+
+def model_name(shape: dict) -> str:
+    """Map a result shape to a model name; fall back to the dims for an unregistered shape."""
+    h, k, e = shape.get("hidden"), shape.get("topk"), shape.get("experts")
+    return MODEL_NAMES.get((h, k, e)) or f"shape {h}/{k}/{e}"
+
+
+def _iter_docs(results_dir: str):
+    """Yield every result doc under results_dir: one per *.json file, AND one per line of each
+    *.ndjson (the consolidated aggregate written by aggregate_results.py). This lets the plot read
+    the single aggregate ndjson instead of thousands of individual JSONs — keeping results/ small
+    (the restructure goal). During a transition both may exist; delete the individuals once merged
+    so no doc is double-counted."""
+    for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)):
+        try:
+            yield json.load(open(path))
+        except (json.JSONDecodeError, OSError):
+            continue
+    for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.ndjson"), recursive=True)):
+        try:
+            with open(path) as fh:
+                for line in fh:
+                    line = line.strip()
+                    if line:
+                        yield json.loads(line)
+        except (json.JSONDecodeError, OSError):
+            continue
+
+
+def load_series(results_dir: str, legacy: str = "all") -> list[dict]:
+    series = []
+    for d in _iter_docs(results_dir):
+        if d.get("family") != "moe" or not d.get("rows"):
+            continue
+        # legacy = a v3 doc with no machine-derived publication_status. exclude -> v4-only main
+        # plot; only -> the legacy.html archive.
+        is_legacy = "publication_status" not in d
+        if (legacy == "exclude" and is_legacy) or (legacy == "only" and not is_legacy):
+            continue
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        rows = []
+        for r in d["rows"]:
+            # v4 carries nested {p50,p90,p95,p99} dicts for dispatch/combine/roundtrip/isolated_sum.
+            # Fall back to v3 flat *_us_p* (serial -> isolated_sum) so legacy docs still load.
+            def pcts(k, flat):
+                if isinstance(r.get(k), dict) and r[k].get("p50") is not None:
+                    o = dict(r[k]); o.setdefault("p95", o.get("p90"))
+                    return o
+                p50 = r.get(f"{flat}_us_p50")
+                return {"p50": p50, "p90": r.get(f"{flat}_us_p90") or p50,
+                        "p95": r.get(f"{flat}_us_p95") or r.get(f"{flat}_us_p90") or p50,
+                        "p99": r.get(f"{flat}_us_p99") or p50}
+            dop, cop = pcts("dispatch", "dispatch"), pcts("combine", "combine")
+            iso = pcts("isolated_sum", "serial")                       # renamed from "serial"
+            rtp = pcts("roundtrip", "roundtrip")                       # MEASURED round trip (v4)
+            if not (dop["p50"] and cop["p50"]):
+                continue
+            if rtp["p50"] is None:                                     # legacy: no measured RT
+                rtp = iso
+            rows.append({
+                "t": r["tokens_per_rank"], "gt": r.get("global_tokens"),
+                "dispatch": dop, "combine": cop, "roundtrip": rtp, "isolated_sum": iso,
+                "fanout": r.get("fanout_mean"),
+                "dbytes": r.get("dispatch_logical_bytes") or r.get("routed_bytes_total") or 0,
+                "cbytes": r.get("combine_logical_bytes") or 0,
+                "recv": r.get("recv_tokens_max") or r.get("recv_tokens") or 0,
+                "straggler": (r.get("per_rank_dispatch_us") or {}).get("slowest_rank"),
+                "correct": bool(r.get("correct")),
+            })
+        if not rows:
+            continue
+        sh = d.get("shape", {})
+        mode = d.get("mode", "normal")
+        dtype = sh.get("dispatch_dtype", "?")
+        rmode = d.get("resource_mode", "")
+        ll = " LL" if mode == "ll" else ""
+        # resource suffix: tuned is the default (omit); flag the others so a normalized
+        # or default-budget line is never confused with the tuned one.
+        rs = {"normalized": " (norm)", "default": " (def)"}.get(rmode, "")
+        contract = d.get("measurement_contract", "?")
+        cl = " [cl]" if contract == "cached-layout-comm-only-v1" else ""   # cached-layout flag
+        backend = d.get("backend")
+        ep = d.get("ep_size")
+        # DeepEP kernel generation (v1 NVSHMEM / v2 NCCL-Gin); default v1 for legacy deepep docs
+        # without the field, n-a for non-deepep. Folds into the line key + label so V1/V2 are distinct.
+        kgen = sh.get("kernel_gen") or ("v1" if backend == "deepep" else "n-a")
+        kg = f" {kgen}" if kgen == "v2" else ""   # only annotate v2 (keep v1 labels unchanged)
+        # Routing axis: base distribution + EPLB. "zipf+eplb" is the balanced-by-replication
+        # variant of zipf; uniform is the baseline (omitted from the label to keep it short).
+        eplb_doc = d.get("eplb") or {}
+        routing_disp = f'{sh.get("routing", "?")}+eplb' if eplb_doc.get("enabled") else sh.get("routing", "?")
+        # temporal step + uneven allocation are distinct workloads — fold into the routing label so
+        # moving-hotspot snapshots / uneven variants draw as separate lines, not overlaid.
+        _repro = d.get("reproduction") or {}
+        _step = _repro.get("routing_step", 0)
+        _uneven = _repro.get("uneven_tokens", "none")
+        if _step:
+            routing_disp += f"@s{_step}"
+        if _uneven != "none":
+            routing_disp += f"·{_uneven}"
+        rt = "" if routing_disp == "uniform" else f' ·{routing_disp}'
+        # FULL per-line label: SKU·EP·backend·dtype[·LL][·resource][·cached-layout][·routing].
+        # EP is explicit because a SKU can span EP degrees (GB300 EP4 on one NVL72 tray, EP8
+        # across two); routing is explicit so balanced/zipf/zipf+eplb don't collide with uniform.
+        label = f'{sku.upper()} EP{ep} · {backend}{kg} · {dtype}{ll}{rs}{cl}{rt}'
+        repro = d.get("reproduction", {})
+        gr = repro.get("git_run") or {}
+        rid = d.get("routing_identity", {})
+        wl = d.get("workload") or {}
+        # publication status (v4) gates the default view; legacy v3 docs -> "legacy".
+        pub = d.get("publication_status") or "legacy"
+        # workload signature: prefer the v4 workload block, fall back to routing_identity (v3).
+        wsig = wl.get("trace_signature") or rid.get("trace_signature")
+        series.append({
+            "sku": sku, "backend": backend, "ep": ep,
+            "pub": pub, "wsig": wsig, "wid": wl.get("workload_id"),
+            # combine-quant mode + activation (value) profile are part of workload identity
+            # (review: quant combine can be value-sensitive). Default none/normal for pre-scaffold
+            # results; used by the comparison guard + tooltip so a quantized-combine or
+            # different-value run is never read as the same point as a bf16/normal one.
+            "cqm": (sh.get("quant") or {}).get("combine_quant_mode", "none"),
+            "act": sh.get("activation_profile", "normal"),
+            "phase": d.get("phase", "decode"), "mode": mode,
+            "dtype": dtype, "resource": rmode or "tuned", "contract": contract,
+            # comparison class: best-stack (tuned/default) vs resource-constrained
+            # (normalized) — kept distinct so they're never read as one fair contest.
+            "suite": "resource-constrained" if rmode == "normalized" else "backend-default",
+            "routing": routing_disp,
+            # eplb per-rank load imbalance removed (the headline of zipf vs zipf+eplb).
+            "eplb_before": eplb_doc.get("imbalance_before"), "eplb_after": eplb_doc.get("imbalance_after"),
+            # ep + routing in the key so EP4/EP8 and uniform/balanced/zipf/zipf+eplb of one SKU
+            # get distinct colors/lines (sku stays ckey.split("|")[0] for the family lookup).
+            "kgen": kgen,
+            "ckey": f"{sku}|{backend}|{dtype}|{mode}|{rmode}|{contract}|ep{ep}|{routing_disp}|{kgen}",  # config identity (color); kgen so V1/V2 are distinct lines
+            "label": label,
+            "dash": "" if dtype == "bf16" else "6 4",   # bf16 solid, fp8 dashed (2nd cue)
+            "color": COLORS.get(sku, "#555"),           # provisional; reassigned below
+            "topo": d.get("topology_class"), "transport": d.get("transport"),
+            "fp8_in_timing": repro.get("fp8_quant_in_timing"),
+            "run_id": gr.get("run_id"), "source_sha": (gr.get("source_sha") or "")[:10],
+            "repo": gr.get("repo"), "image_digest": (repro.get("image_digest") or "")[:19],
+            "routing_consistent": rid.get("consistent_across_ranks"),
+            "trace_sig": rid.get("trace_signature"),
+            "samples": (rows and d["rows"][0].get("samples_pooled")) or None,
+            "prov": d.get("backend_provenance", {}),
+            # model name (from the MoE shape) so the model-shape selector / legend / coverage can
+            # name a series; the raw shape stays for the dims-based match in the chart filter.
+            "model": model_name(sh),
+            "shape": sh, "rows": rows,
+        })
+    # NOTE (goal Part 1, "plot/artifact integrity"): raw series are IMMUTABLE after loading.
+    # An earlier version injected each config's decode-range points into its prefill series so
+    # prefill panels spanned the full token axis — that COPIED observations between series and
+    # is removed. Each phase now plots only its own measured points; the x-axis simply spans
+    # whatever a series measured. (A shaded decode/prefill regime is the cosmetic alternative.)
+
+    # Assign a DISTINCT color per config key, grouped by SKU family (stable across the
+    # decode/prefill panels so a line keeps its color everywhere).
+    by_sku: dict[str, list[str]] = {}
+    for ck in sorted({s["ckey"] for s in series}):
+        by_sku.setdefault(ck.split("|")[0], []).append(ck)
+    ckcolor: dict[str, str] = {}
+    fb = 0
+    for sku, cks in by_sku.items():
+        fam = SKU_FAMILY.get(sku)
+        for j, ck in enumerate(cks):
+            if fam:
+                ckcolor[ck] = fam[j % len(fam)]
+            else:
+                ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1
+    for s in series:
+        s["color"] = ckcolor[s["ckey"]]
+    return series
+
+
+def load_nccl_series(results_dir: str) -> list[dict]:
+    """Load family=nccl docs (run_nccl.py output) into JS-friendly series — ADDITIVE to the
+    family=moe series; routed to the All-reduce / All-gather tabs by `op`. One series per result
+    doc (a single op x runner x topology x transport sweep over message sizes). Color is assigned
+    per (sku, topology_class, transport) config within the SKU's hue family, matching the EP plot's
+    convention so a SKU is readable at a glance. invalid docs are kept but flagged (greyed in the UI)
+    so a failed/zero-busbw run is excluded from comparison rather than silently dropped (goal P1)."""
+    series = []
+    for d in _iter_docs(results_dir):
+        if d.get("family") != "nccl" or not d.get("rows"):
+            continue
+        runner = d.get("runner") or "?"
+        sku = runner.split("_")[0].split("-")[0]
+        topo = d.get("topology_class") or "?"
+        transport = d.get("transport") or ""
+        op = d.get("op") or "?"
+        status = d.get("status") or "?"
+        valid = status == "valid"
+        rows = []
+        for r in d["rows"]:
+            # busbw_gbps is the best (max) across placements; pull the matching time from whichever
+            # placement that came from so latency + bandwidth describe the same observation. Default
+            # to out-of-place (the conventional headline) when busbw is absent/zero (latency-bound
+            # small messages report 0 GB/s — kept for the latency view, dropped from the bw view by y>0).
+            oop, ip = r.get("out_of_place") or {}, r.get("in_place") or {}
+            best_bw = r.get("busbw_gbps")
+            if best_bw is not None and ip.get("busbw_gbps") is not None and \
+               ip.get("busbw_gbps") == best_bw and (oop.get("busbw_gbps") or -1) != best_bw:
+                t_us, algbw = ip.get("time_us"), ip.get("algbw_gbps")
+            else:
+                t_us, algbw = oop.get("time_us"), oop.get("algbw_gbps")
+            if r.get("size_bytes") is None or t_us is None:
+                continue
+            rows.append({
+                "size": r["size_bytes"], "dtype": r.get("dtype"),
+                "t_us": t_us, "algbw": algbw, "busbw": best_bw,
+                "oop_us": oop.get("time_us"), "ip_us": ip.get("time_us"),
+                "correct": r.get("correct"),
+            })
+        if not rows:
+            continue
+        rows.sort(key=lambda x: x["size"])
+        tlab = f" · {transport}" if transport else ""
+        # label carries provenance (topology + transport); world-size disambiguates same-topo runs.
+        label = f'{sku.upper()} · {topo}{tlab} (ws{d.get("world_size","?")})'
+        series.append({
+            "op": op, "sku": sku, "runner": runner,
+            "topo": topo, "transport": transport,
+            "world_size": d.get("world_size"), "nodes": d.get("nodes"),
+            "dtype": (rows[0].get("dtype") if rows else None),
+            "comparison_class": d.get("comparison_class"),
+            "comparison_key": d.get("comparison_key"),
+            "contract": d.get("measurement_contract"),
+            "avg_busbw": (d.get("summary") or {}).get("avg_busbw_gbps"),
+            "status": status, "valid": valid,
+            # config identity for color: a (sku, topology, transport, world-size) cohort is one line.
+            "ckey": f"{sku}|{topo}|{transport}|ws{d.get('world_size')}",
+            "label": label, "color": COLORS.get(sku, "#555"),  # provisional; reassigned below
+            "rows": rows,
+        })
+    # DISTINCT color per config key within the SKU family (same scheme as the EP series), so an
+    # all-reduce line keeps a SKU-readable hue and same-SKU topologies stay distinguishable.
+    by_sku: dict[str, list[str]] = {}
+    for ck in sorted({s["ckey"] for s in series}):
+        by_sku.setdefault(ck.split("|")[0], []).append(ck)
+    ckcolor: dict[str, str] = {}
+    fb = 0
+    for sku, cks in by_sku.items():
+        fam = SKU_FAMILY.get(sku)
+        for j, ck in enumerate(cks):
+            if fam:
+                ckcolor[ck] = fam[j % len(fam)]
+            else:
+                ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1
+    for s in series:
+        s["color"] = ckcolor[s["ckey"]]
+    return series
+
+
+def load_allreduce_fw_series(results_dir: str) -> list[dict]:
+    """Load family=allreduce-fw docs (allreduce_fw_bench.py output) into JS-friendly series — ADDITIVE,
+    and shaped IDENTICALLY to load_nccl_series so they flow through the SAME All-reduce tab path with no
+    JS changes. One series per (doc, group/impl) so the nccl baseline, flashinfer-oneshot, and
+    flashinfer-twoshot lines each get their own color and are directly comparable. op is set to the same
+    "all_reduce" key the All-reduce tab filters on. `skipped` rows (no size, or no latency and no busbw)
+    are dropped so a not-applicable size doesn't draw a phantom point."""
+    series = []
+    for d in _iter_docs(results_dir):
+        if d.get("family") != "allreduce-fw" or not d.get("groups"):
+            continue
+        runner = d.get("runner") or "?"
+        sku = runner.split("_")[0].split("-")[0]
+        transport = d.get("transport") or ""
+        status = d.get("status") or "?"
+        valid = status == "valid"
+        for g in d["groups"]:
+            impl = g.get("impl") or "?"
+            world_size = g.get("world_size", d.get("world_size"))
+            topo = g.get("topology_class") or d.get("topology_class") or "?"
+            dtype = g.get("dtype") or d.get("dtype")
+            rows = []
+            for r in (g.get("rows") or []):
+                size = r.get("size_bytes")
+                t_us = r.get("latency_us")
+                busbw = r.get("busbw_gbps")
+                # drop `skipped` rows: no size, or neither a latency nor a (nonzero) bandwidth observation.
+                if size is None or (t_us is None and busbw in (None, 0)):
+                    continue
+                rows.append({
+                    "size": size, "dtype": dtype,
+                    "t_us": t_us, "algbw": r.get("algbw_gbps"), "busbw": busbw,
+                    "correct": r.get("correct"),
+                })
+            if not rows:
+                continue
+            rows.sort(key=lambda x: x["size"])
+            # label MUST carry the impl so nccl vs flashinfer-oneshot vs flashinfer-twoshot are distinct.
+            label = f'{sku.upper()} · {impl} (fw-AR · ws{world_size})'
+            series.append({
+                "op": "all_reduce", "sku": sku, "runner": runner,
+                "topo": topo, "transport": transport,
+                "world_size": world_size, "nodes": d.get("nodes"),
+                "dtype": dtype,
+                "comparison_class": d.get("comparison_class"),
+                "comparison_key": g.get("comparison_key") or d.get("comparison_key"),
+                "contract": d.get("measurement_contract"),
+                "status": status, "valid": valid,
+                # config identity for color: each impl is its own line within the SKU family.
+                "ckey": f"{sku}|fwar|{impl}|ws{world_size}",
+                "label": label, "color": COLORS.get(sku, "#555"),  # provisional; reassigned below
+                "rows": rows,
+            })
+    # DISTINCT color per config key within the SKU family (same scheme as load_nccl_series), so each
+    # impl keeps a SKU-readable hue and the three impls stay distinguishable.
+    by_sku: dict[str, list[str]] = {}
+    for ck in sorted({s["ckey"] for s in series}):
+        by_sku.setdefault(ck.split("|")[0], []).append(ck)
+    ckcolor: dict[str, str] = {}
+    fb = 0
+    for sku, cks in by_sku.items():
+        fam = SKU_FAMILY.get(sku)
+        for j, ck in enumerate(cks):
+            if fam:
+                ckcolor[ck] = fam[j % len(fam)]
+            else:
+                ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1
+    for s in series:
+        s["color"] = ckcolor[s["ckey"]]
+    return series
+
+
+def _assign_coll_colors(series: list[dict]) -> list[dict]:
+    """Assign a DISTINCT color per `ckey` within each SKU's hue family (same scheme as the EP / NCCL
+    series), so a collective line keeps a SKU-readable hue and same-SKU configs stay distinguishable."""
+    by_sku: dict[str, list[str]] = {}
+    for ck in sorted({s["ckey"] for s in series}):
+        by_sku.setdefault(ck.split("|")[0], []).append(ck)
+    ckcolor: dict[str, str] = {}
+    fb = 0
+    for sku, cks in by_sku.items():
+        fam = SKU_FAMILY.get(sku)
+        for j, ck in enumerate(cks):
+            if fam:
+                ckcolor[ck] = fam[j % len(fam)]
+            else:
+                ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1
+    for s in series:
+        s["color"] = ckcolor[s["ckey"]]
+    return series
+
+
+def _dedup_newest(docs: list) -> list:
+    """Keep one doc per dedup-key, newest generated_at wins (the decode+prefill jobs ran the SAME
+    single-process bench, so two files share a (sku,config) — drawing both would double every line).
+    `docs` is a list of (dedup_key, generated_at, payload); returns the surviving payloads."""
+    best: dict = {}
+    for key, gen, payload in docs:
+        cur = best.get(key)
+        if cur is None or (gen or "") > (cur[0] or ""):
+            best[key] = (gen, payload)
+    return [payload for _, payload in best.values()]
+
+
+def load_offload_series(results_dir: str) -> list[dict]:
+    """family=offload (CPU<->GPU offload). ONE line per (sku, op, host_memory) so pinned-vs-pageable
+    and h2d-vs-d2h are directly visible (goal P2 "GPU->CPU / CPU->GPU bandwidth/latency, pinned vs
+    pageable"). Dedup to newest doc per (sku, topology, transport); surface the overlap % from
+    diagnostics as a per-doc note. ADDITIVE — independent of the family=moe series."""
+    docs = []
+    for d in _iter_docs(results_dir):
+        if d.get("family") != "offload" or not d.get("rows"):
+            continue
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        # dedup key: a (sku, topology, transport) cohort is one bench regardless of decode/prefill job.
+        docs.append(((sku, d.get("topology_class"), d.get("transport")), d.get("generated_at"), d))
+    series = []
+    for d in _dedup_newest(docs):
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        topo = d.get("topology_class") or "?"
+        transport = d.get("transport") or ""
+        valid = (d.get("status") or "?") == "valid"
+        ov = ((d.get("diagnostics") or {}).get("overlap_with_compute") or {})
+        peak = d.get("peak_bandwidth_gbps")
+        note = (f"peak {peak:.0f} GB/s" if peak is not None else "")
+        if ov.get("overlap_pct") is not None:
+            note += f" · copy/compute overlap {ov['overlap_pct']:.0f}%"
+        numa = (d.get("diagnostics") or {}).get("numa") or {}
+        if numa.get("node_count") is not None:
+            note += f" · {numa['node_count']} NUMA node(s)"
+        lines: dict = {}   # (op, host_memory) -> rows
+        for r in d["rows"]:
+            if r.get("size_bytes") is None or r.get("bandwidth_gbps") is None:
+                continue
+            lines.setdefault((r.get("op"), r.get("host_memory")), []).append({
+                "size": r["size_bytes"], "bw": r.get("bandwidth_gbps"), "lat": r.get("latency_us")})
+        for (op, host), rows in lines.items():
+            rows.sort(key=lambda x: x["size"])
+            series.append({
+                "family": "offload", "sku": sku, "topo": topo, "transport": transport,
+                "op": op, "sub": host, "valid": valid, "status": d.get("status") or "?",
+                "note": note, "peak": peak,
+                "label": f'{sku.upper()} · {op} · {host}',
+                "ckey": f'{sku}|{op}|{host}', "color": COLORS.get(sku, "#555"),
+                "rows": rows,
+            })
+    return _assign_coll_colors(series)
+
+
+def load_copy_engine_series(results_dir: str) -> list[dict]:
+    """family=copy-engine (SDMA copy engine vs SM-driven copy). ONE line per (sku, op, engine) so the
+    copy-engine-vs-SM comparison (the headline of this view) is direct. Dedup to newest doc per
+    (sku, topology, transport); carry copy_engine_uses_near_zero_sms as a note. ADDITIVE."""
+    docs = []
+    for d in _iter_docs(results_dir):
+        if d.get("family") != "copy-engine" or not d.get("rows"):
+            continue
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        docs.append(((sku, d.get("topology_class"), d.get("transport")), d.get("generated_at"), d))
+    series = []
+    for d in _dedup_newest(docs):
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        topo = d.get("topology_class") or "?"
+        transport = d.get("transport") or ""
+        valid = (d.get("status") or "?") == "valid"
+        peak = d.get("peak_bandwidth_gbps")
+        nz = d.get("copy_engine_uses_near_zero_sms")
+        note = (f"peak {peak:.0f} GB/s" if peak is not None else "")
+        if nz is not None:
+            note += f" · copy-engine uses near-zero SMs: {'yes' if nz else 'no'}"
+        lines: dict = {}   # (op, engine) -> rows
+        for r in d["rows"]:
+            if r.get("size_bytes") is None or r.get("bandwidth_gbps") is None:
+                continue
+            lines.setdefault((r.get("op"), r.get("engine")), []).append({
+                "size": r["size_bytes"], "bw": r.get("bandwidth_gbps"), "lat": r.get("latency_us")})
+        for (op, engine), rows in lines.items():
+            rows.sort(key=lambda x: x["size"])
+            series.append({
+                "family": "copy-engine", "sku": sku, "topo": topo, "transport": transport,
+                "op": op, "sub": engine, "valid": valid, "status": d.get("status") or "?",
+                "note": note, "peak": peak,
+                "label": f'{sku.upper()} · {op} · {engine}',
+                "ckey": f'{sku}|{op}|{engine}', "color": COLORS.get(sku, "#555"),
+                "rows": rows,
+            })
+    return _assign_coll_colors(series)
+
+
+def load_kvcache_series(results_dir: str) -> list[dict]:
+    """family=kv-cache (KV block transfer). ONE line per (sku, direction, layout, backend) so paged-
+    vs-contiguous and the direction breakdown are visible. groups[] each carry their own rows[]
+    (transfer_bytes -> bandwidth_gb_s / time_ms). Dedup to newest doc per (sku, transport); note the
+    declared-unwired backends. ADDITIVE."""
+    docs = []
+    for d in _iter_docs(results_dir):
+        if d.get("family") != "kv-cache" or not d.get("groups"):
+            continue
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        docs.append(((sku, d.get("transport")), d.get("generated_at"), d))
+    series = []
+    for d in _dedup_newest(docs):
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        valid = (d.get("status") or "?") == "valid"
+        unwired = d.get("declared_unwired_backends") or []
+        wired = d.get("wired_backends") or []
+        note = (f"wired: {', '.join(wired)}" if wired else "")
+        if unwired:
+            note += f" · declared-unwired: {', '.join(unwired)}"
+        for g in d["groups"]:
+            direction, layout, backend = g.get("direction"), g.get("layout"), g.get("backend")
+            topo = g.get("topology_class") or d.get("transport") or "?"
+            rows = []
+            for r in (g.get("rows") or []):
+                if r.get("transfer_bytes") is None or r.get("bandwidth_gb_s") is None:
+                    continue
+                rows.append({"size": r["transfer_bytes"], "bw": r.get("bandwidth_gb_s"),
+                             "lat": r.get("time_ms"), "size_class": r.get("size_class"),
+                             "correct": r.get("correct")})
+            if not rows:
+                continue
+            rows.sort(key=lambda x: x["size"])
+            series.append({
+                "family": "kv-cache", "sku": sku, "topo": topo, "transport": d.get("transport") or "",
+                "op": direction, "sub": f'{layout}/{backend}', "valid": valid, "status": d.get("status") or "?",
+                "note": note,
+                "label": f'{sku.upper()} · {direction} · {layout} · {backend}',
+                "ckey": f'{sku}|{direction}|{layout}|{backend}', "color": COLORS.get(sku, "#555"),
+                "rows": rows,
+            })
+    return _assign_coll_colors(series)
+
+
+def load_rlmesh_series(results_dir: str) -> list[dict]:
+    """family=rl-mesh (RL trainer<->generator weight-transfer mesh). ONE line per (sku, direction,
+    pattern) so trainer->gen vs gen->trainer AND paired (1:1 send/recv) vs redistribute (disjoint
+    all-to-all reshard) are all visible. groups-nested like kv-cache (each group carries its own
+    rows[]: transfer_bytes -> bandwidth_gb_s / time_ms). Dedup to newest doc per (sku, transport);
+    note the mesh split (trainer N <-> generator M). ADDITIVE."""
+    docs = []
+    for d in _iter_docs(results_dir):
+        if d.get("family") != "rl-mesh" or not d.get("groups"):
+            continue
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        docs.append(((sku, d.get("transport")), d.get("generated_at"), d))
+    # short direction labels keep the legend compact (raw direction stays in `op` for grouping).
+    short = {"trainer_to_generator": "trn→gen", "generator_to_trainer": "gen→trn"}
+    series = []
+    for d in _dedup_newest(docs):
+        sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+        valid = (d.get("status") or "?") == "valid"
+        peak = d.get("peak_bandwidth_gb_s")
+        ws, tr, gr = d.get("world_size"), d.get("trainer_ranks"), d.get("generator_ranks")
+        note = (f"peak {peak:.0f} GB/s" if peak is not None else "")
+        if ws is not None:
+            note += f" · world={ws}: trainer {tr} ↔ generator {gr}"
+        for g in d["groups"]:
+            direction, pattern = g.get("direction"), g.get("pattern")
+            topo = g.get("topology_class") or d.get("transport") or "?"
+            rows = []
+            for r in (g.get("rows") or []):
+                if r.get("transfer_bytes") is None or r.get("bandwidth_gb_s") is None:
+                    continue
+                rows.append({"size": r["transfer_bytes"], "bw": r.get("bandwidth_gb_s"),
+                             "lat": r.get("time_ms"), "correct": r.get("correct")})
+            if not rows:
+                continue
+            rows.sort(key=lambda x: x["size"])
+            dlab = short.get(direction, direction)
+            series.append({
+                "family": "rl-mesh", "sku": sku, "topo": topo, "transport": d.get("transport") or "",
+                "op": direction, "sub": pattern, "valid": valid, "status": d.get("status") or "?",
+                "note": note,
+                "label": f'{sku.upper()} · {dlab} · {pattern}',
+                "ckey": f'{sku}|{direction}|{pattern}', "color": COLORS.get(sku, "#555"),
+                "rows": rows,
+            })
+    return _assign_coll_colors(series)
+
+
+# Budgets (µs) for the "max tokens / rank under a p99 round-trip budget" decision view (goal P3-D,
+# the previously-missing metric). Picked to bracket a typical decode SLO band.
+RT_BUDGETS_US = [100, 250, 500]
+
+
+def _rt_p99(row):
+    """measured round-trip p99 for a plot_ep row (v4 nested dict, falls back to isolated_sum)."""
+    rt = row.get("roundtrip") or {}
+    return rt.get("p99")
+
+
+def max_tokens_under_budget(series, budgets=RT_BUDGETS_US):
+    """For each (sku, backend, phase, dtype, ep) HEADLINE cell (official, DeepSeek-V3 shape, uniform
+    routing), the largest tokens/rank whose MEASURED round-trip p99 <= each budget. This is the
+    "how much load fits under an SLO" number the chart did not previously expose. Honest about
+    misses: a budget no measured point satisfies reports None (rendered as '—')."""
+    cells = {}
+    for s in series:
+        sh = s.get("shape") or {}
+        if not (s.get("pub") == "official" and s.get("wid")
+                and sh.get("hidden") == 7168 and sh.get("topk") == 8 and sh.get("experts") == 256
+                and s.get("routing") == "uniform"):
+            continue
+        key = (s["sku"], s["backend"], s["phase"], s["dtype"], s["ep"], s.get("mode", "normal"))
+        pts = cells.setdefault(key, [])
+        for r in s["rows"]:
+            q = _rt_p99(r)
+            if q and r.get("t"):
+                pts.append((r["t"], q))
+    out = []
+    for (sku, backend, phase, dtype, ep, mode), pts in sorted(cells.items()):
+        pts.sort()
+        row = {"sku": sku, "backend": backend, "phase": phase, "dtype": dtype, "ep": ep, "mode": mode}
+        for b in budgets:
+            ok = [t for (t, q) in pts if q <= b]
+            row[f"b{b}"] = max(ok) if ok else None
+        # only emit a row if at least one budget is satisfiable (keeps the table to useful cells)
+        if any(row.get(f"b{b}") is not None for b in budgets):
+            out.append(row)
+    return out
+
+
+def summary_cards(series, sens_rows, failed, ll_rows):
+    """Industry-summary headline cards (goal P3-F), computed from the loaded series. Each card is
+    {title, value, sub, [warn], [href]}. Comparisons use the MEASURED round-trip p99 on the official
+    DeepSeek-V3 headline cohort so the cards match the default chart view. ll_rows is analyze_ep's
+    ll_crossover() output (used for the LL→normal crossover card)."""
+    def headline(s):
+        sh = s.get("shape") or {}
+        return (s.get("pub") == "official" and s.get("wid")
+                and sh.get("hidden") == 7168 and sh.get("topk") == 8 and sh.get("experts") == 256
+                and s.get("routing") == "uniform")
+
+    def best_rt(pred, T_decode=64, T_prefill=256):
+        """lowest round-trip p99 over series matching pred, at the phase's headline token count."""
+        best = None
+        for s in series:
+            if not (headline(s) and pred(s)):
+                continue
+            T = T_decode if s["phase"] == "decode" else T_prefill
+            for r in s["rows"]:
+                if r.get("t") == T:
+                    q = _rt_p99(r)
+                    if q and (best is None or q < best[0]):
+                        best = (q, s, T)
+        return best
+
+    cards = []
+
+    def fmt_best(b, label):
+        if not b:
+            cards.append({"title": label, "value": "no data", "sub": "no official headline cell at this phase/EP"})
+            return
+        q, s, T = b
+        cards.append({"title": label,
+                      "value": f"{s['backend']} · {s['sku'].upper()}",
+                      "sub": f"{q:.0f} µs RT p99 · {s['dtype']} · T={T}"})
+
+    fmt_best(best_rt(lambda s: s["phase"] == "decode" and s["ep"] == 8), "Best backend · decode EP8")
+    fmt_best(best_rt(lambda s: s["phase"] == "prefill" and s["ep"] == 8), "Best backend · prefill EP8")
+
+    # LL crossover (measured-roundtrip basis, p50): first cell with a real crossover token count.
+    crosses = [r for r in (ll_rows or [])
+               if r.get("basis") == "measured-roundtrip" and r.get("stat") == "p50"
+               and isinstance(r.get("normal_faster_at_T"), int)]
+    if crosses:
+        c = min(crosses, key=lambda r: r["normal_faster_at_T"])
+        cards.append({"title": "LL → normal crossover",
+                      "value": f"T≈{c['normal_faster_at_T']} tok/rank",
+                      "sub": f"{c['sku'].upper()} EP{c['ep']} {c['dtype']} · normal RT p50 wins above this (measured)"})
+    else:
+        cards.append({"title": "LL → normal crossover", "value": "none in range",
+                      "sub": "normal RT never beats LL within the measured token ladder"})
+
+    # Resource-normalized vs backend-default winners (decode EP8 headline).
+    rn = best_rt(lambda s: s["phase"] == "decode" and s["ep"] == 8 and s["suite"] == "resource-constrained")
+    bd = best_rt(lambda s: s["phase"] == "decode" and s["ep"] == 8 and s["suite"] == "backend-default")
+    fmt_best(rn, "Resource-normalized winner")
+    fmt_best(bd, "Backend-default winner")
+
+    # Most unstable configuration: highest distribution-sensitivity ratio (p99 worst/uniform).
+    if sens_rows:
+        w = max(sens_rows, key=lambda g: g.get("distribution_sensitivity_ratio") or 0)
+        cards.append({"title": "Most unstable config", "warn": True,
+                      "value": f"{w['sku'].upper()} · {w['backend']} {w['phase']}",
+                      "sub": f"{w['distribution_sensitivity_ratio']:.2f}× p99 under {w.get('worst_distribution','?')} vs uniform"})
+    else:
+        cards.append({"title": "Most unstable config", "value": "n/a", "sub": "no multi-distribution group yet"})
+
+    # Known invalid / diagnostic cases (count + link to the Evidence tab's failed table).
+    n = len(failed or [])
+    cards.append({"title": "Invalid / diagnostic cases", "warn": n > 0,
+                  "value": str(n), "sub": ("see Evidence ▸ failed table" if n else "none — all runs publishable"),
+                  "href": "#tab-evidence"})
+    return cards
+
+
+HEAD = """<!doctype html><html lang="en"><head><meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>CollectiveX — EP dispatch / combine</title>
+<style>
+:root{--bg:#0f1115;--panel:#171a21;--ink:#e6e9ef;--mut:#9aa4b2;--line:#2a2f3a;--accent:#5b8def}
+*{box-sizing:border-box}
+body{margin:0;background:var(--bg);color:var(--ink);font:14px/1.45 -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,Helvetica,Arial,sans-serif}
+.wrap{max-width:1080px;margin:0 auto;padding:24px 18px 64px}
+h1{font-size:20px;margin:0 0 4px} h2{font-size:15px;color:var(--mut);font-weight:600;margin:28px 0 10px;border-bottom:1px solid var(--line);padding-bottom:6px}
+.sub{color:var(--mut);font-size:12.5px;margin:0 0 18px}
+.controls{display:flex;flex-wrap:wrap;gap:14px;background:var(--panel);border:1px solid var(--line);border-radius:10px;padding:12px 14px;margin-bottom:14px}
+.grp{display:flex;flex-direction:column;gap:5px}
+.grp .lab{font-size:11px;letter-spacing:.04em;text-transform:uppercase;color:var(--mut)}
+.seg{display:inline-flex;border:1px solid var(--line);border-radius:8px;overflow:hidden}
+.seg button{background:transparent;color:var(--mut);border:0;padding:6px 11px;font-size:12.5px;cursor:pointer}
+.seg button:hover{color:var(--ink)}
+.seg button.on{background:var(--accent);color:#fff}
+.card{background:var(--panel);border:1px solid var(--line);border-radius:10px;padding:10px}
+.legend{display:flex;flex-wrap:wrap;gap:16px;margin:6px 2px 0;color:var(--mut);font-size:12.5px}
+.guard{background:#3a2a14;border:1px solid #6b4f1f;color:#f0c674;border-radius:6px;padding:6px 10px;margin:6px 2px;font-size:12px}
+table.cov{border-collapse:collapse;font-size:12px;width:100%;margin:4px 0 18px}
+table.cov th,table.cov td{border:1px solid var(--line);padding:3px 8px;text-align:left}
+table.cov th{color:var(--mut)}
+.badge{color:#0f1115;border-radius:4px;padding:1px 6px;font-size:11px;font-weight:600}
+.legend .it{display:flex;align-items:center;gap:7px}
+.legend .sw{width:22px;height:3px;border-radius:2px;display:inline-block}
+.grid{display:grid;grid-template-columns:repeat(3,1fr);gap:12px}
+.gtit{font-size:12.5px;color:var(--ink);margin:0 0 2px;font-weight:600}
+.note{color:var(--mut);font-size:12px;margin-top:10px}
+svg{display:block;width:100%;height:auto}
+.ax{stroke:var(--line);stroke-width:1}.gl{stroke:var(--line);stroke-width:1;opacity:.45}
+.tk{fill:var(--mut);font-size:11px}.axl{fill:var(--mut);font-size:11.5px}
+.ttl{fill:var(--ink);font-size:13px;font-weight:600}
+circle.pt{stroke:#0f1115;stroke-width:1}
+@media(max-width:760px){.grid{grid-template-columns:1fr}}
+/* Tabs (goal P3-C): pure CSS/JS, no libs. One nav row; one .tab panel shown at a time. */
+.tabs{display:flex;flex-wrap:wrap;gap:4px;border-bottom:1px solid var(--line);margin:8px 0 16px}
+.tabs button{background:transparent;color:var(--mut);border:0;border-bottom:2px solid transparent;padding:9px 14px;font-size:13px;cursor:pointer;font-weight:600}
+.tabs button:hover{color:var(--ink)}
+.tabs button.on{color:var(--ink);border-bottom-color:var(--accent)}
+.tabs button:disabled{color:#555;cursor:not-allowed;font-weight:400}
+.tabs button:disabled:hover{color:#555}
+.tab{display:none}.tab.on{display:block}
+.soon{color:var(--mut);font-size:13px;background:var(--panel);border:1px dashed var(--line);border-radius:10px;padding:22px 18px;margin:8px 0}
+.soon b{color:var(--ink)}
+/* Industry summary cards (goal P3-F): a responsive row of headline takeaways. */
+.cards{display:grid;grid-template-columns:repeat(auto-fill,minmax(214px,1fr));gap:10px;margin:6px 0 4px}
+.kcard{background:var(--panel);border:1px solid var(--line);border-radius:10px;padding:11px 13px}
+.kcard .kt{font-size:11px;letter-spacing:.03em;text-transform:uppercase;color:var(--mut);margin-bottom:5px}
+.kcard .kv{font-size:15px;font-weight:700;color:var(--ink);line-height:1.25}
+.kcard .ks{font-size:11.5px;color:var(--mut);margin-top:3px}
+.kcard.warn{border-color:#6b4f1f}.kcard.warn .kv{color:#f0c674}
+.kcard a{color:var(--accent);text-decoration:none}.kcard a:hover{text-decoration:underline}
+/* Decision tables (goal P3-D): compact, same palette as the coverage tables. */
+table.dec{border-collapse:collapse;font-size:12px;width:100%;margin:4px 0 20px}
+table.dec th,table.dec td{border:1px solid var(--line);padding:3px 8px;text-align:left;white-space:nowrap}
+table.dec th{color:var(--mut);font-weight:600}
+table.dec td.num{text-align:right;font-variant-numeric:tabular-nums}
+.win{color:#2ca02c;font-weight:600}
+/* Provenance drawer (goal P3-E): collapsible per-series provenance + artifact links. */
+details.prov{background:var(--panel);border:1px solid var(--line);border-radius:10px;padding:4px 12px;margin:6px 0 18px}
+details.prov>summary{cursor:pointer;color:var(--ink);font-weight:600;font-size:13px;padding:7px 0;list-style:none}
+details.prov>summary::-webkit-details-marker{display:none}
+details.prov>summary:before{content:"▸ ";color:var(--mut)}
+details.prov[open]>summary:before{content:"▾ "}
+table.prov{border-collapse:collapse;font-size:11.5px;width:100%;margin:6px 0 8px}
+table.prov th,table.prov td{border:1px solid var(--line);padding:3px 7px;text-align:left;white-space:nowrap}
+table.prov th{color:var(--mut)}
+table.prov a{color:var(--accent);text-decoration:none}table.prov a:hover{text-decoration:underline}
+.mono{font-family:ui-monospace,SFMono-Regular,Menlo,monospace;font-size:11px;color:var(--mut)}
+</style></head><body><div class="wrap">
+<h1>CollectiveX — EP dispatch / combine</h1>
+<p class="sub" id="prov"></p>
+"""
+
+TAIL = "</div></body></html>"
+
+JS = r"""
+const SKUS = [...new Set(DATA.map(s=>s.sku))];
+// roundtrip = INDEPENDENTLY MEASURED chained latency (v4). isolated_sum = Σ of isolated
+// dispatch+combine percentiles — NOT a measured op (no throughput/SLO use). serial(v3)->isolated_sum.
+const OPS = {dispatch:"Dispatch", combine:"Combine", roundtrip:"Round trip (measured)", isolated_sum:"Isolated sum (Σp, not measured)"};
+// NOT algorithmic/bus bandwidth: logical routed payload (recv copies x hidden x dtype)
+// over latency; dispatch & combine count their OWN bytes. Excludes scales/idx/meta/padding.
+const YK  = {lat:"Latency (µs)", tps:"Tokens / s", bw:"Logical routed payload rate (GB/s)"};
+const XK  = {t:"Source tokens / rank", gt:"Global source tokens"};
+const PCT = {p50:"p50", p90:"p90", p99:"p99"};
+const SUITE = {all:"All", "backend-default":"Backend-default", "resource-constrained":"Resource-constrained"};
+// Routing distributions present in the data (+ "all"): uniform (baseline) / balanced /
+// zipf (skewed) / zipf+eplb (skew rebalanced by EPLB replication). Default to uniform so the
+// initial view matches the headline sweep; switch to compare zipf vs zipf+eplb.
+const ROUTING = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.routing))].sort().forEach(r=>{o[r]=r;}); return o; })();
+// Prefill panels show only the real large-T prefill range. MoRI ramps its prefill sweep from 1
+// (cold-jump wedge) and records decode-scale points; the intended prefill floor is the DeepEP
+// prefill ladder min. So every SKU's prefill panel starts there — the sub-floor MoRI points are
+// ramp-warmup (same kernel as decode) and live in the decode panel, not fabricated/duplicated here.
+const _dpf = DATA.filter(s=>s.phase==="prefill"&&s.backend==="deepep").flatMap(s=>s.rows.map(r=>r.t));
+const PREFILL_MIN = _dpf.length? Math.min(..._dpf) : 128;
+// Publication-status filter (goal P1): default hides diagnostic/invalid/failed so the first
+// view is publication-valid; "publishable" = official + comparable-experimental + legacy v3.
+// The OFFICIAL view additionally drops wid=null lines (a non-canonical workload can never be
+// official — goal P1) so an official chart can never show a wid=null or non-official cohort.
+// "official-headline" (goal P0-1a, B6/B7) is the DEFAULT opening filter: official + canonical wid
+// AND the single cross-hardware headline MoE shape (DeepSeek-V3 7168/8/256) — so the page opens on
+// exactly the apples-to-apples headline cohort, never a mixed-shape official set. Every broader set
+// (official / publishable / all) stays one click away.
+// MODEL-SHAPE selector (follow-up): each result carries a MoE shape (hidden/topk/experts) named in
+// Python (s.model). The headline shape is DeepSeek-V3/V4 (7168/8/256). The option list is built
+// DYNAMICALLY from the shapes ACTUALLY present in DATA (a shape with no data is never offered);
+// each option is keyed by "hidden/topk/experts" and labelled "<ModelName> (h/topk/e)". "all" = every
+// shape. Default = the headline shape so the opening view is unchanged.
+const HEADLINE_SHAPE = {hidden:7168, topk:8, experts:256};
+const SHAPE_KEY = sh => (sh? (sh.hidden+'/'+sh.topk+'/'+sh.experts) : '?');
+const HEADLINE_SHAPE_KEY = HEADLINE_SHAPE.hidden+'/'+HEADLINE_SHAPE.topk+'/'+HEADLINE_SHAPE.experts;
+// {shapeKey -> "Model (h/topk/e)"} for every distinct shape in DATA, headline first then by size.
+const MODELS = (()=>{
+  const seen={}; DATA.forEach(s=>{ const k=SHAPE_KEY(s.shape); if(!(k in seen)) seen[k]=s.model||('shape '+k); });
+  const keys=Object.keys(seen).sort((a,b)=>{ if(a===HEADLINE_SHAPE_KEY) return -1; if(b===HEADLINE_SHAPE_KEY) return 1; return a.localeCompare(b,undefined,{numeric:true}); });
+  const o={all:"All shapes"}; keys.forEach(k=>{ o[k]=seen[k]+' ('+k+')'; }); return o;
+})();
+const MODEL_DEFAULT = (HEADLINE_SHAPE_KEY in MODELS)? HEADLINE_SHAPE_KEY : Object.keys(MODELS).filter(k=>k!=="all")[0];
+function modelOk(s){ return ST.model==="all" || SHAPE_KEY(s.shape)===ST.model; }
+// isHeadlineShape now means "matches the SELECTED model shape" (defaults to DeepSeek-V3/V4), so the
+// official-headline filter follows the model selector instead of being pinned to one shape.
+function isHeadlineShape(s){ return modelOk(s); }
+const PUB = {"official-headline":"Official headline", official:"Official only", publishable:"Publishable", all:"All (incl. diagnostic)"};
+function pubOk(s){
+  if(ST.pub==="all") return true;
+  if(ST.pub==="official-headline") return s.pub==="official" && !!s.wid && isHeadlineShape(s);  // official + selected model shape
+  if(ST.pub==="official") return s.pub==="official" && !!s.wid;   // official => canonical wid required
+  // publishable = official + comparable, but ONLY with a NON-NULL workload id (goal P0: every
+  // plotted official/comparable result carries non-null workload identity). A seeded-runtime
+  // (wid=null) line is shown only in the "All (incl. diagnostic)" view, never as publishable.
+  return !["diagnostic","invalid","failed"].includes(s.pub) && !!s.wid;
+}
+// dtype + EP-degree filters (goal P0-1a/B2): the headline opens on BF16 + EP8, but "All" keeps
+// every dtype / EP degree selectable. Applied to the MAIN chart + legend only (the grid + heatmaps
+// facet by EP themselves). Built from the data so a new dtype/EP shows up automatically.
+const DTYPES = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.dtype))].sort().forEach(d=>{o[d]=d;}); return o; })();
+const EPS = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b).forEach(e=>{o[String(e)]="EP"+e;}); return o; })();
+function dtOk(s){ return ST.dtype==="all" || s.dtype===ST.dtype; }
+function epOk(s){ return ST.ep==="all" || String(s.ep)===ST.ep; }
+// HEADLINE DISTRIBUTION CONTRACT (goal P2 "define one headline distribution"): uniform is the
+// single cross-hardware headline — controlled, deterministic, and present on every SKU, so it is
+// the apples-to-apples reference. balanced / zipf / zipf+eplb / hotspot* are SENSITIVITY views
+// (see the Distribution-sensitivity section), NOT peer headline dimensions. (Long-term headline
+// will come from InferenceX trace replay; zipf+eplb is the interim load-realism reference.)
+const HEADLINE_DISTRIBUTION = "uniform";
+// HEADLINE OPENING VIEW (goal P0-1a, B2/B6/B7): the page opens on the MEASURED round trip at p99,
+// resource-constrained (normalized) suite, BF16, EP8, uniform routing, DeepSeek-V3 shape, official
+// headline cohort. Every other value stays selectable via the toggles below — this only sets what
+// the page OPENS with. resolveHeadlineDefaults() (called once at boot) falls the resource suite
+// back to backend-default if no normalized data exists for the headline cell, so the chart is never
+// empty on first paint while still defaulting to normalized whenever it is present.
+const ST  = {op:"roundtrip", phase:"decode", x:"t", y:"lat", xlog:true, ylog:true, pct:"p99",
+             suite:"resource-constrained", dtype:"bf16", ep:"8", model:MODEL_DEFAULT,
+             routing:HEADLINE_DISTRIBUTION, pub:"all"};
+// NOTE: pub defaults to "all" so the page opens showing the full sweep — the bulk of the data is
+// SEEDED-RUNTIME (comparable-experimental, wid=null), which the "official"/"publishable" filters
+// exclude by design (they require a canonical workload id). Toggle the publication filter to
+// "Official headline"/"Official"/"Publishable" for the publication-grade cohort only.
+// Count series visible under a candidate state (used only for graceful headline fallback). Model-
+// aware: the candidate carries o.model, and the official-headline branch matches that shape.
+function _visCount(o){ return DATA.filter(s=>s.phase===o.phase
+    && (o.suite==="all"||s.suite===o.suite) && (o.routing==="all"||s.routing===o.routing)
+    && (o.dtype==="all"||s.dtype===o.dtype) && (o.ep==="all"||String(s.ep)===o.ep)
+    && (o.model==="all"||SHAPE_KEY(s.shape)===o.model)
+    && _pubOkFor(s,o.pub,o.model)).length; }
+function _pubOkFor(s,pub,model){
+  if(pub==="all") return true;
+  const shapeOk = (model==null||model==="all"||SHAPE_KEY(s.shape)===model);
+  if(pub==="official-headline") return s.pub==="official" && !!s.wid && shapeOk;
+  if(pub==="official") return s.pub==="official" && !!s.wid;
+  return !["diagnostic","invalid","failed"].includes(s.pub) && !!s.wid;
+}
+// Resolve the opening view so the FIRST paint is never empty, while keeping normalized as the
+// preferred default. Fallback order is least-surprising-first: relax the suite (normalized ->
+// backend-default), then the dtype, then the EP degree, then the publication breadth. Each step
+// only fires if the current candidate yields no visible series.
+function resolveHeadlineDefaults(){
+  if(_visCount(ST)>0) return;
+  const ladder=[["suite","all"],["dtype","all"],["ep","all"],["pub","publishable"],["pub","all"]];
+  for(const [k,v] of ladder){ ST[k]=v; if(_visCount(ST)>0) return; }
+}
+
+function xval(r,xk){ return xk==="t"? r.t : r.gt; }
+function metric(r,op,yk,pct){
+  const us=(r[op] && r[op][pct]!=null)? r[op][pct] : (r[op]? r[op].p50 : 0);
+  if(yk==="lat") return us;
+  if(yk==="tps") return r.gt/(us*1e-6);
+  const b = op==="dispatch"? r.dbytes : op==="combine"? r.cbytes : (r.dbytes + r.cbytes);
+  return us>0 ? b/(us*1e3) : 0;   // logical routed payload rate (GB/s), per-op bytes
+}
+function fmt(v){
+  if(v>=1e9) return (v/1e9).toFixed(v<1e10?2:0)+"G";
+  if(v>=1e6) return (v/1e6).toFixed(v<1e7?2:0)+"M";
+  if(v>=1e3) return (v/1e3).toFixed(v<1e4?1:0)+"k";
+  if(v>=10)  return v.toFixed(0);
+  if(v>=1)   return v.toFixed(v<3?1:0);
+  return v.toFixed(2);
+}
+function logTicks(mn,mx){
+  const t=[]; let e=Math.floor(Math.log10(mn));
+  for(;Math.pow(10,e)<=mx*1.0001;e++) for(const m of [1,2,5]){const v=m*Math.pow(10,e); if(v>=mn*0.999&&v<=mx*1.001)t.push(v);}
+  return t.length?t:[mn,mx];
+}
+function linTicks(mn,mx){
+  const span=mx-mn||1, step=Math.pow(10,Math.floor(Math.log10(span))); const t=[];
+  let s=step; if(span/step>6)s=step*2; if(span/step<3)s=step/2;
+  for(let v=Math.ceil(mn/s)*s; v<=mx*1.0001; v+=s) t.push(+v.toFixed(6));
+  return t.length?t:[mn,mx];
+}
+const mapLog=(v,a,b,p,q)=>p+(Math.log(v)-Math.log(a))/(Math.log(b)-Math.log(a))*(q-p);
+const mapLin=(v,a,b,p,q)=>p+(v-a)/(b-a)*(q-p);
+
+// Build one SVG chart. opts: {op,phase,x,y,ylog,title,legend,w,h}
+function chart(o){
+  const W=o.w||900, H=o.h||520, m={l:64,r:16,t:34,b:46};
+  const pct=o.pct||"p99", suite=o.suite||"all", routing=o.routing||"all";
+  // o.dtype / o.epf / o.model are the MAIN-chart headline filters (default-off so the grid, which
+  // faces by EP via o.ep, is unaffected). epf "all"|"8"…; dtype "all"|"bf16"…; model "all"|"hidden/topk/experts".
+  const sl = DATA.filter(s=>s.phase===o.phase && (o.ep==null || s.ep===o.ep)
+                            && (suite==="all" || s.suite===suite)
+                            && (routing==="all" || s.routing===routing)
+                            && (!o.dtype || o.dtype==="all" || s.dtype===o.dtype)
+                            && (!o.epf || o.epf==="all" || String(s.ep)===o.epf)
+                            && (!o.model || o.model==="all" || SHAPE_KEY(s.shape)===o.model) && pubOk(s));
+  const pts = sl.map(s=>({s, P:s.rows.map(r=>({x:xval(r,o.x), y:metric(r,o.op,o.y,pct), r}))
+                                     .filter(p=>p.x>0 && (o.ylog? p.y>0 : p.y>=0)
+                                                && (o.phase!=="prefill" || p.r.t>=PREFILL_MIN))}));
+  let xs=[], ys=[]; pts.forEach(g=>g.P.forEach(p=>{xs.push(p.x);ys.push(p.y);}));
+  if(!xs.length) return '<svg viewBox="0 0 '+W+' '+H+'"><text x="'+(W/2)+'" y="'+(H/2)+'" class="axl" text-anchor="middle">no data</text></svg>';
+  const xmn=Math.min(...xs), xmx=Math.max(...xs);
+  let ymn=Math.min(...ys), ymx=Math.max(...ys);
+  if(o.ylog){ ymn=Math.min(...ys.filter(v=>v>0)); } else { ymn=Math.min(0,ymn); }
+  if(ymx===ymn) ymx=ymn+1;
+  const X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t;
+  const xlog = o.xlog!==false;                              // x defaults to log (geometric sweep)
+  const xv=v=>xlog?mapLog(v,xmn,xmx,X0,X1):mapLin(v,xmn,xmx,X0,X1);
+  const yv=v=>o.ylog?mapLog(Math.max(v,ymn),ymn,ymx,Y0,Y1):mapLin(v,ymn,ymx,Y0,Y1);
+  let s='<svg viewBox="0 0 '+W+' '+H+'" role="img">';
+  s+='<text x="'+X0+'" y="20" class="ttl">'+o.title+'</text>';
+  // y grid + ticks
+  const yt=o.ylog?logTicks(ymn,ymx):linTicks(ymn,ymx);
+  yt.forEach(v=>{const y=yv(v); s+='<line class="gl" x1="'+X0+'" y1="'+y+'" x2="'+X1+'" y2="'+y+'"/>'+
+    '<text class="tk" x="'+(X0-7)+'" y="'+(y+3.5)+'" text-anchor="end">'+fmt(v)+'</text>';});
+  // x grid + ticks (label the actual sweep points)
+  const xt=[...new Set(xs)].sort((a,b)=>a-b);
+  xt.forEach(v=>{const x=xv(v); s+='<line class="gl" x1="'+x+'" y1="'+Y0+'" x2="'+x+'" y2="'+Y1+'"/>'+
+    '<text class="tk" x="'+x+'" y="'+(Y0+16)+'" text-anchor="middle">'+fmt(v)+'</text>';});
+  // axes
+  s+='<line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X1+'" y2="'+Y0+'"/><line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X0+'" y2="'+Y1+'"/>';
+  s+='<text class="axl" x="'+((X0+X1)/2)+'" y="'+(H-6)+'" text-anchor="middle">'+XK[o.x]+(xlog?'  (log)':'')+'</text>';
+  s+='<text class="axl" transform="translate(15,'+((Y0+Y1)/2)+') rotate(-90)" text-anchor="middle">'+YK[o.y]+(o.ylog?'  (log)':'')+'</text>';
+  // lines + points
+  pts.forEach(g=>{ if(!g.P.length) return;
+    const d=g.P.map((p,i)=>(i?'L':'M')+xv(p.x).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' ');
+    const dash=g.s.dash?' stroke-dasharray="'+g.s.dash+'"':'';
+    s+='<path d="'+d+'" fill="none" stroke="'+g.s.color+'" stroke-width="2"'+dash+'/>';
+    g.P.forEach(p=>{ const D=p.r.dispatch, C=p.r.combine, R=p.r.roundtrip;
+      // artifact links (goal P1): the workflow run + source SHA + image digest + workload id
+      // that produced this point. (Result JSON / manifest / raw-samples live alongside by name.)
+      const run=g.s.run_id? ('\nrun '+g.s.run_id+(g.s.source_sha?' @'+g.s.source_sha:'')) : '';
+      const art='\nworkload='+(g.s.wid||g.s.wsig||'?')+(g.s.image_digest?'  ·  image '+g.s.image_digest:'')
+                +(g.s.repo?'  ·  '+g.s.repo:'');
+      s+='<circle class="pt" cx="'+xv(p.x).toFixed(1)+'" cy="'+yv(p.y).toFixed(1)+'" r="3.2" fill="'+g.s.color+'">'+
+      '<title>'+g.s.label+'  ['+pct+']  ('+g.s.pub+')'+
+      '\nmodel='+(g.s.model||'?')+'  (hidden/topk/experts '+SHAPE_KEY(g.s.shape)+')'+
+      '\nT/rank='+p.r.t+'  ·  global='+p.r.gt+
+      '\n'+YK[o.y]+' = '+fmt(p.y)+(o.y==='lat'?' µs':o.y==='bw'?' GB/s':'')+
+      '\ndispatch  µs p50/p90/p99 = '+D.p50.toFixed(1)+'/'+D.p90.toFixed(1)+'/'+D.p99.toFixed(1)+
+      '\ncombine   µs p50/p90/p99 = '+C.p50.toFixed(1)+'/'+C.p90.toFixed(1)+'/'+C.p99.toFixed(1)+
+      '\nroundtrip µs p50/p90/p99 = '+R.p50.toFixed(1)+'/'+R.p90.toFixed(1)+'/'+R.p99.toFixed(1)+' (measured)'+
+      '\nfan-out='+(p.r.fanout!=null?p.r.fanout.toFixed(2):'?')+'  ·  recv(max)='+p.r.recv
+      +(p.r.straggler!=null?'  ·  straggler=r'+p.r.straggler:'')+(p.r.correct?'':'  ✗')+
+      '\ncontract='+g.s.contract+'  ·  suite='+g.s.suite+
+      '\ndispatch='+g.s.dtype+'  ·  combine='+(g.s.cqm||'none')+'  ·  activation='+(g.s.act||'normal')+run+art+
+      '</title></circle>'; });
+  });
+  s+='</svg>'; return s;
+}
+// Comparison guard (goal P1): flag when overlaid lines are NOT a direct comparison —
+// differing topology at one EP, or differing realized workload signature within one routing.
+function guardNote(vis){
+  if(!vis.length) return '';
+  const w=[];
+  const topos=[...new Set(vis.map(s=>s.topo).filter(Boolean))];
+  if(topos.length>1) w.push('mixed topology ('+topos.join(', ')+')');
+  const byRt={}; vis.forEach(s=>{ (byRt[s.routing]=byRt[s.routing]||new Set()).add(s.wsig||'?'); });
+  const split=Object.entries(byRt).filter(([k,v])=>v.size>1).map(([k])=>k);
+  if(split.length) w.push('different workload trace within routing ['+split.join(',')+'] — NOT identical workloads');
+  // combine-quant / activation-value / workload-id are part of the workload contract: a quantized
+  // combine, a different value distribution, or a different canonical workload is NOT the same
+  // benchmark as the headline, even at matched routing/dims (review).
+  const cqms=[...new Set(vis.map(s=>s.cqm||'none'))];
+  if(cqms.length>1) w.push('mixed combine-quant ('+cqms.join(', ')+') — quantized combine is a different contract from dispatch');
+  const acts=[...new Set(vis.map(s=>s.act||'normal'))];
+  if(acts.length>1) w.push('mixed activation profile ('+acts.join(', ')+') — value distribution differs');
+  const wids=[...new Set(vis.map(s=>s.wid).filter(Boolean))];
+  if(wids.length>1) w.push('mixed workload_id ('+wids.join(' / ')+') — not the same canonical workload');
+  // source SHA: a cross-SKU OFFICIAL cohort must come from ONE benchmark source SHA (goal P1).
+  const shas=[...new Set(vis.map(s=>s.source_sha).filter(Boolean))];
+  if(shas.length>1) w.push('mixed source SHA ('+shas.join(' / ')+') — official cohorts need one benchmark SHA');
+  // wid=null cohorts can never be official (goal P1) — flag if any non-canonical line is shown.
+  const nullwid=vis.filter(s=>!s.wid).length;
+  if(nullwid && ST.pub==='official') w.push(nullwid+' line(s) have wid=null — excluded from the official view');
+  const eps=[...new Set(vis.map(s=>s.ep))];
+  if(eps.length>1) w.push('mixed EP degree '+eps.join('/')+' — compare only on the global-tokens x-axis');
+  return w.length? '<div class="guard">⚠ not a direct comparison: '+w.join('; ')+'</div>' : '';
+}
+function legend(phase, ep, suite, routing, dtype, epf, model){
+  return '<div class="legend">'+DATA.filter(s=>s.phase===phase && (ep==null||s.ep===ep)
+                                              && (!suite||suite==="all"||s.suite===suite)
+                                              && (!routing||routing==="all"||s.routing===routing)
+                                              && (!dtype||dtype==="all"||s.dtype===dtype)
+                                              && (!model||model==="all"||SHAPE_KEY(s.shape)===model)
+                                              && (!epf||epf==="all"||String(s.ep)===epf) && pubOk(s)).map(s=>{
+    const sw = s.dash ? 'background:repeating-linear-gradient(90deg,'+s.color+' 0 5px,transparent 5px 9px)'
+                      : 'background:'+s.color;   // dashed swatch = fp8 (matches the line)
+    // when shapes are mixed ("All shapes"), prefix the model so same-config lines of different
+    // models are distinguishable; a single-model view keeps the original (uncluttered) label.
+    const lab = (model==="all"? '['+(s.model||'?')+'] ' : '')+s.label;
+    return '<span class="it"><span class="sw" style="'+sw+'"></span>'+lab+'</span>';
+  }).join('')+'</div>';
+}
+function seg(name,opts,cur){
+  return '<div class="seg">'+Object.entries(opts).map(([k,v])=>
+    '<button data-grp="'+name+'" data-val="'+k+'" class="'+(k===cur?'on':'')+'">'+v+'</button>').join('')+'</div>';
+}
+function renderControls(){
+  document.getElementById('controls').innerHTML =
+    '<div class="grp"><span class="lab">Model shape (headline=DeepSeek-V3/V4)</span>'+seg('model',MODELS,ST.model)+'</div>'+
+    '<div class="grp"><span class="lab">Operation</span>'+seg('op',OPS,ST.op)+'</div>'+
+    '<div class="grp"><span class="lab">Phase</span>'+seg('phase',{decode:"Decode",prefill:"Prefill"},ST.phase)+'</div>'+
+    '<div class="grp"><span class="lab">Percentile</span>'+seg('pct',PCT,ST.pct)+'</div>'+
+    '<div class="grp"><span class="lab">Suite</span>'+seg('suite',SUITE,ST.suite)+'</div>'+
+    '<div class="grp"><span class="lab">Dispatch dtype</span>'+seg('dtype',DTYPES,ST.dtype)+'</div>'+
+    '<div class="grp"><span class="lab">EP degree</span>'+seg('ep',EPS,ST.ep)+'</div>'+
+    '<div class="grp"><span class="lab">Routing (headline='+HEADLINE_DISTRIBUTION+')</span>'+seg('routing',ROUTING,ST.routing)+'</div>'+
+    '<div class="grp"><span class="lab">Publication</span>'+seg('pub',PUB,ST.pub)+'</div>'+
+    '<div class="grp"><span class="lab">X-axis</span>'+seg('x',XK,ST.x)+'</div>'+
+    '<div class="grp"><span class="lab">X scale</span>'+seg('xlog',{true:"Log",false:"Linear"},String(ST.xlog))+'</div>'+
+    '<div class="grp"><span class="lab">Y-axis</span>'+seg('y',YK,ST.y)+'</div>'+
+    '<div class="grp"><span class="lab">Y scale</span>'+seg('ylog',{true:"Log",false:"Linear"},String(ST.ylog))+'</div>';
+  document.querySelectorAll('#controls button').forEach(b=>b.onclick=()=>{
+    const g=b.dataset.grp, v=b.dataset.val; ST[g]= (g==='ylog'||g==='xlog')? v==='true' : v;
+    // grid/heatmaps also reflect pct/suite/phase/scale toggles; scaling is headline-only (static).
+    renderControls(); renderMain(); renderGrid(); renderHeatmaps(); });
+}
+function renderMain(){
+  const mtag=(ST.model==='all'?' · all shapes':' · '+(MODELS[ST.model]||ST.model));
+  const tags=mtag+(ST.dtype==='all'?'':' · '+ST.dtype)+(ST.ep==='all'?'':' · EP'+ST.ep);
+  document.getElementById('chart').innerHTML = chart({op:ST.op,phase:ST.phase,x:ST.x,y:ST.y,xlog:ST.xlog,ylog:ST.ylog,
+    pct:ST.pct, suite:ST.suite, routing:ST.routing, dtype:ST.dtype, epf:ST.ep, model:ST.model,
+    title:OPS[ST.op]+' — '+ST.phase+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+tags+' ('+YK[ST.y].toLowerCase()+' vs '+XK[ST.x].toLowerCase()+')'});
+  const vis=DATA.filter(s=>s.phase===ST.phase && (ST.suite==="all"||s.suite===ST.suite)
+                           && (ST.routing==="all"||s.routing===ST.routing)
+                           && dtOk(s) && epOk(s) && modelOk(s) && pubOk(s));
+  document.getElementById('mlegend').innerHTML = guardNote(vis)+legend(ST.phase, null, ST.suite, ST.routing, ST.dtype, ST.ep, ST.model);
+}
+function renderGrid(){
+  // SEPARATE panels per (phase, EP degree); within a panel, the SUITE selector keeps
+  // backend-default and resource-constrained lines from being read as one fair contest.
+  const phases=[...new Set(DATA.map(s=>s.phase))].sort();
+  const eps=[...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b);
+  let h='';
+  phases.forEach(ph=>{ eps.forEach(ep=>{
+    const panelVis=DATA.filter(s=>s.phase===ph && s.ep===ep && (ST.suite==="all"||s.suite===ST.suite)
+                     && (ST.routing==="all"||s.routing===ST.routing) && pubOk(s));
+    if(!panelVis.length) return;
+    const scale=(ST.xlog?'log':'lin')+'–'+(ST.ylog?'log':'lin');
+    h+='<h2>'+ph[0].toUpperCase()+ph.slice(1)+' · EP'+ep+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+' — latency vs source tokens/rank (µs, '+scale+')</h2>'+
+       guardNote(panelVis)+legend(ph,ep,ST.suite,ST.routing)+'<div class="grid">';
+    ['dispatch','combine','roundtrip'].forEach(op=>{ h+='<div class="card"><div class="gtit">'+OPS[op]+'</div>'+
+      chart({op,phase:ph,ep,x:'t',y:'lat',xlog:ST.xlog,ylog:ST.ylog,pct:ST.pct,suite:ST.suite,routing:ST.routing,title:'',w:340,h:260})+'</div>'; });
+    h+='</div>'; }); });
+  document.getElementById('grid').innerHTML=h;
+}
+// Strong + weak SCALING views (goal P2 "separate views for strong and weak scaling" — do NOT rely
+// on the x-axis toggle to reinterpret one experiment). weak = fixed tokens/RANK, latency vs EP
+// (ideal: flat). strong = fixed GLOBAL tokens, latency vs EP (ideal: falls ~1/EP). Each labels its
+// scaling contract. Renders only for SKUs measured at >=2 EP degrees (the headline distribution).
+function scalingChart(kind){
+  // map: sku -> {ep -> {key(T or GT) -> p50 dispatch}}
+  const sl=DATA.filter(s=>s.routing===HEADLINE_DISTRIBUTION && s.mode==="normal"
+                          && s.contract==="layout-and-dispatch-v1" && pubOk(s));
+  const bySku={}; sl.forEach(s=>{ (bySku[s.sku]=bySku[s.sku]||{})[s.ep]=s; });
+  const skuColor={}; DATA.forEach(s=>{ skuColor[s.sku]=skuColor[s.sku]||s.color; });
+  const skus=Object.keys(bySku).filter(k=>Object.keys(bySku[k]).length>=2).sort();
+  if(!skus.length) return '<p class="note">No SKU measured at ≥2 EP degrees yet (needs e.g. GB300 EP4 + EP8). Strong/weak scaling renders here once a multi-EP cohort exists.</p>';
+  // build series: one line per sku; x=EP, y=latency at a fixed anchor (weak: tokens/rank=64; strong: global=512).
+  const anchorT=64, anchorGT=512;
+  const W=900,H=360,m={l:64,r:16,t:34,b:46},X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t;
+  const lines=[]; let xs=[],ys=[];
+  skus.forEach(sku=>{ const pts=[];
+    Object.keys(bySku[sku]).map(Number).sort((a,b)=>a-b).forEach(ep=>{ const s=bySku[sku][ep];
+      let r=null;
+      if(kind==="weak"){ r=s.rows.find(rr=>rr.t===anchorT); }
+      else { r=s.rows.find(rr=>rr.gt===anchorGT) || s.rows.find(rr=>rr.t===Math.round(anchorGT/ep)); }
+      if(r){ const y=r.dispatch.p50; if(y>0){ pts.push({ep,y}); xs.push(ep); ys.push(y);} }
+    });
+    if(pts.length) lines.push({sku,pts,color:(skuColor[sku]||"#888")});
+  });
+  if(!xs.length) return '<p class="note">No matched anchor points for '+kind+' scaling.</p>';
+  const xmn=Math.min(...xs),xmx=Math.max(...xs),ymn=Math.min(...ys),ymx=Math.max(...ys);
+  const xv=v=>mapLin(v,xmn,xmx||xmn+1,X0,X1), yv=v=>mapLin(v,Math.min(0,ymn),ymx||1,Y0,Y1);
+  let s='<svg viewBox="0 0 '+W+' '+H+'">';
+  s+='<text x="'+X0+'" y="20" class="ttl">'+(kind==="weak"?"Weak scaling — fixed tokens/rank="+anchorT+" (ideal: flat)":"Strong scaling — fixed global tokens="+anchorGT+" (ideal: ↓ ~1/EP)")+'</text>';
+  [...new Set(xs)].sort((a,b)=>a-b).forEach(v=>{const x=xv(v);s+='<line class="gl" x1="'+x+'" y1="'+Y0+'" x2="'+x+'" y2="'+Y1+'"/><text class="tk" x="'+x+'" y="'+(Y0+16)+'" text-anchor="middle">EP'+v+'</text>';});
+  linTicks(Math.min(0,ymn),ymx).forEach(v=>{const y=yv(v);s+='<line class="gl" x1="'+X0+'" y1="'+y+'" x2="'+X1+'" y2="'+y+'"/><text class="tk" x="'+(X0-7)+'" y="'+(y+3.5)+'" text-anchor="end">'+fmt(v)+'</text>';});
+  s+='<line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X1+'" y2="'+Y0+'"/><line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X0+'" y2="'+Y1+'"/>';
+  s+='<text class="axl" x="'+((X0+X1)/2)+'" y="'+(H-6)+'" text-anchor="middle">EP degree</text>';
+  s+='<text class="axl" transform="translate(15,'+((Y0+Y1)/2)+') rotate(-90)" text-anchor="middle">dispatch p50 (µs)</text>';
+  lines.forEach(g=>{ const d=g.pts.map((p,i)=>(i?'L':'M')+xv(p.ep).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' ');
+    s+='<path d="'+d+'" fill="none" stroke="'+g.color+'" stroke-width="2"/>';
+    g.pts.forEach(p=>{ s+='<circle class="pt" cx="'+xv(p.ep).toFixed(1)+'" cy="'+yv(p.y).toFixed(1)+'" r="3.5" fill="'+g.color+'"><title>'+g.sku.toUpperCase()+' EP'+p.ep+' '+kind+'-scaling: '+fmt(p.y)+' µs</title></circle>'; }); });
+  s+='</svg>'; return s;
+}
+function renderScaling(){
+  const el=document.getElementById('scaling'); if(!el) return;
+  el.innerHTML='<div class="card">'+scalingChart("weak")+'</div><div class="card" style="margin-top:12px">'+scalingChart("strong")+'</div>'
+    +'<p class="note">Strong vs weak are DISTINCT experiments with distinct scaling contracts (labelled in each title) — not one chart reinterpreted by an x-axis toggle. Headline distribution = '+HEADLINE_DISTRIBUTION+', layout-and-dispatch-v1, normal mode.</p>';
+}
+// HEATMAPS (goal P2): EP×tokens/rank and routing-skew×token-load (latency), placement×node and
+// resource×load where data exists. A cell is colored by dispatch p50 (log scale); empty cells are
+// blank (no measured point). One grid per (metric pairing) for the current phase + publishable set.
+function heatmap(rowKeyFn, rowLabel, rowVals, colVals, title){
+  const sl=DATA.filter(s=>s.phase===ST.phase && (ST.suite==="all"||s.suite===ST.suite) && pubOk(s));
+  // cell value = min dispatch p50 across series matching (rowVal) at colVal (tokens/rank)
+  const cell={};
+  sl.forEach(s=>{ const rk=rowKeyFn(s); if(rk==null) return;
+    s.rows.forEach(r=>{ const k=rk+'|'+r.t; const y=r.dispatch&&r.dispatch.p50; if(y>0) cell[k]=Math.min(cell[k]||1e9,y); }); });
+  const present=Object.keys(cell); if(!present.length) return '';
+  const cols=colVals.filter(c=>present.some(k=>k.endsWith('|'+c)));
+  const rows=rowVals.filter(rv=>present.some(k=>k.startsWith(rv+'|')));
+  if(!rows.length||!cols.length) return '';
+  const allv=Object.values(cell), lo=Math.min(...allv), hi=Math.max(...allv);
+  const cw=46,ch=26,L=120,T=30,W=L+cols.length*cw+16,H=T+rows.length*ch+24;
+  const col=v=>{ const t=(Math.log(v)-Math.log(lo))/((Math.log(hi)-Math.log(lo))||1); // green->red
+    const r=Math.round(40+t*200),g=Math.round(190-t*150); return 'rgb('+r+','+g+',70)'; };
+  let s='<svg viewBox="0 0 '+W+' '+H+'"><text x="4" y="16" class="ttl">'+title+'</text>';
+  cols.forEach((c,j)=>{ s+='<text class="tk" x="'+(L+j*cw+cw/2)+'" y="'+(T-4)+'" text-anchor="middle">'+c+'</text>'; });
+  rows.forEach((rv,i)=>{ s+='<text class="tk" x="'+(L-6)+'" y="'+(T+i*ch+ch/2+3)+'" text-anchor="end">'+rv+'</text>';
+    cols.forEach((c,j)=>{ const v=cell[rv+'|'+c]; const x=L+j*cw,y=T+i*ch;
+      if(v) s+='<rect x="'+x+'" y="'+y+'" width="'+(cw-2)+'" height="'+(ch-2)+'" fill="'+col(v)+'"><title>'+rowLabel+'='+rv+' T='+c+': '+fmt(v)+' µs</title></rect><text class="tk" x="'+(x+cw/2-1)+'" y="'+(y+ch/2+3)+'" text-anchor="middle" style="fill:#0b0d10;font-size:9px">'+fmt(v)+'</text>';
+      else s+='<rect x="'+x+'" y="'+y+'" width="'+(cw-2)+'" height="'+(ch-2)+'" fill="#1b1f27" stroke="#2a2f3a"/>'; }); });
+  s+='</svg>'; return s;
+}
+function renderHeatmaps(){
+  const el=document.getElementById('heatmaps'); if(!el) return;
+  const Ts=[...new Set(DATA.filter(s=>s.phase===ST.phase).flatMap(s=>s.rows.map(r=>r.t)))].sort((a,b)=>a-b);
+  const eps=[...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b);
+  const routs=[...new Set(DATA.map(s=>s.routing))].sort();
+  const ress=[...new Set(DATA.map(s=>s.resource))].sort();
+  const places=[...new Set(DATA.map(s=>s.placement||'packed'))].sort();
+  const grids=[
+    heatmap(s=>'EP'+s.ep, 'EP', eps.map(e=>'EP'+e), Ts, 'EP × tokens/rank — dispatch p50 (µs), '+ST.phase),
+    heatmap(s=>s.routing, 'routing', routs, Ts, 'Routing skew × token load — dispatch p50 (µs), '+ST.phase),
+    heatmap(s=>s.resource, 'resource', ress, Ts, 'Resource regime × token load — dispatch p50 (µs), '+ST.phase),
+  ];
+  if(places.length>1) grids.push(heatmap(s=>s.placement||'packed','placement',places,Ts,'Placement × token load — dispatch p50 (µs), '+ST.phase));
+  const shown=grids.filter(Boolean);
+  el.innerHTML=(shown.length? shown.map(g=>'<div class="card" style="margin-bottom:10px">'+g+'</div>').join('') : '<p class="note">No heatmap cells for this phase/suite.</p>')
+    +'<p class="note">Cell = min dispatch p50 (µs) over matching publishable series; green→red = fast→slow (log). Blank = no measured point. Placement×node and a populated routing×load grid fill in as multi-node / skew runs land.</p>';
+}
+// Coverage table (goal P2): publication status per measured config (validated=official,
+// experimental=comparable/legacy, failed=invalid/failed). Supported/unsupported come from
+// generate_matrix.py (capability), which records omissions with reasons.
+function renderCoverage(){
+  const cls={official:'#2ca02c','comparable-experimental':'#d6a72b',legacy:'#7f7f7f',
+             diagnostic:'#9467bd',invalid:'#d62728',failed:'#a30000'};
+  const by={}; DATA.forEach(s=>{ (by[s.sku]=by[s.sku]||[]).push(s); });
+  let h='<table class="cov"><tr><th>SKU</th><th>model (h/topk/e)</th><th>EP</th><th>config</th><th>phase</th><th>routing</th><th>workload</th><th>status</th><th>correct pts</th></tr>';
+  Object.keys(by).sort().forEach(sku=>{
+    // sort by model then EP then label so the per-model coverage (which SKUs have which shape) groups.
+    by[sku].sort((a,b)=>(a.model||'').localeCompare(b.model||'')||(a.ep-b.ep)||a.label.localeCompare(b.label)).forEach(s=>{
+      const ok=s.rows.filter(r=>r.correct).length;
+      // dispatch dtype / mode / contract, + combine-quant + activation profile ONLY when non-default
+      // (so today's bf16/none/normal rows stay uncluttered; a PR311 quant-combine run shows /cq:…).
+      const cfg=(s.dtype||'?')+'/'+s.mode+'/'+(s.contract||'?').replace('-v1','')
+        +((s.cqm&&s.cqm!=='none')?'/cq:'+s.cqm:'')+((s.act&&s.act!=='normal')?'/'+s.act:'');
+      // workload identity column (goal P1): canonical wid, else flag wid=null as an official blocker.
+      const wcell = s.wid? ('<span title="canonical workload">'+s.wid.slice(0,10)+'</span>')
+                         : '<span style="color:#d6a72b" title="non-canonical (seeded-runtime) — cannot be official">wid=null ⚠</span>';
+      h+='<tr><td>'+sku+'</td><td>'+(s.model||'?')+' <span class="mono" style="font-size:10px">'+SHAPE_KEY(s.shape)+'</span></td>'
+        +'<td>'+s.ep+'</td><td>'+cfg+'</td><td>'+s.phase+'</td><td>'+s.routing+'</td>'
+        +'<td>'+wcell+'</td>'
+        +'<td><span class="badge" style="background:'+(cls[s.pub]||'#555')+'">'+s.pub+'</span></td>'
+        +'<td>'+ok+'/'+s.rows.length+'</td></tr>';
+    });
+  });
+  document.getElementById('coverage').innerHTML=h+'</table>'
+    +'<p class="note">model column = the MoE shape (hidden/topk/experts) named per the model registry; this is the per-model coverage (which SKUs ran which model shape). workload=wid is the canonical workload id; <b>wid=null</b> marks a seeded-runtime (non-canonical) line that is capped at comparable-experimental and is hidden from the Official view. Status is machine-derived from validity (goal P1).</p>';
+}
+// Failed / quarantined cases (goal immediate P2 "preserve failed cases in aggregation"): no-row
+// failed-case records (classified wedge/timeout/crash) + diagnostic/invalid/failed docs, surfaced
+// so a failure is never silently dropped. Diagnostic = quarantined (e.g. LL-FP8 roundtrip anomaly,
+// MoRI resource-nonconforming) — kept, labelled, excluded from official/comparable.
+function renderFailed(){
+  const el=document.getElementById('failed'); if(!el) return;
+  if(typeof FAILED==='undefined' || !FAILED.length){ el.innerHTML='<p class="note">No failed or quarantined cases — every run completed and is publishable.</p>'; return; }
+  const cls={failed:'#a30000',invalid:'#d62728',diagnostic:'#9467bd'};
+  let h='<table class="cov"><tr><th>SKU</th><th>backend</th><th>phase</th><th>config</th><th>status</th><th>reason / failure mode</th><th>rc</th></tr>';
+  FAILED.slice().sort((a,b)=>(a.sku||'').localeCompare(b.sku||'')).forEach(r=>{
+    h+='<tr><td>'+r.sku+'</td><td>'+(r.backend||'?')+'</td><td>'+(r.phase||'?')+'</td><td>'+r.cfg+'</td>'
+      +'<td><span class="badge" style="background:'+(cls[r.status]||'#555')+'">'+r.status+'</span></td>'
+      +'<td>'+(r.reason||'?')+'</td><td>'+(r.rc==null?'—':r.rc)+'</td></tr>';
+  });
+  el.innerHTML=h+'</table><p class="note">Preserved, not dropped: failed-case records (run_in_container emits a tests/failure_taxonomy classification on a wedge/timeout/crash) + quarantined diagnostic/invalid docs (e.g. an LL-FP8 roundtrip anomaly, or a resource-nonconforming MoRI run). These are excluded from the official/comparable views above.</p>';
+}
+// Distribution-sensitivity summary (review: don't add a 7th chart dimension — collapse it to one
+// ratio per sku/backend/phase). p99(worst stressor distribution) / p99(uniform) at matched
+// tokens/rank, computed by tests/sensitivity.py and injected as SENS.
+function renderSensitivity(){
+  const el=document.getElementById('sensitivity'); if(!el) return;
+  if(typeof SENS==='undefined' || !SENS.length){ el.innerHTML='<p class="note">No multi-distribution groups in this view (need uniform + a stressor at matched tokens/rank).</p>'; return; }
+  let h='<table class="cov"><tr><th>SKU</th><th>backend</th><th>phase</th><th>config</th><th>headline p99 µs</th><th>worst dist @T</th><th>sensitivity</th><th>EPLB zipf→+eplb</th></tr>';
+  SENS.slice().sort((a,b)=>(a.sku.localeCompare(b.sku))||a.backend.localeCompare(b.backend)||a.phase.localeCompare(b.phase)).forEach(r=>{
+    const cfg=r.dispatch_dtype+'·'+r.mode+'·'+(r.contract||'').replace('-v1','');
+    const rng=r.headline_p99_range_us, sr=r.distribution_sensitivity_ratio;
+    const sc = sr>=1.5?'#d62728':(sr>=1.2?'#d6a72b':'#2ca02c');
+    const ev=r.eplb_recovery? (r.eplb_recovery.zipf.toFixed(2)+'→'+r.eplb_recovery['zipf+eplb'].toFixed(2)+'×') : '—';
+    h+='<tr><td>'+r.sku+'</td><td>'+r.backend+'</td><td>'+r.phase+'</td><td>'+cfg+'</td>'
+      +'<td>'+rng[0]+'–'+rng[1]+'</td><td>'+r.worst_distribution+' @'+r.worst_at_T+'</td>'
+      +'<td><span class="badge" style="background:'+sc+'">'+sr.toFixed(2)+'×</span></td><td>'+ev+'</td></tr>';
+  });
+  el.innerHTML=h+'</table>'
+    +'<p class="note">distribution_sensitivity_ratio = p99(worst stressor distribution) ÷ p99(uniform) at matched tokens/rank — how much routing skew/spread degrades this backend (>1 = fragile, ~1 = robust). Stressors exclude the min-comm best case + EPLB-remedied runs. A single number, NOT a chart dimension (tests/sensitivity.py).</p>';
+}
+// Industry summary cards (goal P3-F): CARDS is precomputed in Python (main()) from the loaded
+// series so the numbers match the analysis modules exactly. Rendered as a responsive grid.
+function renderCards(){
+  const el=document.getElementById('cards'); if(!el) return;
+  // bare reference (NOT window.CARDS): top-level const in a classic <script> binds lexically, it is
+  // NOT a property of window — so guard on the binding the same way the chart guards on DATA.
+  if(typeof CARDS==='undefined' || !CARDS.length){ el.innerHTML=''; return; }
+  el.innerHTML=CARDS.map(c=>{
+    const v = c.href? '<a href="'+c.href+'">'+c.value+'</a>' : c.value;
+    return '<div class="kcard'+(c.warn?' warn':'')+'"><div class="kt">'+c.title+'</div>'
+         + '<div class="kv">'+v+'</div>'+(c.sub?'<div class="ks">'+c.sub+'</div>':'')+'</div>';
+  }).join('');
+}
+// Construct a GitHub Actions run URL from the per-series git_run (goal P3-E "raw-artifact links").
+// Falls back to a relative href to the run_id (no repo) — callers handle a fully missing run_id.
+function runUrl(s){
+  if(s.repo && s.run_id) return 'https://github.com/'+s.repo+'/actions/runs'+'/'+s.run_id;
+  if(s.run_id) return '#run-'+s.run_id;          // no repo in data — link to the id anchor
+  return null;
+}
+// DECISION views (goal P3-D): all computed in Python (analyze_ep + the budget metric) and injected
+// as DECISION, so each table renders from the ACTUAL results via the same matching logic the CLI uses.
+function _tbl(headers, rows){
+  if(!rows.length) return '<p class="note">No matching cells in the current result set.</p>';
+  return '<table class="dec"><tr>'+headers.map(h=>'<th'+(h.num?' class="num"':'')+'>'+h.t+'</th>').join('')+'</tr>'
+    + rows.map(r=>'<tr>'+r.map(c=>(typeof c==='object')?'<td class="num">'+c.v+'</td>':'<td>'+c+'</td>').join('')+'</tr>').join('')
+    + '</table>';
+}
+function renderDecision(){
+  const el=document.getElementById('decision'); if(!el) return;
+  const D=(typeof DECISION!=='undefined')?DECISION:{};   // bare const, not a window property
+  let h='';
+  // 1. Recommendations — lowest-p99-dispatch config at the headline token count, per (sku,phase).
+  h+='<h2>Recommended config — lowest dispatch p99 at the headline token count</h2>';
+  h+=_tbl([{t:'SKU'},{t:'phase'},{t:'@T',num:1},{t:'best dispatch p99 (µs)',num:1},{t:'EP',num:1},{t:'config'}],
+    (D.recommendations||[]).map(r=>[r.sku.toUpperCase(),r.phase,{v:r.at_T},{v:'<span class="win">'+r.lowest_p99_dispatch_us+'</span>'},{v:r.ep},r.config]));
+  // 2. Max tokens/rank under a p99 round-trip budget (the previously-missing metric).
+  const bs=(D.budgets||[]);
+  h+='<h2>Max tokens / rank under a p99 round-trip budget <span style="font-weight:400;color:var(--mut)">— official headline (DeepSeek-V3, uniform)</span></h2>';
+  h+=_tbl([{t:'SKU'},{t:'backend'},{t:'phase'},{t:'dtype'},{t:'EP',num:1},{t:'mode'}].concat(bs.map(b=>({t:'≤'+b+'µs',num:1}))),
+    (D.max_tokens_under_budget||[]).map(r=>[r.sku.toUpperCase(),r.backend,r.phase,r.dtype,{v:r.ep},r.mode]
+      .concat(bs.map(b=>({v:(r['b'+b]==null?'—':r['b'+b])})))));
+  // 3. LL vs normal crossover (measured-roundtrip + isolated-kernel bases).
+  h+='<h2>LL → normal crossover <span style="font-weight:400;color:var(--mut)">— token count where normal overtakes low-latency</span></h2>';
+  h+=_tbl([{t:'SKU'},{t:'EP',num:1},{t:'dtype'},{t:'stat'},{t:'basis'},{t:'normal faster at T'}],
+    (D.ll_crossover||[]).map(r=>[r.sku.toUpperCase(),{v:r.ep},r.dtype,r.stat,r.basis,String(r.normal_faster_at_T)]));
+  // 4. Resource Pareto — latency vs achieved comm-resource fraction (curve summarized to endpoints).
+  h+='<h2>Resource ↔ latency Pareto <span style="font-weight:400;color:var(--mut)">— dispatch p50 across the comm-fraction ladder (fixed-kernel excluded)</span></h2>';
+  h+=_tbl([{t:'SKU'},{t:'phase'},{t:'dtype'},{t:'@T',num:1},{t:'pts',num:1},{t:'min frac',num:1},{t:'p50 @min',num:1},{t:'max frac',num:1},{t:'p50 @max',num:1}],
+    (D.resource_pareto||[]).map(r=>{const c=r.curve; const a=c[0],z=c[c.length-1];
+      return [r.sku.toUpperCase(),r.phase,r.dtype,{v:r.T},{v:r.n_points},{v:a.achieved_fraction},{v:a.dispatch_p50},{v:z.achieved_fraction},{v:z.dispatch_p50}];}));
+  // 5. Topology penalty — EP4 vs EP8 dispatch p50.
+  h+='<h2>Topology penalty <span style="font-weight:400;color:var(--mut)">— lower-EP vs higher-EP dispatch p50 at matched tokens/rank</span></h2>';
+  h+=_tbl([{t:'SKU'},{t:'phase'},{t:'dtype'},{t:'@T',num:1},{t:'low-EP p50',num:1},{t:'high-EP p50',num:1},{t:'penalty %',num:1}],
+    (D.topology_penalty||[]).map(r=>{const ks=Object.keys(r).filter(k=>/^ep\d+_p50$/.test(k)).sort();
+      return [r.sku.toUpperCase(),r.phase,r.dtype,{v:r.T},{v:r[ks[0]]},{v:r[ks[1]]},{v:(r.penalty_pct>0?'+':'')+r.penalty_pct}];}));
+  // 6. Routing-skew penalty — zipf* vs matched uniform dispatch amplification.
+  h+='<h2>Routing-skew penalty <span style="font-weight:400;color:var(--mut)">— zipf* dispatch p50/p99 amplification vs matched uniform</span></h2>';
+  const sk=(D.skew_penalty||[]).slice().sort((a,b)=>b.p99_amplification-a.p99_amplification).slice(0,40);
+  h+=_tbl([{t:'SKU'},{t:'EP',num:1},{t:'phase'},{t:'routing'},{t:'@T',num:1},{t:'p50 ×',num:1},{t:'p99 ×',num:1}],
+    sk.map(r=>[r.sku.toUpperCase(),{v:r.ep},r.phase,r.routing,{v:r.T},{v:r.p50_amplification},{v:'<span class="'+(r.p99_amplification>=1.5?'win':'')+'" style="'+(r.p99_amplification>=1.5?'color:#d62728':'')+'">'+r.p99_amplification+'</span>'}]));
+  h+='<p class="note">All decision tables are computed by analyze_ep.py (same matching logic as the CLI) over the loaded results; the budget table adds the "max tokens under a p99 round-trip SLO" metric. Only matching (workload, topology, contract, backend, resource) cells are compared. Skew table truncated to the 40 worst p99 amplifications.</p>';
+  el.innerHTML=h;
+}
+// PROVENANCE drawer (goal P3-E): collapsible per-series git_run / source_sha / run_id /
+// image_digest / backend_provenance + a raw-artifact link to the GitHub Actions run (or a relative
+// href when the repo is absent). One row per series; opens collapsed so it never crowds the chart.
+function renderProvenance(){
+  const el=document.getElementById('provdrawer'); if(!el) return;
+  const rows=DATA.slice().sort((a,b)=>(a.sku.localeCompare(b.sku))||(a.ep-b.ep)||a.label.localeCompare(b.label));
+  let h='<table class="prov"><tr><th>series</th><th>pub</th><th>workload_id</th><th>source SHA</th>'
+       +'<th>image digest</th><th>backend provenance</th><th>artifact / run</th></tr>';
+  rows.forEach(s=>{
+    const url=runUrl(s);
+    const link = url? '<a href="'+url+'" target="_blank" rel="noopener">'+(s.run_id||'run')+'</a>'
+                    : (s.run_id? s.run_id : '<a href="'+'#'+'" title="no run id">'+'—'+'</a>');
+    const prov=s.prov||{};
+    const pv=(prov.deepep_version?('deepep '+prov.deepep_version):'')
+            +(prov.mori_commit?(' mori '+prov.mori_commit):'')
+            +(prov.num_sms!=null?(' · '+prov.num_sms+'/'+(prov.device_sms||'?')+' SM'):'');
+    h+='<tr><td>'+s.label+'</td><td>'+s.pub+'</td>'
+      +'<td class="mono">'+(s.wid?s.wid.slice(0,12):'<span style="color:#d6a72b">null</span>')+'</td>'
+      +'<td class="mono">'+(s.source_sha||'?')+'</td>'
+      +'<td class="mono">'+(s.image_digest||'?')+'</td>'
+      +'<td class="mono">'+(pv||'?')+'</td>'
+      +'<td>'+link+'</td></tr>';
+  });
+  el.innerHTML=h+'</table>';
+}
+// ===== NCCL collective primitives (family=nccl) — All-reduce / All-gather tabs (goal P2/P3) =====
+// NCCL is a separate dataset from the EP series (DATA); these helpers read NCCL (injected below) and
+// never touch DATA/ST, so the EP tabs are completely unaffected. Each line is one (runner, topology,
+// transport) sweep over message size. Two evidence views per op (NST.metric, per tab):
+//   busbw   — bus bandwidth (GB/s) vs message size, log-log (the "bandwidth vs payload" view);
+//   latency — op time (µs) vs message size (the "latency vs size" view; log-x, LINEAR-y by default
+//             so the flat small-message latency floor is read directly — goal "latency-focused
+//             small tensor shapes"). Toggle the y-scale for the large-message ramp.
+const NSTATE = {};   // per-op view state, seeded lazily so all-reduce + all-gather toggle independently
+function nstate(op){ return NSTATE[op] || (NSTATE[op] = {metric:"busbw", xlog:true, ylog:false}); }
+const NMETRIC = {busbw:"Bus bandwidth (GB/s)", latency:"Op time (µs)"};
+function ncclSeries(op){ return (typeof NCCL!=="undefined"? NCCL : []).filter(s=>s.op===op); }
+// y value for a row under the active metric. busbw is 0 for latency-bound small messages — those
+// points are dropped from the (log) bandwidth view (yv>0 filter) but ALL sizes show in latency.
+function ncclY(r, metric){ return metric==="busbw" ? (r.busbw||0) : r.t_us; }
+function ncclChart(op){
+  const st=nstate(op), metric=st.metric;
+  const W=900, H=460, m={l:66,r:16,t:34,b:46};
+  const X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t;
+  const sl=ncclSeries(op);
+  // build per-series point lists; for busbw (log y) keep y>0 only, for latency keep all.
+  const pts=sl.map(s=>({s, P:s.rows.map(r=>({x:r.size, y:ncclY(r,metric), r}))
+                                    .filter(p=>p.x>0 && (metric==="busbw" ? p.y>0 : p.y>=0))}));
+  let xs=[], ys=[]; pts.forEach(g=>g.P.forEach(p=>{xs.push(p.x);ys.push(p.y);}));
+  if(!xs.length) return '<svg viewBox="0 0 '+W+' '+H+'"><text x="'+(W/2)+'" y="'+(H/2)+'" class="axl" text-anchor="middle">no data</text></svg>';
+  const xmn=Math.min(...xs), xmx=Math.max(...xs);
+  const ylog = st.ylog;   // both metrics honor the Y-scale toggle (busbw defaults log via the toggle)
+  let ymn=Math.min(...ys), ymx=Math.max(...ys);
+  if(ylog){ const pos=ys.filter(v=>v>0); ymn=pos.length?Math.min(...pos):1; } else { ymn=Math.min(0,ymn); }
+  if(ymx===ymn) ymx=ymn+1;
+  const xlog=st.xlog;
+  const xv=v=>xlog?mapLog(v,xmn,xmx,X0,X1):mapLin(v,xmn,xmx,X0,X1);
+  const yv=v=>ylog?mapLog(Math.max(v,ymn),ymn,ymx,Y0,Y1):mapLin(v,ymn,ymx,Y0,Y1);
+  let s='<svg viewBox="0 0 '+W+' '+H+'" role="img">';
+  s+='<text x="'+X0+'" y="20" class="ttl">'+NMETRIC[metric]+' vs message size — '+(op==="all_reduce"?"all-reduce":op==="all_gather"?"all-gather":op)+'</text>';
+  // y grid + ticks
+  (ylog?logTicks(ymn,ymx):linTicks(ymn,ymx)).forEach(v=>{const y=yv(v); s+='<line class="gl" x1="'+X0+'" y1="'+y+'" x2="'+X1+'" y2="'+y+'"/>'+
+    '<text class="tk" x="'+(X0-7)+'" y="'+(y+3.5)+'" text-anchor="end">'+fmt(v)+'</text>';});
+  // x grid + ticks (message size, log decades; label the actual sweep points sparsely via logTicks)
+  (xlog?logTicks(xmn,xmx):linTicks(xmn,xmx)).forEach(v=>{const x=xv(v); s+='<line class="gl" x1="'+x+'" y1="'+Y0+'" x2="'+x+'" y2="'+Y1+'"/>'+
+    '<text class="tk" x="'+x+'" y="'+(Y0+16)+'" text-anchor="middle">'+fmt(v)+'B</text>';});
+  s+='<line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X1+'" y2="'+Y0+'"/><line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X0+'" y2="'+Y1+'"/>';
+  s+='<text class="axl" x="'+((X0+X1)/2)+'" y="'+(H-6)+'" text-anchor="middle">Message size (bytes)'+(xlog?'  (log)':'')+'</text>';
+  s+='<text class="axl" transform="translate(15,'+((Y0+Y1)/2)+') rotate(-90)" text-anchor="middle">'+NMETRIC[metric]+(ylog?'  (log)':'')+'</text>';
+  pts.forEach(g=>{ if(!g.P.length) return;
+    const d=g.P.map((p,i)=>(i?'L':'M')+xv(p.x).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' ');
+    // invalid runs (zero-busbw / failed check) are greyed + dashed + dimmed so they read as EXCLUDED,
+    // not as a peer measurement (goal P1: a failed run is preserved-but-flagged, never silently shown).
+    const col=g.s.valid? g.s.color : '#666';
+    const dash=g.s.valid? '' : ' stroke-dasharray="3 4"';
+    const op_attr=g.s.valid? '' : ' opacity="0.5"';
+    s+='<path d="'+d+'" fill="none" stroke="'+col+'" stroke-width="2"'+dash+op_attr+'/>';
+    g.P.forEach(p=>{ const r=p.r;
+      s+='<circle class="pt" cx="'+xv(p.x).toFixed(1)+'" cy="'+yv(p.y).toFixed(1)+'" r="3.2" fill="'+col+'"'+op_attr+'>'+
+      '<title>'+g.s.label+(g.s.valid?'':'  [INVALID — excluded]')+
+      '\nsize='+fmt(r.size)+'B'+(r.dtype?'  ·  '+r.dtype:'')+
+      '\nbusbw = '+(r.busbw!=null?fmt(r.busbw)+' GB/s':'n/a')+(r.algbw!=null?'  ·  algbw '+fmt(r.algbw)+' GB/s':'')+
+      '\ntime  = '+(r.t_us!=null?r.t_us.toFixed(2)+' µs':'n/a')+
+      (r.oop_us!=null||r.ip_us!=null?'\nout-of-place '+(r.oop_us!=null?r.oop_us.toFixed(2):'?')+' µs  ·  in-place '+(r.ip_us!=null?r.ip_us.toFixed(2):'?')+' µs':'')+
+      '\ntopology='+g.s.topo+(g.s.transport?'  ·  transport='+g.s.transport:'')+'  ·  world='+g.s.world_size+
+      '\ncontract='+(g.s.contract||'?')+'  ·  class='+(g.s.comparison_class||'?')+'  ·  status='+g.s.status+
+      (r.correct===false?'  ✗ check failed':'')+
+      '</title></circle>'; });
+  });
+  s+='</svg>'; return s;
+}
+function ncclLegend(op){
+  const sl=ncclSeries(op);
+  if(!sl.length) return '';
+  return '<div class="legend">'+sl.map(s=>{
+    const col=s.valid? s.color : '#666';
+    const sw = s.valid? 'background:'+col
+                      : 'background:repeating-linear-gradient(90deg,'+col+' 0 4px,transparent 4px 8px)';
+    return '<span class="it"><span class="sw" style="'+sw+'"></span>'+s.label+(s.valid?'':' (invalid — excluded)')+'</span>';
+  }).join('')+'</div>';
+}
+// not-a-direct-comparison guard for NCCL: mixed topology/transport/dtype/contract overlaid in one op.
+function ncclGuard(op){
+  const sl=ncclSeries(op).filter(s=>s.valid); if(sl.length<2) return '';
+  const w=[];
+  const tp=[...new Set(sl.map(s=>s.topo))]; if(tp.length>1) w.push('mixed topology ('+tp.join(', ')+')');
+  const tr=[...new Set(sl.map(s=>s.transport).filter(Boolean))]; if(tr.length>1) w.push('mixed transport ('+tr.join(', ')+')');
+  const dt=[...new Set(sl.map(s=>s.dtype).filter(Boolean))]; if(dt.length>1) w.push('mixed dtype ('+dt.join(', ')+')');
+  const ck=[...new Set(sl.map(s=>s.contract).filter(Boolean))]; if(ck.length>1) w.push('mixed contract ('+ck.join(', ')+')');
+  return w.length? '<div class="guard">⚠ not a direct comparison: '+w.join('; ')+' — topology-class is part of the comparison key (B200·IB vs GB200·MNNVL are distinct fabrics).</div>' : '';
+}
+function ncclSeg(op,grp,opts,cur){
+  return '<div class="seg">'+Object.entries(opts).map(([k,v])=>
+    '<button data-nop="'+op+'" data-ngrp="'+grp+'" data-val="'+k+'" class="'+(k===cur?'on':'')+'">'+v+'</button>').join('')+'</div>';
+}
+// Render one NCCL op tab (panelId holds .ncc-ctl/.ncc-chart/.ncc-leg children). Robust to zero data:
+// the whole panel collapses to a "no data yet" note (never a crashing/empty chart).
+function renderNccl(op, panelId){
+  const el=document.getElementById(panelId); if(!el) return;
+  const sl=ncclSeries(op);
+  if(!sl.length){
+    el.innerHTML='<div class="soon">No <b>'+(op==="all_reduce"?"all-reduce":op==="all_gather"?"all-gather":op)+
+      '</b> results yet. This tab populates automatically once a family=nccl '+op+
+      ' sweep lands in the results directory (nccl-tests via run_nccl.py).</div>';
+    return;
+  }
+  const st=nstate(op);
+  const ctl='<div class="controls">'+
+    '<div class="grp"><span class="lab">Metric</span>'+ncclSeg(op,'metric',NMETRIC,st.metric)+'</div>'+
+    '<div class="grp"><span class="lab">X scale</span>'+ncclSeg(op,'xlog',{true:"Log",false:"Linear"},String(st.xlog))+'</div>'+
+    '<div class="grp"><span class="lab">Y scale</span>'+ncclSeg(op,'ylog',{true:"Log",false:"Linear"},String(st.ylog))+'</div>'+
+    '</div>';
+  el.innerHTML=ctl+'<div class="card"><div class="ncc-chart">'+ncclChart(op)+'</div></div>'+
+    '<div>'+ncclGuard(op)+ncclLegend(op)+'</div>'+
+    '<p class="note">One line per (SKU, topology-class, transport) sweep. '+
+    'busbw view drops latency-bound small messages that report 0 GB/s; the latency view (log-x, '+
+    'linear-y default) shows the flat small-message floor directly. Invalid runs (zero-busbw / failed '+
+    'correctness check) are greyed + dashed and excluded from comparison. Measured by nccl-tests '+
+    '(out-of-place + in-place; busbw = best placement); standardized contract — these are stock-NCCL '+
+    'fabric numbers, not framework-integrated EP times. Hover a point for algbw / placements / provenance.</p>';
+  // wire toggles (scoped to this panel via data-nop) -> mutate this op's state + re-render it.
+  el.querySelectorAll('.controls button[data-nop]').forEach(b=>b.onclick=()=>{
+    const g=b.dataset.ngrp, v=b.dataset.val;
+    st[g]= (g==='xlog'||g==='ylog')? v==='true' : v;
+    renderNccl(op, panelId);
+  });
+}
+// ===== Data-movement collective families (offload / copy-engine / kv-cache) — generic tab =====
+// These 3 families share ONE shape: each series is one config line {label,color,valid,note,op,sub,
+// rows:[{size,bw,lat}]} and the view is "bandwidth vs size + latency vs size, log-log". A single
+// generic renderer (renderColl) drives all three from their injected global array (OFFLOAD /
+// COPYENGINE / KVCACHE), exactly like renderNccl drives the NCCL tabs. ADDITIVE: reads only its own
+// array + its own per-panel state; never touches DATA/ST/NCCL, so EP + NCCL tabs are unaffected.
+const CSTATE = {};   // per-panel view state, seeded lazily so each collective tab toggles independently
+function cstate(id, latUnit){ return CSTATE[id] || (CSTATE[id] = {metric:"bw", xlog:true, ylog:true, latUnit}); }
+// resolve the injected family array by name. MUST use the bare const (a top-level const in a classic
+// <script> binds lexically — it is NOT a property of window/globalThis), the same way ncclSeries
+// references the bare NCCL. typeof-guarded so a missing array is an empty list, never a crash.
+function collArr(name){
+  if(name==="OFFLOAD")    return (typeof OFFLOAD!=="undefined"&&OFFLOAD)?OFFLOAD:[];
+  if(name==="COPYENGINE") return (typeof COPYENGINE!=="undefined"&&COPYENGINE)?COPYENGINE:[];
+  if(name==="KVCACHE")    return (typeof KVCACHE!=="undefined"&&KVCACHE)?KVCACHE:[];
+  if(name==="RLMESH")     return (typeof RLMESH!=="undefined"&&RLMESH)?RLMESH:[];
+  return [];
+}
+function collChart(arr, st, title){
+  const W=900, H=460, m={l:70,r:16,t:34,b:46};
+  const X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t;
+  const metric=st.metric, useBw=metric==="bw";
+  const ylabel = useBw ? "Bandwidth (GB/s)" : ("Latency ("+(st.latUnit||"µs")+")");
+  const pts=arr.map(s=>({s, P:s.rows.map(r=>({x:r.size, y:(useBw?r.bw:r.lat), r}))
+                                    .filter(p=>p.x>0 && p.y!=null && (st.ylog? p.y>0 : p.y>=0))}));
+  let xs=[], ys=[]; pts.forEach(g=>g.P.forEach(p=>{xs.push(p.x);ys.push(p.y);}));
+  if(!xs.length) return '<svg viewBox="0 0 '+W+' '+H+'"><text x="'+(W/2)+'" y="'+(H/2)+'" class="axl" text-anchor="middle">no data</text></svg>';
+  const xmn=Math.min(...xs), xmx=Math.max(...xs);
+  const ylog=st.ylog; let ymn=Math.min(...ys), ymx=Math.max(...ys);
+  if(ylog){ const pos=ys.filter(v=>v>0); ymn=pos.length?Math.min(...pos):1; } else { ymn=Math.min(0,ymn); }
+  if(ymx===ymn) ymx=ymn+1;
+  const xlog=st.xlog;
+  const xv=v=>xlog?mapLog(v,xmn,xmx,X0,X1):mapLin(v,xmn,xmx,X0,X1);
+  const yv=v=>ylog?mapLog(Math.max(v,ymn),ymn,ymx,Y0,Y1):mapLin(v,ymn,ymx,Y0,Y1);
+  let s='<svg viewBox="0 0 '+W+' '+H+'" role="img">';
+  s+='<text x="'+X0+'" y="20" class="ttl">'+title+'</text>';
+  (ylog?logTicks(ymn,ymx):linTicks(ymn,ymx)).forEach(v=>{const y=yv(v); s+='<line class="gl" x1="'+X0+'" y1="'+y+'" x2="'+X1+'" y2="'+y+'"/>'+
+    '<text class="tk" x="'+(X0-7)+'" y="'+(y+3.5)+'" text-anchor="end">'+fmt(v)+'</text>';});
+  (xlog?logTicks(xmn,xmx):linTicks(xmn,xmx)).forEach(v=>{const x=xv(v); s+='<line class="gl" x1="'+x+'" y1="'+Y0+'" x2="'+x+'" y2="'+Y1+'"/>'+
+    '<text class="tk" x="'+x+'" y="'+(Y0+16)+'" text-anchor="middle">'+fmt(v)+'B</text>';});
+  s+='<line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X1+'" y2="'+Y0+'"/><line class="ax" x1="'+X0+'" y1="'+Y0+'" x2="'+X0+'" y2="'+Y1+'"/>';
+  s+='<text class="axl" x="'+((X0+X1)/2)+'" y="'+(H-6)+'" text-anchor="middle">Transfer size (bytes)'+(xlog?'  (log)':'')+'</text>';
+  s+='<text class="axl" transform="translate(15,'+((Y0+Y1)/2)+') rotate(-90)" text-anchor="middle">'+ylabel+(ylog?'  (log)':'')+'</text>';
+  pts.forEach(g=>{ if(!g.P.length) return;
+    const col=g.s.valid? g.s.color : '#666';
+    const dash=g.s.valid? '' : ' stroke-dasharray="3 4"';
+    const op_attr=g.s.valid? '' : ' opacity="0.5"';
+    const d=g.P.map((p,i)=>(i?'L':'M')+xv(p.x).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' ');
+    s+='<path d="'+d+'" fill="none" stroke="'+col+'" stroke-width="2"'+dash+op_attr+'/>';
+    g.P.forEach(p=>{ const r=p.r;
+      s+='<circle class="pt" cx="'+xv(p.x).toFixed(1)+'" cy="'+yv(p.y).toFixed(1)+'" r="3.2" fill="'+col+'"'+op_attr+'>'+
+      '<title>'+g.s.label+(g.s.valid?'':'  [INVALID — excluded]')+
+      '\nsize='+fmt(r.size)+'B'+(r.size_class?'  ·  '+r.size_class:'')+
+      '\nbandwidth = '+(r.bw!=null?fmt(r.bw)+' GB/s':'n/a')+
+      '\nlatency = '+(r.lat!=null?r.lat.toFixed(3)+' '+(st.latUnit||'µs'):'n/a')+
+      (r.correct===false?'\n✗ correctness check failed':'')+
+      (g.s.note?'\n'+g.s.note:'')+
+      '</title></circle>'; });
+  });
+  s+='</svg>'; return s;
+}
+function collLegend(arr){
+  if(!arr.length) return '';
+  return '<div class="legend">'+arr.map(s=>{
+    const col=s.valid? s.color : '#666';
+    const sw = s.valid? 'background:'+col : 'background:repeating-linear-gradient(90deg,'+col+' 0 4px,transparent 4px 8px)';
+    return '<span class="it"><span class="sw" style="'+sw+'"></span>'+s.label+(s.valid?'':' (invalid — excluded)')+'</span>';
+  }).join('')+'</div>';
+}
+function collSeg(panelId,grp,opts,cur){
+  return '<div class="seg">'+Object.entries(opts).map(([k,v])=>
+    '<button data-cid="'+panelId+'" data-cgrp="'+grp+'" data-val="'+k+'" class="'+(k===cur?'on':'')+'">'+v+'</button>').join('')+'</div>';
+}
+// Render one data-movement collective tab. arrName = injected global ("OFFLOAD"|"COPYENGINE"|
+// "KVCACHE"); latUnit = the latency unit for that family ("µs" or "ms"). Robust to zero data.
+function renderColl(arrName, panelId, emptyLabel, latUnit, footNote){
+  const el=document.getElementById(panelId); if(!el) return;
+  const arr=collArr(arrName);
+  if(!arr.length){
+    el.innerHTML='<div class="soon">No <b>'+emptyLabel+'</b> results yet. This tab populates '+
+      'automatically once a family result for it lands in the results directory.</div>';
+    return;
+  }
+  const st=cstate(panelId, latUnit);
+  const CMETRIC={bw:"Bandwidth (GB/s)", lat:"Latency ("+(latUnit||"µs")+")"};
+  const ctl='<div class="controls">'+
+    '<div class="grp"><span class="lab">Metric</span>'+collSeg(panelId,'metric',CMETRIC,st.metric)+'</div>'+
+    '<div class="grp"><span class="lab">X scale</span>'+collSeg(panelId,'xlog',{true:"Log",false:"Linear"},String(st.xlog))+'</div>'+
+    '<div class="grp"><span class="lab">Y scale</span>'+collSeg(panelId,'ylog',{true:"Log",false:"Linear"},String(st.ylog))+'</div>'+
+    '</div>';
+  const title=CMETRIC[st.metric]+' vs transfer size';
+  // a per-sku notes line (peak / overlap / near-zero-sms / unwired) — one note per distinct (sku,note).
+  const seen={}; const notes=[];
+  arr.forEach(s=>{ const k=s.sku+'|'+(s.note||''); if(s.note && !seen[k]){ seen[k]=1; notes.push(s.sku.toUpperCase()+': '+s.note); } });
+  el.innerHTML=ctl+'<div class="card"><div class="coll-chart">'+collChart(arr,st,title)+'</div></div>'+
+    '<div>'+collLegend(arr)+'</div>'+
+    (notes.length? '<p class="note">'+notes.join(' &nbsp;·&nbsp; ')+'</p>' : '')+
+    '<p class="note">'+footNote+' Single-process micro-benchmark; one line per config. Invalid runs are greyed + dashed and excluded from comparison. Hover a point for size / bandwidth / latency. Decode+prefill jobs are deduped to the newest run per (SKU, config) so lines are not doubled.</p>';
+  el.querySelectorAll('.controls button[data-cid]').forEach(b=>b.onclick=()=>{
+    const g=b.dataset.cgrp, v=b.dataset.val;
+    st[g]= (g==='xlog'||g==='ylog')? v==='true' : v;
+    renderColl(arrName, panelId, emptyLabel, latUnit, footNote);
+  });
+}
+function renderOffload(){ renderColl('OFFLOAD','offload','CPU↔GPU offload','µs',
+  'CPU↔GPU offload: host-to-device + device-to-host copy bandwidth/latency, pinned vs pageable host memory (goal P2). Pinned host memory should sustain markedly higher bandwidth than pageable.'); }
+function renderCopyEngine(){ renderColl('COPYENGINE','copyengine','copy-engine / SDMA','µs',
+  'Copy-engine (SDMA) vs SM-driven copy at matched op/size — the copy-engine should reach near-peak bandwidth while using almost no SMs, leaving compute free.'); }
+function renderKvCache(){ renderColl('KVCACHE','kvcache','KV-cache transfer','ms',
+  'KV-cache block transfer: paged vs contiguous layout across directions (D→H / H→D / device-local / device-remote). Contiguous layout transfers far faster than paged (scatter/gather overhead).'); }
+function renderRlMesh(){ renderColl('RLMESH','rlmesh','RL mesh','ms',
+  'RL trainer↔generator weight-transfer mesh: trainer→gen vs gen→trainer, paired (1:1 send/recv) vs redistribute (disjoint all-to-all reshard). The redistribute pattern stresses the fabric harder than paired.'); }
+// TABS (goal P3-C): pure JS/CSS. Toggle .on on a nav button + its matching .tab panel. Disabled
+// buttons (suites not built yet) are inert. Re-renders the active tab's charts so SVGs that need a
+// real layout (the main chart) paint correctly when first shown.
+function showTab(id){
+  document.querySelectorAll('.tab').forEach(t=>t.classList.toggle('on', t.id===id));
+  document.querySelectorAll('.tabs button[data-tab]').forEach(b=>b.classList.toggle('on', b.dataset.tab===id));
+  if(id==='tab-ep'){ renderMain(); renderGrid(); renderScaling(); renderHeatmaps(); }
+  if(id==='tab-allreduce'){ renderNccl('all_reduce','allreduce'); }
+  if(id==='tab-allgather'){ renderNccl('all_gather','allgather'); }
+  if(id==='tab-offload'){ renderOffload(); }
+  if(id==='tab-copyengine'){ renderCopyEngine(); }
+  if(id==='tab-kvcache'){ renderKvCache(); }
+  if(id==='tab-rlmesh'){ renderRlMesh(); }
+}
+function setupTabs(){
+  document.querySelectorAll('.tabs button[data-tab]').forEach(b=>{ if(!b.disabled) b.onclick=()=>showTab(b.dataset.tab); });
+  // honor a #tab-evidence style hash (e.g. the diagnostic-cases card link) on load.
+  const hash=(location.hash||'').replace('#','');
+  showTab(document.getElementById(hash)? hash : 'tab-ep');
+}
+(function(){
+  const sh=(DATA[0]||{shape:{}}).shape||{};
+  const provs=[...new Set(DATA.map(s=>s.backend+' '+(s.prov.deepep_version||s.prov.mori_commit||'?')))];
+  const fo=[...new Set(DATA.map(s=>(s.rows[0]&&s.rows[0].fanout!=null)?s.rows[0].fanout.toFixed(1):'?'))].join('/');
+  const contracts=[...new Set(DATA.map(s=>s.contract))].join(' / ');
+  const dtypes=[...new Set(DATA.map(s=>s.dtype))].join('+');
+  const suites=[...new Set(DATA.map(s=>s.suite))].join(' + ');
+  const samp=[...new Set(DATA.map(s=>s.samples).filter(Boolean))].join('/');
+  const allconsistent=DATA.every(s=>s.routing_consistent!==false);
+  const routings=[...new Set(DATA.map(s=>s.routing))].sort().join(' / ');
+  const ez=DATA.find(s=>s.eplb_after!=null);
+  const eplbNote=ez? ' EPLB (routing=zipf+eplb) replicates hot experts to rebalance per-rank load — imbalance '+ez.eplb_before.toFixed(1)+'x→'+ez.eplb_after.toFixed(1)+'x (vs raw zipf).' : '';
+  document.getElementById('prov').textContent=
+    'Deterministic shared routing trace (seed-fixed; routings: '+routings+' — Routing selector; mean fan-out ≈'+fo+
+    ' dest-ranks/token; cross-rank identity '+(allconsistent?'PROVEN (SHA-256 of topk_idx+weights agrees on every rank)':'NOT proven on some series')+
+    '). Fixed: hidden='+(sh.hidden||'?')+', top-k='+(sh.topk||'?')+', experts='+(sh.experts||'?')+
+    '. dtype/mode/resource/contract vary PER LINE — read the label (dtypes shown: '+dtypes+'). '+
+    'Contract(s): '+contracts+' (layout-and-dispatch times routing-layout INSIDE dispatch; cached-layout [cl] hoists it out). '+
+    'Latency = percentile (selector; p99 default) over POOLED per-iteration cross-rank-MAX samples'+(samp?(' (~'+samp+'/point)'):'')+
+    '. ROUND TRIP is INDEPENDENTLY MEASURED (dispatch→sync→no-op expert→combine, raw per-iter samples); ISOLATED_SUM is Σ of isolated dispatch+combine percentiles, NOT a measured op (no throughput/SLO use). Publication filter defaults to ALL (the full sweep, incl. seeded-runtime comparable-experimental wid=null lines); switch to Official/Publishable for the canonical-wid cohort. Status is machine-derived from validity. The bandwidth axis is a LOGICAL routed-payload rate '+
+    '(recv copies x hidden x dtype / latency; per-op bytes; excludes scales/idx/meta/padding) — NOT algBW/busBW/wire utilization. '+
+    'Suites ('+suites+') are kept distinct (Suite selector): backend-default = best stack; resource-constrained = ~fixed SM/CU fraction — '+
+    'do not read across suites as one contest. Correctness = round-trip reconstruction smoke check (NOT a full per-token routing proof).'+eplbNote+' '+
+    'Backends: '+provs.join(', ')+'. Hover a point for p50/p90/p99, contract, suite, and its workflow run.';
+  resolveHeadlineDefaults();   // pick a non-empty opening view (keeps normalized as the default)
+  renderControls(); renderCards(); renderMain(); renderGrid(); renderScaling(); renderHeatmaps();
+  renderDecision(); renderProvenance(); renderCoverage(); renderSensitivity(); renderFailed();
+  renderNccl('all_reduce','allreduce'); renderNccl('all_gather','allgather');  // family=nccl tabs (no-op if empty)
+  renderOffload(); renderCopyEngine(); renderKvCache(); renderRlMesh();   // data-movement collective tabs (no-op if empty)
+  setupTabs();
+})();
+"""
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP HTML plotter")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--out", default="results/plots/collectivex_ep.html")
+    ap.add_argument("--legacy", choices=["all", "exclude", "only"], default="all",
+                    help="exclude -> v4-only main plot; only -> the legacy v3 archive")
+    args = ap.parse_args()
+
+    series = load_series(args.results_dir, args.legacy)
+    if not series:
+        print(f"no family=moe results with rows under {args.results_dir} (legacy={args.legacy})")
+        return 1
+    # Preserve FAILED / quarantined cases (goal immediate P2): failed-case records (no rows, a
+    # classified wedge/timeout/crash) + any diagnostic/invalid/failed doc — surfaced as a table so
+    # a failure is never silently dropped from the aggregation.
+    failed = []
+    for path in sorted(glob.glob(os.path.join(args.results_dir, "**", "*.json"), recursive=True)):
+        try:
+            d = json.load(open(path))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") != "moe":
+            continue
+        rt, pub = d.get("record_type"), d.get("publication_status")
+        if rt == "failed-case" or pub in ("failed", "invalid", "diagnostic"):
+            fa = d.get("failure") or {}
+            sku = (d.get("runner") or "?").split("_")[0].split("-")[0]
+            sh = d.get("shape", {}) or {}
+            cfg = f"{sh.get('dispatch_dtype','?')}/{d.get('mode','?')}/{(d.get('measurement_contract') or '?').replace('-v1','')}"
+            reason = fa.get("failure_mode")
+            if not reason and pub == "diagnostic":
+                rc = d.get("resource_profile") or {}
+                anom = d.get("anomaly_summary") or {}
+                reason = ("resource-nonconforming" if str((d.get("validity") or {}).get("resource_conformance","")).endswith("nonconforming")
+                          else f"anomaly:{','.join(anom.get('types',[]))}" if anom.get("count") else "diagnostic")
+            failed.append({"sku": sku, "backend": d.get("backend"), "phase": d.get("phase"),
+                           "cfg": cfg, "status": pub or "failed", "reason": reason or "?",
+                           "rc": fa.get("return_code")})
+    # Distribution-sensitivity ratios (stdlib; same results dir), embedded as SENS for a small
+    # summary table — collapses the routing axis to one ratio per sku/backend/phase (review).
+    sens_rows = []
+    try:
+        sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "tests"))
+        import sensitivity as _sens
+        sens_rows = [g for g in _sens.analyze(args.results_dir)["groups"]
+                     if g["distribution_sensitivity_ratio"] is not None]
+    except Exception as exc:  # never let the summary break the main plot
+        print(f"  (sensitivity summary skipped: {exc!r})", file=sys.stderr)
+    # DECISION views (goal P3-D): compute from the ACTUAL results via analyze_ep's matching logic
+    # (recommendations / ll_crossover / resource_pareto / topology_penalty / skew_penalty), plus the
+    # previously-missing "max tokens under p99 budget" metric. analyze_ep reads the same JSONs.
+    decision = {"budgets": RT_BUDGETS_US,
+                "max_tokens_under_budget": max_tokens_under_budget(series)}
+    ll_rows = []
+    try:
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        import analyze_ep as _ae
+        _aser = _ae.load(args.results_dir)
+        ll_rows = _ae.ll_crossover(_aser)
+        decision.update({
+            "recommendations": _ae.recommendations(_aser),
+            "ll_crossover": ll_rows,
+            "resource_pareto": _ae.resource_pareto(_aser),
+            "topology_penalty": _ae.topology_penalty(_aser),
+            "skew_penalty": _ae.skew_penalty(_aser),
+        })
+    except Exception as exc:  # never let the decision tab break the main plot
+        print(f"  (decision views skipped: {exc!r})", file=sys.stderr)
+    cards = summary_cards(series, sens_rows, failed, ll_rows)
+    # NCCL collective-primitive series (family=nccl), routed to the All-reduce / All-gather tabs.
+    # ADDITIVE: independent of the family=moe EP series above; an empty list simply leaves the tabs
+    # as "no data yet" placeholders (GHA nccl runs may still be in flight).
+    nccl_series = load_nccl_series(args.results_dir)
+    # Framework custom all-reduce (family=allreduce-fw): nccl baseline vs flashinfer-oneshot/twoshot.
+    # ADDITIVE — appended into the same list so it flows through the SAME All-reduce tab path, the
+    # has_ar / nccl_ops detection, and the `const NCCL` serialization with zero extra JS.
+    fwar_series = load_allreduce_fw_series(args.results_dir)
+    nccl_series = nccl_series + fwar_series
+    nccl_ops = {s["op"] for s in nccl_series}
+    has_ar, has_ag = "all_reduce" in nccl_ops, "all_gather" in nccl_ops
+    # Data-movement collective families (follow-up): CPU<->GPU offload, copy-engine/SDMA, KV-cache.
+    # ADDITIVE + independent of the family=moe series; an empty list leaves the tab as a placeholder.
+    offload_series = load_offload_series(args.results_dir)
+    copyengine_series = load_copy_engine_series(args.results_dir)
+    kvcache_series = load_kvcache_series(args.results_dir)
+    rlmesh_series = load_rlmesh_series(args.results_dir)
+    has_offload, has_copy, has_kv = bool(offload_series), bool(copyengine_series), bool(kvcache_series)
+    has_rl = bool(rlmesh_series)
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    # Tab nav (goal P3-C): real clickable tabs. Built suites are enabled; not-yet-built collective
+    # suites are disabled "coming soon" placeholders so the framework's scope is visible. Each
+    # collective tab enables as soon as a result for its family lands (else stays disabled).
+    def _navbtn(tab, label, enabled):
+        return (f'<button data-tab="{tab}">{label}</button>' if enabled
+                else f'<button disabled title="no {label} results yet">{label}</button>')
+    tabnav = ('<div class="tabs">'
+              '<button data-tab="tab-ep" class="on">EP dispatch / combine</button>'
+              '<button data-tab="tab-decision">Decision</button>'
+              '<button data-tab="tab-evidence">Evidence</button>'
+              + _navbtn("tab-allreduce", "All-reduce", has_ar)
+              + _navbtn("tab-allgather", "All-gather", has_ag)
+              + _navbtn("tab-offload", "CPU-GPU offload", has_offload)
+              + _navbtn("tab-kvcache", "KV-cache transfer", has_kv)
+              + _navbtn("tab-copyengine", "Copy-engine / SDMA", has_copy)
+              + _navbtn("tab-rlmesh", "RL mesh", has_rl)
+              + '</div>')
+    # Tab panels. EP = the existing chart + grid + scaling + heatmaps (unchanged behavior).
+    tab_ep = ('<div class="tab on" id="tab-ep">'
+              '<div class="controls" id="controls"></div>'
+              '<div class="card"><div id="chart"></div></div><div id="mlegend"></div>'
+              '<details class="prov"><summary>Provenance &amp; raw artifacts — git run / source SHA / image digest / backend (every series)</summary>'
+              '<div id="provdrawer"></div>'
+              '<p class="note">Each row links to its GitHub Actions run (github.com/&lt;repo&gt;/actions/runs/&lt;run_id&gt;); a series with no repo links to its run id anchor. workload_id / source SHA / image digest / backend build pin the result.</p></details>'
+              '<div id="grid"></div>'
+              '<h2>Scaling (strong + weak — distinct contracts)</h2><div id="scaling"></div>'
+              '<h2>Heatmaps</h2><div id="heatmaps"></div>'
+              '</div>')
+    tab_decision = ('<div class="tab" id="tab-decision">'
+                    '<p class="sub">Decision-oriented summaries computed by analyze_ep.py from the loaded results (best config by latency budget, LL crossover, resource↔latency Pareto, topology + routing-skew penalties) plus the max-tokens-under-a-p99-SLO metric.</p>'
+                    '<div id="decision"></div></div>')
+    tab_evidence = ('<div class="tab" id="tab-evidence">'
+                    '<h2>Distribution sensitivity <span style="font-weight:400;color:var(--mut)">— NOT the headline (headline = uniform)</span></h2><div id="sensitivity"></div>'
+                    '<h2>Failed / quarantined cases</h2><div id="failed"></div>'
+                    '<h2>Coverage</h2><div id="coverage"></div></div>')
+    # NCCL collective tabs (family=nccl): the panel body is rendered by renderNccl() when the tab is
+    # shown (and once at boot). Robust to zero data — renderNccl prints a "no data yet" note.
+    tab_allreduce = ('<div class="tab" id="tab-allreduce">'
+                     '<p class="sub">Standardized NCCL <b>all-reduce</b> (nccl-tests): bus bandwidth vs payload and op-time vs message size. One line per (SKU, topology-class, transport). Topology-class is part of the comparison key, so distinct fabrics are never silently overlaid.</p>'
+                     '<div id="allreduce"></div></div>')
+    tab_allgather = ('<div class="tab" id="tab-allgather">'
+                     '<p class="sub">Standardized NCCL <b>all-gather</b> (nccl-tests): bus bandwidth vs payload and op-time vs message size. One line per (SKU, topology-class, transport).</p>'
+                     '<div id="allgather"></div></div>')
+    # Data-movement collective tabs: bodies rendered by renderColl() at boot + on tab show. Zero-data safe.
+    tab_offload = ('<div class="tab" id="tab-offload">'
+                   '<p class="sub">CPU↔GPU <b>offload</b>: host-to-device + device-to-host copy bandwidth/latency, <b>pinned vs pageable</b> host memory. One line per (SKU, op, host-memory).</p>'
+                   '<div id="offload"></div></div>')
+    tab_kvcache = ('<div class="tab" id="tab-kvcache">'
+                   '<p class="sub"><b>KV-cache</b> block transfer: <b>paged vs contiguous</b> layout across directions (D→H / H→D / device-local / device-remote). One line per (SKU, direction, layout, backend).</p>'
+                   '<div id="kvcache"></div></div>')
+    tab_copyengine = ('<div class="tab" id="tab-copyengine">'
+                      '<p class="sub"><b>Copy-engine / SDMA</b> vs SM-driven copy at matched op/size — the copy-engine reaches near-peak bandwidth using almost no SMs. One line per (SKU, op, engine).</p>'
+                      '<div id="copyengine"></div></div>')
+    tab_rlmesh = ('<div class="tab" id="tab-rlmesh">'
+                  '<p class="sub"><b>RL mesh</b> — trainer↔generator weight transfer: <b>trainer→gen vs gen→trainer</b> and <b>paired</b> (1:1 send/recv) vs <b>redistribute</b> (disjoint all-to-all reshard). One line per (SKU, direction, pattern).</p>'
+                  '<div id="rlmesh"></div></div>')
+    # Every collective suite now has results except where a SKU is still landing; tabs without data
+    # stay disabled via _navbtn, so no standing "coming soon" note is needed.
+    placeholder = ''
+    html = HEAD \
+        + '<div class="cards" id="cards"></div>' \
+        + tabnav + tab_ep + tab_decision + tab_evidence + tab_allreduce + tab_allgather \
+        + tab_offload + tab_kvcache + tab_copyengine + tab_rlmesh + placeholder \
+        + '<p class="note">Self-contained (inline SVG, no external scripts). Generated from ' \
+        + f'{len(series)} EP sweeps' + (f' + {len(nccl_series)} NCCL sweeps' if nccl_series else '') \
+        + (f' + {len(offload_series)} offload + {len(copyengine_series)} copy-engine + {len(kvcache_series)} KV-cache + {len(rlmesh_series)} RL-mesh lines'
+           if (has_offload or has_copy or has_kv or has_rl) else '') + '. ' \
+        + 'Latency (p50/p90/p99 selector) is the primary EP metric; the EP ' \
+        + 'bandwidth axis is a LOGICAL routed-payload rate (per-op bytes ÷ latency), not bus/alg ' \
+        + 'bandwidth. The All-reduce / All-gather + offload / copy-engine / KV-cache / RL-mesh tabs show measured ' \
+        + 'bandwidth + latency vs transfer size. dtype/mode/resource/contract vary per line — see labels + provenance.</p>' \
+        + "<script>\nconst DATA = " + json.dumps(series) + ";\nconst SENS = " + json.dumps(sens_rows) \
+        + ";\nconst FAILED = " + json.dumps(failed) + ";\nconst DECISION = " + json.dumps(decision) \
+        + ";\nconst CARDS = " + json.dumps(cards) + ";\nconst NCCL = " + json.dumps(nccl_series) \
+        + ";\nconst OFFLOAD = " + json.dumps(offload_series) \
+        + ";\nconst COPYENGINE = " + json.dumps(copyengine_series) \
+        + ";\nconst KVCACHE = " + json.dumps(kvcache_series) \
+        + ";\nconst RLMESH = " + json.dumps(rlmesh_series) \
+        + ";\n" + JS + "\n</script>\n" + TAIL
+    with open(args.out, "w") as fh:
+        fh.write(html)
+    phases = sorted({s["phase"] for s in series})
+    print(f"wrote {args.out}  ({len(series)} series across SKUs={sorted({s['sku'] for s in series})}, phases={phases})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/prune_results.py b/experimental/CollectiveX/prune_results.py
new file mode 100644
index 000000000..e48cb8504
--- /dev/null
+++ b/experimental/CollectiveX/prune_results.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""CollectiveX — prune results/ to the fresh canonical set.
+
+The results/ dir accumulates every GHA download across sessions (885+ files): many are SUPERSEDED
+debug re-runs of the same config, stale runs from older code, or failed-case stubs that now have a
+valid newer counterpart. This prunes to the FRESH canonical set:
+
+  * group every result by its comparison_key (the config identity the plot/aggregator uses);
+  * within a group, keep the newest KEEP_PER_KEY runs whose publication_status/status is usable
+    (official | comparable-experimental | valid) — newest by generated_at;
+  * move everything else (older-than-KEEP valids, and failed/invalid runs that have >=1 usable run in
+    their group) to results/.superseded/ (NOT hard-deleted — recoverable; already out of the plot glob).
+
+Keeping KEEP_PER_KEY>1 preserves the repeat-run aggregation (median + error bands across runs, a
+P0 deliverable) while removing the long tail of stale debug duplicates. A failed-case with NO usable
+counterpart is KEPT (the "preserve genuinely-failed cases" deliverable). env_*.json + analysis.json
+are kept. Stdlib only.
+
+  python3 prune_results.py --results-dir results            # prune (move to .superseded)
+  python3 prune_results.py --results-dir results --dry-run   # just report
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+
+KEEP_PER_KEY = 3                       # newest usable runs to keep per config (repeat-run aggregation)
+USABLE = {"official", "comparable-experimental", "valid"}
+
+
+def _doc_key(d: dict) -> str:
+    """Config identity: top-level comparison_key (EP), else family+runner+a stable signature."""
+    if d.get("comparison_key"):
+        return str(d["comparison_key"])
+    # collective families (kv-cache/copy-engine/nccl/rl-mesh/allreduce-fw): derive from group keys.
+    keys = [g.get("comparison_key") for g in d.get("groups", []) if g.get("comparison_key")]
+    if keys:
+        return "|".join(sorted(str(k) for k in keys))
+    return "|".join(str(d.get(k, "")) for k in ("family", "runner", "backend", "phase", "measurement_contract"))
+
+
+def _usable(d: dict) -> bool:
+    ps = d.get("publication_status") or d.get("status")
+    return ps in USABLE
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Prune CollectiveX results/ to the fresh canonical set")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--keep-per-key", type=int, default=KEEP_PER_KEY)
+    ap.add_argument("--dry-run", action="store_true")
+    a = ap.parse_args()
+
+    rd = a.results_dir
+    sup = os.path.join(rd, ".superseded")
+    files = [f for f in os.listdir(rd) if f.endswith(".json")
+             and not f.startswith("env_") and f != "analysis.json"]
+    docs = []  # (fname, key, generated_at, usable, is_failed)
+    for f in files:
+        try:
+            d = json.load(open(os.path.join(rd, f)))
+        except Exception:
+            continue
+        docs.append((f, _doc_key(d), d.get("generated_at") or d.get("generated_at", ""),
+                     _usable(d), f.startswith("failed_") or d.get("record_type") == "failed-case"))
+
+    # group by key
+    groups: dict = {}
+    for rec in docs:
+        groups.setdefault(rec[1], []).append(rec)
+
+    move = []
+    for key, recs in groups.items():
+        usable = sorted([r for r in recs if r[3]], key=lambda r: r[2], reverse=True)
+        keep = set(r[0] for r in usable[:a.keep_per_key])
+        for r in recs:
+            f, _, _, is_usable, is_failed = r
+            if f in keep:
+                continue
+            # keep a failed/unusable run ONLY if its group has NO usable run at all
+            if (is_failed or not is_usable) and not usable:
+                continue
+            move.append(f)
+
+    print(f"prune: {len(files)} result files, {len(groups)} configs, keep<= {a.keep_per_key}/config -> "
+          f"move {len(move)} superseded/stale to {sup}")
+    if a.dry_run:
+        for f in sorted(move)[:20]:
+            print("  would move:", f)
+        return 0
+    os.makedirs(sup, exist_ok=True)
+    for f in move:
+        try:
+            shutil.move(os.path.join(rd, f), os.path.join(sup, f))
+        except Exception as e:
+            print(f"  WARN move {f}: {e!r}")
+    print(f"pruned -> {len([x for x in os.listdir(rd) if x.endswith('.json')])} json kept in {rd}, "
+          f"{len(os.listdir(sup))} in .superseded")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/regression.py b/experimental/CollectiveX/regression.py
new file mode 100644
index 000000000..7d48af5b0
--- /dev/null
+++ b/experimental/CollectiveX/regression.py
@@ -0,0 +1,342 @@
+#!/usr/bin/env python3
+"""CollectiveX performance-regression thresholds (goal P1 "Add regression thresholds").
+
+Threshold-based regression detection ACROSS independent benchmark runs of the same fixed config.
+A config's identity is its `comparison_key` (same as repeated_runs.py / validate_results.py); a
+config is measured at several `tokens_per_rank` (T) ladders. For each (comparison_key, T) we form:
+
+  * CANDIDATE — the NEWEST independent run (latest `generated_at`).
+  * BASELINE  — either an explicit baseline (a --baseline file/dir, e.g. last published headline),
+                or, by default, the run-to-run MEDIAN of all-but-the-newest runs (historical
+                median). The candidate is compared against that.
+
+A larger metric is slower (these are microsecond latencies). We flag:
+
+  * REGRESSION  candidate exceeds baseline by > --threshold (default 10%), AND the change is OUTSIDE
+                run-to-run noise. Noise is the historical variability of THIS (ck, T) point measured
+                by repeated runs (MAD / CV, computed exactly like repeated_runs.py). A "regression"
+                whose candidate value still sits inside the historical [median ± k·MAD] band — or
+                whose pct delta is within the historical CV — is reported as `regression-in-noise`
+                (noted, but NOT a CI-gating failure), because we cannot distinguish it from jitter.
+  * IMPROVEMENT candidate faster than baseline by > --threshold (and outside noise).
+  * OK          |delta| within threshold.
+
+Configs with < 2 independent runs (and no explicit baseline) have no baseline -> `insufficient
+history` (skipped, not failed). Missing rows / missing the chosen metric+percentile are skipped
+gracefully.
+
+Exit code is non-zero iff at least one HARD regression (outside noise) is found, so CI can gate on
+it. `--json` writes the full machine-readable report; a markdown table always goes to stdout.
+
+  python3 regression.py results/
+  python3 regression.py results/ --metric roundtrip --pct p99 --threshold 0.10
+  python3 regression.py results/ --baseline published/headline/ --json regression.json
+  python3 regression.py results/ --metric dispatch --pct p95 --threshold 0.05
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+from collections import defaultdict
+
+# Operations / percentiles a row may carry. Mirrors the row schema used across the repo.
+OPS = ("roundtrip", "dispatch", "combine")
+PCTS = ("p50", "p90", "p95", "p99")
+
+# How many MADs around the historical median still count as "within run-to-run noise". 3·MAD is a
+# robust analogue of a 3-sigma band; a candidate inside it is statistically indistinguishable from
+# the established jitter of this exact point, so we refuse to call it a hard regression.
+NOISE_MAD_K = 3.0
+
+
+def _p(r, op, pct):
+    """Extract one percentile for one op from a row, tolerating both the nested-dict form
+    (`r[op][pct]`) and the flat `r["{op}_us_{pct}"]` form. Same accessor as repeated_runs.py."""
+    if isinstance(r.get(op), dict):
+        return r[op].get(pct)
+    return r.get(f"{op}_us_{pct}")
+
+
+def _median(xs):
+    s = sorted(xs)
+    n = len(s)
+    return (s[n // 2] if n % 2 else (s[n // 2 - 1] + s[n // 2]) / 2.0) if n else float("nan")
+
+
+def _noise_stats(xs):
+    """Run-to-run dispersion of a metric at one (ck, T). Same math as repeated_runs._stats:
+    median / MAD / CV over the independent-run values. Returns None for <2 points (no dispersion)."""
+    n = len(xs)
+    if n < 2:
+        return None
+    mean = sum(xs) / n
+    std = (sum((x - mean) ** 2 for x in xs) / n) ** 0.5
+    med = _median(xs)
+    mad = _median([abs(x - med) for x in xs])
+    return {"n": n, "median": round(med, 3), "mad": round(mad, 3),
+            "cv": round(std / mean, 4) if mean > 0 else None}
+
+
+def _parse_ts(doc):
+    """Sort key for recency. generated_at is ISO-8601 (e.g. 2026-06-27T00:54:19.552522+00:00);
+    a lexicographic compare on the normalized string orders ISO timestamps correctly. Fall back to
+    the filename (which embeds a ...T..Z stamp) so files without generated_at still order sanely."""
+    ts = doc.get("generated_at")
+    if isinstance(ts, str) and ts:
+        return ts
+    return ""
+
+
+def load(paths):
+    """Load moe result docs from files/dirs into per-run records, mirroring repeated_runs.load():
+    skip env_* sidecars, require family==moe with rows, drop preserved failed-case records (they
+    carry no comparable timings), and collapse to ONE record per independent run via its git run_id
+    (falling back to the filename) so in-process repeats of one job aren't counted as separate runs.
+    Returns {comparison_key: {run_id: record}} where record.rows maps T -> row."""
+    files = []
+    for p in paths:
+        if os.path.isdir(p):
+            files += glob.glob(os.path.join(p, "**", "*.json"), recursive=True)
+        elif os.path.isfile(p):
+            files.append(p)
+    files = sorted(f for f in files if not os.path.basename(f).startswith("env_"))
+
+    by_ck = defaultdict(dict)   # ck -> {run_id: record}
+    for f in files:
+        try:
+            doc = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if doc.get("family") != "moe" or not doc.get("rows"):
+            continue
+        if doc.get("record_type") == "failed-case":
+            continue
+        ck = doc.get("comparison_key")
+        if not ck:
+            continue
+        gr = (doc.get("reproduction") or {}).get("git_run") or {}
+        run_id = gr.get("run_id") or os.path.basename(f)
+        rec = {
+            "file": os.path.basename(f),
+            "run_id": run_id,
+            "generated_at": _parse_ts(doc),
+            "runner": doc.get("runner") or "?",
+            "publication_status": doc.get("publication_status"),
+            "rows": {r["tokens_per_rank"]: r for r in doc["rows"] if "tokens_per_rank" in r},
+        }
+        # If the same run_id appears more than once (e.g. several files from one job), keep the
+        # newest by generated_at so each independent run contributes a single set of values.
+        prev = by_ck[ck].get(run_id)
+        if prev is None or rec["generated_at"] >= prev["generated_at"]:
+            by_ck[ck][run_id] = rec
+    return by_ck
+
+
+def _baseline_index(paths, metric, pct):
+    """Build an explicit-baseline lookup {(comparison_key, T): value} from a baseline file/dir.
+    Each (ck, T) takes its value from the newest baseline doc that carries that point."""
+    idx = {}                 # (ck, T) -> (generated_at, value)
+    for ck, runs in load(paths).items():
+        for run in runs.values():
+            for T, row in run["rows"].items():
+                val = _p(row, metric, pct)
+                if val is None:
+                    continue
+                key = (ck, T)
+                cur = idx.get(key)
+                if cur is None or run["generated_at"] >= cur[0]:
+                    idx[key] = (run["generated_at"], val)
+    return {k: v[1] for k, v in idx.items()}
+
+
+def _verdict(baseline, candidate, threshold, noise):
+    """Classify one (ck, T). Returns (verdict, pct_delta, within_noise).
+
+    pct_delta > 0 means the candidate is SLOWER (worse) than baseline. within_noise is True when the
+    change cannot be distinguished from this point's historical run-to-run jitter: either the
+    candidate still lies inside the historical [median ± k·MAD] band, or |pct_delta| is within the
+    historical CV. A change inside noise is never a HARD regression/improvement."""
+    if baseline is None or candidate is None or baseline <= 0:
+        return "skip", None, False
+    delta = (candidate - baseline) / baseline
+
+    within_noise = False
+    if noise:
+        cv = noise.get("cv")
+        med, mad = noise.get("median"), noise.get("mad")
+        # band test: candidate within k·MAD of the historical median.
+        if med is not None and mad is not None and mad > 0 and abs(candidate - med) <= NOISE_MAD_K * mad:
+            within_noise = True
+        # cv test: the observed move is no larger than typical run-to-run variation.
+        if cv is not None and abs(delta) <= cv:
+            within_noise = True
+
+    if delta > threshold:
+        return ("regression-in-noise" if within_noise else "regression"), delta, within_noise
+    if delta < -threshold:
+        return ("improvement-in-noise" if within_noise else "improvement"), delta, within_noise
+    return "ok", delta, within_noise
+
+
+def analyze(paths, metric="roundtrip", pct="p99", threshold=0.10, baseline_paths=None):
+    """Core comparison. For each (comparison_key, T): establish baseline (explicit if provided, else
+    historical median of all-but-newest runs), candidate (newest run), historical noise (MAD/CV over
+    all runs at that point), and a verdict. Returns a structured report dict."""
+    explicit = _baseline_index(baseline_paths, metric, pct) if baseline_paths else None
+    by_ck = load(paths)
+
+    points = []
+    insufficient = []
+    for ck in sorted(by_ck):
+        runs = sorted(by_ck[ck].values(), key=lambda r: r["generated_at"])
+        n_runs = len(runs)
+        # All T measured across this config's runs.
+        all_T = sorted({T for r in runs for T in r["rows"]})
+        for T in all_T:
+            # values for this (ck, T) in chronological order (one per independent run that has it).
+            series = [(r, _p(r["rows"][T], metric, pct)) for r in runs if T in r["rows"]]
+            series = [(r, v) for r, v in series if v is not None]
+            if not series:
+                continue
+            cand_run, cand_val = series[-1]                 # newest run with this point
+            hist_vals = [v for _, v in series]              # all runs (incl. candidate) for noise
+            noise = _noise_stats(hist_vals)
+
+            if explicit is not None:
+                # An explicit baseline is authoritative: compare ONLY points it covers. Points it
+                # lacks are insufficient — we never silently fall back to a historical median, so a
+                # single report mixes only one baseline notion.
+                if (ck, T) not in explicit:
+                    insufficient.append({"comparison_key": ck, "tokens_per_rank": T,
+                                         "runner": cand_run["runner"], "n_runs": n_runs,
+                                         "reason": "not in explicit baseline"})
+                    continue
+                base_val = explicit[(ck, T)]
+                base_kind = "explicit"
+                base_n = 1
+            else:
+                older = [v for _, v in series[:-1]]          # all-but-newest
+                if not older:
+                    # <2 independent runs -> no historical baseline for this point.
+                    insufficient.append({"comparison_key": ck, "tokens_per_rank": T,
+                                         "runner": cand_run["runner"], "n_runs": n_runs,
+                                         "reason": "<2 independent runs"})
+                    continue
+                base_val = _median(older)
+                base_kind = "historical-median"
+                base_n = len(older)
+
+            verdict, delta, within_noise = _verdict(base_val, cand_val, threshold, noise)
+            if verdict == "skip":
+                continue
+            points.append({
+                "comparison_key": ck,
+                "tokens_per_rank": T,
+                "runner": cand_run["runner"],
+                "publication_status": cand_run["publication_status"],
+                "baseline_kind": base_kind,
+                "baseline_runs": base_n,
+                "n_independent_runs": n_runs,
+                "baseline": round(base_val, 3),
+                "candidate": round(cand_val, 3),
+                "candidate_file": cand_run["file"],
+                "pct_delta": round(delta, 4),
+                "verdict": verdict,
+                "within_noise": within_noise,
+                "noise": noise,
+            })
+
+    n_reg = sum(1 for p in points if p["verdict"] == "regression")
+    n_reg_noise = sum(1 for p in points if p["verdict"] == "regression-in-noise")
+    n_imp = sum(1 for p in points if p["verdict"].startswith("improvement"))
+    n_ok = sum(1 for p in points if p["verdict"] == "ok")
+    # rank worst-first: hard regressions, then by delta.
+    points.sort(key=lambda p: (p["verdict"] != "regression", -p["pct_delta"]))
+    return {
+        "metric": metric, "percentile": pct, "threshold": threshold,
+        "noise_mad_k": NOISE_MAD_K,
+        "baseline_source": ("explicit:" + ",".join(baseline_paths)) if baseline_paths else "historical-median",
+        "n_comparison_keys": len(by_ck),
+        "n_points_compared": len(points),
+        "n_insufficient_history": len(insufficient),
+        "counts": {"regression": n_reg, "regression_in_noise": n_reg_noise,
+                   "improvement": n_imp, "ok": n_ok},
+        "hard_regressions": n_reg,
+        "points": points,
+        "insufficient_history": insufficient,
+    }
+
+
+_VERDICT_MARK = {
+    "regression": "REGRESSION", "regression-in-noise": "regression (noise)",
+    "improvement": "improvement", "improvement-in-noise": "improvement (noise)",
+    "ok": "ok",
+}
+
+
+def to_markdown(report):
+    m, pct, thr = report["metric"], report["percentile"], report["threshold"]
+    c = report["counts"]
+    h = (f"### Performance regression — {m} {pct} (threshold ±{thr:.0%}, "
+         f"noise band {report['noise_mad_k']:g}·MAD)\n\n"
+         f"Baseline: {report['baseline_source']}.  "
+         f"{report['n_points_compared']} (config, T) point(s) compared across "
+         f"{report['n_comparison_keys']} comparison_key(s); "
+         f"{report['n_insufficient_history']} point(s) have insufficient history.\n\n"
+         f"**{c['regression']} regression · {c['improvement']} improvement · {c['ok']} ok · "
+         f"{c['regression_in_noise']} regression-in-noise.**\n\n")
+
+    # Only surface points that moved (regression/improvement, either side of the noise line). A wall
+    # of "ok" rows is noise; the counts line above already accounts for them.
+    moved = [p for p in report["points"] if p["verdict"] != "ok"]
+    if not moved:
+        h += ("_No (config, T) point moved beyond the threshold — every compared point is within "
+              f"±{thr:.0%} of its baseline (or inside run-to-run noise)._\n")
+        return h
+    h += ("| comparison_key | T | runner | baseline | candidate | Δ% | verdict | within noise |\n"
+          "|---|--:|---|--:|--:|--:|---|---|\n")
+    for p in moved:
+        n = p["noise"]
+        noise_txt = (f"CV={n['cv']}, MAD={n['mad']} (n={n['n']})" if n and n.get("cv") is not None
+                     else ("n<2" if not n else "—"))
+        h += (f"| `{(p['comparison_key'] or '')[:12]}` | {p['tokens_per_rank']} | {p['runner']} | "
+              f"{p['baseline']:.1f} | {p['candidate']:.1f} | {p['pct_delta']:+.1%} | "
+              f"{_VERDICT_MARK.get(p['verdict'], p['verdict'])} | "
+              f"{'yes' if p['within_noise'] else 'no'} |\n")
+    if report["hard_regressions"]:
+        h += (f"\n**{report['hard_regressions']} hard regression(s) outside run-to-run noise — "
+              f"CI gate fails (exit 1).**\n")
+    return h
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX performance-regression thresholds")
+    ap.add_argument("paths", nargs="*", default=["results"],
+                    help="result JSON files or dirs (default: results)")
+    ap.add_argument("--baseline", action="append", default=None,
+                    help="explicit baseline file/dir (repeatable). Default: historical median of "
+                         "all-but-newest runs per (config, T).")
+    ap.add_argument("--metric", default="roundtrip", choices=list(OPS),
+                    help="operation to compare (default roundtrip)")
+    ap.add_argument("--pct", default="p99", choices=list(PCTS),
+                    help="percentile to compare (default p99)")
+    ap.add_argument("--threshold", type=float, default=0.10,
+                    help="fractional change to flag, e.g. 0.10 = ±10%% (default 0.10)")
+    ap.add_argument("--json", dest="json_out", help="also write the full report to this JSON file")
+    a = ap.parse_args()
+
+    report = analyze(a.paths or ["results"], metric=a.metric, pct=a.pct,
+                     threshold=a.threshold, baseline_paths=a.baseline)
+    if a.json_out:
+        os.makedirs(os.path.dirname(a.json_out) or ".", exist_ok=True)
+        json.dump(report, open(a.json_out, "w"), indent=2, sort_keys=True)
+        print(f"wrote {a.json_out}")
+    print(to_markdown(report))
+    # Non-zero exit iff a hard regression (outside noise) exists, so CI can gate on it.
+    return 1 if report["hard_regressions"] else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/repeated_runs.py b/experimental/CollectiveX/repeated_runs.py
new file mode 100644
index 000000000..f9beeaed3
--- /dev/null
+++ b/experimental/CollectiveX/repeated_runs.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""CollectiveX repeated independent-run statistics (goal Part 1 "repeated independent workflow-run
+statistics"). Distinguishes TWO kinds of repetition that are easy to conflate:
+
+  * in-process trials   — the `trials x iters` samples POOLED inside ONE result doc (already
+                          reduced into that doc's p50/p90/p99). Counted as `samples_pooled`.
+  * independent job reps — SEPARATE benchmark jobs (distinct GitHub run ids / files) of the SAME
+                          fixed config (same `comparison_key`). These reveal run-to-run variance
+                          that a single job cannot — clock state, fabric warm-up, scheduling.
+
+For each (comparison_key, tokens/rank, op, percentile) measured by >= 2 independent runs it reports
+the run-to-run median / min / max / coefficient-of-variation / MAD. An official p99 claim should be
+backed by repeated-run STABILITY: >= `--min-runs` independent runs whose p99 CV <= `--cv-threshold`.
+
+  python3 repeated_runs.py --results-dir results
+  python3 repeated_runs.py --results-dir results --cv-threshold 0.15 --min-runs 2 --out results/repeated.json
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+from collections import defaultdict
+
+
+def _p(r, op, pct):
+    if isinstance(r.get(op), dict):
+        return r[op].get(pct)
+    return r.get(f"{op}_us_{pct}")
+
+
+def _median(xs):
+    s = sorted(xs); n = len(s)
+    return (s[n // 2] if n % 2 else (s[n // 2 - 1] + s[n // 2]) / 2.0) if n else float("nan")
+
+
+def _stats(xs):
+    n = len(xs)
+    if n == 0:
+        return None
+    mean = sum(xs) / n
+    var = sum((x - mean) ** 2 for x in xs) / n
+    std = var ** 0.5
+    med = _median(xs)
+    mad = _median([abs(x - med) for x in xs])
+    return {"n": n, "median": round(med, 3), "min": round(min(xs), 3), "max": round(max(xs), 3),
+            "mean": round(mean, 3), "cv": round(std / mean, 4) if mean > 0 else None,
+            "mad": round(mad, 3)}
+
+
+def load(results_dir):
+    runs = []
+    for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)):
+        if os.path.basename(f).startswith("env_"):
+            continue
+        try:
+            doc = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if doc.get("family") != "moe" or not doc.get("rows"):
+            continue
+        gr = (doc.get("reproduction") or {}).get("git_run") or {}
+        runs.append({
+            "file": os.path.basename(f), "ck": doc.get("comparison_key"),
+            "run_id": gr.get("run_id") or os.path.basename(f),
+            "sku": (doc.get("runner") or "?").split("_")[0].split("-")[0],
+            "samples_pooled": (doc["rows"][0].get("samples_pooled") if doc["rows"] else None),
+            "rows": {r["tokens_per_rank"]: r for r in doc["rows"]},
+        })
+    return runs
+
+
+def analyze(results_dir, metric="roundtrip", cv_threshold=0.15, min_runs=2):
+    runs = load(results_dir)
+    by_ck = defaultdict(list)
+    for r in runs:
+        if r["ck"]:
+            by_ck[r["ck"]].append(r)
+    out = []
+    for ck, group in by_ck.items():
+        # independent job reps = distinct run ids within this comparison_key.
+        run_ids = sorted({g["run_id"] for g in group})
+        n_runs = len(run_ids)
+        # one value per independent run (take the first file for a run id) per T.
+        per_run = {}
+        for g in group:
+            per_run.setdefault(g["run_id"], g)
+        Ts = sorted({t for g in per_run.values() for t in g["rows"]})
+        points = []
+        for T in Ts:
+            vals = {op: [] for op in ("dispatch", "combine", "roundtrip")}
+            for pct in ("p50", "p99"):
+                pass
+            rec = {"tokens_per_rank": T, "n_independent_runs": 0}
+            for op in ("dispatch", "combine", "roundtrip"):
+                for pct in ("p50", "p99"):
+                    xs = [_p(g["rows"][T], op, pct) for g in per_run.values()
+                          if T in g["rows"] and _p(g["rows"][T], op, pct) is not None]
+                    st = _stats(xs)
+                    if st:
+                        rec[f"{op}_{pct}"] = st
+                        rec["n_independent_runs"] = max(rec["n_independent_runs"], st["n"])
+            points.append(rec)
+        # stability verdict on the chosen metric's p99.
+        stable_pts, unstable_pts = [], []
+        for rec in points:
+            st = rec.get(f"{metric}_p99")
+            if st and st["n"] >= min_runs and st["cv"] is not None:
+                (stable_pts if st["cv"] <= cv_threshold else unstable_pts).append(
+                    {"T": rec["tokens_per_rank"], "cv": st["cv"], "n": st["n"]})
+        out.append({
+            "comparison_key": ck, "skus": sorted({g["sku"] for g in group}),
+            "n_independent_runs": n_runs, "run_ids": run_ids,
+            "in_process_samples_per_run": sorted({g["samples_pooled"] for g in group if g["samples_pooled"]}),
+            f"{metric}_p99_stable": len(stable_pts) > 0 and not unstable_pts,
+            "stable_points": stable_pts, "unstable_points": unstable_pts,
+            "points": points,
+        })
+    out.sort(key=lambda c: -c["n_independent_runs"])
+    return {"metric": metric, "cv_threshold": cv_threshold, "min_runs": min_runs,
+            "n_comparison_keys": len(out),
+            "n_with_repeats": sum(1 for c in out if c["n_independent_runs"] >= min_runs),
+            "cohorts": out}
+
+
+def to_markdown(report):
+    rep = [c for c in report["cohorts"] if c["n_independent_runs"] >= report["min_runs"]]
+    h = (f"### Repeated-run stability ({report['metric']} p99; CV ≤ {report['cv_threshold']} over "
+         f"≥ {report['min_runs']} independent runs)\n\n"
+         f"{report['n_with_repeats']}/{report['n_comparison_keys']} comparison_keys have ≥ "
+         f"{report['min_runs']} independent runs.\n\n")
+    if not rep:
+        return h + ("_No config has been run as ≥2 independent jobs yet — every point is a single "
+                    "job's pooled in-process trials. Re-dispatch a config to populate run-to-run "
+                    "stability (an official p99 claim requires it)._\n")
+    h += "| comparison_key | SKUs | runs | p99 stable | stable/unstable pts |\n|---|---|---|---|---|\n"
+    for c in rep:
+        h += (f"| `{(c['comparison_key'] or '')[:12]}` | {','.join(c['skus'])} | "
+              f"{c['n_independent_runs']} | {'YES' if c[report['metric']+'_p99_stable'] else 'NO'} | "
+              f"{len(c['stable_points'])}✓/{len(c['unstable_points'])}✗ |\n")
+    return h
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX repeated independent-run statistics")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--metric", default="roundtrip", choices=["roundtrip", "dispatch", "combine"])
+    ap.add_argument("--cv-threshold", type=float, default=0.15)
+    ap.add_argument("--min-runs", type=int, default=2)
+    ap.add_argument("--out")
+    a = ap.parse_args()
+    report = analyze(a.results_dir, a.metric, a.cv_threshold, a.min_runs)
+    if a.out:
+        os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True)
+        json.dump(report, open(a.out, "w"), indent=2, sort_keys=True)
+        print(f"wrote {a.out}")
+    print(to_markdown(report))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt
new file mode 100644
index 000000000..574afb1f0
--- /dev/null
+++ b/experimental/CollectiveX/requirements.txt
@@ -0,0 +1,9 @@
+# CollectiveX spike dependencies.
+#
+# run_nccl.py + env_capture.py : Python standard library only (run anywhere).
+# run_deepep.py                : torch + deep_ep — provided by the benchmark
+#                                container; DeepEP is built at job setup
+#                                (rebuild-deepep), NOT pinned here.
+# plot.py                      : the only thing worth a local venv:
+matplotlib
+numpy
diff --git a/experimental/CollectiveX/results/.gitkeep b/experimental/CollectiveX/results/.gitkeep
new file mode 100644
index 000000000..8940934a2
--- /dev/null
+++ b/experimental/CollectiveX/results/.gitkeep
@@ -0,0 +1,3 @@
+# CollectiveX result bundles land here as flat *.json (one per runner×op),
+# plus plots/ and raw_*.txt captures (gitignored). Keep this file so the dir
+# exists before the first run.
diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py
new file mode 100644
index 000000000..c22654c59
--- /dev/null
+++ b/experimental/CollectiveX/run_nccl.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — NCCL primitive benchmark wrapper.
+
+Runs stock `nccl-tests` binaries (built in-container at job time — the login
+nodes have no nvcc), parses the text table (NOT JSON — we do not assume the
+build emits JSON), and writes a flat, provenance-tagged JSON result the plot
+script and the eventual schema-freeze can consume.
+
+Standard library only, so it runs in any minimal container.
+
+Run (inside the container, after building nccl-tests):
+    python run_nccl.py --op all_reduce \\
+        --nccl-tests-dir /tmp/nccl-tests/build \\
+        --world-size 8 --min-bytes 8 --max-bytes 8G \\
+        --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\
+        --env-json results/env.json --out results/b200_all_reduce.json
+
+Verify the parser offline (no GPU needed):
+    python run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \\
+        --world-size 8 --runner b200-dgxc --topology-class b200-nvlink-island \\
+        --out /tmp/parsed.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import subprocess
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "nccl-tests-v1"
+
+# op -> nccl-tests binary name
+OP_BINARY = {
+    "all_reduce": "all_reduce_perf",
+    "all_gather": "all_gather_perf",
+    "reduce_scatter": "reduce_scatter_perf",
+    "alltoall": "alltoall_perf",
+    "all_to_all": "alltoall_perf",
+    "broadcast": "broadcast_perf",
+    "sendrecv": "sendrecv_perf",
+}
+
+
+def _f(tok: str):
+    """Parse a numeric cell; nccl-tests prints 'N/A' for #wrong when -c 0."""
+    if tok in ("N/A", "n/a", "-"):
+        return None
+    try:
+        return float(tok)
+    except ValueError:
+        return None
+
+
+def parse_nccl_table(text: str) -> tuple[list[dict], dict]:
+    """Parse nccl-tests stdout into per-size rows + a run summary.
+
+    Robust across ops: the column count varies (all_reduce/reduce_scatter carry
+    redop+root; all_gather/alltoall do not), but every op prints the same 8
+    trailing numeric columns — out-of-place (time, algbw, busbw, #wrong) then
+    in-place (time, algbw, busbw, #wrong). `size` is always the first token and
+    `type` the third. So we key off the first token and the last 8 tokens.
+    """
+    rows: list[dict] = []
+    summary: dict = {"avg_busbw_gbps": None, "out_of_bounds": None, "check_passed": None}
+    for line in text.splitlines():
+        s = line.strip()
+        if not s:
+            continue
+        if s.startswith("#"):
+            if "Avg bus bandwidth" in s:
+                summary["avg_busbw_gbps"] = _f(s.split(":")[-1].strip())
+            elif "Out of bounds values" in s:
+                tail = s.split(":")[-1].strip()
+                summary["out_of_bounds"] = tail
+                summary["check_passed"] = tail.endswith("OK")
+            continue
+        toks = s.split()
+        # Data line: first token is the byte size (all digits), and we need the
+        # 8 trailing metric columns plus size+count+type up front (>=11 tokens).
+        if len(toks) < 11 or not toks[0].isdigit():
+            continue
+        tail = toks[-8:]
+        size = int(toks[0])
+        dtype = toks[2] if len(toks) >= 3 else None
+        oop_wrong = _f(tail[3])
+        ip_wrong = _f(tail[7])
+        rows.append(
+            {
+                "size_bytes": size,
+                "dtype": dtype,
+                "out_of_place": {
+                    "time_us": _f(tail[0]),
+                    "algbw_gbps": _f(tail[1]),
+                    "busbw_gbps": _f(tail[2]),
+                    "wrong": oop_wrong,
+                },
+                "in_place": {
+                    "time_us": _f(tail[4]),
+                    "algbw_gbps": _f(tail[5]),
+                    "busbw_gbps": _f(tail[6]),
+                    "wrong": ip_wrong,
+                },
+                # convenience: best (max) busbw across the two placements
+                "busbw_gbps": max(
+                    [b for b in (_f(tail[2]), _f(tail[6])) if b is not None],
+                    default=None,
+                ),
+                "correct": (
+                    None
+                    if oop_wrong is None and ip_wrong is None
+                    else ((oop_wrong or 0) == 0 and (ip_wrong or 0) == 0)
+                ),
+            }
+        )
+    return rows, summary
+
+
+def comparison_key(meta: dict) -> str:
+    """Machine key gating which rows may share a curve (see plan §Comparability).
+    Topology-class is intentionally part of the key, so B200(IB) and
+    GB200(MNNVL) are labelled distinct rather than silently overlaid."""
+    parts = [
+        meta["op"],
+        meta["dtype"],
+        str(meta["world_size"]),
+        str(meta["nodes"]),
+        meta["topology_class"],
+        meta["comparison_class"],
+        meta["measurement_contract"],
+    ]
+    digest = hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+    return digest
+
+
+def build_command(args, binary_path: str) -> list[str]:
+    cmd: list[str] = []
+    if args.launch_prefix:
+        cmd += args.launch_prefix.split()
+    cmd += [
+        binary_path,
+        "-b", str(args.min_bytes),
+        "-e", str(args.max_bytes),
+        "-f", str(args.factor),
+        "-g", str(args.gpus_per_proc),
+        "-c", str(args.check),
+        "-w", str(args.warmup),
+        "-n", str(args.iters),
+    ]
+    if args.extra_args:
+        cmd += args.extra_args.split()
+    return cmd
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX NCCL primitive runner")
+    ap.add_argument("--op", required=True, choices=sorted(OP_BINARY))
+    ap.add_argument("--nccl-tests-dir", help="dir containing <op>_perf binaries (build/)")
+    ap.add_argument("--parse-only", help="parse this captured stdout file instead of running")
+    # nccl-tests knobs
+    ap.add_argument("--min-bytes", default="8")
+    ap.add_argument("--max-bytes", default="8G")
+    ap.add_argument("--factor", type=int, default=2, help="size step factor")
+    ap.add_argument("--gpus-per-proc", type=int, default=8,
+                    help="-g: GPUs per process (single-node multi-GPU). Use 1 under MPI.")
+    ap.add_argument("--check", type=int, default=1, help="-c: 1 enables correctness check")
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=20)
+    ap.add_argument("--extra-args", default="", help="extra args appended to the binary")
+    ap.add_argument("--launch-prefix", default="",
+                    help="e.g. 'mpirun -np 16 --hostfile hf' for multi-node; empty for single-node -g mode")
+    # provenance
+    ap.add_argument("--runner", required=True, help="runner label, e.g. b200-dgxc")
+    ap.add_argument("--world-size", type=int, required=True, help="total ranks/GPUs in the run")
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True,
+                    help="e.g. b200-nvlink-island, b200-nvlink-island+cx7-ib, gb200-nvl72-mnnvl")
+    ap.add_argument("--transport", default="", help="observed transport label: nvlink | ib | mnnvl")
+    ap.add_argument("--comparison-class", default="standardized",
+                    choices=["standardized", "backend-optimized", "framework-integrated"])
+    ap.add_argument("--env-json", help="path to env_capture.py output to embed")
+    ap.add_argument("--timestamp", help="ISO timestamp (default now)")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    binary = OP_BINARY[args.op]
+    command = None
+    if args.parse_only:
+        with open(args.parse_only) as fh:
+            stdout = fh.read()
+        ran_ok = True
+    else:
+        if not args.nccl_tests_dir:
+            ap.error("--nccl-tests-dir is required unless --parse-only is given")
+        binary_path = os.path.join(args.nccl_tests_dir, binary)
+        if not os.path.exists(binary_path):
+            print(f"ERROR: binary not found: {binary_path}", file=sys.stderr)
+            return 2
+        command = build_command(args, binary_path)
+        print("running:", " ".join(command), file=sys.stderr)
+        proc = subprocess.run(command, capture_output=True, text=True, check=False)
+        stdout = proc.stdout
+        ran_ok = proc.returncode == 0
+        if not ran_ok:
+            print(stdout, file=sys.stderr)
+            print(proc.stderr, file=sys.stderr)
+            print(f"ERROR: {binary} exited {proc.returncode}", file=sys.stderr)
+
+    rows, summary = parse_nccl_table(stdout)
+    dtype = rows[0]["dtype"] if rows else None
+
+    meta = {
+        "op": args.op,
+        "dtype": dtype,
+        "world_size": args.world_size,
+        "nodes": args.nodes,
+        "topology_class": args.topology_class,
+        "comparison_class": args.comparison_class,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+    }
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    # All-zero busbw means the benchmark didn't actually communicate — e.g. an
+    # MPI=0 binary launched under srun --mpi=pmix runs as N standalone world=1
+    # procs (busbw formula -> 0). Don't let that pass the gate as "valid".
+    peak_busbw = max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
+
+    doc = {
+        "schema_version": SCHEMA_VERSION,
+        "family": "nccl",
+        "generated_by": "run_nccl.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner,
+        "binary": binary,
+        "command": " ".join(command) if command else f"<parse-only {args.parse_only}>",
+        "transport": args.transport,
+        "status": ("valid" if (rows and ran_ok and peak_busbw > 0.0
+                   and (summary.get("check_passed") is True
+                   or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"),
+        "comparison_key": comparison_key(meta),
+        **meta,
+        "summary": summary,
+        "num_rows": len(rows),
+        "rows": rows,
+        "environment": env,
+    }
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+
+    print(
+        f"{args.op}: parsed {len(rows)} sizes -> {args.out} "
+        f"(status={doc['status']}, avg_busbw={summary.get('avg_busbw_gbps')} GB/s, "
+        f"key={doc['comparison_key']})"
+    )
+    return 0 if doc["status"] == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/runtime/_xnode_net.sh b/experimental/CollectiveX/runtime/_xnode_net.sh
new file mode 100644
index 000000000..ffbd2172a
--- /dev/null
+++ b/experimental/CollectiveX/runtime/_xnode_net.sh
@@ -0,0 +1,42 @@
+# shellcheck shell=bash
+# CollectiveX — cross-node PG bootstrap network fix + diagnostic (sourced per-rank/per-node).
+#
+# torch.distributed's gloo/NCCL TCP bootstrap advertises each rank's address from its hostname. On
+# clusters whose /etc/hosts aliases the hostname to loopback 127.0.1.1 (MI355X) the per-rank gloo
+# connectFullMesh then tries to connect to 127.0.1.1 and fails ("Gloo connectFullMesh ... Connection
+# refused, remote=[127.0.1.1]"). Pinning GLOO_SOCKET_IFNAME / NCCL_SOCKET_IFNAME to the NIC that holds
+# the cluster's routable address (the 10.x management/ethernet subnet) makes the mesh advertise the
+# reachable interface. RDMA EP transports (UCCL/MoRI/IBGDA) use their own RDMA NICs; this only fixes
+# the TCP control-plane rendezvous.
+#
+# NOTE this does NOT change the TCPStore *connect target* (that is MASTER_ADDR, fixed by the launcher):
+# if the rank-0 MASTER_ADDR is unreachable from inside a peer's container network namespace, no iface
+# pin helps — that is a cluster topology / container-net property, surfaced by the diagnostic below.
+#
+# The diagnostic ALWAYS prints what the container can see (hostname + every IPv4), so a cross-node GHA
+# log is self-documenting even when auto-detection or reachability fails. Robust to a missing iproute2
+# (`ip`) in minimal CUDA images: falls back to `hostname -I` / /proc parsing.
+
+# ---- diagnostic: what does this container's network namespace actually see? ----
+_cx_host="$(hostname 2>/dev/null || echo '?')"
+if command -v ip >/dev/null 2>&1; then
+  _cx_addrs="$(ip -o -4 addr show 2>/dev/null | awk '{print $2"="$4}' | tr '\n' ' ')"
+else
+  _cx_addrs="(no iproute2) hostname-I=[$(hostname -I 2>/dev/null)]"
+fi
+printf '[collectivex] xnode-net host=%s rank=%s addrs: %s\n' "$_cx_host" "${RANK:-?}" "$_cx_addrs" >&2
+
+# ---- pin GLOO/NCCL bootstrap iface to the routable 10.x NIC (operator override respected) ----
+if [ -z "${GLOO_SOCKET_IFNAME:-}" ]; then
+  _cx_if=""
+  if command -v ip >/dev/null 2>&1; then
+    _cx_if="$(ip -o -4 addr show 2>/dev/null | awk '$4 ~ /^10\./ {print $2; exit}')"
+  fi
+  if [ -n "$_cx_if" ]; then
+    export GLOO_SOCKET_IFNAME="$_cx_if" NCCL_SOCKET_IFNAME="$_cx_if"
+    printf '[collectivex] cross-node PG iface: GLOO/NCCL_SOCKET_IFNAME=%s\n' "$_cx_if" >&2
+  else
+    printf '[collectivex] xnode-net: no routable 10.x iface auto-detected (ip present=%s); relying on MASTER_ADDR\n' \
+      "$(command -v ip >/dev/null 2>&1 && echo yes || echo no)" >&2
+  fi
+fi
diff --git a/experimental/CollectiveX/runtime/common.sh b/experimental/CollectiveX/runtime/common.sh
new file mode 100644
index 000000000..992485a77
--- /dev/null
+++ b/experimental/CollectiveX/runtime/common.sh
@@ -0,0 +1,209 @@
+# shellcheck shell=bash
+# CollectiveX — shared launcher helpers (sourced, not executed).
+#
+# Cluster-generic scaffolding only (Slurm/container/build/staging); no
+# model-serving. Logging goes to stderr so functions can `echo` a single
+# result on stdout.
+
+cx_log() { printf '[collectivex] %s\n' "$*" >&2; }
+cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; }
+
+# Allocate via salloc (--no-shell is appended) and echo the GRANTED Slurm job id, parsed from
+# salloc's OWN output. Use INSTEAD of `salloc ...; JOB_ID=$(squeue --name=<name> -h -o %A | head -1)`:
+# that lookup is not unique per allocation, so under GHA-matrix concurrency (several cells calling
+# salloc with the same --job-name on one cluster) it returns a SIBLING cell's job id. Observed on
+# gb300: salloc granted 11354 but the name lookup returned a still-pending 11356 -> srun "Expired or
+# invalid job 11356" -> the cell failed even though its own allocation was fine. Parsing salloc's own
+# "Granted job allocation N" is race-free. salloc progress still streams live to the job log via tee.
+cx_salloc_jobid() {
+  local _t; _t="$(mktemp)"
+  salloc "$@" --no-shell 2>&1 | tee "$_t" >&2 || true
+  sed -n 's/.*Granted job allocation \([0-9][0-9]*\).*/\1/p' "$_t" | head -n1
+  rm -f "$_t"
+}
+
+# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI
+# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import
+# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
+# IMPORT BY TAG, not by digest: enroot's anonymous Docker Hub token scope is built
+# from the tag; a bare `repo@sha256:` ref makes enroot prompt for a password and
+# HANG in non-interactive CI (and a combined `tag@sha256` ref 400s). The expected
+# multi-arch index digest is recorded for provenance/verification:
+CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975"
+# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based
+# squash creation on these nodes — "failed to mount overlay ... Invalid argument".
+# v0.5.11-cu130 imports cleanly and is pre-staged on GB200.)
+# DeepEP is NOT bundled here -> run_in_container.sh builds it via rebuild-deepep.
+# (The arch-specific deepseek-v4-{blackwell,grace-blackwell} images DO bundle
+# DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.)
+CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130"
+
+# AMD (ROCm/CDNA): the multi-arch NVIDIA image above is x86_64+aarch64 CUDA and
+# cannot run on MI355X. AMD uses a separate ROCm image that bundles MoRI (the
+# AMD EP library). Single-arch (linux/amd64 host, ROCm runtime); not digest-
+# pinned yet — pin once validated on the runner. See CONTAINERS.md.
+CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+
+# NIXL stack: the sglang multiarch image has neither the NIXL agent nor the device-EP build deps,
+# and its Abseil (20220623) is what blocked the NIXL EP meson build (docs/gated.md). The dynamo
+# tensorrtllm-runtime image (CUDA-13, 2026) ships NIXL + a modern Abseil/UCX — the container-switch
+# the gated NIXL item calls for. Selected automatically for CX_BENCH=nixl on NVIDIA SKUs (override
+# with CX_IMAGE). Listed in .github/configs/nvidia-master.yaml.
+CX_IMAGE_NIXL="${CX_IMAGE_NIXL:-nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13}"
+
+cx_default_image() {
+  # CX_BENCH=nixl needs the NIXL/dynamo container — switch automatically on NVIDIA SKUs (CX_BENCH is
+  # already in the inherited env at this point). AMD keeps the MoRI image (no NIXL build there).
+  if [ "${CX_BENCH:-}" = "nixl" ]; then
+    case "$1" in
+      b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_NIXL"; return ;;
+    esac
+  fi
+  case "$1" in
+    mi355x*|mi350x*|mi325x*|mi300x*) echo "$CX_IMAGE_AMD_MORI" ;;
+    b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;;
+    *) cx_die "no default image for runner prefix: $1" ;;
+  esac
+}
+
+# cx_ensure_squash <squash_dir> <image>  ->  echoes the squash file path.
+# Imports via enroot only if a valid squash is not already present (flock-guarded,
+# mirroring runners/launch_b200-dgxc.sh).
+cx_ensure_squash() {
+  local squash_dir="$1" image="$2"
+  mkdir -p "$squash_dir" 2>/dev/null || true
+  local key sq locks
+  key="$(printf '%s' "$image" | sed 's#[/:@#]#_#g')"
+  sq="$squash_dir/${key}.sqsh"
+  locks="$squash_dir/.locks"; mkdir -p "$locks" 2>/dev/null || true
+  (
+    flock -w 900 9 || cx_die "lock timeout for $sq"
+    if unsquashfs -l "$sq" >/dev/null 2>&1; then
+      cx_log "squash present: $sq"
+    else
+      cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)"
+      rm -f "$sq"
+      # </dev/null: never block on enroot's interactive password prompt (a missing
+      # anonymous token must fail fast, not hang the CI job).
+      enroot import -o "$sq" "docker://$image" </dev/null >&2 \
+        || cx_die "enroot import failed for $image (anonymous auth needs a TAG ref, not a bare digest; or pre-stage the squash)"
+      unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq"
+    fi
+  ) 9>"$locks/${key}.lock"
+  echo "$sq"
+}
+
+# cx_stage_repo <repo_root> <stage_dir>  ->  echoes the mount-source root.
+# Some clusters (e.g. GB200/watchtower) do not cross-mount the runner workspace
+# to compute nodes. If CX_STAGE_DIR is set, rsync the CollectiveX tree onto that
+# compute-visible shared FS and mount from there. No-op (echo repo_root) when
+# stage_dir is empty or equals repo_root.
+cx_stage_repo() {
+  local repo_root="$1" stage_dir="${2:-}"
+  if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then
+    echo "$repo_root"; return 0
+  fi
+  # Concurrency isolation. Under GHA the per-config concurrency fan-out runs many
+  # same-SKU dispatches at once, all staging into the SAME shared base dir; a
+  # shared dir + `rsync --delete` lets one job unlink/replace a file a peer is
+  # mid-read of -> "error reading input file: Stale file handle" on the next
+  # `srun ... run_in_container.sh`. Give each EXECUTING job its own subdir keyed on
+  # the runner name (a self-hosted runner runs one job at a time, so concurrent
+  # jobs never share a dir); sequential reuse on one runner is safe (the jobs do
+  # not overlap, and --delete refreshes the tree). Outside GHA (no RUNNER_NAME /
+  # GITHUB_RUN_ID) keep the single shared dir — SSH use is single-tenant.
+  local tag="${RUNNER_NAME:-${GITHUB_RUN_ID:-}}"
+  if [ -n "$tag" ]; then
+    stage_dir="$stage_dir/job_$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')"
+  fi
+  mkdir -p "$stage_dir/experimental" || cx_die "cannot create stage dir $stage_dir"
+  cx_log "staging experimental/CollectiveX -> $stage_dir (compute-visible)"
+  rsync -a --delete \
+    --exclude='.nccl-tests/' --exclude='__pycache__/' --exclude='results/plots/' \
+    "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" >&2 \
+    || cx_die "rsync to stage dir failed"
+  echo "$stage_dir"
+}
+
+# cx_collect_results <mount_src> <repo_root>
+# When the run used a staged (compute-visible) mount, copy result JSONs back to
+# the original checkout's results/ so the workflow's upload-artifact (which reads
+# the checkout, not the stage dir) finds them. No-op when no staging was used.
+cx_collect_results() {
+  local mount_src="$1" repo_root="$2" dst
+  [ "$mount_src" = "$repo_root" ] && return 0
+  dst="$repo_root/experimental/CollectiveX/results"
+  mkdir -p "$dst"
+  cp "$mount_src/experimental/CollectiveX/results/"*.json "$dst/" 2>/dev/null || true
+  cx_log "copied results from stage dir -> $dst (for artifact upload)"
+}
+
+# cx_build_nccl_tests <parent_dir> <mpi 0|1>  ->  echoes the build/ dir.
+# Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built.
+# CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang
+# cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed.
+cx_build_nccl_tests() {
+  local parent="$1" mpi="${2:-0}" dir bin sfx=""
+  # Cache MPI=0 and MPI=1 builds in SEPARATE dirs. A single-node (MPI=0) binary
+  # reused under `srun --mpi=pmix` runs as N standalone world=1 procs (busbw=0);
+  # keying the cache by flavor prevents that cross-contamination.
+  [ "$mpi" = "1" ] && sfx="-mpi"
+  dir="$parent/nccl-tests$sfx"
+  bin="$dir/build/all_reduce_perf"
+  if [ -x "$bin" ]; then
+    cx_log "nccl-tests already built: $dir/build"
+    echo "$dir/build"; return 0
+  fi
+  mkdir -p "$parent"
+  if [ ! -d "$dir/.git" ]; then
+    cx_log "cloning nccl-tests -> $dir"
+    git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \
+      || cx_die "git clone nccl-tests failed"
+  fi
+  # MPI=1 needs MPI_HOME. On Debian/Ubuntu OpenMPI the headers live under
+  # /usr/lib/<arch>/openmpi/include (NOT /usr/include), so MPI_HOME=/usr fails;
+  # point it at that openmpi dir (libmpi resolves via the default linker path).
+  # Works for both x86_64 (B200) and aarch64 (GB200). Override with CX_MPI_HOME.
+  local mpi_home="${CX_MPI_HOME:-}"
+  if [ "$mpi" = "1" ] && [ -z "$mpi_home" ]; then
+    mpi_home="$(ls -d /usr/lib/*/openmpi 2>/dev/null | head -n1)"
+  fi
+  cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr}${mpi_home:+, MPI_HOME=$mpi_home})"
+  make -C "$dir" -j MPI="$mpi" \
+       CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \
+       NCCL_HOME="${CX_NCCL_HOME:-/usr}" \
+       ${mpi_home:+MPI_HOME="$mpi_home"} >&2 \
+    || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME/CX_MPI_HOME; need nccl.h + libnccl)"
+  [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin"
+  echo "$dir/build"
+}
+
+# cx_build_rccl_tests <parent_dir> <mpi 0|1>  ->  echoes the build/ dir.
+# AMD/ROCm counterpart of cx_build_nccl_tests: ROCm/rccl-tests is a fork of
+# nccl-tests producing the SAME binary names (<op>_perf) and output format, so
+# run_nccl.py parses it unchanged. `make` defaults to ROCm at /opt/rocm
+# (amdclang++ + librccl); validated building in-container on MI355X. Override
+# CX_ROCM_HOME / CX_RCCL_HOME / CX_MPI_HOME if the toolchain lives elsewhere.
+cx_build_rccl_tests() {
+  local parent="$1" mpi="${2:-0}" dir bin
+  dir="$parent/rccl-tests"
+  bin="$dir/build/all_reduce_perf"
+  if [ -x "$bin" ]; then
+    cx_log "rccl-tests already built: $dir/build"
+    echo "$dir/build"; return 0
+  fi
+  mkdir -p "$parent"
+  if [ ! -d "$dir/.git" ]; then
+    cx_log "cloning rccl-tests -> $dir"
+    git clone --depth 1 https://github.com/ROCm/rccl-tests.git "$dir" >&2 \
+      || cx_die "git clone rccl-tests failed"
+  fi
+  cx_log "building rccl-tests (MPI=$mpi, ROCm ${CX_ROCM_HOME:-/opt/rocm})"
+  make -C "$dir" -j MPI="$mpi" \
+       ${CX_ROCM_HOME:+HIP_HOME="$CX_ROCM_HOME"} \
+       ${CX_RCCL_HOME:+RCCL_HOME="$CX_RCCL_HOME"} \
+       ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \
+    || cx_die "rccl-tests build failed (need ROCm + librccl; try CX_ROCM_HOME)"
+  [ -x "$bin" ] || cx_die "rccl-tests build produced no binary at $bin"
+  echo "$dir/build"
+}
diff --git a/experimental/CollectiveX/runtime/run_in_container.sh b/experimental/CollectiveX/runtime/run_in_container.sh
new file mode 100644
index 000000000..10777bdcf
--- /dev/null
+++ b/experimental/CollectiveX/runtime/run_in_container.sh
@@ -0,0 +1,722 @@
+#!/usr/bin/env bash
+# CollectiveX — generic in-container benchmark dispatcher (single-node).
+#
+# Runs INSIDE the container under `srun`, invoked by every per-SKU adapter
+# (launch_<sku>.sh). The SKU adapter handles allocation/container/transport-env;
+# this script decides WHICH benchmark to run from CX_BENCH, so any benchmark can
+# be driven through any SKU's launch script. Writes provenance-tagged JSON to
+# results/.
+#
+# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO
+# Selector:        CX_BENCH = nccl | deepep | mori | all    (default nccl)
+#                  (mori = AMD ROCm EP; nccl/deepep = NVIDIA. `all` = nccl+deepep.)
+# NCCL knobs:      CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME
+# EP knobs (DeepEP/MoRI), all -> tests/run_ep.py:
+#   CX_PHASE = decode | prefill | both (default decode)   <- picks the token sweep
+#   CX_TOKENS_LADDER (space/comma sep; blank = phase default), CX_TOKENS_PER_RANK (legacy single point)
+#   CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE CX_ROUTING CX_MODE(normal|ll)
+#   CX_NUM_SMS (DeepEP comm SMs) CX_SEED CX_ITERS
+set -euo pipefail
+
+cd /ix/experimental/CollectiveX
+# shellcheck source=../runtime/common.sh
+source runtime/common.sh
+mkdir -p results
+
+: "${CX_RUNNER:?CX_RUNNER not set}"
+: "${CX_NGPUS:?CX_NGPUS not set}"
+: "${CX_TS:?CX_TS not set}"
+: "${CX_TOPO:?CX_TOPO not set}"
+CX_BENCH="${CX_BENCH:-nccl}"
+CX_TRANSPORT="${CX_TRANSPORT:-}"
+ENVJSON="results/env_${CX_RUNNER}_${CX_TS}.json"
+
+# CX_TIMING="iters:trials:warmup" unpacks into the individual knobs (one workflow input feeds three,
+# since GitHub caps workflow_dispatch at 25 inputs). Blank fields keep their defaults. Used for the
+# MoRI/MI355X large-T probe (e.g. "8:1:4" — minimal sustained load to dodge the wedge).
+if [ -n "${CX_TIMING:-}" ]; then
+  _ti="${CX_TIMING%%:*}"; _rest="${CX_TIMING#*:}"; _tt="${_rest%%:*}"; _tw="${_rest#*:}"
+  [ -n "$_ti" ] && [ "$_ti" != "$CX_TIMING" ] && export CX_ITERS="$_ti"
+  [ -n "$_tt" ] && [ "$_tt" != "$_rest" ] && export CX_TRIALS="$_tt"
+  [ -n "$_tw" ] && [ "$_tw" != "$_rest" ] && export CX_WARMUP="$_tw"
+  cx_log "CX_TIMING=$CX_TIMING -> iters=${CX_ITERS:-200} trials=${CX_TRIALS:-3} warmup=${CX_WARMUP:-32}"
+fi
+
+cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO"
+python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS"
+
+run_nccl_suite() {
+  local build ops op sfail=0 impl=nccl
+  # AMD/ROCm -> rccl-tests (fork; same binaries + output, parsed by run_nccl.py);
+  # NVIDIA/CUDA -> nccl-tests. Both single-node: MPI=0, -g N.
+  if [ -d /opt/rocm ] || command -v hipcc >/dev/null 2>&1; then
+    impl=rccl
+    build="$(cx_build_rccl_tests "$PWD/.nccl-tests" 0)" || return 1
+  else
+    build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1
+  fi
+  cx_log "collective impl=$impl build=$build"
+  ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+  for op in $ops; do
+    if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \
+        --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \
+        --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+        --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \
+        --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then
+      cx_log "WARN: $impl $op failed or invalid"; sfail=1
+    fi
+  done
+  return "$sfail"
+}
+
+# Resolve the source-tokens-per-rank sweep: explicit CX_TOKENS_LADDER wins; else
+# the legacy single-point CX_TOKENS_PER_RANK becomes a one-point ladder; else
+# blank => tests/run_ep.py picks the phase default (decode small / prefill large).
+cx_ep_ladder() {
+  if [ -n "${CX_TOKENS_LADDER:-}" ]; then printf '%s' "$CX_TOKENS_LADDER"
+  elif [ -n "${CX_TOKENS_PER_RANK:-}" ]; then printf '%s' "$CX_TOKENS_PER_RANK"
+  else printf ''; fi
+}
+
+# Canonical workload staging (goal P1 "official" cohort). make_workloads.py is DETERMINISTIC, so
+# every SKU/backend generates byte-identical serialized traces in-container => identical workload_id
+# + checksum => proven cross-hardware workload identity with NO shared filesystem. When CX_CANONICAL=1
+# (and CX_WORKLOAD_DIR not already provided) we generate the routing's traces for the run's ladder
+# into a NON-results dir (.cx_workloads/ — so the *.manifest.json never pollute the results glob) and
+# point run_ep at it. A canonical-serialized run with full GHA provenance is publication 'official'.
+cx_stage_canonical() {
+  [ "${CX_CANONICAL:-0}" = "1" ] || return 0
+  [ -n "${CX_WORKLOAD_DIR:-}" ] && return 0
+  local dir="$PWD/.cx_workloads"
+  local ladder; ladder="$(cx_ep_ladder)"
+  # cover both phase ladders when none is given, so either phase finds its files.
+  [ -z "$ladder" ] && ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+  cx_log "staging canonical workloads (routing=${CX_ROUTING:-uniform} ep=$CX_NGPUS ladder='$ladder')"
+  python3 tests/make_workloads.py --out-dir "$dir" --routing "${CX_ROUTING:-uniform}" \
+    --ep "$CX_NGPUS" --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" \
+    --experts "${CX_EXPERTS:-256}" --seed "${CX_SEED:-67}" --tokens-ladder "$ladder" \
+    || { cx_log "WARN: canonical workload staging failed — falling back to seeded-runtime"; return 0; }
+  export CX_WORKLOAD_DIR="$dir"
+  cx_log "canonical workloads staged at $dir"
+}
+
+# run_ep_suite <backend: deepep|mori>
+# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and
+# combine are timed separately inside it. One JSON per (backend, phase).
+# Preserve a FAILED case as a classified record (goal immediate P2 "preserve failed cases in
+# aggregation") so a wedge/timeout/crash becomes a bounded artifact in results/ (uploaded + surfaced
+# by the plot/validator) instead of vanishing. Uses tests/failure_taxonomy.py for the mode.
+emit_failed_case() {  # backend phase rc
+  python3 - "$1" "$2" "$3" "$CX_RUNNER" "$CX_TOPO" \
+    "results/failed_${CX_RUNNER}_${1}_${2}_${CX_TS}.json" <<'PY' || true
+import sys, json, os
+sys.path.insert(0, "tests")
+import failure_taxonomy as ft
+backend, phase, rc, runner, topo, out = sys.argv[1:7]
+rec = {"family": "moe", "record_type": "failed-case", "schema_version": 3,
+       "generated_by": "run_in_container.sh", "runner": runner, "backend": backend,
+       "phase": phase, "topology_class": topo, "status": "failed",
+       "publication_status": "failed", "rows": [],
+       "failure": ft.record(rc=int(rc), case={"backend": backend, "phase": phase,
+                            "dispatch_dtype": os.environ.get("CX_DISPATCH_DTYPE", "bf16"),
+                            "mode": os.environ.get("CX_MODE", "normal"),
+                            "contract": os.environ.get("CX_MEASUREMENT_CONTRACT", "layout-and-dispatch-v1"),
+                            "routing": os.environ.get("CX_ROUTING", "uniform")})}
+json.dump(rec, open(out, "w"), indent=2)
+print(f"preserved failed-case record ({rec['failure']['failure_mode']}) -> {out}")
+PY
+}
+
+run_ep_suite() {
+  local backend="$1" phase phases ladder rc=0 rc_run
+  ladder="$(cx_ep_ladder)"
+  phases="${CX_PHASE:-decode}"
+  [ "$phases" = "both" ] && phases="decode prefill"
+  cx_stage_canonical || true   # sets CX_WORKLOAD_DIR when CX_CANONICAL=1 (official cohort)
+  # CROSS-NODE EP (goal 182): when CX_NNODES>1 (set per-node by a multi-node launcher with
+  # CX_NODE_RANK + CX_RDZV_FILE) we span CX_NNODES*CX_NGPUS ranks over the inter-node fabric. We do
+  # NOT use torchrun: its elastic agent runs its OWN cross-node TCPStore at --master-addr, which is
+  # unreachable from a peer rank's enroot container net namespace (the management-subnet NodeAddr is
+  # not in the container's net view — torchrun timed out 900s at exactly that bootstrap). Instead each
+  # node spawns its NGPUS local ranks directly (global RANK = CX_NODE_RANK*NGPUS + local) and they
+  # rendezvous via a FileStore on the compute-visible shared mount (CX_RDZV_FILE, consumed by
+  # run_ep.py), so NCCL exchanges its unique-id through the shared file and connects peers over IB.
+  local xnode=0
+  if [ -n "${CX_NNODES:-}" ] && [ "${CX_NNODES}" -gt 1 ]; then
+    xnode=1
+    # shellcheck source=_xnode_net.sh
+    source runtime/_xnode_net.sh 2>/dev/null || true
+    : "${CX_RDZV_FILE:=$PWD/.rdzv_${CX_TS}}"; export CX_RDZV_FILE
+    cx_log "cross-node EP: nnodes=$CX_NNODES node_rank=${CX_NODE_RANK:-0} world=$((CX_NNODES*CX_NGPUS)) rdzv=file://$CX_RDZV_FILE (no torchrun agent)"
+  fi
+  for phase in $phases; do
+    cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-<phase-default>}'"
+    local out="results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"
+    # Common run_ep.py args (shared by single-node torchrun + cross-node local-spawn).
+    local -a EPARGS=(--backend "$backend" --phase "$phase" --tokens-ladder "$ladder" --mode "${CX_MODE:-normal}"
+      --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}"
+      --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" --routing "${CX_ROUTING:-uniform}"
+      --num-sms "${CX_NUM_SMS:-24}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-200}"
+      --trials "${CX_TRIALS:-3}" --warmup "${CX_WARMUP:-32}"
+      --measurement-contract "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}"
+      --resource-mode "${CX_RESOURCE_MODE:-normalized}" --sm-fraction "${CX_SM_FRACTION:-0.18}"
+      --activation-profile "${CX_ACTIVATION_PROFILE:-normal}" --placement "${CX_PLACEMENT:-packed}"
+      --routing-step "${CX_ROUTING_STEP:-0}" --uneven-tokens "${CX_UNEVEN_TOKENS:-none}"
+      --combine-dtype "${CX_COMBINE_DTYPE:-bf16}" --combine-quant-mode "${CX_COMBINE_QUANT_MODE:-none}"
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT"
+      --env-json "$ENVJSON" --out "$out")
+    [ -n "${CX_EPLB:-}" ] && EPARGS+=(--eplb)
+    [ -n "${CX_WORKLOAD_DIR:-}" ] && EPARGS+=(--workload-dir "$CX_WORKLOAD_DIR")
+    [ -n "${CX_WAIVE_ANOMALY:-}" ] && EPARGS+=(--waive-anomaly)
+    # Hard wall-clock guard: a wedged collective must FAIL FAST (timeout -k SIGKILLs after grace).
+    if [ "$xnode" = 1 ]; then
+      # Cross-node: spawn NGPUS local ranks, FileStore rendezvous (no torchrun agent). Only the global
+      # rank 0 writes --out; the rest participate in the collectives. wait collects every rank's rc.
+      local base=$(( ${CX_NODE_RANK:-0} * CX_NGPUS )) world=$(( CX_NNODES * CX_NGPUS )) i; local -a pids=()
+      for i in $(seq 0 $((CX_NGPUS - 1))); do
+        RANK=$((base + i)) LOCAL_RANK="$i" WORLD_SIZE="$world" \
+          timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/run_ep.py "${EPARGS[@]}" &
+        pids+=($!)
+      done
+      rc_run=0; for i in "${pids[@]}"; do wait "$i" || rc_run=$?; done
+    else
+      # shellcheck disable=SC2086
+      timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+          torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py "${EPARGS[@]}"
+      rc_run=$?
+    fi
+    if [ "$rc_run" != 0 ]; then
+      cx_log "WARN: $backend $phase run failed/timed out rc=$rc_run (CX_RUN_TIMEOUT=${CX_RUN_TIMEOUT:-900}s)"
+      emit_failed_case "$backend" "$phase" "$rc_run"   # preserve the classified failed case
+      rc=1
+    fi
+  done
+  return "$rc"
+}
+
+# Build DeepEP V2 (NCCL Gin backend) from source, overriding the image's bundled V1 (1.2.1).
+# V2 needs NCCL>=2.30.4 (symmetric memory) STRICTLY matching the NCCL torch loads, and builds JIT
+# (no precompile). arch 9.0 for Hopper (H100/H200), 10.0 for Blackwell (B300/B200/GB300). Best-effort:
+# on failure the deepep run still fails loudly (preserved failed-case), never a silent V1 fallback.
+cx_build_deepep_v2() {
+  # IDEMPOTENT: SHARD mode calls dispatch_bench (hence this) once PER CASE. Build once per allocation,
+  # then skip — else a 60-case shard re-runs the from-source build 60x (force-reinstall) and blows the
+  # slurm --time. Sentinel lives in the container fs (persists across the x86 in-container case loop).
+  [ -f /tmp/.cx_built_deepep_v2 ] && { cx_log "DeepEP V2 already built this allocation — skip rebuild"; return 0; }
+  local arch="9.0"; case "$CX_RUNNER" in b300*|gb300*|b200*) arch="10.0";; esac
+  cx_log "DeepEP V2: building from source (TORCH_CUDA_ARCH_LIST=$arch) — overrides bundled V1"
+  # PEP 668: newer images (H200/B300) ship an externally-managed Python that refuses `pip install`.
+  # PIP_BREAK_SYSTEM_PACKAGES is honored by pip>=23.0.1 and silently ignored by older pip (H100),
+  # so this is safe across every image; --break-system-packages as a flag would error on old pip.
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  pip install -q "nvidia-nccl-cu13>=2.30.4" >&2 2>&1 || cx_log "WARN: nvidia-nccl-cu13 install warning"
+  rm -rf /tmp/DeepEP_v2
+  git clone --depth 1 https://github.com/deepseek-ai/DeepEP /tmp/DeepEP_v2 >&2 2>&1 \
+    || { cx_log "ERROR: DeepEP V2 git clone failed (compute-node network?)"; return 1; }
+  export DEEPEP_COMMIT="v2-$(git -C /tmp/DeepEP_v2 rev-parse --short HEAD 2>/dev/null || echo main)"
+  ( cd /tmp/DeepEP_v2 && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+      pip install -q --no-build-isolation --force-reinstall . ) >&2 2>&1 \
+    || { cx_log "ERROR: DeepEP V2 build/install failed (arch=$arch; NCCL/toolchain?)"; return 1; }
+  python3 -c "import deep_ep; print('built deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \
+    || { cx_log "ERROR: DeepEP V2 import failed after build (NCCL version mismatch?)"; return 1; }
+  : > /tmp/.cx_built_deepep_v2   # sentinel: skip rebuild on subsequent cases in this allocation
+  cx_log "DeepEP V2 ready ($DEEPEP_COMMIT)"
+}
+
+# Build the DeepEP `hybrid-ep` branch (NVIDIA's TMA-based impl: HybridEPBuffer, intranode NVLink +
+# internode RDMA/NIXL). Three container-specific fixes, all probe-confirmed on the B300 sglang image:
+#   1. CUDA-13 moved cccl/libcudacxx headers to <cuda>/include/cccl/ (not on nvcc's default path) —
+#      its nvshmem_tensor.h #includes <cuda/std/tuple> -> add that dir via CPATH/NVCC_PREPEND_FLAGS.
+#   2. The final link wants -l:libnvshmem_host.so but the bundled nvshmem ships only .so.3 -> create
+#      the unversioned symlink.
+#   3. NVSHMEM_DIR set to the bundled nvshmem enables build; unset => intranode-only (internode/LL off).
+# Intranode HybridEPBuffer (single NVLink domain, <=8 ranks) needs no multi-node/NVSHMEM bring-up.
+cx_build_deepep_hybrid() {
+  [ -f /tmp/.cx_built_deepep_hybrid ] && { cx_log "hybrid-ep already built this allocation — skip rebuild"; return 0; }
+  local arch="9.0"; case "$CX_RUNNER" in b300*|gb300*|b200*) arch="10.0";; esac
+  cx_log "DeepEP hybrid-ep: building NVIDIA TMA branch from source (TORCH_CUDA_ARCH_LIST=$arch)"
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  export NVSHMEM_DIR="$(python3 -c 'import os,nvidia.nvshmem as n; print(os.path.dirname(n.__file__))' 2>/dev/null || echo /usr/local/lib/python3.12/dist-packages/nvidia/nvshmem)"
+  local cccl; cccl="$(echo /usr/local/cuda*/targets/*/include/cccl | awk '{print $1}')"
+  [ -d "$cccl" ] && { export CPATH="$cccl:${CPATH:-}"; export NVCC_PREPEND_FLAGS="-I$cccl ${NVCC_PREPEND_FLAGS:-}"; }
+  [ -e "$NVSHMEM_DIR/lib/libnvshmem_host.so.3" ] && ln -sf libnvshmem_host.so.3 "$NVSHMEM_DIR/lib/libnvshmem_host.so" 2>/dev/null || true
+  export LD_LIBRARY_PATH="$NVSHMEM_DIR/lib:${LD_LIBRARY_PATH:-}"
+  rm -rf /tmp/DeepEP_hybrid
+  git clone --depth 1 --branch hybrid-ep https://github.com/deepseek-ai/DeepEP /tmp/DeepEP_hybrid >&2 2>&1 \
+    || { cx_log "ERROR: hybrid-ep git clone failed"; return 1; }
+  export DEEPEP_COMMIT="hybrid-$(git -C /tmp/DeepEP_hybrid rev-parse --short HEAD 2>/dev/null || echo hybrid-ep)"
+  # Install into SITE-PACKAGES so the build persists across srun steps in the pyxis named container. The
+  # EP8 multi-srun runs the build-once and each case as SEPARATE srun steps; only the container rootfs
+  # (site-packages) persists — /tmp does NOT. The old `build_ext --inplace` under /tmp/DeepEP_hybrid +
+  # PYTHONPATH worked for the EP4 single-node path (build+run share one process) but was LOST at EP8,
+  # giving `module deep_ep has no attribute HybridEPBuffer`. pip install mirrors deepep-v2 (which persists
+  # correctly at EP8). Fall back to in-place build (EP4 single-node only) if this branch can't plain-install.
+  if ( cd /tmp/DeepEP_hybrid && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+        pip install -q --no-build-isolation --force-reinstall . ) >&2 2>&1; then
+    cx_log "hybrid-ep installed into site-packages (persists across srun steps)"
+  else
+    cx_log "WARN: hybrid-ep pip install failed — falling back to build_ext --inplace (EP4 single-node only)"
+    ( cd /tmp/DeepEP_hybrid && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 python3 setup.py build_ext --inplace ) >&2 2>&1 \
+      || { cx_log "ERROR: hybrid-ep build failed (arch=$arch; cccl/nvshmem?)"; return 1; }
+    export PYTHONPATH="/tmp/DeepEP_hybrid:${PYTHONPATH:-}"
+  fi
+  python3 -c "import deep_ep; assert hasattr(deep_ep,'HybridEPBuffer'); print('built hybrid-ep deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \
+    || { cx_log "ERROR: hybrid-ep import / HybridEPBuffer missing after build"; return 1; }
+  # nvshmem runtime libs are in site-packages (persistent); the env pointing at them is process-local, and
+  # a PYTHONPATH is needed only if the in-place fallback ran. Persist both to a file the EP8 case-srun WRAP
+  # sources (best-effort; with pip install the package itself is already on the default site-packages path).
+  { printf 'export LD_LIBRARY_PATH=%s/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}\n' "$NVSHMEM_DIR"
+    [ -n "${PYTHONPATH:-}" ] && printf 'export PYTHONPATH=%s\n' "$PYTHONPATH"
+  } > /tmp/.cx_hybrid_env 2>/dev/null || cx_log "WARN: could not write /tmp/.cx_hybrid_env"
+  : > /tmp/.cx_built_deepep_hybrid   # sentinel: skip rebuild on subsequent cases in this allocation
+  cx_log "DeepEP hybrid-ep ready ($DEEPEP_COMMIT)"
+}
+
+# UCCL EP (uccl.ep.Buffer is a DeepEP-API clone). The prebuilt wheel is cu12; on a cu13
+# image its kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH (probe-confirmed). PEP-668
+# images need PIP_BREAK_SYSTEM_PACKAGES. Best-effort; failure to import fails loudly.
+cx_build_uccl() {
+  cx_log "UCCL EP: pip install uccl + cu12 runtime shim"
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  pip install -q uccl >&2 2>&1 || { cx_log "ERROR: pip install uccl failed"; return 1; }
+  pip install -q nvidia-cuda-runtime-cu12 >&2 2>&1 || cx_log "WARN: nvidia-cuda-runtime-cu12 warning"
+  local cu12lib
+  cu12lib="$(python3 -c "import nvidia.cuda_runtime as m, os; print(os.path.join(os.path.dirname(m.__file__),'lib'))" 2>/dev/null)"
+  [ -n "$cu12lib" ] && export LD_LIBRARY_PATH="$cu12lib:${LD_LIBRARY_PATH:-}"
+  export UCCL_COMMIT="pkg-$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))' 2>/dev/null || echo uccl)"
+  # import torch FIRST: uccl.ep's C extension links libc10.so (torch), which is only on the loader
+  # path once torch is imported (rpath). The adapter (ep_uccl.py) imports torch before uccl.ep too.
+  python3 -c "import torch; from uccl.ep import Buffer; print('uccl.ep ready')" >&2 \
+    || { cx_log "ERROR: uccl.ep import failed (cu12 runtime on LD_LIBRARY_PATH?)"; return 1; }
+  # Vendor UCCL's DeepEP-API wrapper (ep/deep_ep_wrapper/deep_ep) under a NON-conflicting name
+  # (uccl_deepep) so it doesn't shadow the container's real deep_ep. Its Buffer(group, num_nvl_bytes,
+  # ...) takes a torch ProcessGroup (matching DeepEP + ep_uccl.py's calls) and runs the full
+  # proxy/IPC-handle/runtime.sync bootstrap that the low-level uccl.ep.Buffer(rank,num_ranks) lacks.
+  rm -rf /tmp/uccl_src /tmp/uccl_deepep_pkg
+  # Pin the wrapper to the SAME tag as the installed wheel (pkg-0.1.1 -> v0.1.1): the wrapper's
+  # dispatch calls into uccl.ep (get_rdma_buffer etc.), so a main-branch wrapper vs a 0.1.1 wheel
+  # mismatches signatures. Match them.
+  _uccl_tag="v$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))' 2>/dev/null || echo 0.1.1)"
+  if { git clone --depth 1 --branch "$_uccl_tag" https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1 \
+       || git clone --depth 1 https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1; } \
+     && [ -d /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep ]; then
+    mkdir -p /tmp/uccl_deepep_pkg/uccl_deepep
+    cp /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep/*.py /tmp/uccl_deepep_pkg/uccl_deepep/ 2>/dev/null
+    export PYTHONPATH="/tmp/uccl_deepep_pkg:${PYTHONPATH:-}"
+    if python3 -c "import torch; from uccl_deepep import Buffer; print('uccl_deepep wrapper ready')" >&2; then
+      export CX_UCCL_WRAPPER=1
+    else
+      cx_log "WARN: uccl_deepep wrapper import failed — falling back to low-level uccl.ep"
+    fi
+  else
+    cx_log "WARN: uccl deep_ep_wrapper not vendored (clone/path) — low-level uccl.ep fallback"
+  fi
+  cx_log "UCCL EP ready ($UCCL_COMMIT, wrapper=${CX_UCCL_WRAPPER:-0})"
+}
+
+run_deepep_suite() {
+  # CX_DEEPEP_V2=1 -> build the V2 (NCCL Gin) kernels from source first (Hopper+Blackwell only).
+  if [ "${CX_DEEPEP_V2:-0}" = "1" ]; then
+    cx_build_deepep_v2 || { cx_log "WARN: DeepEP V2 setup failed — cannot run V2"; return 1; }
+  fi
+  # DeepEP is not bundled in the multi-arch image. Try to import; if absent,
+  # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a
+  # failure, not a silent skip — the caller asked for deepep.
+  if ! python3 -c "import deep_ep" 2>/dev/null; then
+    if command -v rebuild-deepep.sh >/dev/null 2>&1; then
+      cx_log "building DeepEP via rebuild-deepep.sh"
+      rebuild-deepep.sh >&2 || { cx_log "WARN: rebuild-deepep.sh failed"; return 1; }
+    else
+      cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; cannot run deepep"
+      return 1
+    fi
+  fi
+  run_ep_suite deepep
+}
+
+run_mori_suite() {
+  # MoRI (AMD ROCm EP), bundled in the AMD MoRI image. If absent this is a
+  # failure (MoRI is not rebuildable here), not a silent skip. Single-node
+  # 8x MI355X over XGMI; torch.cuda maps onto ROCm/HIP.
+  if ! python3 -c "import mori" 2>/dev/null; then
+    cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori"
+    return 1
+  fi
+  run_ep_suite mori
+}
+
+run_uccl_suite() {
+  # UCCL EP (NVIDIA) — DeepEP-API clone; build the wheel + cu12 shim, then reuse the generic
+  # EP sweep (run_ep.py --backend uccl). Inability to install/import is a failure, not a skip.
+  cx_build_uccl || { cx_log "WARN: UCCL EP setup failed — cannot run uccl"; return 1; }
+  run_ep_suite uccl
+}
+run_nccl_ep_suite() {
+  # NCCL/RCCL all-to-all EP (tests/ep_nccl.py) — pure torch.distributed collectives, already in every
+  # image (no build). The canonical token-shuffle EP + the only cross-node path that survives without
+  # GPUDirect-RDMA: NCCL host-stages where UCCL's ibv_reg_mr / MoRI's RDMA registration abort. Works
+  # cross-node via the FileStore rendezvous (CX_RDZV_FILE) on both NVIDIA (nccl) and AMD (rccl).
+  run_ep_suite nccl-ep
+}
+run_deepep_hybrid_suite() {
+  # DeepEP hybrid-ep branch (NVIDIA TMA HybridEPBuffer) — build from source (cccl + libnvshmem
+  # fixes), then the generic EP sweep (run_ep.py --backend deepep-hybrid). Intranode NVLink path.
+  cx_build_deepep_hybrid || { cx_log "WARN: hybrid-ep setup failed — cannot run deepep-hybrid"; return 1; }
+  run_ep_suite deepep-hybrid
+}
+
+run_collective_bench() {
+  # Single-process host/GPU memcpy-family collectives (NOT torchrun): CPU-GPU offload,
+  # copy-engine/SDMA, KV-cache transfer. Each emits one family-tagged JSON like run_nccl.py.
+  local kind="$1" script out rc=0
+  case "$kind" in
+    offload)     script="tests/offload_bench.py";    out="results/${CX_RUNNER}_offload_${CX_TS}.json" ;;
+    copy-engine) script="tests/copy_engine_bench.py"; out="results/${CX_RUNNER}_copy_engine_${CX_TS}.json" ;;
+    kv-cache)    script="tests/kv_cache_transfer.py"; out="results/${CX_RUNNER}_kvcache_${CX_TS}.json" ;;
+    *) cx_die "unknown collective kind '$kind'" ;;
+  esac
+  cx_log "collective bench=$kind -> $out"
+  local extra=""; [ "$kind" = "kv-cache" ] && extra="--direction all"
+  # shellcheck disable=SC2086
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 "$script" $extra \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \
+      --env-json "$ENVJSON" --out "$out" || rc=$?
+  [ "$rc" = 0 ] || cx_log "WARN: collective $kind failed/timed out rc=$rc"
+  return "$rc"
+}
+
+run_rl_mesh() {
+  # RL trainer<->generator mesh transfer (multi-process: torchrun splits world into two meshes).
+  cx_log "rl-mesh bench ngpus=$CX_NGPUS"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+      torchrun --nproc_per_node="$CX_NGPUS" tests/rl_mesh_bench.py \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \
+      --env-json "$ENVJSON" --out "results/${CX_RUNNER}_rl_mesh_${CX_TS}.json"
+  local rc=$?
+  [ "$rc" = 0 ] || cx_log "WARN: rl-mesh failed/timed out rc=$rc"
+  return "$rc"
+}
+
+run_allreduce_fw() {
+  # Framework custom all-reduce (flashinfer one-shot/two-shot + sglang/vllm), multi-process torchrun.
+  cx_log "allreduce-fw bench ngpus=$CX_NGPUS"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+      torchrun --nproc_per_node="$CX_NGPUS" tests/allreduce_fw_bench.py \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \
+      --env-json "$ENVJSON" --out "results/${CX_RUNNER}_allreduce_fw_${CX_TS}.json"
+  local rc=$?
+  [ "$rc" = 0 ] || cx_log "WARN: allreduce-fw failed/timed out rc=$rc"
+  return "$rc"
+}
+
+# Upgrade FlashInfer in-container to the latest wheel — the bundled 0.6.8.post1 lacks the
+# quantized-COMBINE OUTPUT path (moe_a2a_combine output_dtype/output_scales, added in a newer
+# release; confirmed in the main-branch source). A combine-quant run needs it; the dispatch path
+# (bf16/fp8/mxfp8/nvfp4) is unaffected and stays on whatever is installed. Best-effort: a failed
+# upgrade leaves the run on the bundled version (the combine-quant adapter then rejects loudly).
+cx_build_flashinfer_latest() {
+  [ -f /tmp/.cx_built_flashinfer ] && { cx_log "FlashInfer quant-combine build already done this allocation — skip"; return 0; }
+  cx_log "FlashInfer: upgrading to latest wheel for quantized-combine output (moe_a2a_combine output_dtype)"
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  # moe_a2a_combine output_dtype is on flashinfer MAIN but NOT in the latest PyPI release (0.6.13) —
+  # so `pip -U flashinfer-python` (PyPI) is insufficient. Install from the NIGHTLY wheel index
+  # (built from main): flashinfer-python (--no-deps; the container already has torch etc.) + the
+  # matching cubin + cu130 jit-cache. FLASHINFER_DISABLE_VERSION_CHECK=1 bypasses any residual
+  # sub-package skew. Falls back to a PyPI -U (which then asserts-out cleanly if it lacks output_dtype).
+  export FLASHINFER_DISABLE_VERSION_CHECK=1
+  local before after NIDX="https://flashinfer.ai/whl/nightly"
+  before="$(python3 -c 'import flashinfer;print(flashinfer.__version__)' 2>/dev/null || echo none)"
+  { pip install -q -U --pre flashinfer-python --index-url "$NIDX/" --no-deps >&2 2>&1 \
+      && pip install -q -U --pre flashinfer-cubin --index-url "$NIDX/" >&2 2>&1 \
+      && pip install -q -U --pre flashinfer-jit-cache --index-url "$NIDX/cu130" >&2 2>&1; } \
+    || { cx_log "WARN: flashinfer nightly index failed — falling back to PyPI -U"; \
+         pip install -q -U flashinfer-python flashinfer-cubin flashinfer-jit-cache >&2 2>&1 || true; }
+  # The nightly (main) flashinfer's CuTe-DSL kernels import newer cutlass.cute symbols (e.g.
+  # OperandMajorMode) than the bundled nvidia-cutlass-dsl provides — upgrade it to match (PyPI).
+  pip install -q -U nvidia-cutlass-dsl >&2 2>&1 || cx_log "WARN: nvidia-cutlass-dsl upgrade warning"
+  # The cu130 nightly WHEEL (0.6.13.dev20260612) still predates the combine output_dtype PR — if it's
+  # absent, build flashinfer MAIN from source (the container has the cu130 toolchain that built
+  # deep_ep-v2 + hybrid-ep; cutlass-dsl 4.5.2 is now installed; JIT-first build, time-boxed).
+  if ! python3 -c "import inspect, flashinfer.comm as c; assert 'output_dtype' in str(inspect.signature(c.MoeAlltoAll.combine))" 2>/dev/null; then
+    cx_log "FlashInfer nightly wheel lacks combine output_dtype — building flashinfer main from source"
+    # Uninstall the precompiled cubin + jit-cache FIRST: they ship the OLD 10-arg moe_a2a_combine
+    # kernel, which the main Python wrapper (14-arg, with output_dtype) then mis-calls ("Expected 10
+    # but got 14 arguments"). Removing them forces get_moe_alltoall_module() to JIT-compile the
+    # kernel FRESH from main's csrc at runtime (14-arg, matching the wrapper).
+    pip uninstall -y flashinfer-cubin flashinfer-jit-cache >&2 2>&1 || true
+    rm -rf /tmp/fi_main ~/.cache/flashinfer 2>/dev/null || true
+    if git clone --recursive --depth 1 https://github.com/flashinfer-ai/flashinfer.git /tmp/fi_main >&2 2>&1; then
+      ( cd /tmp/fi_main && timeout 2400 pip install -q --no-build-isolation . >&2 2>&1 ) \
+        || cx_log "WARN: flashinfer main source build failed/timed out"
+    else
+      cx_log "WARN: flashinfer main clone failed (compute-node network?)"
+    fi
+  fi
+  after="$(python3 -c 'import flashinfer;print(flashinfer.__version__)' 2>/dev/null || echo none)"
+  export FLASHINFER_COMMIT="pkg-$after"
+  # Record the EXACT upgraded library stack for reproducibility — the upgrade happens AFTER
+  # env_capture, so these versions live nowhere else. CX_FLASHINFER_STACK is read into the result's
+  # backend_provenance by ep_flashinfer. Also logged to the GHA log even if the run later fails.
+  export CX_FLASHINFER_STACK="$(python3 - <<'PY' 2>/dev/null || echo 'capture-failed'
+import importlib.metadata as m
+def v(p):
+    try: return m.version(p)
+    except Exception: return "absent"
+pkgs=["flashinfer-python","flashinfer-cubin","flashinfer-jit-cache","nvidia-cutlass-dsl","torch"]
+print(" ".join(f"{p}={v(p)}" for p in pkgs))
+PY
+)"
+  cx_log "FlashInfer upgrade (nightly): $before -> $after"
+  cx_log "FlashInfer stack: $CX_FLASHINFER_STACK"
+  python3 -c "import inspect, flashinfer.comm as c; assert 'output_dtype' in str(inspect.signature(c.MoeAlltoAll.combine)), 'combine still has no output_dtype'; print('combine output_dtype: present')" >&2 \
+    || { cx_log "ERROR: upgraded FlashInfer combine still lacks output_dtype — cannot quant-combine"; return 1; }
+  : > /tmp/.cx_built_flashinfer   # sentinel: skip rebuild on subsequent cases in this allocation
+}
+
+# NIXL device-EP build-probe — the gated EP item (goal "NIXL EP"). The OLD sglang image blocked the
+# meson build on Abseil 20220623; this runs in the dynamo tensorrtllm-runtime image (container switch)
+# and reports whether THIS container clears it. Reports the build deps the meson tree needs (nixl lib,
+# Abseil, meson/ninja/ucx) then attempts `meson setup` (which enumerates any missing dep) + a
+# time-boxed compile. Informational: logs the precise outcome; never fails the suite (the transfer
+# bench is the guaranteed datapoint). If it SUCCEEDS we wire ep_nixl.py against nixl_ep_cpp next.
+cx_probe_nixl_ep() {
+  cx_log "NIXL device-EP build-probe (gated EP item — does examples/device/ep build on this container?)"
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  python3 - >&2 2>&1 <<'PY' || true
+import importlib.metadata as m, shutil, glob
+def v(p):
+    try: return m.version(p)
+    except Exception: return "absent"
+print("NIXL_EP_PROBE deps: nixl=%s meson=%s ninja=%s pybind11=%s cmake=%s" %
+      (v("nixl"), shutil.which("meson"), shutil.which("ninja"), v("pybind11"), shutil.which("cmake")))
+# Abseil version was the OLD container's blocker (20220623) — report what THIS container ships.
+hits = glob.glob("/usr/**/libabsl_base*", recursive=True) + glob.glob("/opt/**/libabsl_base*", recursive=True)
+print("NIXL_EP_PROBE abseil libs:", hits[:4] or "not found on /usr,/opt")
+try:
+    import nixl, os; print("NIXL_EP_PROBE nixl at", os.path.dirname(nixl.__file__))
+except Exception as e:
+    print("NIXL_EP_PROBE nixl import:", repr(e))
+PY
+  pip install -q meson ninja pybind11 >&2 2>&1 || cx_log "NIXL_EP_PROBE: meson/ninja/pybind11 pip warn"
+  # The device-EP build needs UCX's GPU device API header <ucp/api/device/ucp_device_impl.h>; the
+  # dynamo image's UCX lacks it (meson "UCX GPU Device API: NO"). Build a recent UCX from source WITH
+  # CUDA (ships the device-API header) and point pkg-config at it — the directive's "see if a build
+  # fixes it". If the header is still absent (device-comm needs GPUDirect-Async driver support), the
+  # meson reports NO again and that precise wall is documented.
+  if ! find /usr /opt -name 'ucp_device_impl.h' 2>/dev/null | grep -q .; then
+    cx_log "NIXL_EP_PROBE: building UCX from source with CUDA device API -> /opt/ucx-dev"
+    rm -rf /tmp/ucx_src
+    if git clone --depth 1 https://github.com/openucx/ucx /tmp/ucx_src >&2 2>&1; then
+      ( cd /tmp/ucx_src && timeout 1300 bash -c '
+          ./autogen.sh >/dev/null 2>&1
+          ./configure --prefix=/opt/ucx-dev --with-cuda=/usr/local/cuda --enable-mt --without-go --without-java >/dev/null 2>&1
+          make -j"$(nproc)" install 2>&1 | tail -4' ) >&2 2>&1 || cx_log "NIXL_EP_PROBE: UCX build failed/timed out"
+      export PKG_CONFIG_PATH="/opt/ucx-dev/lib/pkgconfig:${PKG_CONFIG_PATH:-}"
+      export LD_LIBRARY_PATH="/opt/ucx-dev/lib:${LD_LIBRARY_PATH:-}"
+    fi
+    find /opt/ucx-dev -name 'ucp_device_impl.h' 2>/dev/null | head -1 | sed 's/^/NIXL_EP_PROBE built-ucx device header: /' >&2 || true
+  fi
+  rm -rf /tmp/nixl_src
+  git clone --depth 1 https://github.com/ai-dynamo/nixl /tmp/nixl_src >&2 2>&1 \
+    || { cx_log "NIXL_EP_PROBE: clone failed (compute-node network?)"; return 0; }
+  # meson-setup the whole project (it now sees the source-built UCX via PKG_CONFIG_PATH -> the "UCX
+  # GPU Device API" line shows YES/NO), then a time-boxed compile. tail the decisive lines to the log.
+  ( cd /tmp/nixl_src && timeout 1500 bash -c '
+      echo "--- meson setup ---"; meson setup build 2>&1 | tail -34
+      echo "--- meson compile (time-boxed) ---"; meson compile -C build 2>&1 | tail -40
+    ' ) >&2 2>&1 || true
+  if find /tmp/nixl_src/build -name 'nixl_ep_cpp*.so' 2>/dev/null | grep -q .; then
+    cx_log "NIXL_EP_PROBE: SUCCESS — nixl_ep_cpp built on this container (wire ep_nixl.py next)"
+  else
+    cx_log "NIXL_EP_PROBE: nixl_ep_cpp NOT produced — see 'meson setup' output above for the blocker"
+  fi
+}
+
+run_mooncake_suite() {
+  # MoonCake KV transfer (the goal's kv-cache 'mooncake' backend). Mooncake is in no CollectiveX
+  # container -> pip-install mooncake-transfer-engine first (the directive's "import a new one", as a
+  # pip import). Then the single-process RDMA loopback bench. Needs an RDMA NIC.
+  local out="results/${CX_RUNNER}_mooncake_${CX_TS}.json" rc=0
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  if ! python3 -c "import mooncake.engine" 2>/dev/null; then
+    cx_log "mooncake: pip install mooncake-transfer-engine"
+    pip install -q mooncake-transfer-engine >&2 2>&1 || cx_log "WARN: mooncake pip install failed"
+  fi
+  cx_log "mooncake transfer bench -> $out"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/mooncake_transfer.py \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-rdma}" \
+      --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: mooncake failed/timed out rc=$rc"; }
+  return "$rc"
+}
+
+run_nccl_kv_suite() {
+  # NCCL/RCCL KV-cache transfer (the goal's kv-cache 'nccl'/'rccl' backend). torchrun 2 ranks,
+  # rank0 dist.send -> rank1 dist.recv of KV-block-sized buffers. NCCL on NVIDIA, RCCL on ROCm
+  # (same torch.distributed API). Needs >=2 GPUs.
+  local out="results/${CX_RUNNER}_nccl_kv_${CX_TS}.json" rc=0 np=2
+  [ "$CX_NGPUS" -lt 2 ] && { cx_log "WARN: nccl-kv needs >=2 GPUs (have $CX_NGPUS)"; return 1; }
+  cx_log "nccl-kv transfer bench (2-rank send/recv) -> $out"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+      torchrun --nproc_per_node="$np" tests/nccl_kv_transfer.py \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \
+      --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: nccl-kv failed/timed out rc=$rc"; }
+  return "$rc"
+}
+
+run_mori_io_suite() {
+  # MoRI-IO (ROCm/mori mori.io) — AMD RDMA p2p transfer engine, bundled in the AMD MoRI image. The
+  # WIRED kv-cache 'mori-io' backend (a guaranteed datapoint when mori.io imports + RDMA loopback
+  # works on the ionic_rdma NICs). Single process, 2 IOEngines, GPU0<->GPU1 RDMA read.
+  if ! python3 -c "import mori.io" 2>/dev/null; then
+    cx_log "WARN: mori.io not importable — needs the AMD MoRI image; cannot run mori-io"; return 1
+  fi
+  local out="results/${CX_RUNNER}_mori_io_${CX_TS}.json" rc=0
+  cx_log "mori-io transfer bench -> $out"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/mori_io_transfer.py \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-rdma}" \
+      --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: mori-io failed/timed out rc=$rc"; }
+  return "$rc"
+}
+
+run_nixl_suite() {
+  # NIXL (ai-dynamo/nixl) — runs in the dynamo tensorrtllm-runtime image (cx_default_image switched
+  # CX_IMAGE for CX_BENCH=nixl). Two parts: (1) the NIXL point-to-point TRANSFER bench (the wired
+  # KV-cache 'nixl' backend — a guaranteed datapoint when nixl imports); (2) the device-EP build-probe
+  # (the gated NIXL EP item). The transfer result drives the suite's pass/fail; the probe is logged.
+  local out rc=0
+  out="results/${CX_RUNNER}_nixl_${CX_TS}.json"
+  cx_log "nixl transfer bench -> $out"
+  timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/nixl_transfer.py --direction all \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \
+      --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: nixl transfer failed/timed out rc=$rc"; }
+  cx_probe_nixl_ep || true   # informational; never fails the suite
+  return "$rc"
+}
+
+run_flashinfer_suite() {
+  # FlashInfer EP (flashinfer.comm.MoeAlltoAll) — pre-installed in the sglang image. When a
+  # combine-quant run is requested (CX_COMBINE_DTYPE != bf16), first upgrade FlashInfer to a wheel
+  # that has the quantized-combine OUTPUT path; otherwise run on the bundled version (dispatch path).
+  # Upgrade FlashInfer to the newer wheel when: (a) a combine-quant run needs the output_dtype path, OR
+  # (b) CX_FLASHINFER_UPGRADE=1 — the bundled 0.6.8 MoeAlltoAll MNNVL barrier intermittently deadlocks on
+  # h100 ('Rank N timed out waiting for completion flag' -> CUDA unspecified launch failure); newer
+  # flashinfer carries MNNVL fixes (e.g. socket-collision #36674). Otherwise run on the bundled version.
+  if { [ -n "${CX_COMBINE_DTYPE:-}" ] && [ "${CX_COMBINE_DTYPE}" != "bf16" ]; } || [ "${CX_FLASHINFER_UPGRADE:-}" = "1" ]; then
+    cx_build_flashinfer_latest || { cx_log "WARN: flashinfer upgrade setup failed"; return 1; }
+  fi
+  if ! python3 -c "import flashinfer.comm" 2>/dev/null; then
+    cx_log "WARN: flashinfer.comm not importable — cannot run flashinfer EP"; return 1
+  fi
+  run_ep_suite flashinfer
+}
+
+# dispatch_bench runs the CURRENT CX_BENCH (+ CX_* config env) once. The sweep workflow runs many
+# of these per allocation (SHARD mode below), reusing this single container + its built backend.
+dispatch_bench() {
+  local rc=0
+  case "$CX_BENCH" in
+    nccl)        run_nccl_suite || rc=1 ;;
+    deepep)      run_deepep_suite || rc=1 ;;
+    mori)        run_mori_suite || rc=1 ;;
+    uccl)        run_uccl_suite || rc=1 ;;
+    nccl-ep)     run_nccl_ep_suite || rc=1 ;;
+    flashinfer)  run_flashinfer_suite || rc=1 ;;
+    deepep-hybrid) run_deepep_hybrid_suite || rc=1 ;;
+    nixl)        run_nixl_suite || rc=1 ;;
+    mori-io)     run_mori_io_suite || rc=1 ;;
+    nccl-kv)     run_nccl_kv_suite || rc=1 ;;
+    mooncake)    run_mooncake_suite || rc=1 ;;
+    offload)     run_collective_bench offload || rc=1 ;;
+    copy-engine) run_collective_bench copy-engine || rc=1 ;;
+    kv-cache)    run_collective_bench kv-cache || rc=1 ;;
+    rl-mesh)     run_rl_mesh || rc=1 ;;
+    allreduce-fw) run_allreduce_fw || rc=1 ;;
+    all)         run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;;
+    *)           cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|mori|uccl|nccl-ep|flashinfer|deepep-hybrid|nixl|mori-io|nccl-kv|mooncake|offload|copy-engine|kv-cache|rl-mesh|allreduce-fw|all)" ;;
+  esac
+  return $rc
+}
+
+rc=0
+# Build-only mode: the rack EP8 launcher runs this ONCE per node inside a PERSISTENT named container
+# to pre-build the from-source kernels (DeepEP V2 / flashinfer quant-combine) that the per-rank
+# multi-srun case loop cannot build itself (8 separate ephemeral containers). Build the requested
+# kernels into this (named, persisting) container's site-packages, then exit — no benchmark run.
+if [ -n "${CX_BUILD_ONLY:-}" ]; then
+  [ -n "${CX_DEEPEP_V2:-}" ] && { cx_build_deepep_v2 || rc=1; }
+  [ "${CX_BENCH:-}" = "deepep-hybrid" ] && { cx_build_deepep_hybrid || rc=1; }
+  [ -n "${CX_COMBINE_DTYPE:-}" ] && [ "${CX_COMBINE_DTYPE}" != "bf16" ] && { cx_build_flashinfer_latest || rc=1; }
+  cx_log "CX_BUILD_ONLY: build complete rc=$rc (deepep_v2=${CX_DEEPEP_V2:-} bench=${CX_BENCH:-} combine=${CX_COMBINE_DTYPE:-})"
+  exit "$rc"
+fi
+if [ -n "${CX_SHARD_FILE:-}" ] && [ -f "${CX_SHARD_FILE:-/nonexistent}" ]; then
+  # SHARD/SWEEP mode (collectivex-sweep.yml): run EVERY case of this shard in THIS one allocation.
+  # All cases share (sku, backend, v2, nodes) so the backend build (cx_build_*) is paid once and cached
+  # for the rest. Each case overrides its own mode/resource_mode/dtype/contract/routing/phase/eplb/
+  # workload, then reuses the same per-config path (dispatch_bench). Collapses a whole build-group's
+  # cases (all modes/resource_modes) into one allocation — the sweep shard key is now (sku,backend,v2,nodes).
+  ncases="$(python3 -c "import json;print(len(json.load(open('$CX_SHARD_FILE')).get('cases',[])))" 2>/dev/null || echo 0)"
+  cx_log "SHARD mode: $ncases case(s) in one allocation (shard=$CX_SHARD_FILE)"
+  _cx_ts_base="$CX_TS"   # per-case CX_TS suffix below keeps each case's result file UNIQUE (else
+                         # cases sharing backend+phase overwrite each other at the same timestamp).
+  ci=0
+  while [ "$ci" -lt "$ncases" ]; do
+    export CX_TS="${_cx_ts_base}-c$(printf '%03d' "$ci")"
+    # Map case[ci] fields -> CX_* env (shell-quoted). The setup job pre-resolved hidden/topk/experts
+    # + the token ladder into each case, so the loop is config-only (no workloads.yaml lookup here).
+    _exports="$(python3 - "$CX_SHARD_FILE" "$ci" <<'PY'
+import json, sys, shlex
+c = json.load(open(sys.argv[1]))["cases"][int(sys.argv[2])]
+def g(k, d=""):
+    v = c.get(k, d); return "" if v is None else str(v)
+env = {
+  "CX_BENCH": g("backend"), "CX_MODE": g("mode", "normal"),
+  "CX_DISPATCH_DTYPE": g("dtype", "bf16"),
+  "CX_MEASUREMENT_CONTRACT": g("contract", "layout-and-dispatch-v1"),
+  "CX_ROUTING": g("routing", "uniform"), "CX_PHASE": g("phase", "decode"),
+  "CX_RESOURCE_MODE": g("resource_mode", "normalized"),
+  "CX_ACTIVATION_PROFILE": g("activation_profile", "normal"),
+  "CX_PLACEMENT": g("placement", "packed"), "CX_ROUTING_STEP": g("routing_step", "0"),
+  "CX_UNEVEN_TOKENS": g("uneven_tokens", "none"),
+  "CX_EPLB": "1" if c.get("eplb") else "",
+  "CX_HIDDEN": g("hidden"), "CX_TOPK": g("topk"), "CX_EXPERTS": g("experts"),
+  "CX_TOKENS_LADDER": g("ladder"), "CX_CANONICAL": ("1" if c.get("canonical") else ""),
+}
+print("\n".join(f"export {k}={shlex.quote(v)}" for k, v in env.items()))
+PY
+)"
+    eval "$_exports"
+    # Each case has its OWN routing/dims -> its own canonical workload manifest. cx_stage_canonical
+    # short-circuits when CX_WORKLOAD_DIR is already set, so without this unset the first case's
+    # staged dir is reused for the rest and run_ep.py can't find the later cases' manifests
+    # (FileNotFoundError .cx_workloads/<wid>.manifest.json). Unset so every case re-stages its own.
+    unset CX_WORKLOAD_DIR 2>/dev/null || true
+    cx_log "  [$((ci+1))/$ncases] $CX_BENCH $CX_PHASE $CX_DISPATCH_DTYPE/$CX_MODE/${CX_MEASUREMENT_CONTRACT/-v1/} rt=$CX_ROUTING eplb=${CX_EPLB:-0}"
+    # flashinfer's MoeAlltoAll MNNVL barrier INTERMITTENTLY deadlocks on h100 ('Rank N timed out waiting
+    # for completion flag' -> CUDA unspecified launch failure): ~half of cases, scattered across T/routing,
+    # the SAME config both crashes AND passes (a transient, not config/pidfd). Upgrade to flashinfer 0.6.14
+    # + a between-case shm-drop settle were both TESTED and did NOT fix it (the settle made it worse). Since
+    # it's intermittent, RETRY: each fresh torchrun is another independent ~50% shot, so a few retries
+    # recover almost all cases. On a retry success, drop this case's intermediate failed-case record so it
+    # doesn't pollute the shard. Non-flashinfer backends run ONCE — their failures are deterministic
+    # (h200 flashinfer pidfd, aarch64 uccl, deepep-hybrid ll) so retrying only wastes the allocation.
+    attempts=1; [ "$CX_BENCH" = "flashinfer" ] && attempts=$(( ${CX_FLASHINFER_RETRIES:-3} + 1 ))
+    a=1
+    while :; do
+      if dispatch_bench; then
+        [ "$a" -gt 1 ] && rm -f results/failed_*"${CX_TS}"*.json 2>/dev/null || true
+        break
+      fi
+      [ "$a" -ge "$attempts" ] && { rc=1; break; }
+      cx_log "  [$((ci+1))/$ncases] $CX_BENCH attempt $a/$attempts failed — retry (intermittent MNNVL barrier)"
+      a=$((a+1))
+    done
+    ci=$((ci + 1))
+  done
+else
+  dispatch_bench || rc=1
+fi
+
+# Summary table for the log; also fails the job if no valid results were produced.
+python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1
+exit "$rc"
diff --git a/experimental/CollectiveX/schemas/ep-result-v4.schema.json b/experimental/CollectiveX/schemas/ep-result-v4.schema.json
new file mode 100644
index 000000000..0d0035997
--- /dev/null
+++ b/experimental/CollectiveX/schemas/ep-result-v4.schema.json
@@ -0,0 +1,195 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://semianalysis/collectivex/schemas/ep-result-v4.schema.json",
+  "title": "CollectiveX EP dispatch/combine result (v4)",
+  "description": "One (backend, phase, dtype, mode, contract, routing) sweep. v4 adds multi-dimensional validity + machine-derived publication_status, measured roundtrip, dual byte contracts, per-rank diagnostics, raw-sample histograms, and workload identity. v3 docs load via compatibility (publication_status absent => treated as legacy/experimental).",
+  "type": "object",
+  "required": ["schema_version", "family", "runner", "backend", "mode", "phase",
+               "ep_size", "measurement_contract", "shape", "rows",
+               "validity", "publication_status", "workload", "reproduction",
+               "backend_provenance", "comparison_key"],
+  "properties": {
+    "schema_version": {"type": "integer", "minimum": 3},
+    "family": {"const": "moe"},
+    "runner": {"type": "string"},
+    "backend": {"type": "string", "enum": ["deepep", "deepep-hybrid", "mori", "aiter", "uccl", "flashinfer"]},
+    "mode": {"type": "string", "enum": ["normal", "ll"]},
+    "phase": {"type": "string", "enum": ["decode", "prefill"]},
+    "ep_size": {"type": "integer", "minimum": 1},
+    "world_size": {"type": "integer", "minimum": 1},
+    "nodes": {"type": "integer", "minimum": 1},
+    "topology_class": {"type": "string"},
+    "transport": {"type": "string"},
+    "resource_mode": {"type": "string", "enum": ["normalized", "tuned", "default"]},
+    "measurement_contract": {"type": "string",
+      "enum": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1",
+               "mori-quant-combine-v1"]},
+    "publication_status": {"type": "string",
+      "enum": ["official", "comparable-experimental", "diagnostic", "invalid", "failed"]},
+    "validity": {
+      "type": "object",
+      "required": ["execution_status", "semantic_correctness", "workload_identity",
+                   "measurement_conformance", "resource_conformance", "provenance_complete"],
+      "properties": {
+        "execution_status": {"type": "string", "enum": ["complete", "failed"]},
+        "semantic_correctness": {"type": "string", "enum": ["pass", "fail"]},
+        "workload_identity": {"type": "string"},
+        "workload_source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "measurement_conformance": {"type": "string", "enum": ["conformant", "nonconformant"]},
+        "resource_conformance": {"type": "string"},
+        "provenance_complete": {"type": "boolean"},
+        "anomaly_free": {"type": "boolean"}
+      }
+    },
+    "workload": {
+      "type": "object",
+      "required": ["source", "trace_signature", "cross_rank_consistent"],
+      "properties": {
+        "source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "workload_id": {"type": ["string", "null"]},
+        "manifest_checksums": {"type": ["object", "null"]},
+        "trace_signature": {"type": "string"},
+        "distinct_per_T_hashes": {"type": "array", "items": {"type": "string"}},
+        "cross_rank_consistent": {"type": "boolean"},
+        "activation_profile": {"type": "string"},
+        "activation_identity": {"type": ["string", "null"]}
+      }
+    },
+    "shape": {
+      "type": "object",
+      "required": ["hidden", "topk", "experts", "experts_per_rank", "dispatch_dtype", "routing"],
+      "properties": {
+        "hidden": {"type": "integer"}, "topk": {"type": "integer"},
+        "experts": {"type": "integer"}, "experts_per_rank": {"type": "integer"},
+        "dispatch_dtype": {"type": "string", "enum": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast", "mxfp8", "mxfp4", "nvfp4"]},
+        "routing": {"type": "string"},
+        "eplb": {"type": "boolean"}, "num_logical_experts": {"type": "integer"},
+        "kernel_gen": {"type": "string"},
+        "activation_profile": {"type": "string"},
+        "quant": {
+          "type": "object",
+          "properties": {
+            "combine_input_dtype": {"type": "string"},
+            "combine_accum_dtype": {"type": "string"},
+            "combine_output_dtype": {"type": "string"},
+            "combine_quant_mode": {"type": "string"},
+            "scale_layout": {"type": ["string", "null"]}
+          }
+        }
+      }
+    },
+    "reproduction": {
+      "type": "object",
+      "required": ["command", "seed", "warmup", "iters", "trials", "measurement_contract"],
+      "properties": {
+        "command": {"type": "string"},
+        "image": {"type": ["string", "null"]},
+        "image_digest": {"type": ["string", "null"]},
+        "image_arch": {"type": ["string", "null"]},
+        "squash_sha256": {"type": ["string", "null"]},
+        "git_run": {"type": ["object", "null"]},
+        "fp8_quant_in_timing": {"type": ["boolean", "null"]},
+        "combine_quant_in_timing": {"type": ["boolean", "null"]},
+        "combine_dequant_in_timing": {"type": ["boolean", "null"]},
+        "combine_dtype": {"type": "string"}, "combine_quant_mode": {"type": "string"},
+        "activation_profile": {"type": "string"},
+        "routing_step": {"type": "integer"}, "uneven_tokens": {"type": "string"},
+        "waive_anomaly": {"type": "boolean"}, "roundtrip_anomaly_threshold": {"type": "number"}
+      }
+    },
+    "backend_provenance": {"type": "object"},
+    "phase_profile": {"type": "object"},
+    "source_allocation": {
+      "type": "object",
+      "properties": {
+        "mode": {"type": "string", "enum": ["none", "linear", "empty-rank"]},
+        "routing_step": {"type": "integer"}
+      }
+    },
+    "placement": {
+      "type": "object",
+      "properties": {
+        "kind": {"type": "string", "enum": ["packed", "striped", "runtime-native", "adversarial"]},
+        "nodes": {"type": "integer"}, "gpus_per_node": {"type": "integer"},
+        "scale_up_domain": {"type": "integer"}, "ranks": {"type": "integer"}
+      }
+    },
+    "eplb": {
+      "type": "object",
+      "properties": {
+        "enabled": {"type": "boolean"},
+        "num_logical_experts": {"type": "integer"}, "num_physical_experts": {"type": "integer"},
+        "imbalance_before": {"type": "number"}, "imbalance_after": {"type": "number"},
+        "mapping_hash": {"type": ["string", "null"]}
+      }
+    },
+    "anomalies": {"type": "array", "items": {"type": "object"}},
+    "anomaly_summary": {
+      "type": "object",
+      "properties": {
+        "count": {"type": "integer"}, "waived": {"type": "boolean"},
+        "types": {"type": "array", "items": {"type": "string"}}
+      }
+    },
+    "rows": {
+      "type": "array", "minItems": 1,
+      "items": {
+        "type": "object",
+        "required": ["tokens_per_rank", "global_tokens", "dispatch", "combine", "roundtrip",
+                     "isolated_sum", "samples_pooled", "byte_contracts", "correct"],
+        "properties": {
+          "tokens_per_rank": {"type": "integer", "minimum": 1},
+          "global_tokens": {"type": "integer", "minimum": 1},
+          "dispatch": {"$ref": "#/definitions/percentiles"},
+          "combine": {"$ref": "#/definitions/percentiles"},
+          "roundtrip": {"$ref": "#/definitions/percentiles"},
+          "isolated_sum": {"type": "object"},
+          "samples_pooled": {"type": "integer", "minimum": 1},
+          "percentile_interpolation": {"type": "string"},
+          "per_rank_dispatch_us": {"type": "object"},
+          "raw_samples": {"type": "object"},
+          "byte_contracts": {
+            "type": "object",
+            "required": ["token_rank_payload_copies", "token_expert_payload_copies",
+                         "dispatch_bytes", "combine_bytes"],
+            "properties": {
+              "token_rank_payload_copies": {"type": "integer"},
+              "token_expert_payload_copies": {"type": "integer"},
+              "dispatch_bytes": {"type": "integer"}, "combine_bytes": {"type": "integer"}
+            }
+          },
+          "roundtrip_tokens_per_second": {"type": ["number", "null"]},
+          "bandwidth": {
+            "type": "object",
+            "properties": {
+              "logical_payload_rate_gbps": {"type": "object"},
+              "backend_buffer_rate_gbps": {"type": "object"},
+              "algorithm_bandwidth_gbps": {"type": ["number", "null"]},
+              "bus_bandwidth_gbps": {"type": ["number", "null"]},
+              "wire_utilization": {"type": ["number", "null"]}
+            }
+          },
+          "fanout_hist": {"type": "array"},
+          "rank_load_hist": {"type": "array"},
+          "expert_load_cv": {"type": "number"}, "rank_load_cv": {"type": "number"},
+          "hotspot_ratio": {"type": "number"},
+          "dest_rank_load_max": {"type": "integer"}, "dest_rank_load_mean": {"type": "number"},
+          "empty_expert_count": {"type": "integer"}, "empty_rank_count": {"type": "integer"},
+          "source_token_stats": {"type": ["object", "null"]},
+          "anomalies": {"type": "array", "items": {"type": "object"}},
+          "correct": {"type": "boolean"}
+        }
+      }
+    }
+  },
+  "definitions": {
+    "percentiles": {
+      "type": "object",
+      "required": ["p50", "p90", "p95", "p99"],
+      "properties": {
+        "p50": {"type": "number"}, "p90": {"type": "number"},
+        "p95": {"type": "number"}, "p99": {"type": "number"}
+      }
+    }
+  }
+}
diff --git a/experimental/CollectiveX/schemas/workload-v1.schema.json b/experimental/CollectiveX/schemas/workload-v1.schema.json
new file mode 100644
index 000000000..5a12b5af0
--- /dev/null
+++ b/experimental/CollectiveX/schemas/workload-v1.schema.json
@@ -0,0 +1,50 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://semianalysis/collectivex/schemas/workload-v1.schema.json",
+  "title": "CollectiveX canonical MoE routing workload manifest",
+  "description": "Manifest for a serialized routing trace (tests/workload.py). The <workload_id>.npz holds topk_idx/topk_weights; this manifest carries the identity, dimensions, routing profile, and SHA-256 checksums that gate cross-hardware comparison.",
+  "type": "object",
+  "additionalProperties": false,
+  "required": ["schema_version", "workload_id", "generator_version", "gate_weight_format",
+               "dims", "routing_profile", "seed", "checksums"],
+  "properties": {
+    "schema_version": {"const": 1},
+    "workload_id": {"type": "string", "pattern": "^[0-9a-f]{16}$",
+      "description": "Immutable id = sha256(generator|routing|hidden|topk|experts|gt|seed)[:16]."},
+    "generator_version": {"type": "string",
+      "description": "Routing generator identity; bump when numerics change so stale files can't masquerade."},
+    "gate_weight_format": {"type": "string"},
+    "dims": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["hidden", "topk", "experts", "global_tokens", "experts_per_rank"],
+      "properties": {
+        "hidden": {"type": "integer", "minimum": 1},
+        "topk": {"type": "integer", "minimum": 1},
+        "experts": {"type": "integer", "minimum": 1},
+        "global_tokens": {"type": "integer", "minimum": 1},
+        "experts_per_rank": {"type": "integer", "minimum": 1}
+      }
+    },
+    "routing_profile": {"type": "string",
+      "enum": ["uniform", "balanced", "balanced-rank-local", "zipf",
+               "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single"]},
+    "seed": {"type": "integer"},
+    "checksums": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["topk_idx", "topk_weights", "trace"],
+      "properties": {
+        "topk_idx": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
+        "topk_weights": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
+        "trace": {"type": "string", "pattern": "^[0-9a-f]{64}$"}
+      }
+    },
+    "routing_stats": {"type": "object",
+      "description": "Realized fan-out / load / locality stats (advisory; not identity-defining)."},
+    "activation_profile": {"type": "string",
+      "description": "Value distribution of expert inputs (e.g. 'normal'); reserved for the value-sensitivity rig."},
+    "activation_identity": {"type": ["string", "null"],
+      "description": "Deterministic descriptor hash of the activation distribution (profile|seed|dims). Becomes a byte-hash once activations are serialized (model-trace)."}
+  }
+}
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
new file mode 100644
index 000000000..509d22cf8
--- /dev/null
+++ b/experimental/CollectiveX/summarize.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+"""CollectiveX — summarize a run's results.
+
+Two output modes over the same data:
+  (default)    a plain-text table for the Slurm/container log; ALSO the result
+               gate — exits non-zero if no valid results were produced, so a
+               failed/skipped benchmark doesn't get reported as a green job.
+  --markdown   GitHub-flavored markdown for a GitHub Actions job summary
+               (https://github.blog/.../supercharging-github-actions-with-job-summaries/);
+               reporting only, always exits 0. A workflow step appends this to
+               $GITHUB_STEP_SUMMARY so the run page shows a rendered table.
+
+    python summarize.py --results-dir results --runner gb200-nv_1 --ts <ts>
+    python summarize.py --results-dir results --markdown >> "$GITHUB_STEP_SUMMARY"
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+
+
+def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[dict]:
+    docs = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
+        base = os.path.basename(path)
+        if base.startswith("env_"):
+            continue
+        if runner and not base.startswith(f"{runner}_"):
+            continue
+        if ts and ts not in base:
+            continue
+        try:
+            with open(path) as fh:
+                d = json.load(fh)
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") in CLI_FAMILIES:
+            docs.append(d)
+    return docs
+
+
+# Families summarize.py recognizes: EP (moe), NCCL primitives, and the single-process
+# memcpy-family collectives (offload/copy-engine/kv-cache). A doc of any other family is
+# ignored; a run that produces ONLY recognized families must not be reported as "nothing".
+CLI_FAMILIES = ("nccl", "moe", "offload", "copy-engine", "kv-cache", "rl-mesh", "allreduce-fw")
+COLLECTIVE_FAMILIES = ("offload", "copy-engine", "kv-cache", "rl-mesh", "allreduce-fw")
+
+
+def _peak_busbw(rows):
+    return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
+
+
+def _coll_peak(d) -> float:
+    """Peak bandwidth (GB/s) across a collective doc. Field name varies by family:
+    offload/copy-engine use top-level peak_bandwidth_gbps + rows[].bandwidth_gbps;
+    kv-cache nests rows[].bandwidth_gb_s under groups[]. Defensive: 0.0 if none found."""
+    top = d.get("peak_bandwidth_gbps")
+    if top:
+        return top
+    best = 0.0
+
+    def _scan(rows):
+        nonlocal best
+        for r in rows or []:
+            for k in ("bandwidth_gbps", "bandwidth_gb_s", "busbw_gbps"):
+                v = r.get(k)
+                if v:
+                    best = max(best, v)
+
+    _scan(d.get("rows"))
+    for g in d.get("groups", []) or []:
+        _scan(g.get("rows"))
+    return best
+
+
+_OP_ORDER = ["all_reduce", "reduce_scatter", "all_gather", "alltoall"]
+
+
+def _row_lat(r):
+    vals = [(r.get(k) or {}).get("time_us") for k in ("out_of_place", "in_place")]
+    vals = [v for v in vals if v is not None]
+    return min(vals) if vals else None
+
+
+def _lat_floor(rows):
+    # Small-message latency floor: time at the smallest REAL (size>0) message.
+    # (Sub-granularity 0-byte rows are a no-op ~1 us and not a real latency.)
+    real = [r for r in rows if (r.get("size_bytes") or 0) > 0]
+    if not real:
+        return float("nan")
+    v = _row_lat(min(real, key=lambda r: r["size_bytes"]))
+    return v if v is not None else float("nan")
+
+
+def _at_size(rows, size, fn):
+    for r in rows:
+        if r.get("size_bytes") == size:
+            return fn(r)
+    return None
+
+
+def _fmt_bytes(b):
+    for u, s in ((2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")):
+        if b >= u and b % u == 0:
+            return f"{b // u} {s}"
+    return f"{b} B"
+
+
+def _ops_sorted(nccl):
+    present = {d.get("op") for d in nccl}
+    ordered = [o for o in _OP_ORDER if o in present]
+    return ordered + sorted(present - set(ordered))
+
+
+def _ladder(nccl):
+    sizes = sorted({r["size_bytes"] for d in nccl for r in d.get("rows", [])
+                    if (r.get("size_bytes") or 0) > 0})
+    if not sizes:
+        return []
+    cand = [16384, 262144, 4194304, 67108864, 268435456, 1073741824, 4294967296]
+    lad = [s for s in cand if s in set(sizes) and s < sizes[-1]]
+    lad.append(sizes[-1])
+    return lad
+
+
+def _sweep_table(nccl, title, rowfn, fmt):
+    lad = _ladder(nccl)
+    if not lad:
+        return []
+    ops = _ops_sorted(nccl)
+    rows_by_op = {d.get("op"): d.get("rows", []) for d in nccl}
+    out = [f"\n**{title}**\n",
+           "| bytes/rank | " + " | ".join(f"`{o}`" for o in ops) + " |",
+           "|---" + "|--:" * len(ops) + "|"]
+    for s in lad:
+        cells = []
+        for o in ops:
+            v = _at_size(rows_by_op.get(o, []), s, rowfn)
+            cells.append(format(v, fmt) if isinstance(v, (int, float)) else "—")
+        out.append(f"| {_fmt_bytes(s)} | " + " | ".join(cells) + " |")
+    return out
+
+
+def _fnum(x, fmt):
+    return format(x, fmt) if isinstance(x, (int, float)) else "—"
+
+
+def _moe_sorted(moe):
+    return sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""), x.get("ep_size", 0)))
+
+
+def _moe_sweep_table(d):
+    """Markdown sweep table for one EP doc — the rows already ARE the ladder, so
+    emit one row per source-tokens-per-rank point. Skips old single-point docs
+    (no rows[])."""
+    rows = d.get("rows")
+    if not rows:
+        return []
+    sh = d.get("shape", {})
+    head = (f"\n**`{d.get('backend')}` · {d.get('phase')} · ep{d.get('ep_size')} · "
+            f"H{sh.get('hidden')} top{sh.get('topk')} E{sh.get('experts')} "
+            f"{sh.get('dispatch_dtype')} {sh.get('routing')}** — latency vs source tokens/rank\n")
+    out = [head,
+           "| tokens/rank | fan-out | dispatch µs | combine µs | serial µs (D+C) | tokens/s | recv max | correct |",
+           "|--:|--:|--:|--:|--:|--:|--:|:--:|"]
+    for r in rows:
+        out.append(f"| {r.get('tokens_per_rank')} | {_fnum(r.get('fanout_mean'), '.2f')} | "
+                   f"{_fnum(r.get('dispatch_us_p50'), '.2f')} | {_fnum(r.get('combine_us_p50'), '.2f')} | "
+                   f"{_fnum(r.get('serial_us_p50', r.get('roundtrip_us_p50')), '.2f')} | "
+                   f"{_fnum(r.get('tokens_per_second'), '.3e')} | "
+                   f"{r.get('recv_tokens_max', r.get('recv_tokens', '—'))} | {'✅' if r.get('correct') else '❌'} |")
+    return out
+
+
+def render_plain(nccl, moe, coll, n_valid, total) -> str:
+    out = []
+    hdr = "CollectiveX results"
+    anchor = (nccl + moe + coll)
+    if anchor:
+        d0 = anchor[0]
+        hdr += f" — runner={d0.get('runner')} topology={d0.get('topology_class')} transport={d0.get('transport')}"
+    out += ["=" * len(hdr), hdr, "=" * len(hdr)]
+    if coll:
+        out.append("\nMemcpy-family collectives (offload / copy-engine / kv-cache):")
+        out.append(f"  {'family':<13}{'status':<9}{'peak bw (GB/s)':>15}")
+        for d in sorted(coll, key=lambda x: x.get("family", "")):
+            out.append(f"  {d.get('family',''):<13}{d.get('status',''):<9}{_coll_peak(d):>15.1f}")
+    if nccl:
+        out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):")
+        out.append(f"  {'op':<16}{'status':<9}{'peak busbw':>12}{'lat floor':>10}{'avg busbw':>11}")
+        for d in sorted(nccl, key=lambda x: x["op"]):
+            rows = d.get("rows", [])
+            avg = (d.get("summary") or {}).get("avg_busbw_gbps")
+            out.append(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
+                       f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
+    if moe:
+        out.append("\nMoE EP dispatch/combine (DeepEP / MoRI) — headline (* = headline tokens/rank):")
+        out.append(f"  {'backend':<9}{'phase':<8}{'ep':>3} {'status':<9}{'T*':>5}{'disp_p50':>10}{'comb_p50':>10}{'serial':>9}  correct")
+        for d in sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""))):
+            m, c = d.get("metrics", {}), d.get("correctness", {})
+            ser = m.get("serial_us_p50", m.get("roundtrip_us_p50"))
+            out.append(f"  {d.get('backend',''):<9}{d.get('phase',''):<8}{str(d.get('ep_size','')):>3} {d.get('status',''):<9}"
+                       f"{str(m.get('headline_tokens_per_rank','')):>5}"
+                       f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}{(m.get('combine_us_p50') or float('nan')):>10.1f}"
+                       f"{(ser or float('nan')):>9.1f}   {c.get('passed')}")
+    return "\n".join(out)
+
+
+def _emoji(status) -> str:
+    return "✅ valid" if status == "valid" else f"❌ {status}"
+
+
+def render_markdown(nccl, moe, coll, n_valid, total) -> str:
+    out = []
+    anchor = (nccl + moe + coll)
+    if anchor:
+        d0 = anchor[0]
+        out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}")
+    if coll:
+        out.append("\n### Memcpy-family collectives\n")
+        out.append("| family | status | peak bw (GB/s) |")
+        out.append("|---|---|--:|")
+        for d in sorted(coll, key=lambda x: x.get("family", "")):
+            out.append(f"| `{d.get('family','')}` | {_emoji(d.get('status'))} | {_coll_peak(d):.1f} |")
+    if nccl:
+        out.append(f"\n### NCCL/RCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n")
+        out.append("| op | status | peak busbw (GB/s) | lat floor (µs) |")
+        out.append("|---|---|--:|--:|")
+        for d in sorted(nccl, key=lambda x: _OP_ORDER.index(x["op"]) if x["op"] in _OP_ORDER else 99):
+            rows = d.get("rows", [])
+            out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | {_lat_floor(rows):.2f} |")
+        out += _sweep_table(nccl, "Bus bandwidth vs bytes/rank (GB/s)", lambda r: r.get("busbw_gbps"), ".1f")
+        out += _sweep_table(nccl, "Latency vs bytes/rank (µs)", _row_lat, ".2f")
+        out.append("\n> bytes/rank = nccl/rccl-tests message size (= per-rank for all-reduce / "
+                   "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small "
+                   "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.")
+    if moe:
+        out.append("\n### MoE EP dispatch / combine (DeepEP / MoRI)\n")
+        out.append("Headline = the reference point (tokens/rank shown as `T*`); the per-line "
+                   "sweep tables below carry the full source-tokens-per-rank curve.\n")
+        out.append("| backend | phase | mode | dtype | resource | ep | routing (fan-out) | status | T\\* | dispatch p50 (µs) | combine p50 (µs) | serial p50 (µs) | tokens/s | correct |")
+        out.append("|---|---|---|---|---|--:|---|---|--:|--:|--:|--:|--:|:--:|")
+        for d in _moe_sorted(moe):
+            m, c = d.get("metrics", {}), d.get("correctness", {})
+            rp = d.get("routing_profile", {})
+            ser = m.get("serial_us_p50", m.get("roundtrip_us_p50"))
+            sh = d.get("shape") or {}
+            fo = f"{sh.get('routing','?')} ({_fnum(rp.get('fanout_mean'), '.1f')})"
+            # dtype shows whether the fp8 cast was inside the timed dispatch (LL) or not.
+            dt = sh.get("dispatch_dtype", "?")
+            fit = (d.get("reproduction") or {}).get("fp8_quant_in_timing")
+            dt += "*" if fit else ("⁺" if fit is False else "")
+            out.append(f"| `{d.get('backend')}` | {d.get('phase','')} | {d.get('mode','')} | {dt} | "
+                       f"{d.get('resource_mode','')} | {d.get('ep_size','')} | {fo} | {_emoji(d.get('status'))} | "
+                       f"{m.get('headline_tokens_per_rank','—')} | {_fnum(m.get('dispatch_us_p50'), '.1f')} | "
+                       f"{_fnum(m.get('combine_us_p50'), '.1f')} | {_fnum(ser, '.1f')} | "
+                       f"{_fnum(m.get('tokens_per_second'), '.3e')} | {'✅' if c.get('passed') else '❌'} |")
+        for d in _moe_sorted(moe):
+            out += _moe_sweep_table(d)
+        out.append("\n> EP sweep: only source tokens/rank varies along a line. **fan-out** = mean "
+                   "destination ranks/token (representativeness — top-k spread, not a permutation). "
+                   "Dispatch & combine timed **separately** (staging untimed); **serial = dispatch + "
+                   "combine** (a sum, not an independently-measured chained op). dtype `fp8*` = fp8 cast "
+                   "IS inside the timed dispatch (LL kernel); `fp8⁺` = cast is untimed preprocessing "
+                   "(normal mode). `mode` ll = DeepEP low-latency; `resource` = comm SM/CU regime.")
+    if not total:
+        out.append("\n> No result files found — the benchmark produced nothing.")
+    return "\n".join(out)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX result summary")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--runner", default=None)
+    ap.add_argument("--ts", default=None)
+    ap.add_argument("--markdown", action="store_true",
+                    help="emit GitHub job-summary markdown (reporting only; always exits 0)")
+    args = ap.parse_args()
+
+    docs = load_results(args.results_dir, args.runner, args.ts)
+    nccl = [d for d in docs if d["family"] == "nccl"]
+    moe = [d for d in docs if d["family"] == "moe"]
+    coll = [d for d in docs if d["family"] in COLLECTIVE_FAMILIES]
+    total = len(docs)
+    n_valid = sum(d.get("status") == "valid" for d in docs)
+
+    if args.markdown:
+        print(render_markdown(nccl, moe, coll, n_valid, total))
+        return 0  # reporting step — never fail the job here
+
+    print(render_plain(nccl, moe, coll, n_valid, total))
+    if total == 0:
+        print("ERROR: no result files found — benchmark produced nothing.")
+        return 1
+    if n_valid < total:
+        print(f"ERROR: {total - n_valid} result(s) invalid — failing the job.")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/sweep_matrix.py b/experimental/CollectiveX/sweep_matrix.py
new file mode 100644
index 000000000..2c0d98d14
--- /dev/null
+++ b/experimental/CollectiveX/sweep_matrix.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""CollectiveX — sweep matrix resolver (the `setup` job of collectivex-sweep.yml).
+
+Resolves the requested suites into the GHA matrix of SHARDS. A shard = one allocation that sweeps
+many cases sharing (sku, backend, mode, resource_mode) — generate_matrix's own grouping. Big shards
+are CHUNKED so no single matrix cell exceeds the GHA 6h job budget. Each case is enriched with its
+model dims (hidden/topk/experts from workloads.yaml) + token ladder + canonical flag, so the in-
+container shard loop (run_in_container.sh SHARD mode) needs no further config lookup.
+
+Knobs: --backends sweeps every EP library in ONE matrix; --backend remaps the deepep matrix onto a
+single other library (capability-filtered); --deepep-v2 threads kernel_gen=v2. Emits a JSON matrix for `fromJSON` in the
+workflow: {"include": [ {id, sku, backend, mode, resource, deepep_v2, n, cases:[...]}, ... ]}.
+
+  python3 sweep_matrix.py --suites all --out matrix.json
+  python3 sweep_matrix.py --suites all --backend uccl --max-cases 12 --out matrix.json
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, HERE)
+sys.path.insert(0, os.path.join(HERE, "tests"))
+import yaml  # noqa: E402
+import generate_matrix as gm  # noqa: E402
+import capability as cap  # noqa: E402
+
+# platform key -> workflow `sku` input value (must match the workflow's sku choices + runner label)
+SKU = {"h100": "h100-dgxc", "h200": "h200", "b300": "b300", "b200": "b200-dgxc",
+       "mi355x": "mi355x", "gb300": "gb300", "gb200": "gb200"}
+
+
+def _dims(wl_cfg, name):
+    for sec in ("synthetic", "model_derived"):
+        m = (wl_cfg.get(sec) or {}).get(name)
+        if m:
+            return m.get("hidden"), m.get("topk"), m.get("experts", m.get("routed_experts"))
+    return None, None, None
+
+
+def _ladder(suite_cfg, phase):
+    if phase == "decode" and suite_cfg.get("token_points_decode"):
+        return " ".join(map(str, suite_cfg["token_points_decode"]))
+    if phase == "prefill" and suite_cfg.get("token_points_prefill"):
+        return " ".join(map(str, suite_cfg["token_points_prefill"]))
+    if suite_cfg.get("token_points"):
+        return " ".join(map(str, suite_cfg["token_points"]))
+    return ""
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX sweep matrix resolver")
+    ap.add_argument("--suites", default="all", help="'all' or comma-list of suite names")
+    ap.add_argument("--backend", default="", help="remap deepep cases onto ONE EP lib (uccl/flashinfer/deepep-hybrid/nccl-ep)")
+    ap.add_argument("--backends", default="",
+                    help="combined multi-backend matrix in ONE run: 'all' or a comma-list "
+                         "(deepep,deepep-v2,uccl,flashinfer,deepep-hybrid,nccl-ep). Each deepep-origin "
+                         "case is emitted once per backend (capability-filtered); mori stays AMD-native. "
+                         "Supersedes per-backend dispatches. Overrides --backend/--deepep-v2 when set.")
+    ap.add_argument("--deepep-v2", action="store_true")
+    ap.add_argument("--only-sku", default="", help="restrict to one workflow sku value")
+    ap.add_argument("--min-nodes", type=int, default=0,
+                    help="keep only shards whose tray count (nodes, blank=1) is >= this; "
+                         "e.g. 2 = rack-scale EP8 only (skip the single-tray EP4 cells)")
+    ap.add_argument("--max-nodes", type=int, default=0,
+                    help="keep only shards whose tray count (nodes, blank=1) is <= this; "
+                         "e.g. 1 = single-tray EP4 only (skip the rack-scale EP8 cells)")
+    ap.add_argument("--max-cases", type=int, default=128, help="chunk shards larger than this into sub-cells (128 = effectively no chunking for current suites; each shard's cases run consecutively in ONE allocation, amortizing runner/enroot/build startup)")
+    ap.add_argument("--out", default="")
+    ap.add_argument("--slim", action="store_true",
+                    help="emit matrix WITHOUT the per-cell cases list (fits the GHA output size cap); "
+                         "cells re-resolve their own cases via --emit-shard")
+    ap.add_argument("--emit-shard", default="",
+                    help="write just this shard id's {cases:[...]} (the CX_SHARD_FILE for run_in_container)")
+    ap.add_argument("--shard-out", default="results/.shard.json")
+    a = ap.parse_args()
+
+    wl_cfg = yaml.safe_load(open(os.path.join(HERE, "configs", "workloads.yaml")))
+    suites_cfg = yaml.safe_load(open(os.path.join(HERE, "configs", "suites.yaml")))["suites"]
+    suite_names = list(suites_cfg) if a.suites == "all" else [s.strip() for s in a.suites.split(",")]
+
+    # Backend expansion targets for a deepep-origin case, as (backend, deepep_v2) pairs:
+    #  --backends "all"|comma-list -> COMBINED matrix (every backend in ONE run; supersedes the
+    #    per-backend dispatches). 'deepep-v2' is the from-source V2 kernel = deepep + v2 flag.
+    #  else -> the legacy single --backend (+ --deepep-v2) behavior.
+    NV_EP_ALL = ["deepep", "deepep-v2", "uccl", "flashinfer", "deepep-hybrid", "nccl-ep"]
+    if a.backends:
+        names = NV_EP_ALL if a.backends == "all" else [x.strip() for x in a.backends.split(",") if x.strip()]
+        targets = [("deepep", True) if n == "deepep-v2" else (n, False) for n in names]
+    else:
+        targets = [(a.backend or "deepep", a.deepep_v2)]
+
+    # collect enriched cases, deduped globally (a config shared by several suites appears once)
+    seen = set()
+    shards: dict = {}
+    for sname in suite_names:
+        scfg = suites_cfg[sname]
+        for c in gm.generate(sname)["cases"]:
+            plat = c["platform"]
+            beng0 = c["backend"]
+            if beng0 not in ("deepep", "mori"):
+                continue
+            sku = SKU.get(plat, plat)
+            if a.only_sku and sku != a.only_sku:
+                continue
+            phase = c["phase"]
+            rmode = c["resource_mode"]
+            lad = _ladder(scfg, phase)
+            h, t, e = _dims(wl_cfg, c["workload"])
+            # MoRI envelope guard: capped ladder (T=1..16) + tuned for BOTH phases. MoRI prefill IS
+            # supported (MORI-EP does intra+inter-node, both modes — ROCm/mori); prefill at the capped
+            # ladder is validated 5/5 (run 28461798511). It was an UNCAPPED ladder to T=128 that timed
+            # out, not prefill itself — so prefill is capped here, NOT skipped (correcting an earlier
+            # decode-only assumption).
+            if sku == "mi355x":
+                lad, rmode = "1 2 4 8 16", "tuned"
+            # rack-scale tray->nodes (gb200/gb300 = 4 GPU/tray): EP4 = 1 tray, EP8 = 2 trays. ALWAYS
+            # set an EXPLICIT count: the gb300 launcher does NODES="${CX_NODES:-2}", so an EMPTY
+            # CX_NODES coerces to 2 (EP8) — an EP4 cell with nodes="" silently ran EP8 (the rack
+            # multi-srun, which bypasses cx_build_deepep_v2 / cx_build_flashinfer_latest). nodes="1"
+            # makes EP4 actually run EP4 (run_in_container, which builds V2/quant-combine).
+            nodes = ""
+            if plat in ("gb200", "gb300"):
+                nodes = str(max(1, int(c.get("ep") or 8) // 4))
+            # The broad sweep runs SEEDED-runtime (comparable-experimental), NOT pre-staged canonical:
+            # a fixed seed + identical params already yields the same cross-SKU trace for a fair
+            # comparison, without the per-case canonical-manifest staging (overhead + a fragility — the
+            # official cohort is a separate targeted run). run_in_container also re-stages per case if
+            # canonical is ever re-enabled (the CX_WORKLOAD_DIR unset fix).
+            canonical = False
+            # mori cases stay AMD-native; deepep-origin cases expand across the requested backend set.
+            case_targets = [("mori", False)] if beng0 == "mori" else targets
+            for (beng, v2) in case_targets:
+                ok, _r = cap.resolve(plat, beng, mode=c["mode"], dtype=c["dtype"], contract=c["contract"],
+                                     routing=c["routing"], eplb=bool(c.get("eplb")),
+                                     activation_profile=c.get("activation_profile", "normal"))
+                if not ok:
+                    continue
+                # DeepEP V2 (from-source kernel_gen=v2) is genuine on aarch64 gb200/gb300 at BOTH EP4
+                # (single-tray, gb300 run 28429220764) AND EP8 rack (2-tray MNNVL, gb300 run 28434764062
+                # -> kernel_gen=v2/ws8/correct). The EP8 rack path builds V2 once-per-node into a persistent
+                # container (CX_BUILD_ONLY) and the harness passes allow_mnnvl=True (CX_ALLOW_MNNVL) so the
+                # NVL buffer spans trays — so v2 is now allowed on gb200/gb300 at every EP degree.
+                case = {
+                    "backend": beng, "deepep_v2": v2, "mode": c["mode"], "dtype": c["dtype"],
+                    "contract": c["contract"], "routing": c["routing"], "phase": phase,
+                    "eplb": bool(c.get("eplb")), "resource_mode": rmode,
+                    "activation_profile": c.get("activation_profile", "normal"),
+                    "placement": c.get("placement", "packed"), "routing_step": str(c.get("routing_step", 0)),
+                    "uneven_tokens": c.get("uneven_tokens", "none"),
+                    "hidden": "" if h in (None, 7168) else str(h),
+                    "topk": "" if t in (None, 8) else str(t),
+                    "experts": "" if e in (None, 256) else str(e),
+                    "ladder": lad, "canonical": canonical, "nodes": nodes,
+                }
+                sig = (sku, beng, v2, c["mode"], c["dtype"], c["contract"], c["routing"], phase,
+                       case["eplb"], rmode, case["activation_profile"], case["placement"],
+                       case["routing_step"], case["uneven_tokens"], case["hidden"], case["topk"],
+                       case["experts"], nodes)
+                if sig in seen:
+                    continue
+                seen.add(sig)
+                # shard key = the CONTAINER/allocation-determining fields only: (sku, backend, v2, nodes).
+                # mode + resource_mode are per-case runtime knobs (run_in_container reads CX_MODE/
+                # CX_RESOURCE_MODE per case), so they do NOT split shards — all modes/rmodes of one
+                # (sku,backend,v2,nodes) run consecutively in ONE allocation, paying the enroot import +
+                # from-source build ONCE (not once per mode).
+                key = (sku, beng, v2, nodes)
+                shards.setdefault(key, []).append(case)
+
+    # PER-BACKEND chunk size. Fast backends (deepep*/nccl-ep/mori/deepep-hybrid) run a whole build-group
+    # in ONE allocation (max_cases, ~no chunking). flashinfer is SLOW (~3.2 min/case, heavy per-case MNNVL
+    # workspace setup) and intermittently hits `CUDA error: unspecified launch failure` under rapid
+    # back-to-back cases — so chunk it small: bounded, PARALLEL jobs, fewer successive setups per
+    # allocation. (uccl is NOT chunked: it fit a 74-case allocation cleanly; its only misses were a few
+    # ll-mode per-case timeouts that chunking wouldn't change.)
+    SLOW_MAX_CASES = {"flashinfer": 12}   # 12 (not 16): flashinfer cases retry up to 3x for the intermittent
+                                          # MNNVL-barrier deadlock, so smaller chunks keep a chunk within --time.
+    include = []
+    for (sku, beng, v2, nodes), cases in sorted(shards.items()):
+        if a.min_nodes and max(1, int(nodes or 1)) < a.min_nodes:
+            continue   # --min-nodes: skip single-tray (EP4) shards, keep only rack-scale (EP8+)
+        if a.max_nodes and max(1, int(nodes or 1)) > a.max_nodes:
+            continue   # --max-nodes: skip rack-scale (EP8+) shards, keep only single-tray (EP4)
+        tag = beng + ("-v2" if v2 else "")   # distinct shard id/runner for the V2 kernel variant
+        mc = min(a.max_cases, SLOW_MAX_CASES.get(beng, a.max_cases))
+        for ci in range(0, len(cases), mc):
+            chunk = cases[ci:ci + mc]
+            part = ci // mc
+            sid = f"{sku}-{tag}" + (f"-n{nodes}" if nodes else "") + (f"-p{part}" if len(cases) > mc else "")
+            include.append({
+                "id": sid, "sku": sku, "backend": beng,
+                "nodes": nodes, "deepep_v2": v2,
+                "n": len(chunk), "cases": chunk,
+            })
+
+    # --emit-shard: write just one shard's cases (the per-cell CX_SHARD_FILE) and exit.
+    if a.emit_shard:
+        match = next((x for x in include if x["id"] == a.emit_shard), None)
+        if match is None:
+            print(f"ERROR: shard id '{a.emit_shard}' not found among {len(include)} cells", file=sys.stderr)
+            return 2
+        os.makedirs(os.path.dirname(a.shard_out) or ".", exist_ok=True)
+        with open(a.shard_out, "w") as fh:
+            json.dump({"id": match["id"], "sku": match["sku"], "backend": match["backend"],
+                       "nodes": match["nodes"], "deepep_v2": match["deepep_v2"],
+                       "cases": match["cases"]}, fh)
+        print(f"wrote shard {a.emit_shard} ({match['n']} cases) -> {a.shard_out}", file=sys.stderr)
+        return 0
+
+    n_cells = len(include)
+    n_cases = sum(x["n"] for x in include)
+    # slim: drop the heavy `cases` from each cell so the matrix fits the GHA job-output size cap;
+    # each cell re-derives its cases with --emit-shard <id>.
+    out_include = ([{k: v for k, v in x.items() if k != "cases"} for x in include]
+                   if a.slim else include)
+    matrix = {"include": out_include}
+    if a.out:
+        with open(a.out, "w") as fh:
+            json.dump(matrix, fh)
+    print(f"resolved {n_cells} shard-cells, {n_cases} cases "
+          f"(suites={len(suite_names)} backends={a.backends or a.backend or 'deepep'} v2={a.deepep_v2})",
+          file=sys.stderr)
+    # stdout = the matrix JSON (for `$(...)` capture in the workflow)
+    print(json.dumps(matrix))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/_gb300_ep_probe.py b/experimental/CollectiveX/tests/_gb300_ep_probe.py
new file mode 100644
index 000000000..3889c98f5
--- /dev/null
+++ b/experimental/CollectiveX/tests/_gb300_ep_probe.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""GB300 EP8 GO/NO-GO probe — does DeepEP work across 2 NVL72 trays (8 ranks / 2 nodes)?
+
+Read-only spike (no artifacts). One PATH per process (CX_PROBE_PATH), because NVSHMEM
+inits once per process and the internode/LL buffers each bootstrap it. Reports, on rank 0,
+which Buffer construction + a 1-shot dispatch/combine round-trip actually runs on this fabric:
+
+  intranode  Buffer(group, nvl, 0)                         (MNNVL-as-one-NVLink-domain hope)
+  internode  Buffer(group, nvl, rdma>0)                    (DeepEP NVSHMEM path, over NVLink/IB)
+  ll         Buffer(group, 0, rdma, low_latency_mode=True) (decode path; nvlink-LL allowed)
+
+Env (set per-rank by the srun wrapper): RANK WORLD_SIZE LOCAL_RANK MASTER_ADDR MASTER_PORT
+                                        CX_PROBE_PATH=intranode|internode|ll
+"""
+import os
+import socket
+import sys
+import traceback
+
+import torch
+import torch.distributed as dist
+
+RANK = int(os.environ["RANK"])
+WORLD = int(os.environ["WORLD_SIZE"])
+LR = int(os.environ["LOCAL_RANK"])
+PATH = os.environ.get("CX_PROBE_PATH", "intranode")
+HOST = socket.gethostname()
+H = 7168
+TOPK = 8
+EXPERTS = WORLD * 32          # 256 at world=8 — same as the real sweep
+T = 8                          # tiny: this is a does-it-run probe, not a timing run
+
+
+def log(msg):
+    print(f"[r{RANK}@{HOST} {PATH}] {msg}", flush=True)
+
+
+def main():
+    torch.cuda.set_device(LR)
+    dev = torch.device(f"cuda:{LR}")
+    dist.init_process_group("nccl", rank=RANK, world_size=WORLD)
+
+    import deep_ep
+    from deep_ep import Buffer
+    if RANK == 0:
+        import inspect
+        try:
+            import importlib.metadata as md
+            ver = md.version("deep_ep")
+        except Exception:
+            ver = getattr(deep_ep, "__version__", "?")
+        log(f"deep_ep={ver} torch={torch.__version__} cuda={torch.version.cuda}")
+        log(f"Buffer.__init__{inspect.signature(Buffer.__init__)}")
+        log(f"caps: internode_dispatch={hasattr(Buffer,'internode_dispatch')} "
+            f"get_dispatch_config={hasattr(Buffer,'get_dispatch_config')} "
+            f"low_latency_dispatch={hasattr(Buffer,'low_latency_dispatch')} "
+            f"ll_rdma_hint={hasattr(Buffer,'get_low_latency_rdma_size_hint')}")
+
+    hosts = [None] * WORLD
+    dist.all_gather_object(hosts, HOST)
+    if RANK == 0:
+        uniq = sorted(set(hosts))
+        log(f"world={WORLD} over {len(uniq)} node(s): {uniq}")
+
+    group = dist.group.WORLD
+    x = torch.randn(T, H, dtype=torch.bfloat16, device=dev)
+    g = torch.Generator(device=dev).manual_seed(1234 + RANK)
+    idx = torch.stack([torch.randperm(EXPERTS, device=dev, generator=g)[:TOPK]
+                       for _ in range(T)]).to(torch.int64)
+    w = torch.rand(T, TOPK, device=dev, generator=g).to(torch.float32)
+
+    dist.barrier()
+    try:
+        if PATH == "intranode":
+            buf = Buffer(group, 1 * 1024**3, 0)
+            try:
+                Buffer.set_num_sms(24)
+            except Exception:
+                pass
+            ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS)
+            rx, _ri, rw, _nre, h, _ev = buf.dispatch(
+                x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr,
+                num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir,
+                num_tokens_per_expert=ntpe)
+            cx, _, _ = buf.combine(rx, h, topk_weights=rw)
+            rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape
+            log(f"RESULT intranode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} "
+                f"rdma_rank_layout={'present' if ntrr is not None else 'None'}")
+
+        elif PATH == "internode":
+            buf = Buffer(group, 1 * 1024**3, 1 * 1024**3)
+            try:
+                Buffer.set_num_sms(24)
+            except Exception:
+                pass
+            ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS)
+            rx, _ri, rw, _nre, h, _ev = buf.dispatch(
+                x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr,
+                num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir,
+                num_tokens_per_expert=ntpe)
+            cx, _, _ = buf.combine(rx, h, topk_weights=rw)
+            rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape
+            log(f"RESULT internode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} "
+                f"rdma_rank_layout={'present' if ntrr is not None else 'None'}")
+
+        elif PATH == "ll":
+            num_max = 128
+            rdma = Buffer.get_low_latency_rdma_size_hint(num_max, H, WORLD, EXPERTS)
+            nq = max(1, EXPERTS // WORLD)
+            buf = Buffer(group, 0, rdma, low_latency_mode=True, num_qps_per_rank=nq,
+                         allow_nvlink_for_low_latency_mode=True)
+            rx, rc, h, _ev, _hook = buf.low_latency_dispatch(
+                x, idx, num_max, EXPERTS, use_fp8=False, return_recv_hook=False)
+            cx, _ev2, _hook2 = buf.low_latency_combine(rx, idx, w, h)
+            rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape
+            log(f"RESULT ll OK: recv={tuple(rxs)} combine={tuple(cx.shape)}")
+        else:
+            log(f"unknown CX_PROBE_PATH={PATH}")
+            return 2
+        dist.barrier()
+    except Exception as exc:
+        if RANK == 0:
+            log(f"RESULT {PATH} FAIL: {exc!r}")
+            tb = traceback.format_exc().strip().splitlines()
+            for ln in tb[-8:]:
+                log(f"  | {ln}")
+        # let other ranks print their error too (often the real one is rank-specific)
+        else:
+            log(f"FAIL(non0): {exc!r}")
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        return 1
+    finally:
+        try:
+            dist.destroy_process_group()
+        except Exception:
+            pass
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/experimental/CollectiveX/tests/allreduce_fw_bench.py b/experimental/CollectiveX/tests/allreduce_fw_bench.py
new file mode 100644
index 000000000..609c2c7b1
--- /dev/null
+++ b/experimental/CollectiveX/tests/allreduce_fw_bench.py
@@ -0,0 +1,531 @@
+#!/usr/bin/env python3
+"""CollectiveX — framework custom all-reduce benchmark (family=allreduce-fw).
+
+Goal P2 "Low-latency all-reduce suite", framework-integrated tier. The standardized
+NCCL all-reduce is already covered by run_nccl.py (nccl-tests); this benchmark times the
+CUSTOM all-reduce kernels the serving frameworks ship — the ones that beat NCCL in the
+small-to-medium, latency-bound regime (TP all-reduce of activations: a few KiB .. tens of
+MiB) by doing a single one-shot or two-shot NVLink reduction instead of a ring.
+
+It runs under torchrun (multi-process, one rank per GPU) and, for EACH importable
+framework, times an all-reduce-sum of a bf16/fp32 tensor across the whole world over a
+latency-focused size ladder, CUDA-event timed, validating the result against a known
+reference. NCCL (torch.distributed.all_reduce) is the always-present baseline.
+
+Implementations measured (each IMPORT-GUARDED — a framework that isn't importable in the
+container is recorded as skipped, never faked):
+  * nccl                 — torch.distributed.all_reduce (baseline)
+  * flashinfer-oneshot   } flashinfer custom all-reduce (trtllm fusion / vLLM-style
+  * flashinfer-twoshot   } custom-allreduce), one-shot and two-shot recorded separately
+  * sglang               — sgl_kernel / sglang custom all-reduce
+  * vllm                 — vllm custom all-reduce (vllm may or may not be in the image)
+
+Each measured impl is one group:
+  {impl, dtype, world_size, rows:[{size_bytes, latency_us, algbw_gbps, busbw_gbps, correct}]}
+busbw uses the all-reduce factor 2*(n-1)/n (same as nccl-tests) so framework and NCCL bus
+bandwidth are directly comparable. status=valid iff nccl + >=1 framework impl produced rows
+with bw>0. A top-level frameworks_available dict records which frameworks were importable.
+
+Stdlib + torch; torch (and every framework) is imported lazily so `--help` works on a login
+node with no GPU. One provenance-tagged JSON like rl_mesh_bench.py / run_nccl.py.
+
+  torchrun --nproc_per_node=8 tests/allreduce_fw_bench.py --runner h200-dgxc \\
+      --topology-class h200-nvlink-island --transport nvlink \\
+      --env-json results/env.json --out results/h200_allreduce_fw.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "allreduce-fw-v1"
+FAMILY = "allreduce-fw"
+
+# Latency-focused ladder: 1 KiB .. 64 MiB. This is the regime where a custom one-shot /
+# two-shot NVLink all-reduce beats the NCCL ring (small messages are latency-bound; the
+# ring's 2*(n-1) hops dominate). Above ~tens of MiB NCCL's bandwidth-optimal ring wins, so
+# we deliberately stop at 64 MiB — past the crossover the framework kernels stop being the
+# point. Geometric x4 keeps the sweep short (9 points) so per-impl warmup cost stays bounded.
+DEFAULT_MIN_BYTES = 1 << 10   # 1 KiB
+DEFAULT_MAX_BYTES = 64 << 20  # 64 MiB
+
+# Custom all-reduce kernels are written for fp16/bf16 activations (TP all-reduce); a few also
+# take fp32. bf16 is the headline serving dtype. Map to torch dtype lazily (torch imported in main).
+_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp32": 4}
+
+
+def _sizes(lo: int, hi: int, factor: int = 4):
+    out, s = [], lo
+    while s <= hi:
+        out.append(s)
+        s *= factor
+    return out
+
+
+def comparison_key(meta: dict) -> str:
+    """Rows may share a curve only within the same (impl, dtype, world, topology, contract).
+    impl + topology-class are part of the key so e.g. flashinfer-oneshot on H200(NVLink) is
+    never silently overlaid on sglang or on a different topology."""
+    parts = [meta["impl"], meta["dtype"], str(meta["world_size"]),
+             meta["topology_class"], meta["measurement_contract"]]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _bench(fn, torch, warmup: int, iters: int) -> float:
+    """CUDA-event timed mean ms/iter (identical pattern to rl_mesh_bench._bench)."""
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / iters  # ms/iter
+
+
+def _bandwidths(nbytes: int, ms: float, world: int):
+    """algbw + busbw (GB/s) for an all-reduce, matching nccl-tests so framework numbers are
+    directly comparable to run_nccl.py. algbw = size/time; busbw = algbw * 2*(n-1)/n."""
+    if ms <= 0:
+        return 0.0, 0.0
+    sec = ms / 1e3
+    algbw = (nbytes / sec) / 1e9
+    factor = (2.0 * (world - 1) / world) if world > 1 else 1.0
+    return algbw, algbw * factor
+
+
+# --------------------------------------------------------------------------------------
+# Implementation registry. Each entry is a builder: given (torch, dist, dev, world, rank,
+# dtype_str) it returns either None (framework/kernel not available -> skipped) or a dict
+#   {"runner": fn(tensor)->None in-place all-reduce-sum, "free": optional teardown}.
+# Every builder is fully import-guarded and never raises out — an unavailable framework is a
+# recorded skip with a note, never a fake row. Several framework entrypoints are GUESSED
+# defensively across plausible API surfaces (flashinfer/sglang/vllm reorganize these often);
+# each guess is tried under try/except and simply yields "skipped" if absent, so a wrong guess
+# degrades to a skip rather than a crash.
+# --------------------------------------------------------------------------------------
+
+def _build_nccl(torch, dist, dev, world, rank, dtype):
+    """Baseline: torch.distributed.all_reduce (NCCL). Always available when dist is up."""
+    def run(t):
+        dist.all_reduce(t, op=dist.ReduceOp.SUM)
+    return {"runner": run, "note": "torch.distributed.all_reduce (NCCL ring)"}
+
+
+# FlashInfer custom AR works on a [token_num, hidden_dim] activation tensor (the TP all-reduce
+# shape), so the flashinfer impls sweep this fixed hidden and reshape the bench's flat buffer to
+# [numel/H, H]. Sizes not a multiple of H (only the smallest 1 KiB point) raise _SkipSize -> the
+# bench records a skipped row and continues (does NOT mark the impl failed).
+_FI_AR_HIDDEN = 2048
+
+
+class _SkipSize(Exception):
+    """Raised by an impl's run() for a size its kernel can't shape (skip that size, keep the impl)."""
+
+
+def _build_flashinfer(torch, dist, dev, world, rank, dtype, variant):
+    """FlashInfer custom all-reduce, one-shot vs two-shot as distinct impls — the REAL contract
+    (pinned on B300, flashinfer 0.6.8.post1): trtllm_allreduce_fusion with pattern_code=
+    AllReduceFusionPattern.kAllReduce (pure AR, no fusion) and use_oneshot True/False selecting
+    one-shot vs two-shot. The IPC workspace comes from trtllm_create_ipc_workspace_for_all_reduce_
+    fusion(tp_rank, tp_size, max_token_num, hidden_dim, group) -> (ipc_handles, workspace_ptrs[7]).
+    Both variants validated correct=True at EP2. (These APIs carry a deprecation note toward a future
+    allreduce.py, but are the functional one/two-shot entrypoints in this wheel.)"""
+    try:
+        import flashinfer.comm as ficomm
+        from flashinfer.comm import trtllm_ar as fi_ar
+    except Exception:
+        return None
+    fusion = getattr(ficomm, "trtllm_allreduce_fusion", None)
+    mkws = getattr(ficomm, "trtllm_create_ipc_workspace_for_all_reduce_fusion", None)
+    rmws = getattr(ficomm, "trtllm_destroy_ipc_workspace_for_all_reduce_fusion", None)
+    Pat = getattr(fi_ar, "AllReduceFusionPattern", None) or getattr(ficomm, "AllReduceFusionPattern", None)
+    if fusion is None or mkws is None or Pat is None or not hasattr(Pat, "kAllReduce"):
+        return {"runner": None,
+                "skip": "flashinfer.comm lacks trtllm_allreduce_fusion / IPC workspace / "
+                        "AllReduceFusionPattern.kAllReduce"}
+    H = _FI_AR_HIDDEN
+    use_oneshot = (variant == "oneshot")
+    max_tok = max(1, (DEFAULT_MAX_BYTES // _DTYPE_BYTES[dtype]) // H)
+    try:
+        ws = mkws(rank, world, max_tok, H, group=dist.group.WORLD)
+    except Exception as exc:
+        return {"runner": None, "skip": f"fusion IPC workspace creation failed: {exc!r}"}
+    ipc_handles = ws[0] if isinstance(ws, (list, tuple)) else None
+    ws_ptrs = ws[1] if isinstance(ws, (list, tuple)) and len(ws) >= 2 else None
+    pat = Pat.kAllReduce
+    out_buf = {}
+
+    def run(t, _f=fusion, _pat=pat, _os=use_oneshot, _wp=ws_ptrs):
+        numel = t.numel()
+        if numel < H or (numel % H) != 0:
+            raise _SkipSize(f"size {numel} elems not a multiple of hidden {H}")
+        Tn = numel // H
+        # Two-shot splits the sequence dim across ranks -> it asserts token_num > tp_size. One-shot
+        # has no such floor. Skip (don't fail) the small sizes where two-shot can't run.
+        if not _os and Tn <= world:
+            raise _SkipSize(f"two-shot needs token_num({Tn}) > tp_size({world})")
+        inp = t.view(Tn, H)
+        out = out_buf.get(Tn)
+        if out is None:
+            out = torch.empty_like(inp)
+            out_buf[Tn] = out
+        _f(allreduce_in=inp, world_size=world, world_rank=rank, token_num=Tn, hidden_dim=H,
+           workspace_ptrs=_wp, launch_with_pdl=False, trigger_completion_at_end=True,
+           fp32_acc=True, pattern_code=_pat, use_oneshot=_os, allreduce_out=out,
+           residual_in=None, residual_out=None, norm_out=None, quant_out=None, scale_out=None,
+           rms_gamma=None, rms_eps=None, scale_factor=None, layout_code=None)
+        # The kernel is out-of-place; copy back so the bench's in-place run(t) contract + its
+        # correctness check (which reads t) hold. The copy is small vs the AR and noted in the row.
+        t.copy_(out.view(-1))
+
+    def free():
+        if rmws is not None and ipc_handles is not None:
+            try:
+                rmws(ipc_handles, group=dist.group.WORLD)
+            except Exception:
+                pass
+
+    return {"runner": run, "free": free,
+            "note": f"flashinfer.comm.trtllm_allreduce_fusion kAllReduce use_oneshot={use_oneshot} "
+                    f"(hidden={H}, out-of-place + copy-back)"}
+
+
+def _sglang_vllm_ca_runner(ps, torch, dev, world, rank, fw):
+    """Shared: replicate the framework's SERVING distributed init (init_distributed_environment +
+    initialize_model_parallel) on the existing torchrun group, then return a run() that calls the TP
+    GroupCoordinator's custom-allreduce. sglang AND vllm expose the identical parallel_state API
+    (sglang forked vllm's), so one helper drives both. The serving init is exactly the context the
+    CustomAllreduce wrapper needs (it builds ca_comm only after initialize_model_parallel) — which is
+    why a bare-wrapper construction skipped before. Fully guarded -> skip dict on any failure."""
+    try:
+        if not ps.model_parallel_is_initialized():
+            ps.init_distributed_environment(world_size=world, rank=rank,
+                                            distributed_init_method="env://",
+                                            local_rank=local_device_index(dev), backend="nccl")
+            ps.initialize_model_parallel(tensor_model_parallel_size=world)
+        tp = ps.get_tp_group()
+    except Exception as e:
+        return {"runner": None, "skip": f"{fw} distributed init failed: {e!r}"}
+    # sglang/vllm expose ca_comm directly on the GroupCoordinator; aiter nests it under
+    # device_communicator.ca_comm — try both.
+    ca = getattr(tp, "ca_comm", None) or getattr(getattr(tp, "device_communicator", None), "ca_comm", None)
+    if ca is None or getattr(ca, "disabled", True):
+        return {"runner": None,
+                "skip": f"{fw} TP group ca_comm absent/disabled (no custom-AR at world={world}; "
+                        f"needs >1 rank + a supported topology/size)"}
+
+    def run(t, _ca=ca):
+        if hasattr(_ca, "should_custom_ar") and not _ca.should_custom_ar(t):
+            raise _SkipSize(f"{fw} ca_comm: size outside custom-AR range")
+        out = _ca.custom_all_reduce(t)
+        if out is not None and out.data_ptr() != t.data_ptr():
+            t.copy_(out)
+    return {"runner": run, "free": getattr(tp, "destroy", None),
+            "note": f"{fw} GroupCoordinator.ca_comm.custom_all_reduce (serving init replicated)"}
+
+
+def _build_sglang(torch, dist, dev, world, rank, dtype):
+    """SGLang custom all-reduce. The wrapper builds its IPC buffer only inside the framework's
+    distributed init (initialize_model_parallel) — so replicate that on the torchrun group and use
+    the TP group's ca_comm (the prior bare-CustomAllreduce construction skipped for exactly this)."""
+    try:
+        from sglang.srt.distributed import parallel_state as ps
+    except Exception as e:
+        return {"runner": None, "skip": f"sglang.srt.distributed import failed (not in image?): {e!r}"}
+    return _sglang_vllm_ca_runner(ps, torch, dev, world, rank, "sglang")
+
+
+def _build_vllm(torch, dist, dev, world, rank, dtype):
+    """vLLM in-tree custom all-reduce via its GroupCoordinator — same serving-init replication as
+    sglang (vllm.distributed.parallel_state has the identical init/get_tp_group/ca_comm API). vLLM
+    isn't in the sglang image, so this runs under the vLLM container switch (CX_BENCH=allreduce-fw +
+    sku/image -> a vllm image); skips on absence."""
+    try:
+        from vllm.distributed import parallel_state as ps
+    except Exception as e:
+        return {"runner": None, "skip": f"vllm.distributed import failed (not in image — needs a vLLM container): {e!r}"}
+    # vLLM's CustomAllreduce is a CustomOp that asserts an ACTIVE VllmConfig at instantiation
+    # ("Current vLLM config is not set" — observed on vllm/vllm-openai). Enter set_current_vllm_config
+    # PERSISTENTLY so the init + the timed run() calls all see the config (it sets a contextvar);
+    # free() exits it. Guarded: a vLLM without this API proceeds without (the helper reports failures).
+    cm = None
+    try:
+        from vllm.config import VllmConfig, set_current_vllm_config
+        cm = set_current_vllm_config(VllmConfig())
+        cm.__enter__()
+    except Exception:
+        cm = None
+    built = _sglang_vllm_ca_runner(ps, torch, dev, world, rank, "vllm")
+    if cm is not None:
+        _orig_free = built.get("free")
+        def _free(_of=_orig_free, _cm=cm):
+            try:
+                if _of:
+                    _of()
+            finally:
+                try:
+                    _cm.__exit__(None, None, None)
+                except Exception:
+                    pass
+        built["free"] = _free
+    return built
+
+
+def _module_exists(name: str) -> bool:
+    import importlib.util
+    try:
+        return importlib.util.find_spec(name) is not None
+    except Exception:
+        return False
+
+
+def _build_aiter(torch, dist, dev, world, rank, dtype):
+    """AITER (AMD) custom all-reduce via its GroupCoordinator. aiter.dist.parallel_state forked
+    vllm's (same init_distributed_environment / initialize_model_parallel / get_tp_group), with
+    ca_comm nested under device_communicator — so the shared serving-init helper drives it. The
+    first version constructed the wrapper BARE and got a nan; replicating the init gives a working
+    ca_comm. Skips on absence (NVIDIA image has no aiter)."""
+    try:
+        from aiter.dist import parallel_state as ps
+    except Exception as e:
+        return {"runner": None, "skip": f"aiter.dist import failed (not in image?): {e!r}"}
+    return _sglang_vllm_ca_runner(ps, torch, dev, world, rank, "aiter")
+
+
+def local_device_index(dev) -> int:
+    return dev.index if getattr(dev, "index", None) is not None else 0
+
+
+# (impl-name, builder, top-level framework key). flashinfer one/two-shot share the "flashinfer"
+# framework key; nccl's framework is "torch". The framework key drives frameworks_available.
+def _impl_registry():
+    return [
+        ("nccl", lambda *a: _build_nccl(*a), "torch"),
+        ("flashinfer-oneshot", lambda *a: _build_flashinfer(*a, variant="oneshot"), "flashinfer"),
+        ("flashinfer-twoshot", lambda *a: _build_flashinfer(*a, variant="twoshot"), "flashinfer"),
+        ("sglang", lambda *a: _build_sglang(*a), "sglang"),
+        ("vllm", lambda *a: _build_vllm(*a), "vllm"),
+        ("aiter", lambda *a: _build_aiter(*a), "aiter"),
+    ]
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX framework custom all-reduce benchmark")
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--dtype", default="bf16", choices=sorted(_DTYPE_BYTES))
+    ap.add_argument("--warmup", type=int, default=10)
+    ap.add_argument("--iters", type=int, default=50)
+    ap.add_argument("--impls", default="",
+                    help="comma/space-separated subset of impls to run (default: all). "
+                         "e.g. 'nccl,flashinfer-oneshot' — nccl is always included as baseline.")
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="nvlink")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    if world < 2:
+        if rank == 0:
+            print(f"ERROR: allreduce-fw needs world_size >= 2 (got {world}); "
+                  f"launch under torchrun --nproc_per_node=N", file=sys.stderr)
+        return 5
+    torch.cuda.set_device(local_rank)
+    dev = torch.device(f"cuda:{local_rank}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12359")
+    if not dist.is_initialized():
+        dist.init_process_group("nccl")
+
+    torch_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[args.dtype]
+    elem_bytes = _DTYPE_BYTES[args.dtype]
+    sizes = _sizes(args.min_bytes, args.max_bytes)
+
+    # Which impls to attempt. nccl baseline is always included.
+    want = {s for s in args.impls.replace(",", " ").split() if s}
+    registry = _impl_registry()
+    if want:
+        registry = [e for e in registry if e[0] in want or e[0] == "nccl"]
+
+    # frameworks_available: framework key -> {available: bool, note/skip-reason}. Probed once.
+    frameworks_available: dict = {}
+
+    def _note_framework(fwkey: str, available: bool, detail: str):
+        prev = frameworks_available.get(fwkey)
+        # importable wins over a per-variant skip (flashinfer may import yet a variant be absent).
+        if prev is None or (available and not prev.get("available")):
+            frameworks_available[fwkey] = {"available": available, "detail": detail}
+
+    groups = []
+    peak_bw = 0.0
+    nccl_ok = False
+    framework_ok = False
+
+    for impl_name, builder, fwkey in registry:
+        # Build the impl on every rank (custom AR needs collective IPC setup on all ranks).
+        try:
+            built = builder(torch, dist, dev, world, rank, args.dtype)
+        except Exception as exc:
+            built = {"runner": None, "skip": f"builder raised: {exc!r}"}
+
+        if built is None:
+            _note_framework(fwkey, False, "framework not importable")
+            if rank == 0:
+                print(f"  {impl_name}: skipped (framework '{fwkey}' not importable)", file=sys.stderr)
+            continue
+        if built.get("runner") is None:
+            reason = built.get("skip", "no usable entrypoint")
+            # framework imported (we got past `is None`) but this impl/variant isn't wireable.
+            _note_framework(fwkey, fwkey == "torch", reason if fwkey != "torch" else "baseline")
+            if rank == 0:
+                print(f"  {impl_name}: skipped ({reason})", file=sys.stderr)
+            continue
+
+        _note_framework(fwkey, True, built.get("note", "available"))
+        run = built["runner"]
+        rows = []
+        impl_failed = False
+        for nbytes in sizes:
+            numel = max(1, nbytes // elem_bytes)
+            actual_bytes = numel * elem_bytes
+            # Known inputs so the reduced result has a closed form: every rank fills with its
+            # (rank+1); all-reduce-sum -> world*(world+1)/2 in every element. Lets us validate
+            # custom kernels against a reference without trusting the kernel to define "correct".
+            base = float(rank + 1)
+            expected = float(world * (world + 1) // 2)
+            try:
+                t = torch.full((numel,), base, dtype=torch_dtype, device=dev)
+
+                def step(_t=t):
+                    run(_t)
+                ms = _bench(step, torch, args.warmup, args.iters)
+            except _SkipSize as sk:
+                # The kernel can't shape this size (e.g. below the custom-AR hidden) — record a
+                # skipped row and CONTINUE; do NOT fail the impl (it works at the other sizes).
+                rows.append({"size_bytes": actual_bytes, "latency_us": None,
+                             "algbw_gbps": 0.0, "busbw_gbps": 0.0, "correct": None,
+                             "skipped": str(sk)})
+                continue
+            except Exception as exc:
+                rows.append({"size_bytes": actual_bytes, "latency_us": None,
+                             "algbw_gbps": 0.0, "busbw_gbps": 0.0, "correct": None,
+                             "error": repr(exc)})
+                impl_failed = True
+                break
+
+            # Correctness: re-run once on a fresh known buffer and compare to the reference.
+            correct = None
+            try:
+                chk = torch.full((numel,), base, dtype=torch_dtype, device=dev)
+                run(chk)
+                ref = torch.full((numel,), expected, dtype=torch_dtype, device=dev)
+                # bf16/fp16 accumulate with rounding; tolerance scales with the magnitude.
+                atol = 0.0 if args.dtype == "fp32" else max(1.0, expected * 0.02)
+                correct = bool(torch.allclose(chk, ref, atol=atol, rtol=0.0))
+            except Exception:
+                correct = None
+
+            # Reduce timing across ranks (max = slowest rank) for a stable cross-rank number,
+            # exactly like rl_mesh_bench. Done with the always-present NCCL collective on a tiny
+            # tensor (not the impl under test).
+            tt = torch.tensor([ms], device=dev)
+            dist.all_reduce(tt, op=dist.ReduceOp.MAX)
+            ms_max = float(tt.item())
+            algbw, busbw = _bandwidths(actual_bytes, ms_max, world)
+            peak_bw = max(peak_bw, busbw)
+            rows.append({"size_bytes": actual_bytes,
+                         "latency_us": round(ms_max * 1e3, 3),
+                         "algbw_gbps": round(algbw, 3),
+                         "busbw_gbps": round(busbw, 3),
+                         "correct": correct})
+
+        if built.get("free"):
+            try:
+                built["free"]()
+            except Exception:
+                pass
+
+        had_bw = any((r.get("busbw_gbps") or 0.0) > 0.0 for r in rows)
+        if had_bw:
+            if impl_name == "nccl":
+                nccl_ok = True
+            else:
+                framework_ok = True
+        meta = {"impl": impl_name, "framework": fwkey, "dtype": args.dtype,
+                "world_size": world, "topology_class": args.topology_class,
+                "measurement_contract": MEASUREMENT_CONTRACT}
+        groups.append({**meta, "comparison_key": comparison_key(meta),
+                       "note": built.get("note"), "rows": rows,
+                       "incomplete": impl_failed})
+        if rank == 0:
+            mn = min((r["latency_us"] for r in rows if r.get("latency_us")), default=None)
+            print(f"  {impl_name}: {len(rows)} sizes, min latency "
+                  f"{mn if mn is not None else float('nan')} us, peak busbw "
+                  f"{max((r.get('busbw_gbps') or 0.0) for r in rows):.1f} GB/s", file=sys.stderr)
+
+    if rank != 0:
+        dist.barrier()
+        dist.destroy_process_group()
+        return 0
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    # valid iff the NCCL baseline produced real (bw>0) rows — the all-reduce curve itself is the
+    # deliverable. Which framework custom kernels were importable on this image is recorded in
+    # frameworks_available + the `framework_ok` flag (not all frameworks ship in every image); a run
+    # with only nccl is a valid latency/bandwidth baseline, not a failure.
+    status = "valid" if nccl_ok else "invalid"
+
+    doc = {
+        "schema_version": SCHEMA_VERSION, "family": FAMILY,
+        "generated_by": "allreduce_fw_bench.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner, "transport": args.transport,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+        "world_size": world, "dtype": args.dtype,
+        "size_min_bytes": args.min_bytes, "size_max_bytes": args.max_bytes,
+        "status": status,
+        "peak_busbw_gbps": round(peak_bw, 2),
+        "frameworks_available": frameworks_available,
+        "num_groups": len(groups), "groups": groups, "environment": env,
+    }
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+
+    avail = sorted(k for k, v in frameworks_available.items() if v.get("available"))
+    print(f"allreduce-fw: {len(groups)} impl group(s) -> {args.out} "
+          f"(status={status}, world={world}, dtype={args.dtype}, "
+          f"frameworks_available={avail}, peak_busbw={peak_bw:.1f} GB/s)")
+    dist.barrier()
+    dist.destroy_process_group()
+    return 0 if status == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/capability.py b/experimental/CollectiveX/tests/capability.py
new file mode 100644
index 000000000..da689ec2a
--- /dev/null
+++ b/experimental/CollectiveX/tests/capability.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""CollectiveX capability resolver (stdlib-only — runs on a login node, no torch).
+
+A workflow that exposes backend x SKU x mode x dtype x contract can request combinations
+no backend supports, and 'all' is not the same backend set across vendors. This static
+table mirrors the adapters' SUPPORTED_* sets so the matrix compiler / a pre-flight step
+can REJECT or OMIT invalid combinations BEFORE consuming a runner (review #3). The
+adapters still reject at runtime — this just fails fast and keeps the matrix honest.
+
+  python3 tests/capability.py --sku b300 --backend deepep --mode ll --dtype fp8 \
+      --contract layout-and-dispatch-v1            # exit 0 if valid, 3 + reason if not
+  python3 tests/capability.py --list               # dump the table
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+
+# SKU -> vendor. The runner label's SKU prefix selects the launcher; vendor gates backend.
+SKU_VENDOR = {
+    "h100": "nvidia", "h200": "nvidia", "b200": "nvidia", "b300": "nvidia",
+    "gb200": "nvidia", "gb300": "nvidia", "h100-dgxc": "nvidia", "b200-dgxc": "nvidia",
+    "mi355x": "amd", "mi350x": "amd", "mi325x": "amd", "mi300x": "amd",
+}
+
+
+def _sku_arch(sku: str) -> str:
+    s = (sku or "").lower()
+    if s.startswith(("gb300", "gb200", "b300", "b200")):
+        return "blackwell"
+    if s.startswith(("h100", "h200")):
+        return "hopper"
+    if s.startswith("mi3"):
+        return "cdna"
+    return "unknown"
+
+
+# Dispatch dtypes that need a specific GPU arch. NVFP4 (e2m1 4-bit) is a Blackwell-native tensor
+# format — FlashInfer's fp4 quantize/dequantize does NOT round-trip correctly on Hopper sm90
+# (validated: nvfp4 dispatch correct=True on B300, correct=False on H100). mxfp8 (e4m3) is fine on
+# Hopper. Gated here so a Hopper nvfp4 dispatch is cleanly REJECTED, not run-and-marked-invalid.
+ARCH_ONLY_DTYPES = {"nvfp4": "blackwell", "mxfp4": "blackwell"}
+
+# Backend capability table — MIRRORS the adapter SUPPORTED_* sets (the runtime source of
+# truth). Keep in sync with ep_deepep.py / ep_mori.py. LL is decode-only; cached-layout is
+# normal-only; MoRI is bf16/normal/layout-and-dispatch only.
+# All synthetic routing distributions (trace transforms — backend-agnostic) + the temporal modes.
+ALL_ROUTINGS = ["uniform", "balanced", "balanced-rank-local", "zipf", "zipf-mild",
+                "zipf-moderate", "zipf-heavy", "hotspot-single", "hotspot-moving", "alternating-groups"]
+# Activation value profiles. Under bf16 combine all are RUNNABLE but latency-neutral; the
+# non-normal ones become latency-relevant only under a quantized combine (PR311 — see quant_modes).
+ALL_ACTIVATION_PROFILES = ["normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation"]
+CAP = {
+    "deepep": {
+        "vendors": ["nvidia"],
+        "modes": ["normal", "ll"],
+        # DISPATCH-side precision + fp8 scale-layout recipe variants (same kernel, different cast):
+        # fp8=per-block-128, fp8-pertoken=per-token scale, fp8-directcast=unscaled (no scale transport).
+        "dtypes": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast"],
+        "contracts": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"],
+        "transports": ["nvlink", "rdma"],
+        # Combine path is a SEPARATE axis from dispatch dtype (review): today combine is bf16
+        # with no quant on every backend regardless of dispatch_dtype. fp8/quantized combine is
+        # reserved until a kernel is wired — capability rejects it so it can't be silently faked.
+        "combine_dtypes": ["bf16"],           # quantized combine (mxfp8/mxfp4/nvfp4) is in flashinfer
+        "quant_modes": ["none"],              # moe_a2a_combine (PR3376/3643, merged) but MNNVL-gated on
+                                              # x86_64 — reserved, see docs/upstream_precision.md + gated.md
+        # routing/EPLB/activation semantics (goal P2 "distribution + quant-combine constraints in
+        # capabilities"): DeepEP honors any trace (routing is a pure trace transform) + EPLB.
+        "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES,
+    },
+    "uccl": {
+        # UCCL EP (uccl.ep.Buffer) is a DeepEP-API clone on NVIDIA — mirror DeepEP's capability.
+        # bf16+fp8 dispatch, normal+ll modes, the same 3 contracts, bf16/none combine.
+        "vendors": ["nvidia"],
+        "modes": ["normal", "ll"],
+        "dtypes": ["bf16", "fp8"],
+        "contracts": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"],
+        "transports": ["nvlink", "rdma"],
+        "combine_dtypes": ["bf16"],
+        "quant_modes": ["none"],
+        "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES,
+    },
+    "flashinfer": {
+        # FlashInfer EP = flashinfer.comm.trtllm_moe_alltoall.MoeAlltoAll (pre-installed) — the
+        # TRT-LLM throughput-backend one-sided A2A over an MNNVL symmetric workspace. The A2A is a
+        # dtype-agnostic byte-mover taking input_payloads as a LIST, so a quantized dispatch =
+        # move [q, scale_factor] + dequant in stage(). DISPATCH precisions:
+        #   bf16; fp8/fp8-pertoken/fp8-directcast (e4m3, DeepEP convention); mxfp8/mxfp4/nvfp4
+        #   (OCP-microscaling via FlashInfer's native quantize/dequantize kernels).
+        "vendors": ["nvidia"],
+        "modes": ["normal"],
+        "dtypes": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast", "mxfp8", "mxfp4", "nvfp4"],
+        "contracts": ["layout-and-dispatch-v1"],
+        "transports": ["nvlink", "mnnvl"],
+        # Combine: bf16 default, OR a quantized COMBINE OUTPUT (fp8 e4m3) via moe_a2a_combine
+        # output_dtype — present in a NEWER flashinfer (PR3376/3643), pulled in by the run's
+        # cx_build_flashinfer_latest upgrade (the bundled 0.6.8.post1 lacks it). nvfp4/mxfp8 combine
+        # reserved (fp4/e8m0 output packing) until fp8-combine is GHA-validated.
+        "combine_dtypes": ["bf16", "fp8", "nvfp4"],
+        "quant_modes": ["none", "fp8", "nvfp4"],
+        "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES,
+    },
+    "deepep-hybrid": {
+        # DeepEP hybrid-ep branch (NVIDIA TMA HybridEPBuffer), built from source by
+        # cx_build_deepep_hybrid. Intranode NVLink path (<=8 ranks, one NVLink domain). bf16 normal
+        # layout-and-dispatch only; fp8 (use_fp8) + internode NVLink<->RDMA forwarding are further lift.
+        "vendors": ["nvidia"],
+        "modes": ["normal"],
+        "dtypes": ["bf16"],
+        "contracts": ["layout-and-dispatch-v1"],
+        "transports": ["nvlink"],
+        "combine_dtypes": ["bf16"],
+        "quant_modes": ["none"],
+        "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES,
+    },
+    "mori": {
+        "vendors": ["amd"],
+        "modes": ["normal"],
+        # DISPATCH-side precision. fp8 = e4m3fnuz DIRECT-CAST (the ROCm-native FNUZ format) via MoRI's
+        # quant_type=fp8_direct_cast — the only fp8 mode this MoRI build accepts (GHA introspection
+        # found the valid set is ['none','fp8_direct_cast']; the kernel casts bf16<->e4m3fnuz
+        # internally, scale_dim=0). bf16 combine OUTPUT unchanged (combine_dtypes below).
+        "dtypes": ["bf16", "fp8"],
+        "contracts": ["layout-and-dispatch-v1"],
+        "transports": ["xgmi", "rdma"],
+        "combine_dtypes": ["bf16"],           # + "fp8" via MoRI PR311 (merged): QuantType::Fp8BlockwiseQuant
+        "quant_modes": ["none"],              # + "fp8_blockwise" (MoRI PR311) once wired — see docs/upstream_precision.md
+        # MoRI also honors any trace + EPLB (a routing-trace transform), bf16 value-neutral.
+        "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES,
+    },
+    "nccl-ep": {
+        # NCCL/RCCL all-to-all EP (tests/ep_nccl.py) — the canonical token-shuffle EP built on pure
+        # torch.distributed collectives (all_to_all_single), no custom RDMA. Runs on BOTH vendors
+        # (NCCL on NVIDIA, RCCL on AMD — identical API) and is the only EP backend that survives
+        # cross-node WITHOUT GPUDirect-RDMA: NCCL/RCCL host-stage the all-to-all, where UCCL's
+        # ibv_reg_mr (EINVAL) and MoRI's RDMA registration abort. bf16 / normal / layout-and-dispatch.
+        "vendors": ["nvidia", "amd"],
+        "modes": ["normal"],
+        "dtypes": ["bf16"],
+        "contracts": ["layout-and-dispatch-v1"],
+        "transports": ["nvlink", "rdma", "xgmi"],
+        "combine_dtypes": ["bf16"],
+        "quant_modes": ["none"],
+        "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES,
+    },
+}
+# nccl/rccl are collective primitives, not EP dispatch/combine — phase is meaningless. The `nccl`
+# BENCHMARK runs on BOTH vendors: run_nccl_suite auto-selects nccl-tests on CUDA and rccl-tests on
+# ROCm (same binaries/output), so the All-reduce/All-gather tabs get an MI355X line too. (`rccl` is
+# kept as an explicit amd-only alias for direct dispatch.)
+COLLECTIVE = {"nccl": ["nvidia", "amd"], "rccl": ["amd"]}
+# Non-EP benchmarks (family != moe): memcpy-family (offload/copy-engine/kv-cache) + the RL
+# trainer<->generator mesh transfer (rl-mesh, multi-process NCCL send/recv). The EP capability
+# axes (mode/dtype/contract/phase) don't apply, so they pass validation unconditionally on their
+# vendors. (offload/copy-engine are NVIDIA-only; kv-cache + rl-mesh run anywhere with CUDA/NCCL.)
+HOST_GPU_BENCH = {"offload": ["nvidia"], "copy-engine": ["nvidia", "amd"],
+                  "kv-cache": ["nvidia", "amd"], "rl-mesh": ["nvidia", "amd"],
+                  "allreduce-fw": ["nvidia", "amd"],
+                  # nixl = the NIXL point-to-point transfer bench (kv-cache family) + the device-EP
+                  # build-probe; runs in the dynamo tensorrtllm-runtime container (NVIDIA-only).
+                  "nixl": ["nvidia"],
+                  # mori-io = MoRI-IO RDMA p2p transfer engine (mori.io); AMD MoRI image only.
+                  "mori-io": ["amd"],
+                  # nccl-kv = NCCL/RCCL p2p KV transfer (torch.distributed send/recv); both vendors.
+                  "nccl-kv": ["nvidia", "amd"],
+                  # mooncake = Mooncake transfer-engine RDMA KV transfer (pip-installed); both vendors
+                  # (transfer_write_on_cuda / _on_hip), needs an RDMA NIC.
+                  "mooncake": ["nvidia", "amd"]}
+
+# 'all' resolves to a DEFINED per-vendor backend set (not the same across vendors).
+VENDOR_BACKENDS = {"nvidia": ["nccl", "deepep", "uccl", "flashinfer"], "amd": ["rccl", "mori"]}
+
+
+def resolve(sku, backend, mode="normal", dtype="bf16",
+            contract="layout-and-dispatch-v1", combine_dtype="bf16", combine_quant_mode="none",
+            routing="uniform", eplb=False, activation_profile="normal"):
+    """Return (ok: bool, reason: str). dtype = DISPATCH precision; combine_dtype/
+    combine_quant_mode are the SEPARATE combine-path axes (default bf16/none = today's behavior).
+    routing/eplb/activation_profile gate the distribution semantics a backend admits (goal P2)."""
+    sku = (sku or "").split("_")[0]
+    vendor = SKU_VENDOR.get(sku)
+    if vendor is None:
+        return False, f"unknown SKU '{sku}'"
+    if backend in COLLECTIVE:
+        if vendor not in COLLECTIVE[backend]:
+            return False, f"{backend} is not the {vendor} collective backend"
+        return True, "collective primitive (phase/dtype/mode/contract not applicable)"
+    if backend in HOST_GPU_BENCH:
+        if vendor not in HOST_GPU_BENCH[backend]:
+            return False, f"{backend} bench not available on {vendor}"
+        return True, f"{backend} host/GPU memcpy-family bench (EP axes not applicable)"
+    cap = CAP.get(backend)
+    if cap is None:
+        return False, f"unknown backend '{backend}'"
+    if vendor not in cap["vendors"]:
+        return False, f"{backend} runs on {cap['vendors']}, not {vendor} SKU '{sku}'"
+    if mode not in cap["modes"]:
+        return False, f"{backend} modes={cap['modes']} (got '{mode}')"
+    if dtype not in cap["dtypes"]:
+        return False, f"{backend} dispatch dtypes={cap['dtypes']} (got '{dtype}')"
+    need_arch = ARCH_ONLY_DTYPES.get(dtype)
+    if need_arch and _sku_arch(sku) != need_arch:
+        return False, (f"{dtype} dispatch requires {need_arch} (FP4 is Blackwell-native; FlashInfer's "
+                       f"fp4 kernels don't round-trip on Hopper); SKU '{sku}' is {_sku_arch(sku)}")
+    if contract not in cap["contracts"]:
+        return False, f"{backend} contracts={cap['contracts']} (got '{contract}')"
+    if mode == "ll" and contract == "cached-layout-comm-only-v1":
+        return False, "cached-layout-comm-only-v1 is meaningless for LL (layout is in-kernel)"
+    if combine_dtype not in cap.get("combine_dtypes", ["bf16"]):
+        return False, f"{backend} combine_dtypes={cap.get('combine_dtypes', ['bf16'])} (got '{combine_dtype}')"
+    if combine_quant_mode not in cap.get("quant_modes", ["none"]):
+        return False, (f"{backend} quant_modes={cap.get('quant_modes', ['none'])} "
+                       f"(got '{combine_quant_mode}') — quant combine not wired yet")
+    if routing not in cap.get("routings", ALL_ROUTINGS):
+        return False, f"{backend} routings={cap.get('routings', ALL_ROUTINGS)} (got '{routing}')"
+    if eplb and not cap.get("eplb", False):
+        return False, f"{backend} does not support EPLB"
+    if activation_profile not in cap.get("activation_profiles", ["normal"]):
+        return False, (f"{backend} activation_profiles={cap.get('activation_profiles', ['normal'])} "
+                       f"(got '{activation_profile}')")
+    # an activation profile that needs special scaling is only MEANINGFUL under a quantized combine
+    # (bf16 is value-independent) — runnable but flagged so it isn't read as a latency result.
+    if activation_profile != "normal" and combine_quant_mode == "none":
+        return True, (f"ok (note: activation_profile={activation_profile} is latency-neutral under "
+                      f"bf16/none combine — value sensitivity needs a quantized combine)")
+    return True, "ok"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX capability resolver")
+    ap.add_argument("--sku"); ap.add_argument("--backend")
+    ap.add_argument("--mode", default="normal"); ap.add_argument("--dtype", default="bf16")
+    ap.add_argument("--contract", default="layout-and-dispatch-v1")
+    ap.add_argument("--combine-dtype", default="bf16")
+    ap.add_argument("--combine-quant-mode", default="none")
+    ap.add_argument("--routing", default="uniform")
+    ap.add_argument("--eplb", action="store_true")
+    ap.add_argument("--activation-profile", default="normal")
+    ap.add_argument("--list", action="store_true")
+    a = ap.parse_args()
+    if a.list:
+        print(json.dumps({"sku_vendor": SKU_VENDOR, "cap": CAP,
+                          "collective": COLLECTIVE, "vendor_backends": VENDOR_BACKENDS}, indent=2))
+        return 0
+    ok, reason = resolve(a.sku, a.backend, a.mode, a.dtype, a.contract,
+                         a.combine_dtype, a.combine_quant_mode,
+                         a.routing, a.eplb, a.activation_profile)
+    print(f"{'VALID' if ok else 'INVALID'}: sku={a.sku} backend={a.backend} mode={a.mode} "
+          f"dtype={a.dtype} contract={a.contract} combine_dtype={a.combine_dtype} "
+          f"combine_quant_mode={a.combine_quant_mode} routing={a.routing} eplb={a.eplb} "
+          f"activation_profile={a.activation_profile} — {reason}")
+    return 0 if ok else 3
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/copy_engine_bench.py b/experimental/CollectiveX/tests/copy_engine_bench.py
new file mode 100644
index 000000000..4e2e0aea1
--- /dev/null
+++ b/experimental/CollectiveX/tests/copy_engine_bench.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+"""CollectiveX — Copy-engine / SDMA collectives (goal P2).
+
+Compares the NVIDIA COPY-ENGINE (DMA) path against an SM-based copy:
+
+  * copy-engine path  — cudaMemcpyAsync (torch .copy_/Tensor copy that lowers to
+                        cudaMemcpyDeviceToDevice) issued on a DEDICATED copy
+                        stream. Hardware routes device-to-device memcpy through a
+                        copy engine (DMA), not the SMs.
+  * SM path           — an elementwise kernel (torch mul-add) that necessarily
+                        occupies SMs to move the same bytes.
+
+For each it reports latency + bandwidth across a size sweep (DtoD, and HtoD as a
+second op). It then VALIDATES that the copy-engine path uses ~0 SMs:
+
+  Primary  : if pynvml is importable, sample SM utilization (nvmlDeviceGetUtilization
+             / process-SM) during a sustained copy-engine loop vs a sustained SM-copy
+             loop. copy-engine should read near-zero, SM-copy should read high.
+  Fallback : a concurrent-kernel NON-INTERFERENCE probe. Run a long SM-bound
+             "victim" kernel alone (t_victim). Then run it concurrently with a
+             copy-engine copy on a separate stream (t_with_ce) and with an
+             SM-copy on a separate stream (t_with_sm). If the copy engine truly
+             uses no SMs, t_with_ce ~ t_victim (the copy is hidden), whereas
+             t_with_sm > t_victim (the SM-copy steals SM cycles from the victim).
+             The ratio is reported as evidence; the proxy is documented in the doc.
+
+family="copy-engine". NVIDIA only (AMD SDMA is out of scope) — refuses on ROCm.
+
+Stdlib + torch; --help / --parse-only work without torch (import-safe writer+CLI).
+
+Run (inside the container, 1 GPU is enough):
+    python tests/copy_engine_bench.py \\
+        --runner h200 --topology-class h200-nvlink-island --transport nvlink \\
+        --env-json results/env.json --out results/h200_copy_engine.json
+
+Verify offline (no GPU/torch needed):
+    python tests/copy_engine_bench.py --parse-only --runner h200 \\
+        --topology-class h200-nvlink-island --out /tmp/parsed.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+FAMILY = "copy-engine"
+MEASUREMENT_CONTRACT = "copy-engine-vs-sm-v1"
+GENERATED_BY = "copy_engine_bench.py"
+
+# (op, engine) sub-ops. engine = copy-engine (DMA) vs sm (kernel).
+SUBOPS = [
+    ("dtod", "copy-engine"),
+    ("dtod", "sm"),
+    ("htod", "copy-engine"),
+    ("htod", "sm"),
+]
+
+DEFAULT_MIN_BYTES = 64 * 1024
+DEFAULT_MAX_BYTES = 256 * 1024 * 1024
+DEFAULT_FACTOR = 4
+
+
+# --------------------------------------------------------------------------- #
+# import-safe helpers (no torch)                                              #
+# --------------------------------------------------------------------------- #
+def size_ladder(min_bytes: int, max_bytes: int, factor: int) -> list[int]:
+    sizes, s = [], int(min_bytes)
+    while s <= int(max_bytes):
+        sizes.append(s)
+        s *= factor
+    return sizes
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [
+        meta["op"],
+        meta["engine"],
+        meta["dtype"],
+        meta["transport"],
+        meta["topology_class"],
+        meta["comparison_class"],
+        meta["measurement_contract"],
+    ]
+    return hashlib.sha256("|".join(map(str, parts)).encode()).hexdigest()[:16]
+
+
+def _load_env(path: str | None) -> dict | None:
+    if path and os.path.exists(path):
+        with open(path) as fh:
+            return json.load(fh)
+    return None
+
+
+def _provenance() -> dict:
+    import platform as _plat
+
+    arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine())
+    run = {
+        "run_id": os.environ.get("GITHUB_RUN_ID"),
+        "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+        "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+        "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+        "repo": os.environ.get("GITHUB_REPOSITORY"),
+        "job": os.environ.get("GITHUB_JOB"),
+        "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+    }
+    return {
+        "image": os.environ.get("COLLECTIVEX_IMAGE", ""),
+        "image_digest": os.environ.get("COLLECTIVEX_IMAGE_DIGEST", ""),
+        "image_arch": arch,
+        "squash_sha256": os.environ.get("COLLECTIVEX_SQUASH_SHA256"),
+        "git_run": run if any(run.values()) else None,
+    }
+
+
+# --------------------------------------------------------------------------- #
+# GPU path (torch only here)                                                  #
+# --------------------------------------------------------------------------- #
+def _copy_engine_copy(torch, dst, src, stream):
+    """DtoD/HtoD memcpy that lowers to cudaMemcpyAsync on `stream` (copy engine)."""
+    with torch.cuda.stream(stream):
+        dst.copy_(src, non_blocking=True)
+
+
+def _sm_copy(torch, dst, src, stream):
+    """Bytes moved by an elementwise KERNEL (occupies SMs): dst = src * 1 + 0.
+
+    mul/add lowers to a CUDA elementwise kernel scheduled on the SMs — the
+    deliberate SM-based contrast to the copy engine. Same byte volume as .copy_."""
+    with torch.cuda.stream(stream):
+        torch.add(src, 0, out=dst) if dst.dtype == src.dtype else dst.copy_(src)
+
+
+def _time_loop(torch, fn, iters: int) -> float:
+    torch.cuda.synchronize()
+    s = torch.cuda.Event(enable_timing=True)
+    e = torch.cuda.Event(enable_timing=True)
+    s.record()
+    for _ in range(iters):
+        fn()
+    e.record()
+    torch.cuda.synchronize()
+    return s.elapsed_time(e) / iters  # ms/iter
+
+
+def _bench_one(torch, op: str, engine: str, nbytes: int, dtype,
+               warmup: int, iters: int, copy_stream) -> dict:
+    elem = torch.tensor([], dtype=dtype).element_size()
+    n = max(1, nbytes // elem)
+
+    dev_dst = torch.empty(n, dtype=dtype, device="cuda")
+    if op == "dtod":
+        src = torch.randn(n, dtype=dtype, device="cuda") if dtype.is_floating_point \
+            else torch.zeros(n, dtype=dtype, device="cuda")
+    else:  # htod
+        src = torch.empty(n, dtype=dtype, device="cpu", pin_memory=True)
+
+    if engine == "copy-engine":
+        fn = lambda: _copy_engine_copy(torch, dev_dst, src, copy_stream)
+    else:
+        # SM kernel copy. For HtoD an add kernel can't read host memory directly,
+        # so stage to device first then SM-copy device->device (still SM-bound).
+        if op == "htod":
+            staged = torch.empty(n, dtype=dtype, device="cuda")
+            staged.copy_(src)
+            torch.cuda.synchronize()
+            src = staged
+        fn = lambda: _sm_copy(torch, dev_dst, src, copy_stream)
+
+    for _ in range(warmup):
+        fn()
+    copy_stream.synchronize()
+    torch.cuda.synchronize()
+
+    avg_ms = _time_loop(torch, fn, iters)
+    actual_bytes = n * elem
+    gbps = (actual_bytes / (avg_ms / 1e3)) / 1e9 if avg_ms > 0 else 0.0
+    return {
+        "op": op,
+        "engine": engine,
+        "size_bytes": actual_bytes,
+        "requested_bytes": nbytes,
+        "latency_us": round(avg_ms * 1e3, 4),
+        "bandwidth_gbps": round(gbps, 3),
+    }
+
+
+# ---- SM-utilization validation (primary: nvml; fallback: non-interference) -- #
+def _victim_kernel_factory(torch, device):
+    """A long SM-bound kernel used as the 'victim' in the non-interference probe.
+
+    Repeated matmuls saturate the SMs for a measurable, stable duration; if a
+    concurrent copy steals SM cycles, the victim slows down."""
+    m = 2048
+    a = torch.randn(m, m, device=device, dtype=torch.float16)
+    b = torch.randn(m, m, device=device, dtype=torch.float16)
+    inner = 8
+
+    def victim():
+        c = a
+        for _ in range(inner):
+            c = torch.matmul(c, b)
+        return c
+
+    return victim, [m, m, m, inner]
+
+
+def _attention_victim_factory(torch, device):
+    """An SM-bound ATTENTION victim (scaled_dot_product_attention = the flash-attention kernel) for
+    the copy-vs-attention interference probe (goal "Interference with attention kernels"). Decode-ish
+    attention shape [batch, heads, seq, head_dim]; repeated to saturate the SMs for a stable duration."""
+    import torch.nn.functional as _F
+    b_, h_, s_, d_ = 8, 32, 2048, 128
+    q = torch.randn(b_, h_, s_, d_, device=device, dtype=torch.float16)
+    k = torch.randn(b_, h_, s_, d_, device=device, dtype=torch.float16)
+    v = torch.randn(b_, h_, s_, d_, device=device, dtype=torch.float16)
+    inner = 6
+
+    def victim():
+        o = q
+        for _ in range(inner):
+            o = _F.scaled_dot_product_attention(o, k, v)
+        return o
+
+    return victim, [b_, h_, s_, d_, inner]
+
+
+def _probe_victim(torch, victim, copy_engine_copy, sm_copy, dst, src, copy_stream, iters):
+    """Time a victim alone vs concurrent with a copy-engine copy vs concurrent with an SM-copy.
+    Returns (t_victim_us, t_with_ce_us, t_with_sm_us, ce_slowdown, sm_slowdown, near_zero)."""
+    for _ in range(3):
+        victim(); copy_engine_copy(); sm_copy()
+    torch.cuda.synchronize()
+    t_victim = _time_loop(torch, lambda: victim(), iters)
+    t_with_ce = _time_loop(torch, lambda: (copy_engine_copy(), victim()), iters)
+    t_with_sm = _time_loop(torch, lambda: (sm_copy(), victim()), iters)
+    copy_stream.synchronize()
+    ce_slow = (t_with_ce / t_victim) if t_victim > 0 else None
+    sm_slow = (t_with_sm / t_victim) if t_victim > 0 else None
+    near_zero = (ce_slow is not None and sm_slow is not None
+                 and ce_slow < 1.15 and (sm_slow - ce_slow) > 0.05)
+    return (round(t_victim * 1e3, 4), round(t_with_ce * 1e3, 4), round(t_with_sm * 1e3, 4),
+            round(ce_slow, 4) if ce_slow else None, round(sm_slow, 4) if sm_slow else None, bool(near_zero))
+
+
+def _sm_validation(torch, device, nbytes: int, iters: int) -> dict:
+    """Return evidence the copy-engine path uses ~0 SMs.
+
+    Tries pynvml SM utilization sampling first; always also runs the
+    concurrent-kernel non-interference probe and records BOTH. The doc documents
+    which signal is authoritative."""
+    elem = 2  # float16
+    n = max(1, nbytes // elem)
+    src = torch.randn(n, dtype=torch.float16, device=device)
+    dst = torch.empty(n, dtype=torch.float16, device=device)
+    copy_stream = torch.cuda.Stream()
+    victim, gemm_shape = _victim_kernel_factory(torch, device)
+
+    result: dict = {
+        "method": None,
+        "nvml": None,
+        "non_interference": None,
+        "copy_engine_uses_near_zero_sms": None,
+        "proxy_doc": (
+            "Non-interference proxy: a long SM-bound victim kernel timed alone "
+            "(t_victim) vs concurrent with a copy-engine copy on a separate "
+            "stream (t_with_ce) vs concurrent with an SM-copy (t_with_sm). "
+            "ce_slowdown=t_with_ce/t_victim ~1.0 => the copy engine stole no SM "
+            "cycles; sm_slowdown=t_with_sm/t_victim >1.0 => the SM-copy did. "
+            "copy_engine_uses_near_zero_sms is asserted when ce_slowdown is "
+            "materially smaller than sm_slowdown (and < ce_slowdown_threshold)."
+        ),
+    }
+
+    # ---- primary: pynvml SM utilization while copying on the copy engine ----
+    try:
+        import pynvml  # type: ignore
+
+        pynvml.nvmlInit()
+        idx = torch.cuda.current_device()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
+
+        def _sample_during(fn, n_samples=40) -> float:
+            # launch a long stream of the op, sample SM util repeatedly, take max
+            import time
+            for _ in range(3):
+                fn()
+            samples = []
+            # keep the queue full while sampling
+            for _ in range(n_samples):
+                for _ in range(8):
+                    fn()
+                u = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                samples.append(u.gpu)
+                time.sleep(0.001)
+            torch.cuda.synchronize()
+            return max(samples) if samples else 0.0
+
+        ce_util = _sample_during(
+            lambda: _copy_engine_copy(torch, dst, src, copy_stream))
+        sm_util = _sample_during(
+            lambda: _sm_copy(torch, dst, src, copy_stream))
+        result["nvml"] = {
+            "source": "pynvml nvmlDeviceGetUtilizationRates (whole-GPU SM util %)",
+            "copy_engine_max_sm_util_pct": ce_util,
+            "sm_copy_max_sm_util_pct": sm_util,
+            "note": "whole-GPU util is a coarse proxy; copy-engine should read low, SM-copy high",
+        }
+        pynvml.nvmlShutdown()
+    except Exception as exc:
+        result["nvml"] = {"available": False, "error": repr(exc)}
+
+    # ---- always: concurrent-kernel non-interference probe ----
+    try:
+        # warmup
+        for _ in range(3):
+            victim()
+            _copy_engine_copy(torch, dst, src, copy_stream)
+            _sm_copy(torch, dst, src, copy_stream)
+        torch.cuda.synchronize()
+
+        t_victim = _time_loop(torch, lambda: victim(), iters)
+
+        def _victim_with_ce():
+            _copy_engine_copy(torch, dst, src, copy_stream)
+            victim()
+
+        def _victim_with_sm():
+            _sm_copy(torch, dst, src, copy_stream)
+            victim()
+
+        t_with_ce = _time_loop(torch, _victim_with_ce, iters)
+        t_with_sm = _time_loop(torch, _victim_with_sm, iters)
+        copy_stream.synchronize()
+
+        ce_slow = (t_with_ce / t_victim) if t_victim > 0 else None
+        sm_slow = (t_with_sm / t_victim) if t_victim > 0 else None
+        threshold = 1.15
+        near_zero = (
+            ce_slow is not None and sm_slow is not None
+            and ce_slow < threshold and (sm_slow - ce_slow) > 0.05
+        )
+        result["non_interference"] = {
+            "victim_kernel": "matmul x8 (fp16 2048^3)",
+            "gemm_shape": gemm_shape,
+            "t_victim_us": round(t_victim * 1e3, 4),
+            "t_victim_with_copy_engine_us": round(t_with_ce * 1e3, 4),
+            "t_victim_with_sm_copy_us": round(t_with_sm * 1e3, 4),
+            "ce_slowdown": round(ce_slow, 4) if ce_slow else None,
+            "sm_slowdown": round(sm_slow, 4) if sm_slow else None,
+            "ce_slowdown_threshold": threshold,
+        }
+        result["copy_engine_uses_near_zero_sms"] = bool(near_zero)
+        result["method"] = ("nvml+non-interference"
+                            if result.get("nvml", {}).get("source") else "non-interference")
+    except Exception as exc:
+        result["non_interference"] = {"error": repr(exc)}
+        result["method"] = result["method"] or "failed"
+
+    # ---- copy-vs-ATTENTION interference (goal "Interference with attention kernels") ----
+    # Same probe with a flash-attention (scaled_dot_product_attention) victim instead of GEMM, so
+    # the copy engine's non-interference is shown against BOTH expert-GEMM and attention kernels.
+    try:
+        avictim, ashape = _attention_victim_factory(torch, device)
+        tv, tce, tsm, ce_s, sm_s, az = _probe_victim(
+            torch, avictim,
+            lambda: _copy_engine_copy(torch, dst, src, copy_stream),
+            lambda: _sm_copy(torch, dst, src, copy_stream),
+            dst, src, copy_stream, iters)
+        result["non_interference_attention"] = {
+            "victim_kernel": "scaled_dot_product_attention x6 (fp16 [8,32,2048,128])",
+            "attn_shape": ashape, "t_victim_us": tv,
+            "t_victim_with_copy_engine_us": tce, "t_victim_with_sm_copy_us": tsm,
+            "ce_slowdown": ce_s, "sm_slowdown": sm_s, "ce_slowdown_threshold": 1.15}
+        result["copy_engine_uses_near_zero_sms_attention"] = az
+    except Exception as exc:
+        result["non_interference_attention"] = {"error": repr(exc)}
+
+    return result
+
+
+def run_gpu(args) -> tuple[list[dict], dict, str | None]:
+    try:
+        import torch
+    except Exception as exc:  # pragma: no cover
+        return [], {}, f"torch unavailable: {exc!r}"
+    if not torch.cuda.is_available():
+        return [], {}, "torch.cuda.is_available() is False (no GPU in this container)"
+    # Accelerator-aware: on NVIDIA the off-SM DMA path is the copy engine; on AMD/ROCm the same
+    # async stream-copy lowers to the SDMA (System DMA) engines (the "AMD SDMA path"). The bench
+    # body is identical (torch.cuda maps to HIP); we label the DMA engine honestly per accelerator
+    # and let the non-interference probe characterize SDMA-vs-CU interference (pynvml is absent on
+    # ROCm, so _sm_validation falls back to the pure-torch non-interference path automatically).
+    is_rocm = bool(getattr(torch.version, "hip", None))
+    accel = "rocm" if is_rocm else "cuda"
+    copy_engine_kind = "sdma" if is_rocm else "copy-engine"
+
+    dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16,
+             "float32": torch.float32}[args.dtype]
+    sizes = size_ladder(args.min_bytes, args.max_bytes, args.factor)
+    copy_stream = torch.cuda.Stream()
+
+    rows: list[dict] = []
+    for op, engine in SUBOPS:
+        for nbytes in sizes:
+            try:
+                rows.append(_bench_one(torch, op, engine, nbytes, dtype,
+                                       args.warmup, args.iters, copy_stream))
+            except RuntimeError as exc:
+                rows.append({"op": op, "engine": engine, "size_bytes": nbytes,
+                             "requested_bytes": nbytes, "latency_us": None,
+                             "bandwidth_gbps": None, "error": repr(exc)})
+
+    diagnostics = {
+        "sm_validation": _sm_validation(torch, torch.device("cuda"),
+                                        args.validation_bytes, max(10, args.iters)),
+        "device_name": torch.cuda.get_device_name(0),
+        "multiprocessor_count": torch.cuda.get_device_properties(0).multi_processor_count,
+        "accelerator": accel,
+        "copy_engine_kind": copy_engine_kind,   # "sdma" on AMD/ROCm, "copy-engine" on NVIDIA
+        "hip_version": getattr(torch.version, "hip", None),
+    }
+    return rows, diagnostics, None
+
+
+# --------------------------------------------------------------------------- #
+# document assembly + CLI                                                      #
+# --------------------------------------------------------------------------- #
+def build_doc(args, rows: list[dict], diagnostics: dict, error: str | None) -> dict:
+    measured = [r for r in rows if r.get("bandwidth_gbps")]
+    peak_bw = max((r["bandwidth_gbps"] for r in measured), default=0.0)
+    # gate: must have transferred on BOTH the copy-engine and SM paths with bw>0
+    ce_ok = any(r["engine"] == "copy-engine" and r.get("bandwidth_gbps") for r in rows)
+    sm_ok = any(r["engine"] == "sm" and r.get("bandwidth_gbps") for r in rows)
+    transferred = bool(measured) and peak_bw > 0.0 and ce_ok and sm_ok
+
+    meta = {
+        "op": "memcpy", "engine": "mixed", "dtype": args.dtype,
+        "transport": args.transport, "topology_class": args.topology_class,
+        "comparison_class": args.comparison_class,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+    }
+    curve_keys = {}
+    for op, engine in SUBOPS:
+        curve_keys[f"{op}/{engine}"] = comparison_key(dict(meta, op=op, engine=engine))
+    for r in rows:
+        r["comparison_key"] = curve_keys.get(f"{r['op']}/{r['engine']}")
+
+    doc = {
+        "schema_version": SCHEMA_VERSION,
+        "family": FAMILY,
+        "generated_by": GENERATED_BY,
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner,
+        "transport": args.transport,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+        "topology_class": args.topology_class,
+        "comparison_class": args.comparison_class,
+        "dtype": args.dtype,
+        "sub_ops": [f"{o}/{e}" for o, e in SUBOPS],
+        "comparison_key": comparison_key(meta),
+        "curve_keys": curve_keys,
+        "status": "valid" if transferred else "invalid",
+        "error": error,
+        # "copy-engine" on NVIDIA, "sdma" on AMD/ROCm (same off-SM DMA-engine role) — labeled so the
+        # AMD SDMA result is not conflated with the NVIDIA copy-engine result in the plot.
+        "accelerator": diagnostics.get("accelerator"),
+        "copy_engine_kind": diagnostics.get("copy_engine_kind"),
+        "peak_bandwidth_gbps": round(peak_bw, 3),
+        "copy_engine_uses_near_zero_sms": diagnostics.get("sm_validation", {}).get(
+            "copy_engine_uses_near_zero_sms"),
+        "sweep": {"min_bytes": args.min_bytes, "max_bytes": args.max_bytes,
+                  "factor": args.factor, "warmup": args.warmup, "iters": args.iters},
+        "num_rows": len(rows),
+        "rows": rows,
+        "diagnostics": diagnostics,
+        "provenance": _provenance(),
+        "environment": _load_env(args.env_json),
+    }
+    return doc
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX copy-engine vs SM copy bench (NVIDIA)")
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--factor", type=int, default=DEFAULT_FACTOR)
+    ap.add_argument("--dtype", default="float16", choices=["float16", "bfloat16", "float32"])
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=30)
+    ap.add_argument("--validation-bytes", type=int, default=16 * 1024 * 1024,
+                    help="copy size used by the SM-utilization validation probe")
+    ap.add_argument("--parse-only", action="store_true",
+                    help="emit a well-formed (status=invalid) doc with no GPU — schema check")
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="nvlink",
+                    help="DtoD transport: nvlink (intra-node) | pcie")
+    ap.add_argument("--comparison-class", default="standardized",
+                    choices=["standardized", "backend-optimized", "framework-integrated"])
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    if args.parse_only:
+        rows, diagnostics, error = [], {}, "parse-only (no GPU run)"
+    else:
+        rows, diagnostics, error = run_gpu(args)
+
+    doc = build_doc(args, rows, diagnostics, error)
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+
+    sv = doc["diagnostics"].get("sm_validation", {})
+    print(
+        f"copy-engine: {doc['num_rows']} rows -> {args.out} "
+        f"(status={doc['status']}, peak_bw={doc['peak_bandwidth_gbps']} GB/s, "
+        f"ce_near_zero_sms={doc['copy_engine_uses_near_zero_sms']}, "
+        f"method={sv.get('method')}, key={doc['comparison_key']})",
+        file=sys.stderr,
+    )
+    return 0 if doc["status"] == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py
new file mode 100644
index 000000000..ee300b58f
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP (NVIDIA), normal mode.
+
+The harness owns the deterministic shared routing trace, the comm-only timing, and
+the doc; this file owns only DeepEP's API calls and its correctness reference.
+`make_problem` materializes the harness-provided rank slice (no RNG here), so every
+SKU runs the identical routed workload.
+
+Correctness (per DeepEP's intranode test): a pure dispatch->combine round trip with no
+expert compute reconstructs x only after dividing by the number of ranks each token was
+sent to, so the harness expects combined ≈ x * is_token_in_rank.sum(dim=1).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    from deep_ep import Buffer  # type: ignore
+    import deep_ep  # for version/provenance
+except Exception as exc:  # pragma: no cover - needs the built DeepEP
+    print("ERROR: deep_ep import failed — DeepEP must be present/built at job setup. "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _deepep_version() -> str:
+    try:
+        import importlib.metadata as _md
+        return _md.version("deep_ep")
+    except Exception:
+        return getattr(deep_ep, "__version__", "unknown")
+
+
+# DeepEP's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token
+# block-128 scale (deep_ep 1.2.1 ships NO helper for this — utils is empty — so we
+# implement the exact convention its kernels expect: scales [T, H//128] float32, e4m3,
+# 448 = e4m3 max). Both directions of the cast run OUTSIDE the timed window (cast in
+# make_problem, dequant in stage), so fp8 quantization is NOT included in dispatch time.
+_FP8_MAX = 448.0
+_FP8_BLOCK = 128
+
+
+def _per_token_cast_to_fp8(x):
+    # PER-BLOCK-128 scale layout (DeepEP default): one scale per 128-elem block per token.
+    # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32)
+    T, H = x.shape
+    xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK)
+    amax = xv.abs().amax(dim=2).clamp(min=1e-4)               # [T, H//128]
+    x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H)
+    return x_fp8, (amax / _FP8_MAX).contiguous()
+
+
+def _per_token_cast_to_fp8_pertoken(x):
+    # PER-TOKEN scale layout: ONE amax per token (over all H), broadcast across the H//128 blocks.
+    # Coarser than block-128 (slightly higher quant error) but the same scale transport cost.
+    T, H = x.shape
+    amax = x.float().abs().amax(dim=1, keepdim=True).clamp(min=1e-4)       # [T, 1]
+    x_fp8 = (x.float() * (_FP8_MAX / amax)).to(torch.float8_e4m3fn)
+    scales = (amax / _FP8_MAX).expand(T, H // _FP8_BLOCK).contiguous()     # broadcast per-token
+    return x_fp8, scales
+
+
+def _directcast_to_fp8(x):
+    # DIRECT-CAST: clamp to the e4m3 range and cast with NO learned scale (unit scale). Carries no
+    # scale metadata (zero scale-transport overhead) but truncates activations above e4m3 max — the
+    # recipe MoRI PR311 replaced for accuracy. scales=ones so _per_block_dequant is the plain cast-back.
+    T, H = x.shape
+    x_fp8 = x.float().clamp(-_FP8_MAX, _FP8_MAX).to(torch.float8_e4m3fn)
+    scales = torch.ones((T, H // _FP8_BLOCK), dtype=torch.float32, device=x.device)
+    return x_fp8, scales
+
+
+# dispatch_dtype value -> (scale_layout label, cast fn). All feed DeepEP's same (fp8, scales) kernel
+# input; they differ only in the quant recipe, so they are distinct OPERATING POINTS, not dtypes.
+_FP8_RECIPES = {
+    "fp8": ("per-block-128", _per_token_cast_to_fp8),
+    "fp8-pertoken": ("per-token", _per_token_cast_to_fp8_pertoken),
+    "fp8-directcast": ("direct-cast", _directcast_to_fp8),
+}
+
+
+def _per_block_dequant(x_fp8, scales):
+    # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16
+    R, H = x_fp8.shape
+    xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16)
+
+
+def _per_block_dequant_3d(x_fp8, scales):
+    # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16
+    E, S, H = x_fp8.shape
+    xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16)
+
+
+def _mnnvl_buffer_kwargs() -> dict:
+    """Cross-tray (NVL72/MNNVL) Buffer kwargs.
+
+    DeepEP V2's `Buffer` added `allow_mnnvl` (default False); when it is False DeepEP itself sets
+    `NVSHMEM_DISABLE_MNNVL=1` and the legacy buffer falls onto the intranode-only CUDA-IPC peer path,
+    which faults across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). On a real
+    multi-tray MNNVL allocation (the rack launcher exports CX_ALLOW_MNNVL=1) request allow_mnnvl=True
+    so the NVLink buffer spans trays over the fabric API. The bundled V1 `Buffer` predates the param
+    (its NVL buffer already spans MNNVL trays), so only pass it when the installed Buffer accepts it —
+    keeping x86 single-node and bundled-V1 rack paths byte-for-byte unchanged.
+    """
+    if os.environ.get("CX_ALLOW_MNNVL") != "1":
+        return {}
+    try:
+        import inspect
+        if "allow_mnnvl" in inspect.signature(Buffer.__init__).parameters:
+            return {"allow_mnnvl": True}
+    except (ValueError, TypeError):
+        pass
+    return {}
+
+
+class DeepEPBackend:
+    name = "deepep"
+    combine_needs_redispatch = False  # DeepEP combine reuses the handle (its own bench does too)
+    # Blackwell (B300) drops GPU clocks during the tiny small-T points, so the harness
+    # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100.
+    wants_warm_burst = True
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). Expanded as each path is implemented + hardware-validated.
+    #   normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink.
+    #   ll mode: low_latency_dispatch/combine — verified RUNNING intranode over NVLink via
+    #   allow_nvlink_for_low_latency_mode (IBGDA not required intranode) on 8xH100.
+    SUPPORTED_PRECISIONS = {"bf16", "fp8", "fp8-pertoken", "fp8-directcast"}
+    SUPPORTED_MODES = {"normal", "ll"}
+    # Three contracts (review #3 + goal P1 runtime-visible):
+    #   layout-and-dispatch-v1     — times get_dispatch_layout INSIDE dispatch; fp8 cast/dequant
+    #                                OUTSIDE (preprocessing mirrors a producer handing quantized x).
+    #   cached-layout-comm-only-v1 — layout hoisted out (untimed); dispatch = pure comm (DeepEP's
+    #                                own benchmark boundary). normal mode only.
+    #   runtime-visible-v1         — the serving-realistic boundary: dispatch INCLUDES the fp8
+    #                                quant (cast) + layout + comm + the recv-dequant that makes
+    #                                expert input consumable; combine starts from bf16 expert
+    #                                outputs. (normal mode; LL already times all of this in-kernel.)
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.ll = (args.mode == "ll")
+        self.contract = args.measurement_contract
+        # hoist layout out of the timed dispatch only for the cached contract in normal mode.
+        self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll
+        # runtime-visible-v1: the fp8 cast + recv-dequant move INSIDE the timed dispatch (normal
+        # mode). LL already times cast+layout+comm in its single kernel, so it's runtime-visible
+        # by construction — the flag only changes normal mode's boundary.
+        self.runtime_visible = (self.contract == "runtime-visible-v1") and not self.ll
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest
+        # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is
+        # recorded in the artifact so the looser fp8 gate is explicit, not hidden.
+        self.fp8 = args.dispatch_dtype.startswith("fp8")
+        # fp8 scale-layout recipe (per-block-128 default / per-token / direct-cast) — all use the
+        # same DeepEP fp8 kernel; only the cast differs. Recorded so they're distinct operating points.
+        self.fp8_recipe, self._fp8_cast = _FP8_RECIPES.get(
+            args.dispatch_dtype, ("per-block-128", _per_token_cast_to_fp8))
+        self.scale_layout = self.fp8_recipe if self.fp8 else None
+        # direct-cast truncates above e4m3 (no scale) -> a touch looser gate than scaled recipes.
+        self.tolerance = ((1.5e-1 if self.fp8_recipe == "direct-cast" else 1.25e-1)
+                          if self.fp8 else 5e-2)
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _deepep_version()
+        if self.ll:
+            self._init_ll(args, dev_sms, ver)
+        else:
+            self._init_normal(args, rank, dev_sms, ver)
+
+    def _init_normal(self, args, rank, dev_sms, ver):
+        # fp8 cast: UNTIMED (make_problem) under layout-and-dispatch / cached-layout; TIMED (inside
+        # dispatch) under runtime-visible-v1. So fp8_in_timing tracks the contract honestly.
+        self.fp8_in_timing = (self.runtime_visible if self.fp8 else None)
+        self.combine_needs_redispatch = False  # normal combine reuses the handle
+        # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points
+        # (review: a phase-dependent 2/4 GiB made the shared T=128 point differ between
+        # the decode and prefill sweeps). 4 GiB holds T up to 4096 (validated).
+        num_nvl_bytes = int(os.environ.get("CX_DEEPEP_NVL_BYTES", str(4 * 1024 * 1024 * 1024)))
+        mnnvl_kw = _mnnvl_buffer_kwargs()
+        self.buffer = Buffer(self.group, num_nvl_bytes, 0, **mnnvl_kw)
+        rm = args.resource_mode
+        tuned_src = None
+        if rm == "normalized":
+            num_sms = max(1, round(args.sm_fraction * dev_sms))   # ~same device fraction as MoRI
+        elif rm == "tuned":
+            # Best-available for the installed DeepEP: its OWN default SM count
+            # (Buffer.num_sms — the library's analytic choice; it deliberately uses
+            # fewer SMs). get_dispatch_config(num_ranks) returns the recommended Config
+            # but doesn't expose num_sms to Python, and the default already reflects it.
+            num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+            tuned_src = "deepep-default-num_sms"
+        else:  # default — the bring-up budget
+            num_sms = args.num_sms
+        try:
+            Buffer.set_num_sms(num_sms)
+        except Exception as exc:  # pragma: no cover - version dependent
+            if rank == 0:
+                print(f"WARN: could not set num_sms={num_sms}: {exc!r}", file=sys.stderr)
+        self.backend_provenance = {
+            "deepep_version": ver,
+            "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}",
+            "mode": "normal", "resource_mode": rm, "num_sms": num_sms, "device_sms": dev_sms,
+            "sm_fraction": (num_sms / dev_sms), "tuned_source": tuned_src or "n/a",
+            "num_nvl_bytes": num_nvl_bytes, "allow_mnnvl": bool(mnnvl_kw),
+            "fp8_recipe": self.fp8_recipe if self.fp8 else "n/a",
+            "scale_layout": self.scale_layout,
+        }
+
+    def _init_ll(self, args, dev_sms, ver):
+        # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink
+        # via allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch
+        # so for fp8 the quantization IS inside the timed window (recorded honestly). The
+        # buffer is sized for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical),
+        # so LL is a decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop).
+        # set_num_sms does NOT apply (the LL kernel picks its own occupancy) — recorded n/a.
+        self.fp8_in_timing = (True if self.fp8 else None)
+        self.combine_needs_redispatch = True   # re-dispatch (untimed) before each timed combine
+        self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128"))
+        self.experts = args.experts
+        rdma_bytes = Buffer.get_low_latency_rdma_size_hint(
+            self.num_max, args.hidden, self.world_size, args.experts)
+        # one QP per local expert is the DeepEP convention for LL
+        self.num_qps = max(1, args.experts // self.world_size)
+        mnnvl_kw = _mnnvl_buffer_kwargs()
+        self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True,
+                             num_qps_per_rank=self.num_qps,
+                             allow_nvlink_for_low_latency_mode=True, **mnnvl_kw)
+        self.backend_provenance = {
+            "deepep_version": ver,
+            "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}",
+            "mode": "ll", "resource_mode": args.resource_mode,
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel",
+            "num_max_dispatch_tokens_per_rank": self.num_max,
+            "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps,
+            "low_latency_mode": True, "use_fp8": self.fp8, "allow_mnnvl": bool(mnnvl_kw),
+        }
+
+    def buffer_cap(self, args):
+        # LL is sized for a fixed num_max; cap the sweep there (reported, not silent).
+        return self.num_max if self.ll else None
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice.
+        p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+                                  topk_weights=weights.to(torch.float32), layout=None)
+        if self.fp8 and not self.ll and not self.runtime_visible:
+            # layout-and-dispatch / cached-layout: per-token block-128 cast, UNTIMED (preprocessing,
+            # mirrors the real producer that hands the dispatcher already-quantized activations).
+            # runtime-visible does NOT pre-cast (the cast is timed inside dispatch); LL casts in-kernel.
+            p.x_fp8, p.x_scales = self._fp8_cast(x)
+        if self.cache_layout:
+            # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed)
+            # so the timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None
+            # and dispatch computes it inside the timed window.)
+            ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+            p.layout = (ntr, ntpe, itir)
+        return p
+
+    def dispatch(self, p):
+        if self.ll:
+            return self._dispatch_ll(p)
+        if p.layout is not None:                       # cached-layout-comm-only-v1
+            num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout
+        else:                                          # layout-and-dispatch / runtime-visible (timed layout)
+            (num_tokens_per_rank, _, num_tokens_per_expert,
+             is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+        ref_fp8 = ref_scales = None
+        if self.fp8:
+            if self.runtime_visible:
+                # runtime-visible: the per-token block-128 cast is INSIDE the timed dispatch.
+                x_fp8, x_scales = self._fp8_cast(p.x)
+                ref_fp8, ref_scales = x_fp8, x_scales      # for the correctness reference
+            else:
+                x_fp8, x_scales = p.x_fp8, p.x_scales      # pre-cast (untimed)
+            x_in = (x_fp8, x_scales)
+        else:
+            x_in = p.x
+        recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch(
+            x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert)
+        out = types.SimpleNamespace(
+            recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle,
+            is_token_in_rank=is_token_in_rank, ref_fp8=ref_fp8, ref_scales=ref_scales)
+        if self.fp8 and self.runtime_visible:
+            # dispatch ENDS when expert input is consumable: dequant fp8 recv -> bf16 INSIDE the
+            # timed window (the contract's "expert input genuinely consumable" boundary). stage()
+            # then no-ops for this contract.
+            recv_fp8, recv_scales = recv_x
+            out.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+            out.rv_staged = True
+        return out
+
+    def _dispatch_ll(self, p):
+        # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the
+        # cast IS inside this timed op — fp8_in_timing=True). recv is the expert-major
+        # 3D layout [num_local_experts, num_max*world, hidden] (+scales when fp8).
+        recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch(
+            p.x, p.topk_idx, self.num_max, self.experts,
+            use_fp8=self.fp8, return_recv_hook=False)
+        return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle)
+
+    def stage(self, p, h):
+        # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv
+        # to bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in
+        # both precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.)
+        if getattr(h, "rv_staged", False):
+            return None   # runtime-visible already produced bf16 combine_input inside dispatch (timed)
+        if self.ll:
+            if self.fp8:
+                recv_fp8, recv_scales = h.recv_x
+                h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales)
+            else:
+                h.combine_input = h.recv_x
+        elif self.fp8:
+            recv_fp8, recv_scales = h.recv_x
+            h.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+        else:
+            h.combine_input = h.recv_x
+        return None
+
+    def combine(self, p, h):
+        if self.ll:
+            # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones.
+            combined_x, _event, _hook = self.buffer.low_latency_combine(
+                h.combine_input, p.topk_idx, p.topk_weights, h.handle)
+            return combined_x
+        combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle,
+                                               topk_weights=h.recv_topk_weights)
+        return combined_x
+
+    def expected(self, p, h):
+        if self.ll:
+            # LL combine reduces each token's topk expert copies weighted by topk_weights;
+            # with no expert compute each copy is (the kernel's fp8 cast of) x, so
+            # combined ≈ x * sum(topk_weights). fp8 quant error is covered by self.tolerance.
+            wsum = p.topk_weights.sum(dim=1, keepdim=True)
+            return p.x.float() * wsum, p.T
+        # normal: round trip with no expert compute reconstructs x*(#destination ranks);
+        # for fp8 compare against the dequantized cast that was actually sent.
+        ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
+        ref = p.x.float()
+        if self.fp8:
+            # runtime-visible cast lives on the handle (no pre-cast on p); else use the pre-cast.
+            x_fp8 = getattr(h, "ref_fp8", None)
+            x_scales = getattr(h, "ref_scales", None)
+            if x_fp8 is None:
+                x_fp8, x_scales = p.x_fp8, p.x_scales
+            ref = _per_block_dequant(x_fp8, x_scales).float()
+        return ref * ranks_per_token, p.T
+
+    def recv_tokens(self, h):
+        if self.ll:
+            return int(h.recv_count.sum().item())  # token-copies received across local experts
+        rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x
+        return int(rx.shape[0])
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_deepep_hybrid.py b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
new file mode 100644
index 000000000..594cae735
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP `hybrid-ep` branch (NVIDIA TMA-based HybridEPBuffer).
+
+The hybrid-ep branch (https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is NVIDIA's TMA +
+warp-pipeline implementation of expert-parallel all-to-all, exposing `deep_ep.HybridEPBuffer`
+(distinct from the mainline `deep_ep.Buffer`). HybridEP is NVIDIA's MoE backend built for NVL72
+rack-scale (Megatron `moe_flex_dispatcher_backend="hybridep"`). This adapter drives the single-
+NVLink-domain path (`num_of_hybrid_ep_ranks_per_nvlink_domain == world_size`, <=8 ranks). That domain
+is ONE node on x86 — but on a GB200/GB300 NVL72 the MNNVL fabric makes multiple trays a single NVLink
+domain, so the SAME path spans trays: gb300 EP8 (8 ranks / 2 trays) is validated `transport=mnnvl`,
+decode 8/8 + prefill 6/6 (run 28480519588). The container build is done by runtime/run_in_container.sh
+`cx_build_deepep_hybrid` (CUDA-13 cccl include + libnvshmem symlink fixes; pip-installed so it persists
+across the EP8 multi-srun's separate srun steps).
+
+API (pinned on B300, branch e0a5b1d):
+  HybridEPBuffer(group, hidden_dim, max_num_of_tokens_per_rank, num_local_experts, use_fp8=False, ...)
+  .dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> (recv_hidden, recv_x2, None, handle)
+  .combine(hidden, handle=) -> [T, hidden]
+
+CORRECTNESS: identity expert (no expert compute), combine WITHOUT probs -> each source token is
+reconstructed as x * (distinct ranks among its top_k experts) — verified: an 8-rank uniform top_k=8
+round trip gives relerr(combined, x) = 4.28, matching E[distinct ranks] ~ 5.26 exactly. So this uses
+the SAME "ranks" factor as ep_flashinfer (per-rank-sum combine, no gate re-weight). bf16 tol 5e-2.
+
+STATUS: bf16 / normal / layout-and-dispatch-v1. Single-NVLink-domain path (<=8 ranks) validated on x86
+single-node AND across GB300 NVL72 trays at EP8 via MNNVL (one NVLink domain, run 28480519588). fp8 and
+the cross-RACK (>1 NVL72, IBGDA/RDMA) path are further lift; docs/gated.md rack-scale.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    import deep_ep
+    HybridEPBuffer = deep_ep.HybridEPBuffer
+except Exception as exc:  # pragma: no cover - needs the hybrid-ep build
+    print("ERROR: deep_ep.HybridEPBuffer import failed — the hybrid-ep branch must be built at job "
+          "setup (cx_build_deepep_hybrid). "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _deepep_hybrid_version() -> str:
+    return os.environ.get("DEEPEP_COMMIT", getattr(deep_ep, "__version__", "hybrid-ep"))
+
+
+class DeepEPHybridBackend:
+    name = "deepep-hybrid"
+    # HybridEPBuffer.combine consumes the recv payload + the dispatch handle (no re-dispatch needed
+    # before a timed combine); the harness times dispatch and combine separately (like ep_deepep).
+    combine_needs_redispatch = False
+    wants_warm_burst = True
+    # Capabilities — run_ep.py REJECTS anything outside these before construction.
+    SUPPORTED_PRECISIONS = {"bf16"}        # fp8 = use_fp8 path, further lift
+    SUPPORTED_MODES = {"normal"}
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+    SUPPORTED_COMBINE_DTYPES = {"bf16"}
+    SUPPORTED_COMBINE_QUANT_MODES = {"none"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.contract = args.measurement_contract
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        self.tolerance = 5e-2
+        self.fp8_in_timing = None
+        self.top_k = int(args.topk)
+        self.num_experts = int(args.experts)
+        self.hidden = int(args.hidden)
+        self.local_experts = max(1, self.num_experts // world_size)
+        # Token cap (per rank) for the symmetric buffer; the sweep is capped here (buffer_cap).
+        self.max_tokens = int(os.environ.get("CX_HYBRIDEP_MAX_TOKENS", "4096"))
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _deepep_hybrid_version()
+
+        # Construct the HybridEPBuffer treating all ranks as ONE NVLink domain (default
+        # num_of_hybrid_ep_ranks_per_nvlink_domain == world_size). On x86 that domain is one node; on a
+        # GB200/GB300 NVL72 the MNNVL fabric makes 2 trays one NVLink domain, so EP8 (8 ranks) is covered
+        # by this same path (validated transport=mnnvl). SM counts default.
+        try:
+            self.buffer = HybridEPBuffer(
+                self.group, hidden_dim=self.hidden,
+                max_num_of_tokens_per_rank=self.max_tokens,
+                num_local_experts=self.local_experts, use_fp8=False)
+        except Exception as exc:
+            raise RuntimeError(
+                f"HybridEPBuffer construction failed (hidden={self.hidden} max_tokens={self.max_tokens} "
+                f"local_experts={self.local_experts} world={world_size}): {exc!r}") from exc
+        if rank == 0:
+            print(f"[deepep-hybrid] HybridEPBuffer constructed (single NVLink domain, world={world_size}, "
+                  f"local_experts={self.local_experts}, hidden={self.hidden})", file=sys.stderr)
+
+        self.backend_provenance = {
+            "deepep_commit": ver, "branch": "hybrid-ep",
+            "impl": "deep_ep.HybridEPBuffer (NVIDIA TMA + warp-pipeline)",
+            "mode": "normal", "transport": "nvlink-domain",  # one node (x86) or one NVL72 MNNVL domain (gb300 EP8)
+            "resource_mode": args.resource_mode,
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "fixed-kernel",
+            "max_num_tokens": self.max_tokens, "top_k": self.top_k,
+            "num_experts": self.num_experts, "local_experts": self.local_experts,
+            "routing_factor": "ranks",
+        }
+
+    def buffer_cap(self, args):
+        return self.max_tokens
+
+    def make_problem(self, T, idx, weights, x):
+        return types.SimpleNamespace(
+            T=int(T), x=x,
+            topk_idx=idx.to(torch.int64),
+            topk_weights=weights.to(torch.float32),
+        )
+
+    def dispatch(self, p):
+        # HybridEPBuffer.dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) ->
+        #   (recv_hidden [n_recv, H], recv_x2, None, handle).
+        out = self.buffer.dispatch(p.x, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+                                   num_of_experts=self.num_experts)
+        recv = out[0] if isinstance(out, (tuple, list)) else out
+        handle = None
+        if isinstance(out, (tuple, list)):
+            for o in out:
+                if isinstance(o, tuple):
+                    handle = o
+        return types.SimpleNamespace(recv=recv, recv_payload=recv, handle=handle, combine_input=None)
+
+    def stage(self, p, h):
+        # Identity expert: the recv hidden IS the "expert output". combine reduces it per source token.
+        h.combine_input = h.recv_payload
+        return None
+
+    def combine(self, p, h):
+        # combine(hidden, handle=) -> [T, H] per-source-token reduction (no gate re-weight: "ranks").
+        comb = self.buffer.combine(h.combine_input, handle=h.handle)
+        return comb[0] if isinstance(comb, (tuple, list)) else comb
+
+    def expected(self, p, h):
+        # Round trip, identity expert, per-RANK-sum combine (no gate weights): each source token is
+        # x * (distinct ranks among its top_k experts) — same as ep_flashinfer's "ranks" factor.
+        ref = p.x.float()
+        epr = max(1, self.num_experts // self.world_size)
+        ranks = (p.topk_idx.long() // epr).clamp_(0, self.world_size - 1)        # [T, topk]
+        present = torch.zeros(ranks.shape[0], self.world_size, device=ranks.device, dtype=torch.float32)
+        present.scatter_(1, ranks, 1.0)
+        factor = present.sum(dim=1, keepdim=True)                                # [T, 1] distinct ranks
+        return ref * factor, p.T
+
+    def recv_tokens(self, h):
+        rp = h.recv_payload
+        if torch.is_tensor(rp) and rp.dim() >= 1:
+            return int(rp.shape[0])
+        return 0
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_flashinfer.py b/experimental/CollectiveX/tests/ep_flashinfer.py
new file mode 100644
index 000000000..f12b6c709
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_flashinfer.py
@@ -0,0 +1,805 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — FlashInfer EP (NVIDIA), normal mode.
+
+This file owns ONLY FlashInfer's MoE-AllToAll API calls + its correctness reference;
+the harness (ep_harness.py) owns the deterministic shared routing trace, the comm-only
+timing, the correctness gate, and the provenance-tagged doc. The adapter protocol
+(make_problem / dispatch / stage / combine / expected / buffer_cap / recv_tokens /
+finalize + backend_provenance + SUPPORTED_*) mirrors ep_deepep.py exactly.
+
+WHAT FLASHINFER PROVIDES (flashinfer 0.6.8.post1, NVIDIA container):
+  * `flashinfer.comm.MoeAlltoAll(mapping, max_num_tokens, top_k, num_experts)` — a class
+    holding an MNNVL symmetric workspace, with
+      .dispatch(token_selected_experts, input_payloads: list[Tensor],
+                runtime_max_tokens_per_rank, ...)  -> recv payload(s)
+      .combine(payload, runtime_max_tokens_per_rank, payload_in_workspace=False) -> combined
+  * module-level `flashinfer.comm.trtllm_moe_alltoall` and the lower-level
+    `moe_a2a_dispatch` / `moe_a2a_combine` / `moe_a2a_initialize` /
+    `get_workspace_size_per_rank` — the TensorRT-LLM one-sided path. Selected by
+    env CX_FLASHINFER_TRTLLM=1 (provenance trtllm=True); covers goal's
+    "TensorRT-LLM NVLink one-sided AllToAll EP".
+
+The exact kwarg names for dispatch/combine and the Mapping constructor differ across
+FlashInfer point releases. This adapter has NO GPU to validate against, so EVERY
+FlashInfer API call is wrapped to fail LOUD + SPECIFIC (the call site, the kwargs
+tried, and the underlying error) so the parent's GHA smoke shows precisely what to fix
+rather than a bare TypeError. See `_call_variants` and `_build_mapping`.
+
+CORRECTNESS (`expected`): FlashInfer's MoeAlltoAll is expert-centric (TensorRT-LLM MoE
+A2A): `dispatch` sends each token to its top_k selected experts; `combine` gathers the
+per-expert results back and reduces the top_k copies for each SOURCE token. With an
+identity expert (the harness does NO expert compute) and a combine that does NOT apply
+the gate weights (the public `combine(payload, ...)` takes no topk_weights — gate
+weighting is the MoE epilogue, not the comm), the round trip yields:
+      combined ≈ x * top_k          (sum of top_k identical copies of x)
+This is structurally DeepEP-LL-like (per-expert reduce) but WITHOUT LL's weight multiply.
+The alternative (combine applies softmax gate weights, like DeepEP LL) would give
+`x * sum(topk_weights)`. We LEAD with `x * top_k` and document both; the parent's GHA
+validates which FlashInfer actually implements and flips ONE constant (_ROUTING_FACTOR).
+Tolerance bf16 ~5e-2 (FlashInfer dispatch keeps bf16 end-to-end; no fp8 round-trip yet).
+
+STATUS: normal / layout-and-dispatch-v1. Dispatch precisions: bf16; fp8/fp8-pertoken/
+fp8-directcast (e4m3, DeepEP convention); mxfp8/mxfp4/nvfp4 (OCP-microscaling via
+FlashInfer's native quantizers — the A2A moves [q, scale_factor] as a payload LIST, dequant
+in stage()). Combine stays bf16 (MoeAlltoAll.combine has no output_dtype in 0.6.8.post1).
+The MoeAlltoAll workspace bootstraps inside the single torch.distributed NCCL group of
+same-user ranks (MNNVL symmetric memory) — the launcher/image owns CAP_SYS_PTRACE / FABRIC
+plumbing (docs/gated.md; H200 runner denies the ptrace cap the MNNVL fd-share needs).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    import flashinfer  # for version/provenance
+    import flashinfer.comm as fi_comm  # MoeAlltoAll / trtllm_moe_alltoall / moe_a2a_* live here
+except Exception as exc:  # pragma: no cover - needs the FlashInfer wheel on the container
+    print("ERROR: flashinfer import failed — FlashInfer must be present on the container at job "
+          "setup (cx_build_flashinfer: `pip install flashinfer-python`). "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _flashinfer_version() -> str:
+    try:
+        import importlib.metadata as _md
+        return _md.version("flashinfer-python")
+    except Exception:
+        try:
+            import importlib.metadata as _md
+            return _md.version("flashinfer")
+        except Exception:
+            return getattr(flashinfer, "__version__", "unknown")
+
+
+# --- The round-trip routing factor (see module docstring). LEAD = top_k (sum of top_k
+# identical copies, combine does NOT weight). If GHA shows FlashInfer's combine applies
+# the gate weights instead, flip this to "weight-sum" and the reference becomes
+# x * sum(topk_weights). This is the ONE knob the parent edits after the first GHA run. ---
+_ROUTING_FACTOR = os.environ.get("CX_FLASHINFER_ROUTING_FACTOR", "ranks")  # "ranks" | "topk" | "weight-sum"
+
+
+def _loud(where: str, attempted, exc: Exception) -> RuntimeError:
+    """Build a LOUD + SPECIFIC error for a failed FlashInfer call so the parent's GHA smoke
+    shows exactly which API/kwargs to fix (no GPU here to discover the right names)."""
+    return RuntimeError(
+        f"FlashInfer EP adapter: {where} failed against flashinfer {_flashinfer_version()}. "
+        f"Attempted: {attempted}. Underlying error: {exc!r}. "
+        f"FIX: inspect the installed flashinfer.comm signatures "
+        f"(python3 -c 'import flashinfer.comm as c; help(c.MoeAlltoAll)') and adjust the "
+        f"kwarg names / Mapping construction in tests/ep_flashinfer.py.")
+
+
+def _call_variants(where: str, fn, variants):
+    """Try a sequence of (args, kwargs) plausible signatures for one FlashInfer call.
+    Returns (result, chosen_index). Raises a LOUD error listing EVERY attempt if all fail.
+    Used so a renamed kwarg surfaces as a precise, actionable message in GHA — not a
+    silent fallback (the harness contract forbids faking) and not a bare TypeError."""
+    errors = []
+    for i, (args, kwargs) in enumerate(variants):
+        try:
+            return fn(*args, **kwargs), i
+        except TypeError as exc:        # wrong kwarg name / arity — try the next signature
+            errors.append(f"  variant[{i}] args={_shape_repr(args)} kwargs={list(kwargs)} -> {exc!r}")
+        # any non-TypeError (e.g. a real CUDA/runtime error) is NOT a signature problem —
+        # re-raise immediately, wrapped, so it isn't masked by trying other signatures.
+        except Exception as exc:
+            raise _loud(where, _shape_repr(args) + f" kwargs={list(kwargs)}", exc)
+    raise _loud(where, "all signature variants exhausted:\n" + "\n".join(errors),
+                TypeError("no matching signature"))
+
+
+def _shape_repr(args):
+    out = []
+    for a in args:
+        if torch.is_tensor(a):
+            out.append(f"Tensor{tuple(a.shape)}:{a.dtype}")
+        elif isinstance(a, (list, tuple)):
+            out.append("[" + ",".join(
+                f"Tensor{tuple(t.shape)}:{t.dtype}" if torch.is_tensor(t) else repr(t) for t in a) + "]")
+        else:
+            out.append(repr(a))
+    return "(" + ", ".join(out) + ")"
+
+
+def _build_mapping(world_size, rank):
+    """Construct the FlashInfer Mapping for PURE EP. FlashInfer's Mapping REQUIRES
+    world_size == tp_size*pp_size*cp_size, and realizes MoE-EP as a VIEW over the TP dimension
+    (moe_ep_size ranks taken from the tp ranks). So pure EP across all ranks =
+    tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1 (pp=cp=1). The kwarg set varies
+    across releases, so try the plausible constructors defensively; record which worked (logged
+    at rank 0). Raises a LOUD error (listing every attempt) if none construct."""
+    Mapping = getattr(fi_comm, "Mapping", None) or getattr(flashinfer, "Mapping", None)
+    if Mapping is None:
+        raise _loud("Mapping lookup",
+                    "flashinfer.comm.Mapping / flashinfer.Mapping not found",
+                    AttributeError("Mapping"))
+    # tp_size=world_size so the world_size==tp*pp*cp invariant holds; moe_ep_size=world_size = full EP.
+    variants = [
+        ((), dict(world_size=world_size, rank=rank, gpus_per_node=world_size,
+                  tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)),
+        ((), dict(world_size=world_size, rank=rank,
+                  tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)),
+        ((), dict(world_size=world_size, rank=rank, tp_size=world_size, moe_ep_size=world_size)),
+        ((), dict(world_size=world_size, rank=rank, moe_ep_size=world_size, moe_tp_size=1,
+                  tp_size=world_size)),
+        ((), dict(world_size=world_size, rank=rank, tp_size=world_size)),   # EP defaults from tp
+        # positional last-resort: (world_size, rank) with tp=world_size
+        ((world_size, rank), dict(tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)),
+    ]
+    mapping, idx = _call_variants("Mapping(...)", Mapping, variants)
+    return mapping, idx
+
+
+# --------------------------------------------------------------------------------------
+# Quantized dispatch recipes. FlashInfer's MoE A2A dispatch takes input_payloads as a LIST
+# of [local_num_tokens, *] tensors and moves them as bytes (dtype-agnostic) — so a quantized
+# dispatch = pass [q, scale_factor] as the payload list, recv [recv_q, recv_sf], then DEQUANT
+# in stage() (UNTIMED, outside the comm window — the quant/dequant mirrors a producer handing
+# already-quantized activations, exactly like ep_deepep's layout-and-dispatch-v1 contract).
+#
+# Two families:
+#   * e4m3 block-128 / per-token / direct-cast — pure-torch (identical convention to ep_deepep,
+#     so FlashInfer-fp8 and DeepEP-fp8 are the SAME operating point on different transports).
+#   * mxfp8 / mxfp4 / nvfp4 — FlashInfer's native OCP-microscaling quantizers (mxfp8_quantize,
+#     mxfp4_quantize, nvfp4_quantize) + their matching dequantizers. These check goal's
+#     "MXFP8 / MXFP4 / NVFP4 dispatch" — reachable here precisely because the A2A is a byte
+#     mover and FlashInfer ships the quantize/dequantize kernels (flashinfer 0.6.8.post1).
+# The comm-correctness gate compares against the DEQUANTIZED cast that was actually sent
+# (ref = dequant(quant(x)) * factor), so it verifies the COMM, not the quantizer — same as
+# ep_deepep.expected(). Tolerance per format (4-bit fp4 is far looser than 8-bit fp8).
+_FP8_MAX = 448.0
+_FP8_BLOCK = 128
+
+
+def _e4m3_block128_cast(x):
+    # PER-BLOCK-128 e4m3 (DeepEP default convention): scales [T, H//128] f32.
+    T, H = x.shape
+    xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK)
+    amax = xv.abs().amax(dim=2).clamp(min=1e-4)
+    x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H)
+    return x_fp8, (amax / _FP8_MAX).contiguous()
+
+
+def _e4m3_pertoken_cast(x):
+    T, H = x.shape
+    amax = x.float().abs().amax(dim=1, keepdim=True).clamp(min=1e-4)
+    x_fp8 = (x.float() * (_FP8_MAX / amax)).to(torch.float8_e4m3fn)
+    scales = (amax / _FP8_MAX).expand(T, H // _FP8_BLOCK).contiguous()
+    return x_fp8, scales
+
+
+def _e4m3_directcast(x):
+    T, H = x.shape
+    x_fp8 = x.float().clamp(-_FP8_MAX, _FP8_MAX).to(torch.float8_e4m3fn)
+    scales = torch.ones((T, H // _FP8_BLOCK), dtype=torch.float32, device=x.device)
+    return x_fp8, scales
+
+
+def _e4m3_dequant_nd(x_fp8, scales):
+    # Works for [R,H]+[R,H//128] (2D) and [E,S,H]+[E,S,H//128] (3D recv). Last dim is H; scale
+    # repeats per 128-block.
+    *lead, H = x_fp8.shape
+    blocks = H // _FP8_BLOCK
+    xv = x_fp8.float().reshape(*lead, blocks, _FP8_BLOCK)
+    return (xv * scales.reshape(*lead, blocks, 1)).reshape(*lead, H).to(torch.bfloat16)
+
+
+class _MicroscaleRecipe:
+    """FlashInfer-native mxfp8 / mxfp4 / nvfp4 quant+dequant, validated on the runner via the
+    library's own kernels. Quantize on a flat [N, H] view (the A2A moves per-token payloads),
+    keep the swizzled scale-factor as a SECOND payload, dequant the 3D recv by flattening the
+    [ep, max_tokens] dims to [N, H] (the SF swizzle is per-row so the flatten is layout-safe),
+    then reshaping back. Imports flashinfer lazily so a wheel without these kernels fails LOUD."""
+
+    _MX_BLOCK = 32   # mxfp8 e8m0 block size
+    _NV_VEC = 16     # nvfp4 e4m3 scale block size (sf_vec_size)
+
+    _MXFP4_VEC = 32  # mxfp4 e8m0 block size (sf_vec_size)
+    # OCP e2m1 magnitudes indexed by (exp<<1)|mant (3 low bits); bit3 = sign.
+    _E2M1_MAG = (0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0)
+
+    def __init__(self, kind):
+        self.kind = kind  # "mxfp8" | "nvfp4" | "mxfp4"
+        # mxfp4 is reachable after all: mxfp4_quantize() forces a tile-padded SWIZZLED SF, but the
+        # lower-level fp4_quantize(sf_vec_size=32, sf_use_ue8m0=True, is_sf_swizzled_layout=False)
+        # emits e2m1 + e8m0 in a LINEAR per-token layout (movable through the A2A). dequant is a manual
+        # e2m1 LUT * 2^(e8m0-127) (no flashinfer linear-mxfp4 dequant exists; mxfp4_dequantize wants
+        # swizzled). The dispatch gate is consistency-based, so this validates the comm honestly.
+        import flashinfer as _fi
+        self._fi = _fi
+        need = {"mxfp8": ("mxfp8_quantize",),
+                "nvfp4": ("fp4_quantize", "e2m1_and_ufp8sf_scale_to_float"),
+                "mxfp4": ("fp4_quantize",)}[kind]
+        for fn in need:
+            if not hasattr(_fi, fn):
+                raise _loud(f"{kind} quantizer lookup", f"flashinfer.{fn} not found",
+                            AttributeError(fn))
+
+    def cast(self, x):
+        # Returns (q, sf) — BOTH per-token (first-dim == T) so the A2A moves them as a payload list.
+        # mxfp8: q [T,H] e4m3, sf [T, H/32] e8m0(uint8), LINEAR (is_sf_swizzled_layout=False).
+        # nvfp4: q [T, H/2] uint8 (packed e2m1), sf [T, H/16] uint8 (ufp8 e4m3), per-tensor global sf.
+        # mxfp4: q [T, H/2] uint8 (packed e2m1), sf [T, H/32] uint8 (e8m0), LINEAR — via fp4_quantize.
+        fi = self._fi
+        xt = x.contiguous()
+        T, H = xt.shape
+        if self.kind == "mxfp8":
+            q, sf = fi.mxfp8_quantize(xt, is_sf_swizzled_layout=False)
+            sf = sf.reshape(T, H // self._MX_BLOCK)
+        elif self.kind == "mxfp4":
+            q, sf = fi.fp4_quantize(xt, sf_vec_size=self._MXFP4_VEC, sf_use_ue8m0=True,
+                                    is_sf_swizzled_layout=False)
+            if sf.dim() == 1:
+                sf = sf.reshape(T, -1)
+        else:  # nvfp4: global_scale maps amax -> the max representable (e4m3max * e2m1max = 448*6);
+               # dequant divides by it. (the reciprocal — amax/(448*6) — yields ~0 output, relerr~1.)
+            gsf = ((_FP8_MAX * 6.0) / xt.float().abs().amax().clamp(min=1e-4)).reshape(1)
+            q, sf = fi.fp4_quantize(xt, global_scale=gsf, sf_vec_size=self._NV_VEC,
+                                    sf_use_ue8m0=False, is_sf_swizzled_layout=False)
+            self._gsf = gsf
+            if sf.dim() == 1:
+                sf = sf.reshape(T, -1)
+        return q.contiguous(), sf.contiguous()
+
+    def dequant_nd(self, q, sf):
+        # q/sf are recv tensors — 2D [T,*] (the x_ref path) or 3D [E,S,*] (the stage recv path).
+        # Flatten leading dims to [N,*], dequant on device, reshape back. NO host round-trip.
+        lead = q.shape[:-1]
+        N = 1
+        for d in lead:
+            N *= d
+        if self.kind == "mxfp8":
+            # Manual DEVICE e8m0 dequant (FlashInfer ships only a CPU mxfp8_dequantize_host, too slow
+            # in the timing loop): x ~= q_e4m3 * 2^(sf_uint8 - 127), per block-32. Verified to match
+            # mxfp8_dequantize_host on the runner (see cx_fi_quant_smoke).
+            H = q.shape[-1]
+            B = self._MX_BLOCK
+            qf = q.reshape(N, H // B, B).float()
+            sff = sf.reshape(N, H // B).float()
+            out = (qf * torch.pow(torch.tensor(2.0, device=q.device), sff - 127.0).unsqueeze(-1)).reshape(N, H)
+        elif self.kind == "mxfp4":
+            # Manual e2m1 (LUT) + e8m0 block-32 decode (no flashinfer linear-mxfp4 dequant exists).
+            Hp = q.shape[-1]
+            H = Hp * 2
+            qb = q.reshape(N, Hp)
+            lut = torch.tensor(self._E2M1_MAG, device=q.device, dtype=torch.float32)
+            def _dec(nib):  # nib uint8 [N,Hp] 0..15 -> signed e2m1 magnitude
+                sign = 1.0 - 2.0 * ((nib >> 3) & 1).float()
+                return sign * lut[(nib & 0x7).long()]
+            lo = _dec(qb & 0xF)
+            hi = _dec((qb >> 4) & 0xF)          # byte packs [v_lo, v_hi]
+            vals = torch.stack([lo, hi], dim=-1).reshape(N, H)
+            blk = H // self._MXFP4_VEC
+            scale = torch.pow(torch.tensor(2.0, device=q.device), sf.reshape(N, blk).float() - 127.0)
+            out = (vals.view(N, blk, self._MXFP4_VEC) * scale.view(N, blk, 1)).reshape(N, H)
+        else:  # nvfp4 — DEVICE dequant (e2m1 + ufp8 e4m3 scale + per-tensor global), linear layout.
+            qf = q.reshape(N, q.shape[-1]).contiguous()
+            sff = sf.reshape(N, sf.shape[-1]).contiguous()
+            # dequant divides by the global scale -> pass its RECIPROCAL (verified on the runner:
+            # quant gsf=(448*6)/amax + dequant 1/gsf -> relerr ~0.09 = the 4-bit nvfp4 floor).
+            gsf = getattr(self, "_gsf", None)
+            out = self._fi.e2m1_and_ufp8sf_scale_to_float(
+                qf, sff, global_scale_tensor=(1.0 / gsf).cpu() if gsf is not None else None,
+                sf_vec_size=self._NV_VEC, is_sf_swizzled_layout=False)
+        H = out.shape[-1]
+        # e2m1_and_ufp8sf_scale_to_float returns on CPU; move back to the payload's device.
+        return out.reshape(*lead, H).to(device=q.device, dtype=torch.bfloat16)
+
+
+# dispatch_dtype -> (label, kind). kind selects the cast/dequant path in make_problem/stage.
+# mxfp4 uses fp4_quantize(sf_use_ue8m0=True, is_sf_swizzled_layout=False) — a LINEAR e8m0 SF that
+# moves per-token through the A2A (mxfp4_quantize's tile-padded swizzled SF does NOT; that was the
+# old blocker). mxfp8/mxfp4/nvfp4 + the e4m3 fp8 recipes cover the OCP-microscaling dispatch goal.
+_QUANT_RECIPES = {
+    "fp8":            ("per-block-128", "e4m3"),
+    "fp8-pertoken":   ("per-token", "e4m3"),
+    "fp8-directcast": ("direct-cast", "e4m3"),
+    "mxfp8":          ("mxfp8-e8m0-block32", "mxfp8"),
+    "mxfp4":          ("mxfp4-e8m0-block32", "mxfp4"),
+    "nvfp4":          ("nvfp4-e4m3-block16", "nvfp4"),
+}
+_E4M3_CASTS = {"fp8": _e4m3_block128_cast, "fp8-pertoken": _e4m3_pertoken_cast,
+               "fp8-directcast": _e4m3_directcast}
+# Per-format comm-correctness tolerance (round-trip of the dequantized cast through the comm).
+_QUANT_TOL = {"e4m3": 1.25e-1, "mxfp8": 1.5e-1, "mxfp4": 3.5e-1, "nvfp4": 3.0e-1}
+
+
+class FlashInferBackend:
+    name = "flashinfer"
+    # FlashInfer combine reuses the dispatch workspace/handle (no re-dispatch needed before
+    # a timed combine), mirroring DeepEP normal mode — combine consumes the recv payload.
+    # MoeAlltoAll is a stateful idle->dispatched->idle FSM (asserts "dispatch called twice without
+    # combine"). The harness times dispatch in isolation (loops it) AND combine in isolation. Setting
+    # this True makes the combine-timing loop run an untimed dispatch+stage (pre=) before each combine
+    # sample, so combine always sees a "dispatched" state; dispatch() resets the FSM to idle at its
+    # start so the dispatch-timing loop + the roundtrip (paired) timing all stay valid.
+    combine_needs_redispatch = True
+    # MoeAlltoAll's paired dispatch/combine FSM means isolated/looped dispatch timing corrupts the
+    # symmetric workspace (CUDA launch failure). Only the PAIRED roundtrip is measurable — the
+    # harness times the roundtrip and mirrors it into dispatch/combine (isolated_sum is N/A here).
+    # The roundtrip IS goal P0's headline metric, so this is the right measurement for this backend.
+    roundtrip_only = True
+    # Blackwell (B300/GB300) drops GPU clocks during the tiny small-T points, so the harness
+    # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100/H200.
+    wants_warm_burst = True
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel).
+    #   bf16            : MoeAlltoAll keeps bf16 payloads end-to-end (no quant round trip).
+    #   fp8*            : e4m3 dispatch (per-block-128 / per-token / direct-cast) — SAME convention
+    #                     as ep_deepep, so FlashInfer-fp8 == DeepEP-fp8 operating point, different
+    #                     transport (the TRT-LLM throughput A2A vs DeepEP NVLink).
+    #   mxfp8/mxfp4/nvfp4: OCP-microscaling dispatch via FlashInfer's native quantizers. The A2A
+    #                     moves [q, scale_factor] as a payload LIST (byte-agnostic), dequant in
+    #                     stage(). Covers goal's "MXFP8 / MXFP4 / NVFP4 dispatch" — reachable on
+    #                     this working path because FlashInfer ships the quantize/dequantize kernels.
+    SUPPORTED_PRECISIONS = {"bf16", "fp8", "fp8-pertoken", "fp8-directcast",
+                            "mxfp8", "mxfp4", "nvfp4"}
+    SUPPORTED_MODES = {"normal"}
+    # Only the contract whose timing boundary FlashInfer can honor: layout (the dispatch
+    # send-counts) is computed inside dispatch and cannot be hoisted to a separate untimed
+    # step the way DeepEP's get_dispatch_layout can — so cached-layout-comm-only-v1 and
+    # runtime-visible-v1 (fp8) are NOT offered.
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+    # Combine path: bf16 (default) OR a quantized COMBINE OUTPUT via the newer flashinfer
+    # moe_a2a_combine output_dtype (fp8 e4m3 wired; the bundled 0.6.8.post1 has no output_dtype, so
+    # a combine-quant run upgrades FlashInfer first via cx_build_flashinfer_latest). nvfp4/mxfp8
+    # combine reserved (fp4/e8m0 output packing — extend once fp8-combine is GHA-validated).
+    SUPPORTED_COMBINE_DTYPES = {"bf16", "fp8", "nvfp4"}
+    SUPPORTED_COMBINE_QUANT_MODES = {"none", "fp8", "nvfp4"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.contract = args.measurement_contract
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        # Quant recipe (None for bf16). e4m3 = pure-torch cast (DeepEP convention); mx/nvfp4 =
+        # FlashInfer-native quantizer. dispatch passes [q, sf]; stage() dequants (UNTIMED).
+        self.dispatch_dtype = args.dispatch_dtype
+        self.quant_label, self.quant_kind = _QUANT_RECIPES.get(args.dispatch_dtype, (None, None))
+        self._micro = None
+        if self.quant_kind in ("mxfp8", "mxfp4", "nvfp4"):
+            self._micro = _MicroscaleRecipe(self.quant_kind)   # lazy flashinfer import, LOUD if absent
+        elif self.quant_kind == "e4m3":
+            self._e4m3_cast = _E4M3_CASTS[args.dispatch_dtype]
+        # bf16 round-trip error ~5e-3 (tol 5e-2); fp8 e4m3 ~1/16; fp4 (4-bit) far looser. Per-format
+        # tolerance recorded in the artifact so the looser quant gate is explicit, not hidden.
+        self.tolerance = _QUANT_TOL.get(self.quant_kind, 5e-2)
+        # The quant CAST + recv-DEQUANT run in make_problem/stage (OUTSIDE the timed comm window) —
+        # the layout-and-dispatch-v1 contract (producer hands quantized activations). Recorded honestly.
+        self.fp8_in_timing = False if self.quant_kind else None
+        self.scale_layout = self.quant_label
+
+        # Combine-side quant (SEPARATE axis from dispatch): a quantized COMBINE OUTPUT via the newer
+        # flashinfer moe_a2a_combine output_dtype (the bundled 0.6.8.post1 has NO output_dtype, so a
+        # combine-quant run upgrades FlashInfer first — cx_build_flashinfer_latest). The combine
+        # kernel emits the per-source-token reduction already as fp8 + per-token scales; we dequant
+        # (cached, untimed) for the correctness gate. The quantized reduction is what's TIMED.
+        self.combine_dtype = getattr(args, "combine_dtype", "bf16")
+        self.combine_quant = self.combine_dtype not in ("bf16", None, "")
+        self.combine_input_dtype = self.combine_dtype
+        self.combine_quant_mode = getattr(args, "combine_quant_mode", "none")
+        self.combine_quant_in_timing = True if self.combine_quant else None
+        self.combine_dequant_in_timing = False if self.combine_quant else None
+        self._qc_out_dtype = None
+        self._qc_scale_shape = None   # cached working output_scales shape (discovered on first combine)
+        if self.combine_quant:
+            import inspect as _inspect
+            if "output_dtype" not in str(_inspect.signature(fi_comm.MoeAlltoAll.combine)):
+                raise RuntimeError(
+                    "combine-quant requested but flashinfer.comm.MoeAlltoAll.combine has NO output_dtype — "
+                    "this wheel (likely 0.6.8.post1) predates PR3376/3643. The run must upgrade FlashInfer "
+                    "first (CX_COMBINE_DTYPE!=bf16 triggers cx_build_flashinfer_latest in run_in_container.sh).")
+            # fp8 -> e4m3 output + UE8M0 uint8 vec-32 scales (= MXFP8). nvfp4 -> uint8 packed-e2m1
+            # output + e4m3 vec-16 scales + a per-tensor output_scalar_scale (the fp4 path).
+            self._qc_out_dtype = {"fp8": torch.float8_e4m3fn, "nvfp4": torch.uint8}.get(self.combine_dtype)
+            if self._qc_out_dtype is None:
+                raise RuntimeError(f"combine_dtype={self.combine_dtype} not wired (fp8|nvfp4)")
+            # quantized-combine round-trip is looser than the bf16 reconstruction (fp8 ~1/16 +
+            # whatever the dispatch added); keep at least the dispatch tol.
+            self.tolerance = max(self.tolerance, 1.6e-1)
+
+        # TensorRT-LLM lineage: MoeAlltoAll LIVES IN flashinfer.comm.trtllm_moe_alltoall (the
+        # "throughput backend" — the TRT-LLM NVLink one-sided AllToAll over an MNNVL symmetric
+        # workspace). So this adapter's DEFAULT path IS the TRT-LLM one-sided EP; CX_FLASHINFER_TRTLLM
+        # only flips the provenance label (there is no separate functional path — both call the same
+        # moe_a2a_dispatch/combine kernels). Kept as a label so the artifact can be tagged trtllm.
+        self.trtllm = os.environ.get("CX_FLASHINFER_TRTLLM", "0") == "1"
+
+        self.top_k = int(args.topk)
+        self.num_experts = int(args.experts)
+        # Workspace/buffer ceiling. The MoeAlltoAll symmetric workspace is sized for
+        # max_num_tokens per rank; the sweep is capped at this (buffer_cap) so a too-large T
+        # is dropped (reported) rather than overflowing. 4096 holds the prefill ladder top.
+        self.max_num_tokens = int(os.environ.get("CX_FLASHINFER_MAX_TOKENS", "4096"))
+
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _flashinfer_version()
+
+        # Build the pure-EP Mapping (defensive over kwarg variants; logs which worked).
+        self.mapping, map_variant = _build_mapping(world_size, rank)
+        if rank == 0:
+            print(f"[flashinfer] Mapping constructed via variant #{map_variant} "
+                  f"(world={world_size} rank={rank} tp=1 moe_ep={world_size} moe_tp=1)",
+                  file=sys.stderr)
+
+        # Construct the comm object. MoeAlltoAll (in flashinfer.comm.trtllm_moe_alltoall) IS the
+        # TRT-LLM throughput-backend one-sided A2A — it allocates its MNNVL symmetric workspace
+        # internally and calls the same moe_a2a_dispatch/combine kernels the functional API exposes.
+        # So we ALWAYS construct it; the trtllm flag only tags provenance (no separate path).
+        self.path = "trtllm_moe_alltoall" if self.trtllm else "moe_alltoall"
+        self.a2a = None
+        self.workspace = None
+        self.ws_size = None
+        self._init_moe_alltoall(ver)
+
+        self.backend_provenance = {
+            "flashinfer_version": ver,
+            "flashinfer_commit": os.environ.get("FLASHINFER_COMMIT") or f"pkg-{ver}",
+            # exact upgraded library stack (flashinfer-python/cubin/jit-cache + cutlass-dsl + torch),
+            # set by cx_build_flashinfer_latest — the only record of post-env_capture upgrade versions.
+            "flashinfer_stack": os.environ.get("CX_FLASHINFER_STACK"),
+            "mode": "normal", "path": self.path, "trtllm": self.trtllm,
+            # MoeAlltoAll's home module — proves this EP path IS the TRT-LLM one-sided throughput A2A.
+            "backend_lineage": "flashinfer.comm.trtllm_moe_alltoall.MoeAlltoAll",
+            "transport": "trtllm-throughput-backend-onesided",
+            # quant provenance (None/bf16 path -> nulls). scale_layout + dispatch_dtype name the recipe.
+            "dispatch_dtype": self.dispatch_dtype, "quant_kind": self.quant_kind,
+            "scale_layout": self.scale_layout, "quant_in_timing": self.fp8_in_timing,
+            # combine-side quant (a SEPARATE axis): a quantized COMBINE OUTPUT (fp8 e4m3) when set.
+            "combine_dtype": self.combine_dtype, "combine_quant": self.combine_quant,
+            "combine_quant_in_timing": self.combine_quant_in_timing,
+            "resource_mode": args.resource_mode,
+            # FlashInfer MoE A2A occupancy is fixed by the library (a symmetric-memory kernel, not
+            # an SM/CU budget we set) — like DeepEP LL. Recorded as a fixed-kernel run so the
+            # resource_profile maps it to resource_class=fixed-kernel (excluded from the Pareto).
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "fixed-kernel",
+            "max_num_tokens": self.max_num_tokens, "top_k": self.top_k,
+            "num_experts": self.num_experts,
+            "mapping_variant": map_variant,
+            "routing_factor": _ROUTING_FACTOR,
+            # MNNVL symmetric workspace — comm bootstrapped via torch.distributed (TorchDistBackend),
+            # NOT MPI, so it works under torchrun without mpi4py / an MPI launch.
+            "workspace": "mnnvl-symmetric", "mnnvl_comm": getattr(self, "_mnnvl_comm", "n/a"),
+        }
+
+    def _init_moe_alltoall(self, ver):
+        """Class path: flashinfer.comm.MoeAlltoAll(mapping, max_num_tokens, top_k, num_experts)."""
+        MoeAlltoAll = getattr(fi_comm, "MoeAlltoAll", None)
+        if MoeAlltoAll is None:
+            raise _loud("MoeAlltoAll lookup", "flashinfer.comm.MoeAlltoAll not found",
+                        AttributeError("MoeAlltoAll"))
+        # The MNNVL symmetric workspace bootstraps its cross-rank comm via MPI by default
+        # (MnnvlMemory.get_comm -> MpiComm().Split) — which fails under torchrun (no mpi4py / no MPI
+        # launch). FlashInfer ships a TorchDistBackend; wrap it in an MnnvlConfig so the workspace
+        # uses the torch.distributed NCCL group torchrun already set up. This is the no-MPI path.
+        mnnvl_config = None
+        try:
+            from flashinfer.comm.mnnvl import MnnvlConfig, TorchDistBackend, MnnvlMemory
+            mnnvl_config = MnnvlConfig(comm_backend=TorchDistBackend(group=None))
+            # get_comm() returns the cached class-level comm if set, else MPI-Splits. Register the
+            # torch-dist comm explicitly so the workspace bootstrap NEVER touches MPI/mpi4py.
+            if MnnvlMemory.comm is None:
+                MnnvlMemory.set_comm_from_config(self.mapping, mnnvl_config)
+            if self.rank == 0:
+                print("[ep_flashinfer] MNNVL via TorchDistBackend (no MPI)", flush=True)
+        except Exception as exc:  # older flashinfer without TorchDistBackend -> fall back (will MPI-fail loudly)
+            if self.rank == 0:
+                print(f"[ep_flashinfer] WARN: no TorchDistBackend ({exc!r}); MoeAlltoAll will need MPI",
+                      flush=True)
+        self._mnnvl_comm = "torch-dist" if mnnvl_config else "mpi-default"  # provenance built later
+        # kwarg names have drifted across releases; hidden_size is REQUIRED (else MoeAlltoAll asserts
+        # "hidden_size must be provided if workspace_size_per_rank is not provided"); mnnvl_config
+        # supplies the torch-dist comm. Try with mnnvl_config first, then without (older releases).
+        hs = int(self.args.hidden)
+        mc = dict(mnnvl_config=mnnvl_config) if mnnvl_config is not None else {}
+        variants = [
+            ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs, **mc)),
+            ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs)),
+            ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs,
+                                   ep_size=self.world_size)),
+            ((self.mapping, self.max_num_tokens, self.top_k, self.num_experts, hs), {}),
+            ((self.mapping,), dict(max_num_tokens_per_rank=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs)),
+        ]
+        self.a2a, idx = _call_variants("MoeAlltoAll(...)", MoeAlltoAll, variants)
+        self.path = "moe_alltoall"
+        if self.rank == 0:
+            print(f"[flashinfer] MoeAlltoAll constructed via variant #{idx}", file=sys.stderr)
+
+    def buffer_cap(self, args):
+        # The symmetric workspace is sized for max_num_tokens per rank; cap the sweep there
+        # (reported by the harness, never silently truncated).
+        return self.max_num_tokens
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice.
+        # token_selected_experts is commonly int32 in TensorRT-LLM kernels; keep an int32 copy
+        # alongside the int64 (the harness/expected use int64; the kernel call uses int32).
+        # input_payloads = [x] for bf16, or [q, scale_factor] for a quantized dispatch — the cast
+        # runs HERE (UNTIMED preprocessing). x_ref = the dequantized cast = the COMM correctness
+        # reference (so the gate verifies the all-to-all, not the quantizer).
+        p = types.SimpleNamespace(
+            T=int(T), x=x,
+            topk_idx=idx.to(torch.int64),
+            topk_idx_i32=idx.to(torch.int32),
+            topk_weights=weights.to(torch.float32),
+            payloads=None, x_ref=None,
+        )
+        if self.quant_kind == "e4m3":
+            q, sf = self._e4m3_cast(x)
+            p.payloads = [q, sf]
+            p.x_ref = _e4m3_dequant_nd(q, sf)
+        elif self._micro is not None:
+            q, sf = self._micro.cast(x)
+            p.payloads = [q, sf]
+            p.x_ref = self._micro.dequant_nd(q, sf)   # 2D recv path (lead=(T,)) = source-token ref
+        else:  # bf16
+            p.payloads = [x]
+            p.x_ref = x
+        return p
+
+    def _reset_moe_fsm(self):
+        # Force the MoeAlltoAll FSM back to idle so a fresh dispatch is legal. The harness loops
+        # dispatch in isolation (and re-dispatches before each combine); a pending "dispatched"
+        # state from a prior un-combined dispatch would assert. Discarding it is fine for timing
+        # (each dispatch re-populates the workspace). Defensive: the internal attr may move.
+        a = getattr(self, "a2a", None)
+        st = getattr(a, "_state", None)
+        if st is not None and getattr(st, "phase", "idle") != "idle":
+            try:
+                st.phase = "idle"
+            except Exception:
+                pass
+
+    def dispatch(self, p):
+        self._reset_moe_fsm()
+        # MoeAlltoAll.dispatch(token_selected_experts, input_payloads, runtime_max_tokens_per_rank)
+        # -> a LIST of recv tensors [ep_size, max_tokens, *] (one per input payload, same order).
+        # input_payloads = p.payloads ([x] bf16, or [q, scale_factor] for a quantized dispatch).
+        variants = [
+            ((p.topk_idx_i32, p.payloads, p.T), {}),
+            ((p.topk_idx_i32, p.payloads), dict(runtime_max_tokens_per_rank=p.T)),
+            ((p.topk_idx_i32, p.payloads), dict(runtime_max_tokens=p.T)),
+            ((p.topk_idx, p.payloads, p.T), {}),                  # int64 idx fallback
+        ]
+        recv, idx = _call_variants("MoeAlltoAll.dispatch(...)", self.a2a.dispatch, variants)
+        recv_list = list(recv) if isinstance(recv, (list, tuple)) else [recv]
+        recv_q = recv_list[0]
+        recv_sf = recv_list[1] if len(recv_list) > 1 else None
+        return types.SimpleNamespace(recv=recv, recv_q=recv_q, recv_sf=recv_sf,
+                                     recv_payload=self._first_payload(recv),
+                                     dispatch_variant=idx, combine_input=None)
+
+    @staticmethod
+    def _first_payload(recv):
+        """dispatch may return a Tensor, a (payloads, meta) tuple, or a list of payloads.
+        Return the first payload Tensor (the routed x on this rank) for recv_tokens/staging."""
+        if torch.is_tensor(recv):
+            return recv
+        if isinstance(recv, (list, tuple)) and recv:
+            head = recv[0]
+            if torch.is_tensor(head):
+                return head
+            if isinstance(head, (list, tuple)) and head and torch.is_tensor(head[0]):
+                return head[0]
+        return recv  # leave as-is; recv_tokens guards with is_tensor
+
+    def stage(self, p, h):
+        # No expert compute (identity expert). For bf16, the recv IS the "expert output" as-is —
+        # combine reads back from the SAME workspace dispatch populated, so we hand recv[0] straight
+        # to combine (NO clone — a clone of the workspace-backed recv broke the layout and
+        # async-corrupted CUDA; combine is called payload_in_workspace=False so the kernel stages it).
+        # For a QUANTIZED dispatch, DEQUANT the recv (recv_q + recv_sf) -> bf16 HERE (UNTIMED, outside
+        # the comm window): this is the bf16 "expert input" that combine reduces. The dequant produces
+        # a fresh tensor (not workspace-backed), which combine stages via payload_in_workspace=False.
+        if self.quant_kind:
+            # Dequant is UNTIMED preprocessing (layout-and-dispatch-v1) — but FlashInfer is
+            # roundtrip_only, so stage() runs INSIDE the timed dispatch->combine loop. The recv is
+            # DETERMINISTIC for a fixed problem (same x + routing -> same workspace contents), so we
+            # dequant ONCE and cache it on the problem; steady-state timing then measures comm only
+            # (the dequant is amortized, exactly as DeepEP's separately-timed stage is untimed). This
+            # keeps FlashInfer-fp8 comparable to DeepEP-fp8 (same timing boundary) and stops the
+            # CPU-side nvfp4 dequant from dominating the roundtrip.
+            ci = getattr(p, "_combine_input_cache", None)
+            if ci is None:
+                ci = (_e4m3_dequant_nd(h.recv_q, h.recv_sf) if self.quant_kind == "e4m3"
+                      else self._micro.dequant_nd(h.recv_q, h.recv_sf))
+                p._combine_input_cache = ci
+            h.combine_input = ci
+        else:
+            h.combine_input = h.recv_payload
+        if self.rank == 0 and not getattr(self, "_shape_logged", False) and torch.is_tensor(h.combine_input):
+            self._shape_logged = True
+            print(f"[ep_flashinfer] dtype={self.dispatch_dtype} recv_q={tuple(h.recv_q.shape)}:{h.recv_q.dtype}"
+                  f" combine_input={tuple(h.combine_input.shape)}:{h.combine_input.dtype}", flush=True)
+        return None
+
+    def combine(self, p, h):
+        if self.combine_quant:
+            return self._combine_quant(p, h)
+        # MoeAlltoAll.combine(payload, runtime_max_tokens_per_rank, payload_in_workspace=False)
+        # -> the per-source-token reduced result on this rank ([T, hidden] bf16). Because the
+        # dispatch populated the symmetric workspace, the data is already there: try
+        # payload_in_workspace=True first (no payload re-copy), then the explicit-payload forms.
+        # payload_in_workspace=False FIRST: combine_input is a cloned external tensor (see stage),
+        # so the kernel copies it into the workspace itself — avoids the exact-pointer requirement
+        # that payload_in_workspace=True enforces (which raised a RuntimeError, not a TypeError, so
+        # _call_variants would not fall through to it).
+        variants = [
+            ((h.combine_input, p.T), dict(payload_in_workspace=False)),
+            ((h.combine_input, p.T), {}),
+            ((h.combine_input,), dict(runtime_max_tokens_per_rank=p.T, payload_in_workspace=False)),
+            ((h.combine_input,), dict(runtime_max_tokens_per_rank=p.T)),
+        ]
+        combined, idx = _call_variants("MoeAlltoAll.combine(...)", self.a2a.combine, variants)
+        h.combine_variant = idx
+        return self._as_tensor(combined)
+
+    _QC_VEC = 32   # fp8 combine output uses UE8M0 scales, vector size 32 (flashinfer main source)
+
+    def _combine_quant(self, p, h):
+        # Quantized COMBINE OUTPUT. Pinned from the flashinfer-main source: combine(output_dtype=
+        # float8_e4m3fn) emits the reduced result as e4m3 + UE8M0 scale factors "packed in torch.uint8,
+        # vector size 32" (linear layout) — i.e. MXFP8 (e4m3 + e8m0 block-32). So output_scales MUST be
+        # uint8 [T, H/32] (the kernel WRITES it; first run failed "float32 vs uint8"). We dequant
+        # (cached, UNTIMED — deterministic recv) via e8m0: x = e4m3 * 2^(scale_uint8 - 127) per block-32.
+        # The fp8 reduction is what's TIMED. CX_QC_SCALE override: "block32" (default) | "pertoken"[T,1].
+        H = int(getattr(self, "hidden", 0)) or int(self.args.hidden)
+        T = p.T
+        if self.combine_dtype == "nvfp4":
+            # NVFP4 combine: uint8 packed-e2m1 output + e4m3 (float8) scales vec-16 + per-tensor scalar.
+            blocks = max(1, H // 16)
+            sc = torch.zeros(T, blocks, device=self.device, dtype=torch.float8_e4m3fn)
+            self._qc_scalar = float(os.environ.get("CX_QC_NVFP4_SCALAR", "1.0"))
+            kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype,
+                      output_scales=sc, output_scalar_scale=self._qc_scalar)
+            label = f"nvfp4 output_scales=e4m3[{T},{blocks}] scalar={self._qc_scalar}"
+        elif os.environ.get("CX_QC_SCALE") == "scalar":
+            # DIRECT-CAST fp8 combine: a single per-tensor output_scalar_scale, NO per-block
+            # output_scales (the unscaled/global-scaled e4m3 emit — goal "Direct-cast FP8 combine").
+            # The working mxfp8 path emits SCALED e4m3+e8m0; this probes whether the same kernel also
+            # supports the scalar-only mode. If the kernel REQUIRES per-block output_scales for fp8
+            # output, the call below raises and the run records that (the documented kernel limit).
+            sc = None
+            self._qc_scalar = float(os.environ.get("CX_QC_FP8_SCALAR", "1.0"))
+            kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype,
+                      output_scalar_scale=self._qc_scalar)
+            label = f"fp8-directcast output_scalar_scale={self._qc_scalar} (no per-block scales)"
+        else:
+            # MXFP8 combine: e4m3 output + UE8M0 uint8 scales vec-32 (the main-source spec).
+            mode = os.environ.get("CX_QC_SCALE", "block32")
+            blocks = 1 if mode == "pertoken" else max(1, H // self._QC_VEC)
+            sc = torch.zeros(T, blocks, device=self.device, dtype=torch.uint8)
+            kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype, output_scales=sc)
+            label = f"mxfp8 output_scales=uint8[{T},{blocks}]"
+        try:
+            out = self.a2a.combine(h.combine_input, T, **kw)
+        except Exception as exc:
+            raise _loud(f"MoeAlltoAll.combine({label})",
+                        f"quant-combine call failed ({self.combine_dtype}; per the main-source spec)", exc)
+        if self.rank == 0 and not getattr(self, "_qc_logged", False):
+            self._qc_logged = True
+            oq = out[0] if isinstance(out, (tuple, list)) else out
+            print(f"[ep_flashinfer] combine-quant {label} OK out={tuple(oq.shape)}:{oq.dtype}", flush=True)
+        return self._finish_qcombine(p, out, sc, H)
+
+    def _finish_qcombine(self, p, out, sc, H):
+        # Dequant the quantized combine output (cached, UNTIMED) -> bf16 for the correctness gate.
+        #   mxfp8: e4m3 * 2^(UE8M0_uint8 - 127), per block-32.
+        #   nvfp4: e2m1_and_ufp8sf_scale_to_float(packed-e2m1, e4m3-scales, global=1/scalar), vec-16.
+        out_q = out[0] if isinstance(out, (tuple, list)) else out
+        cached = getattr(p, "_qc_dequant", None)
+        if cached is None:
+            T = out_q.shape[0]
+            if self.combine_dtype == "nvfp4":
+                gsf = torch.tensor([1.0 / max(1e-6, getattr(self, "_qc_scalar", 1.0))], dtype=torch.float32)
+                # nvfp4 dequant via the flashinfer e2m1 decoder (linear layout, vec-16)
+                import flashinfer as _fi
+                # the combine wrote the nvfp4 scales as float8_e4m3fn, but the e2m1 decoder wants the
+                # raw ufp8 bytes as uint8 — reinterpret (same 1-byte storage), don't cast.
+                sc_u8 = sc.reshape(T, -1).contiguous().view(torch.uint8)
+                o = _fi.e2m1_and_ufp8sf_scale_to_float(
+                    out_q.reshape(T, -1).contiguous(), sc_u8,
+                    global_scale_tensor=gsf, sf_vec_size=16, is_sf_swizzled_layout=False)
+                cached = o.reshape(T, H).to(device=out_q.device, dtype=torch.bfloat16)
+            elif sc is None:
+                # direct-cast fp8: single global scalar, no per-block scales -> x = e4m3 * scalar
+                cached = (out_q.float() * float(getattr(self, "_qc_scalar", 1.0))).to(torch.bfloat16)
+                p._qc_dequant = cached
+                return cached
+            else:
+                of = out_q.float()
+                blocks = sc.shape[-1] if torch.is_tensor(sc) and sc.dim() >= 2 else 1
+                if blocks > 1 and (H % blocks) == 0:
+                    bs = H // blocks
+                    scale = torch.pow(torch.tensor(2.0, device=of.device), sc.float() - 127.0)  # e8m0
+                    cached = (of.view(T, blocks, bs) * scale.view(T, blocks, 1)).reshape(T, H).to(torch.bfloat16)
+                else:
+                    scale = torch.pow(torch.tensor(2.0, device=of.device), sc.float().reshape(T, 1) - 127.0)
+                    cached = (of * scale).to(torch.bfloat16)
+            p._qc_dequant = cached
+        return cached
+
+    @staticmethod
+    def _as_tensor(x):
+        if torch.is_tensor(x):
+            return x
+        if isinstance(x, (list, tuple)) and x and torch.is_tensor(x[0]):
+            return x[0]
+        raise _loud("combine result", f"expected a Tensor, got {type(x)}",
+                    TypeError("non-tensor combine result"))
+
+    def expected(self, p, h):
+        # Round trip, identity expert. FlashInfer combine takes NO gate weights and reduces the
+        # recv [ep_size, max_tokens, hidden] over the ep_size (per-RANK) axis — so each source token
+        # is reconstructed as x * (number of DISTINCT ranks its top_k experts land on), exactly like
+        # DeepEP normal mode (combine does not re-weight). Factor is computed from the routing trace:
+        #   "ranks" (default) -> x * distinct_ranks_per_token   (per-rank-sum combine)
+        #   "topk"            -> x * top_k                       (if combine sums every expert copy)
+        #   "weight-sum"      -> x * sum(topk_weights)           (if combine applies the gate)
+        # For a quantized dispatch, compare against the DEQUANTIZED cast that was actually sent
+        # (p.x_ref = dequant(quant(x))), so the gate verifies the COMM not the quantizer. bf16 -> x.
+        ref = (p.x_ref if p.x_ref is not None else p.x).float()
+        if _ROUTING_FACTOR == "weight-sum":
+            factor = p.topk_weights.sum(dim=1, keepdim=True)        # [T, 1]
+        elif _ROUTING_FACTOR == "topk":
+            factor = float(self.top_k)
+        else:  # "ranks": distinct ranks among each token's top_k experts (vectorized)
+            epr = max(1, self.num_experts // self.world_size)
+            ranks = (p.topk_idx.long() // epr).clamp_(0, self.world_size - 1)   # [T, topk]
+            present = torch.zeros(ranks.shape[0], self.world_size,
+                                  device=ranks.device, dtype=torch.float32)
+            present.scatter_(1, ranks, 1.0)
+            factor = present.sum(dim=1, keepdim=True)               # [T, 1] distinct ranks/token
+        return ref * factor, p.T
+
+    def recv_tokens(self, h):
+        # Realized token-copies received on this rank (the routed payload's first dim). FlashInfer
+        # pads to max_num_tokens-per-source-rank; the row count is the realistic recv-buffer size
+        # the harness reports (it does NOT gate on this — recv_total>0 is the only liveness check).
+        rp = h.recv_payload
+        if torch.is_tensor(rp) and rp.dim() >= 1:
+            return int(rp.shape[0])
+        return 0
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py
new file mode 100644
index 000000000..90454dda3
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_harness.py
@@ -0,0 +1,1021 @@
+#!/usr/bin/env python3
+"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness.
+
+Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`)
+implement a small duck-typed protocol; this module owns the source-tokens-per-rank
+sweep, the timing, the correctness gate, and the provenance-tagged JSON doc.
+
+Fair-comparison contract (hardened after review — see notes.md / plan.md):
+  * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs +
+    gate weights are generated once from a fixed seed over the *global* batch and are
+    identical on every SKU; each rank materializes its slice. So every platform runs
+    the *same* problem (no per-rank/per-platform RNG in the adapters).
+  * **Explicit measurement contract** (review #3): adapters conform to a NAMED timing
+    boundary, they do not each choose their own. layout-and-dispatch-v1 times the
+    routing-layout step inside dispatch (the only contract MoRI can honor); cached-
+    layout-comm-only-v1 hoists it out (DeepEP). Combine excludes staging in both.
+    Serial = SUM of the two isolated medians (NOT a measured chained op).
+  * **Correct collective percentile**: each iteration's latency is reduced MAX across
+    ranks first (a collective finishes with its slowest rank), THEN percentiled —
+    `median_i(max_r)`, not `max_r(median_i)`.
+  * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and
+    `global_tokens = T * ep_size` are recorded for the weak/strong-scaling x toggle.
+
+stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported
+lazily inside run_sweep) so this file `py_compile`s without torch.
+
+Backend protocol:
+    name, mode, combine_needs_redispatch, backend_provenance(dict)
+    buffer_cap(args) -> int|None
+    make_problem(T, idx, weights, x) -> problem   # materialize this rank's trace slice
+    dispatch(problem) -> handle                   # pure dispatch comm (timed)
+    stage(problem, handle)                        # untimed expert-output placement
+    combine(problem, handle) -> tensor            # pure combine comm (timed)
+    expected(problem, handle) -> (tensor, n_cmp)  # correctness reference
+    recv_tokens(handle) -> int                    # realized tokens received this rank
+    finalize(rc) -> int|NoReturn
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+
+SCHEMA_VERSION = 3  # v3: explicit contracts, pooled trials p50/p90/p99, routing-identity proof, separated logical bytes
+
+# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal
+# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a
+# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap).
+DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128]
+PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096]
+
+# Dispatch-payload element size (bytes/element of hidden) for the derived-bandwidth estimate.
+# fp8/mxfp8 = 1B e4m3/e8m0; mxfp4/nvfp4 = 0.5B (4-bit e2m1, 2 values/byte) — the headline metric
+# is measured LATENCY (dtype-independent); only the secondary GB/s estimate uses this.
+_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp8": 1, "fp8-pertoken": 1, "fp8-directcast": 1,
+                "mxfp8": 1, "mxfp4": 0.5, "nvfp4": 0.5}
+
+# Phase profiles (goal P2 "decode/prefill representation"): decode/prefill are token-size REGIMES
+# that also carry distinct serving semantics — NOT merely ladder aliases. Emitted into the doc so a
+# T=128 point launched under "prefill" is never silently read as decode (the shared-T overlap is
+# the same kernel at the same T; the phase records what serving situation it stands in). Each point
+# is ONE MoE layer, ONE step, a SINGLE dispatch+combine collective pair — not a whole model or
+# several concurrent layers.
+PHASE_PROFILE = {
+    "decode": {"regime": "decode", "tokens_per_iter": "1 (or few) per active sequence",
+               "microbatch": "one decode step across the active sequences",
+               "routing_variability": "varies step-to-step (temporal routing modes model this)",
+               "represents": "one MoE layer · one decode step · one dispatch+combine collective"},
+    "prefill": {"regime": "prefill", "chunk": "chunked-prefill — many tokens/sequence per MoE layer",
+                "request_mixture": "tokens of one chunk entering a single MoE layer at once",
+                "represents": "one MoE layer · one prefill chunk · one dispatch+combine collective"},
+}
+
+
+def add_common_args(ap: argparse.ArgumentParser) -> None:
+    """CLI args shared by every backend (the entrypoint adds --backend)."""
+    ap.add_argument("--phase", default="decode", choices=["decode", "prefill"],
+                    help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder")
+    ap.add_argument("--tokens-ladder", default="",
+                    help="space/comma-separated source-tokens-per-rank sweep; blank = phase default")
+    ap.add_argument("--hidden", type=int, default=7168)
+    ap.add_argument("--topk", type=int, default=8)
+    ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)")
+    ap.add_argument("--dispatch-dtype", default="bf16",
+                    choices=["bf16", "fp8", "fp8-pertoken", "fp8-directcast",
+                             "mxfp8", "mxfp4", "nvfp4"])
+    # Combine-path precision/quant is a SEPARATE axis from dispatch (review: don't let
+    # dispatch_dtype=fp8 imply the whole EP path is quantized). Today every backend combines
+    # bf16 with no quant (combine_quant_mode=none); a future quantized combine (e.g. ROCm/MoRI
+    # PR311) sets these WITHOUT changing --dispatch-dtype. Defaults reproduce today exactly;
+    # capability.py gates unsupported values.
+    ap.add_argument("--combine-dtype", default="bf16", choices=["bf16", "fp8", "nvfp4"],
+                    help="combine OUTPUT precision (bf16 default; fp8=MXFP8 e4m3+e8m0, nvfp4=e2m1 — "
+                         "quantized combine via flashinfer-main moe_a2a_combine output_dtype)")
+    ap.add_argument("--combine-quant-mode", default="none",
+                    help="combine quantization mode; 'none' today. capability.py rejects unwired modes")
+    # Activation VALUE distribution of expert inputs (goal P2). normal = seeded N(0,1) (the only
+    # latency-relevant one under bf16 combine — bf16 is value-independent); the others stress a
+    # FUTURE quantized combine's scale computation (amax/outliers/saturation). routing.py owns
+    # the generators; capability.py gates which a backend/mode admits.
+    ap.add_argument("--activation-profile", default="normal",
+                    choices=["normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation"],
+                    help="value distribution of expert inputs (routing.ACTIVATION_PROFILES)")
+    # uniform = realistic top-k (fan-out ≈5.3 over EP8); balanced = load-equalized,
+    # one-expert-per-rank (fan-out = ep_size); balanced-rank-local = fan-out 1 (min
+    # comm) edge case; zipf = skewed; hotspot-* = adversarial single hot expert (static
+    # or moving across steps); alternating-groups = expert halves that toggle by step.
+    ap.add_argument("--routing", default="uniform",
+                    choices=["uniform", "balanced", "balanced-rank-local", "zipf",
+                             "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single",
+                             "hotspot-moving", "alternating-groups"])
+    # Temporal snapshot index for the moving/alternating distributions (goal P2 "temporal routing
+    # changes"). One run = one step; a temporal suite launches steps 0..N and analyze_ep compares
+    # them. Folds into workload_id only when non-zero (preserves existing canonical ids).
+    ap.add_argument("--routing-step", type=int, default=0,
+                    help="temporal step for hotspot-moving / alternating-groups (0 = first/static)")
+    # Uneven source-token allocation (goal P2 "support uneven source-token allocation"): per-rank
+    # token counts vary (global may not divide EP); empty-source-rank case included. Default 'none'
+    # = every rank gets exactly the ladder T (perfectly even; source-token CV 0) — no behavior
+    # change for existing runs. 'linear' ramps counts ~0.5T..1.5T; 'empty-rank' zeroes rank 0.
+    ap.add_argument("--uneven-tokens", default="none", choices=["none", "linear", "empty-rank"],
+                    help="per-rank source-token allocation skew (records source_token_stats)")
+    # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical
+    # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform
+    # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew.
+    ap.add_argument("--eplb", action="store_true",
+                    help="apply EPLB expert replication/placement to the routing trace")
+    ap.add_argument("--num-redundant-experts", type=int, default=32,
+                    help="EPLB: redundant physical expert slots (rounded up to a multiple of ep_size)")
+    # Canonical serialized workload (goal P1): consume pre-generated trace bytes instead of the
+    # seeded runtime generator, so a result is provably the SAME workload as another machine's
+    # (checksum match). Points at a dir of <workload_id>.npz/.manifest.json (make_workloads.py).
+    ap.add_argument("--workload-dir", default="",
+                    help="dir of canonical workload traces; empty = seeded runtime generation (dev)")
+    ap.add_argument("--mode", default="normal", choices=["normal", "ll"],
+                    help="kernel path: normal or low-latency (LL); LL is backend-dependent")
+    # Measurement contract — the EXPLICIT timing boundary every adapter must conform to
+    # (review #3: adapters must not each decide their own boundary). Backends declare
+    # SUPPORTED_CONTRACTS; run_ep.py rejects an unsupported one.
+    #   layout-and-dispatch-v1   — dispatch timing INCLUDES routing-layout generation
+    #                              (the only contract MoRI can honor; its layout is
+    #                              computed inside the kernel and cannot be hoisted).
+    #   cached-layout-comm-only-v1 — layout computed ONCE untimed; dispatch times pure
+    #                              comm (DeepEP-only; matches DeepEP's own benchmark).
+    # Combine excludes staging in BOTH (staging is untimed for every backend).
+    #   runtime-visible-v1       — the serving-realistic boundary: dispatch starts from what the
+    #                              runtime has right after routing and INCLUDES required quant /
+    #                              scale creation / layout / packing / comm / sync; combine starts
+    #                              from expert outputs and ends when token outputs are consumable.
+    #                              (DeepEP-only today; the FP8 cast moves INSIDE the timed window.)
+    ap.add_argument("--measurement-contract", default="layout-and-dispatch-v1",
+                    choices=["layout-and-dispatch-v1", "cached-layout-comm-only-v1",
+                             "runtime-visible-v1"])
+    ap.add_argument("--num-sms", type=int, default=24,
+                    help="DeepEP comm-SM budget in 'default' resource-mode (MoRI uses block_num/warps)")
+    # Resource regime (review: budgets were neither normalized nor tuned):
+    #   normalized — each backend restricted to ~sm_fraction of its device's units
+    #                (DeepEP set_num_sms(frac·SMs); MoRI block_num≈frac·CUs). Fraction-
+    #                based, recorded — an approximate apples-to-apples, not identical work.
+    #   tuned      — each backend's recommended/auto launch config (best achievable).
+    #   default    — DeepEP --num-sms / MoRI 80 blocks (the bring-up budget).
+    ap.add_argument("--resource-mode", default="normalized",
+                    choices=["normalized", "tuned", "default"])
+    ap.add_argument("--sm-fraction", type=float, default=0.18,
+                    help="normalized mode: fraction of device SMs/CUs dedicated to comms (~24/132)")
+    ap.add_argument("--num-ep-groups", type=int, default=1,
+                    help="concurrent EP groups; >1 is REJECTED (real subgroup PGs unimplemented)")
+    ap.add_argument("--seed", type=int, default=67)
+    # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks +
+    # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us
+    # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within
+    # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless.
+    ap.add_argument("--warmup", type=int, default=32)
+    ap.add_argument("--iters", type=int, default=200,
+                    help="timed iterations PER TRIAL; pooled across trials for percentiles")
+    # review #3: p99 from ~50 samples is just the max. Pool iters x trials, randomize the
+    # token-order each trial so warmup/clock drift doesn't correlate with T, report p50/
+    # p90/p99 (p99 is the headline). 3 trials x 200 iters = 600 pooled samples per point.
+    ap.add_argument("--trials", type=int, default=3,
+                    help="independent timed trials, token-order randomized per trial; samples pooled")
+    ap.add_argument("--allow-unknown-provenance", action="store_true",
+                    help="permit a run with unpinned backend commit/version (default: fail)")
+    # Anomaly waiver (goal P1: roundtrip/isolated_sum threshold -> diagnostic unless explicitly
+    # waived). Without this, a measured roundtrip implausibly larger/smaller than its components
+    # (e.g. the open LL-FP8 anomaly) demotes the result to 'diagnostic'. Pass to keep it
+    # comparable-experimental/official AFTER the cause is understood + documented.
+    ap.add_argument("--waive-anomaly", action="store_true",
+                    help="do not let a flagged timing anomaly demote publication_status to diagnostic")
+    ap.add_argument("--roundtrip-anomaly-threshold", type=float, default=3.0,
+                    help="roundtrip p99 > threshold x isolated_sum p99 is flagged as an anomaly")
+    # provenance / output
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--comparison-class", default="standardized")
+    # Structured placement metadata (goal P2 topology): GPUs/node + scale-up domain + placement
+    # kind let routing locality (local/same-node/cross-domain copy fractions) be computed and let
+    # packed/striped/adversarial be distinguished. gpus-per-node=0 -> single node (= ep_size).
+    ap.add_argument("--gpus-per-node", type=int, default=0)
+    ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)")
+    ap.add_argument("--placement", default="packed",
+                    choices=["packed", "striped", "runtime-native", "adversarial"])
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+
+
+def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]:
+    """Return (ladder, dropped): explicit spec else the phase default; positive ints;
+    clamped to `cap` with dropped points reported (never silently truncated)."""
+    if spec and spec.strip():
+        want = [int(t) for t in spec.replace(",", " ").split() if t]
+    else:
+        want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER
+    want = sorted({t for t in want if t > 0})
+    if cap is not None:
+        return [t for t in want if t <= cap], [t for t in want if t > cap]
+    return want, []
+
+
+def source_token_counts(nominal_T: int, ep_size: int, mode: str) -> list[int]:
+    """Per-rank source-token counts for the uneven-allocation study (goal P2). 'none' = even
+    (every rank nominal_T; global = nominal_T*ep). 'linear' = a deterministic ramp ~0.5T..1.5T
+    (mean ≈ T, so global tokens stay ~the same but ranks are imbalanced). 'empty-rank' = rank 0
+    gets 0 and the rest share evenly (the empty-source-rank case). Deterministic => identical on
+    every rank. Counts are clamped to >=0; total need not divide ep_size."""
+    if mode == "none" or ep_size <= 1:
+        return [nominal_T] * ep_size
+    if mode == "empty-rank":
+        if ep_size < 2:
+            return [nominal_T]
+        # rank 0 empty; spread ep_size*T across the remaining ranks (keeps ~global constant).
+        total = nominal_T * ep_size
+        per = max(1, total // (ep_size - 1))
+        return [0] + [per] * (ep_size - 1)
+    # linear ramp from ~0.5T to ~1.5T across ranks (mean ≈ T). At least 1 token/rank.
+    if ep_size == 1:
+        return [nominal_T]
+    lo, hi = 0.5 * nominal_T, 1.5 * nominal_T
+    return [max(1, int(round(lo + (hi - lo) * r / (ep_size - 1)))) for r in range(ep_size)]
+
+
+def _stats_vec(xs: list[int]) -> dict:
+    """min/mean/max/CV (+ empty count) of a per-rank count vector — self-describing source-token
+    or load summary without dumping the full vector."""
+    n = len(xs) or 1
+    mean = sum(xs) / n
+    var = sum((x - mean) ** 2 for x in xs) / n
+    cv = (var ** 0.5 / mean) if mean > 0 else 0.0
+    return {"min": min(xs) if xs else 0, "mean": round(mean, 3),
+            "max": max(xs) if xs else 0, "cv": round(cv, 4),
+            "empty_ranks": sum(1 for x in xs if x == 0), "total": sum(xs), "ranks": n}
+
+
+def percentile(xs: list[float], q: float) -> float:
+    if not xs:
+        return float("nan")
+    s = sorted(xs)
+    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
+    return s[i]
+
+
+def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]:
+    """Per-iteration CUDA-event latencies (µs) for THIS rank.
+
+    Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync
+    before the start event so its GPU work can't bleed in), then times `fn(pre_result)`
+    — how combine is isolated when it consumes the dispatch state and needs a fresh
+    untimed dispatch+stage before every sample. Returns the raw per-iteration series;
+    the caller reduces across ranks per iteration before percentiling.
+    """
+    def sample():
+        arg = pre() if pre is not None else None
+        if pre is not None:
+            torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True)
+        e = torch.cuda.Event(enable_timing=True)
+        s.record()
+        fn(arg) if pre is not None else fn()
+        e.record()
+        torch.cuda.synchronize()
+        return s.elapsed_time(e) * 1000.0  # ms -> us
+
+    for _ in range(max(0, warmup)):
+        if pre is not None:
+            a = pre(); torch.cuda.synchronize(); fn(a)
+        else:
+            fn()
+        # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn
+        # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back
+        # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort
+        # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync.
+        torch.cuda.synchronize()
+    return [sample() for _ in range(iters)]
+
+
+def comparison_key(meta: dict) -> str:
+    """Machine key gating which rows share a curve — built from the FIXED config ONLY
+    (tokens_per_rank is the x-axis and is excluded). op/backend/mode/phase/ep_size/
+    topology are in the key, so EP4 vs EP8, normal vs LL, decode vs prefill, and
+    different SKUs are labelled distinct, never silently overlaid."""
+    parts = [
+        meta["op"], meta["backend"], meta["mode"], meta["phase"],
+        str(meta["ep_size"]), str(meta["nodes"]), meta.get("resource_mode", "default"),
+        meta["topology_class"], meta["comparison_class"], meta["measurement_contract"],
+        json.dumps(meta["shape"], sort_keys=True),
+    ]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _reduce_vec(torch, dist, device, vals, op):
+    t = torch.tensor(vals, device=device, dtype=torch.float64)
+    dist.all_reduce(t, op=op)
+    return [float(x) for x in t.tolist()]
+
+
+def _reduce_int(torch, dist, device, v: int, op) -> int:
+    t = torch.tensor([int(v)], device=device, dtype=torch.int64)
+    dist.all_reduce(t, op=op)
+    return int(t.item())
+
+
+def _allgather_floats(torch, dist, device, v: float) -> list[float]:
+    """Gather one scalar from every rank -> list indexed by rank (for per-rank diagnostics:
+    which rank is the straggler, the rank spread). all_reduce can't do this — it collapses."""
+    world = dist.get_world_size()
+    out = [torch.zeros(1, device=device, dtype=torch.float64) for _ in range(world)]
+    dist.all_gather(out, torch.tensor([float(v)], device=device, dtype=torch.float64))
+    return [float(x.item()) for x in out]
+
+
+def _histogram(xs: list[float], nbins: int = 40) -> dict:
+    """Compact distribution of pooled cross-rank-max samples (for p99-spike debugging without
+    storing every sample). Equal-width bins between min and max."""
+    if not xs:
+        return {"n": 0}
+    lo, hi = min(xs), max(xs)
+    if hi <= lo:
+        return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]}
+    counts = [0] * nbins
+    span = hi - lo
+    for x in xs:
+        b = min(nbins - 1, int((x - lo) / span * nbins))
+        counts[b] += 1
+    return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts}
+
+
+def _provenance_unknown(prov: dict) -> list[str]:
+    return [k for k, v in prov.items() if isinstance(v, str) and v.strip().lower() == "unknown"]
+
+
+def _resource_profile(prov: dict, args) -> dict:
+    """Map backend-specific provenance onto the backend-INDEPENDENT resource vocabulary (goal P3):
+    requested vs achieved comm-unit fraction, configured units/warps, and a conformance class.
+    DeepEP units = SMs (num_sms); MoRI units = CU blocks (block_num)."""
+    dev = prov.get("device_sms") or prov.get("device_cus")
+    cfg = prov.get("num_sms") if prov.get("num_sms") is not None else prov.get("block_num")
+    requested = args.sm_fraction if args.resource_mode == "normalized" else None
+    achieved = (cfg / dev) if (cfg and dev) else None
+    floored = bool(prov.get("block_num_floored"))
+    # FIXED-KERNEL split (goal P3 / immediate P0): a kernel whose comm occupancy is fixed by the
+    # library and NOT a normalized/tuned SM/CU budget (DeepEP LL: num_sms=None, low_latency_mode,
+    # tuned_source=ll-fixed-kernel) is NOT a resource-constrained run. It gets resource_class=
+    # fixed-kernel + conformance not-applicable, and is excluded from resource-Pareto comparisons.
+    fixed_kernel = bool(prov.get("low_latency_mode")) or ("fixed-kernel" in str(prov.get("tuned_source", "")))
+    if fixed_kernel:
+        resource_class, cls = "fixed-kernel", "not-applicable"
+    elif floored:
+        resource_class, cls = "resource-constrained", "minimum-functional"  # needed MORE than requested
+    elif args.resource_mode == "normalized":
+        resource_class, cls = "resource-constrained", "resource-conforming"
+    elif args.resource_mode == "tuned":
+        resource_class = "backend-tuned"
+        cls = "best-known" if "default" not in str(prov.get("tuned_source", "")) else "backend-default"
+    else:
+        resource_class, cls = "backend-default", "backend-default"
+    # within tolerance? (normalized only — did we hit the requested fraction?)
+    tol = 0.10
+    target_achieved = (requested is not None and achieved is not None
+                       and abs(achieved - requested) <= tol) if requested else None
+    return {
+        "comm_units_kind": "sm" if prov.get("num_sms") is not None else "cu_block",
+        "requested_fraction": requested, "configured_units": cfg, "device_units": dev,
+        "achieved_fraction": round(achieved, 4) if achieved else None,
+        "warps_dispatch": prov.get("dispatch_warps"), "warps_combine": prov.get("combine_warps"),
+        "qps_per_rank": prov.get("num_qps_per_rank"),
+        "persistent_bytes": prov.get("num_nvl_bytes") or prov.get("num_rdma_bytes") or prov.get("heap_size"),
+        "tuned_source": prov.get("tuned_source"),
+        # resource_class: fixed-kernel | resource-constrained | backend-tuned | backend-default.
+        # fixed-kernel + backend-* are NOT normalized resource-constrained runs (excluded from Pareto).
+        "resource_class": resource_class,
+        "conformance_class": cls, "tolerance": tol, "target_achieved_within_tol": target_achieved,
+        "nonconforming": floored, "fixed_kernel": fixed_kernel,
+        "pareto_eligible": (resource_class == "resource-constrained" and not floored),
+    }
+
+
+def _derive_publication_status(v: dict) -> str:
+    """Machine-derive the publication state from the validity dimensions (goal P1). No caller
+    may hand-label a result 'official' — it must earn every gate here."""
+    if v["execution_status"] != "complete":
+        return "failed"
+    if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \
+       or v["workload_identity"] == "inconsistent":
+        return "invalid"
+    sound = (v["semantic_correctness"] == "pass"
+             and v["workload_identity"].startswith("consistent")
+             and v["measurement_conformance"] == "conformant")
+    # resource-nonconforming but otherwise sound -> diagnostic (not a fair cross-platform point)
+    if v["resource_conformance"].endswith("nonconforming"):
+        return "diagnostic"
+    # contract-level anomaly (goal P1-e/f): a flagged roundtrip/isolated_sum mismatch demotes to
+    # diagnostic unless explicitly waived (validity.anomaly_free reflects the waiver).
+    if not v.get("anomaly_free", True):
+        return "diagnostic"
+    if sound and v["provenance_complete"] and v["workload_source"] == "canonical-serialized":
+        return "official"
+    if sound:
+        return "comparable-experimental"   # measurement sound, missing a publication requirement
+    return "diagnostic"
+
+
+def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int:
+    """Drive the source-tokens-per-rank sweep for one fully-specified line."""
+    import routing  # torch-based; imported lazily so the module byte-compiles without torch
+    import eplb     # stdlib planner + torch remap (the EPLB transform)
+
+    ep_size = world_size  # num_ep_groups removed (was metadata-only; no real subgroups)
+    # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the
+    # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL
+    # experts then remapped to physical (build_trace), so the whole sweep runs over the
+    # balanced physical placement with no adapter change.
+    eplb_on = getattr(args, "eplb", False)
+    num_logical = getattr(args, "num_logical_experts", args.experts)
+    if args.experts % ep_size != 0:
+        if rank == 0:
+            print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})")
+        return 2
+    experts_per_rank = args.experts // ep_size
+    elem_bytes = _DTYPE_BYTES.get(args.dispatch_dtype, 2)
+
+    # Provenance gate (review #1): refuse a comparison run with unpinned backend info.
+    unknown = _provenance_unknown(backend.backend_provenance)
+    if unknown and not args.allow_unknown_provenance:
+        if rank == 0:
+            print(f"ERROR: unpinned provenance {unknown} in {backend.backend_provenance}; "
+                  f"set the commit/version env or pass --allow-unknown-provenance.")
+        return 4
+
+    cap = backend.buffer_cap(args)
+    ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap)
+    if rank == 0 and dropped:
+        print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} "
+              f"(hidden={args.hidden}); not silently truncated.")
+    if not ladder:
+        if rank == 0:
+            print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})")
+        return 2
+    # MoRI wedges on a COLD dispatch that jumps straight to a large T; it sets
+    # needs_gradual_ramp so the sweep approaches its max T via a geometric ramp from 1
+    # (validated on MI355X). A naturally-gradual ladder (decode) is unchanged.
+    if getattr(backend, "needs_gradual_ramp", False):
+        top, ramp, t = ladder[-1], [], 1
+        while t < top:
+            ramp.append(t); t *= 2
+        ramp.append(top)
+        if rank == 0 and ramp != ladder:
+            print(f"NOTE: {backend.name} sweep ramped gradually 1..{top} (cold-jump-safe): {ramp}")
+        ladder = ramp
+
+    MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM
+    # temporal snapshot index — defined BEFORE the EPLB block (which builds a reference trace with
+    # step=routing_step); the EPLB path runs only when eplb_on, so a late definition raised an
+    # UnboundLocalError on zipf+eplb canonical runs (caught as a preserved failed-case).
+    routing_step = int(getattr(args, "routing_step", 0))
+
+    # EPLB plan (once): estimate logical load from the global logical trace at the largest
+    # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB
+    # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps
+    # to physical when the plan is present; otherwise it's the identity (logical == physical).
+    eplb_plan = None
+    if eplb_on:
+        ref_idx, _ = routing.build_global_routing(max(ladder) * ep_size, num_logical, args.topk,
+                                                  args.routing, args.seed, num_logical // ep_size,
+                                                  step=routing_step)
+        load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist()
+        eplb_plan = eplb.build_plan(load, args.experts, ep_size)
+        if rank == 0:
+            print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); "
+                  f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> "
+                  f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts "
+                  f"replicated (hottest {eplb_plan['max_replicas']}x)")
+
+    canonical = bool(getattr(args, "workload_dir", ""))
+    uneven = getattr(args, "uneven_tokens", "none")
+    if canonical and uneven != "none":
+        if rank == 0:
+            print(f"ERROR: --uneven-tokens={uneven} is incompatible with --workload-dir "
+                  f"(canonical workloads are serialized at a fixed global-token count per id); "
+                  f"use seeded-runtime for the uneven-allocation study.")
+        return 2
+    loaded_workload_ids, loaded_checksums = [], {}
+    if canonical:
+        import workload as _wl
+
+    def build_trace(gt):
+        # canonical: load pre-serialized trace bytes (verified by checksum) so this run is
+        # provably the SAME workload as any other consuming the same files. else: seeded gen.
+        if canonical:
+            wid = _wl.compute_workload_id(args.routing, args.hidden, args.topk, num_logical, gt,
+                                          args.seed, step=routing_step)
+            idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True)
+            idx_l = torch.from_numpy(idx_np).to(torch.int64)
+            w = torch.from_numpy(w_np).to(torch.float32)
+            if wid not in loaded_workload_ids:
+                loaded_workload_ids.append(wid)
+                loaded_checksums[wid] = man.get("checksums")
+        else:
+            idx_l, w = routing.build_global_routing(gt, num_logical, args.topk, args.routing,
+                                                    args.seed, num_logical // ep_size, step=routing_step)
+        return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w
+
+    # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold
+    # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually
+    # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone
+    # and is also cold-jump-safe for MoRI.
+    warm_T = min(ladder[-1], 128)
+    warm_shapes = [t for t in ladder if t <= warm_T] or [ladder[0]]
+    for wt in warm_shapes:
+        wi, ww = build_trace(wt * ep_size)
+        wsi, wsw = routing.rank_slice(wi, ww, rank, wt)
+        wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16,
+                                      profile=args.activation_profile)
+        wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx)
+        for _ in range(8):
+            wh = backend.dispatch(wp); backend.stage(wp, wh); backend.combine(wp, wh)
+    torch.cuda.synchronize()
+    try:
+        dist.barrier()
+    except Exception:
+        pass
+    # Per-point clock-ramp burst (set up below, applied inside the loop): a ONE-TIME burst
+    # warms clocks, but on Blackwell (B300) the tiny small-T points let clocks drop again,
+    # so a mid-sweep T=64 reads ~20x cold. Re-ramping at EACH shape keeps every timed point
+    # steady-state. Gated by backend.wants_warm_burst — MoRI WEDGES on a sustained burst
+    # (and is already steady at warmup=8), so it opts out. CX_FABRIC_WARM_BURST overrides.
+    warm_burst = int(os.environ.get("CX_FABRIC_WARM_BURST", "40"))
+    do_burst = warm_burst > 0 and getattr(backend, "wants_warm_burst", False)
+
+    import random as _random
+    elem_dispatch = elem_bytes          # fp8=1 / bf16=2 (dispatch payload element size)
+    tol = getattr(backend, "tolerance", 5e-2)
+
+    # ---- Pass 1: build the per-T problem ONCE (deterministic trace + cached layout per
+    # contract), run the correctness gate ONCE. Timing is Pass 2 (pooled over trials). ----
+    problems, gate, gts = {}, {}, {}
+    routing_hashes = set()
+    for T in ladder:
+        # Per-rank source-token counts (goal P2 uneven allocation). mode 'none' => [T]*ep,
+        # gt = T*ep, offsets = 0,T,2T,... — byte-identical to the even path. Otherwise counts
+        # vary (global may not divide ep) and rank 0 may be empty.
+        counts = source_token_counts(T, ep_size, uneven)
+        offsets = [sum(counts[:r]) for r in range(ep_size)]
+        gt = sum(counts)
+        gts[T] = gt
+        idx_g, w_g = build_trace(gt)
+        rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g)
+        gpn = args.gpus_per_node or ep_size
+        # placement-aware locality (goal P2): packed/striped/adversarial change which physical
+        # node/domain a rank sits on, so the local/same-node/cross-domain copy fractions differ.
+        rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, max(1, T),
+                                                      gpn, args.scale_up_domain or None,
+                                                      placement=args.placement)
+        rstats["source_token_stats"] = _stats_vec(counts)
+        routing_hashes.add(rstats["routing_hash"])
+        my_off, my_cnt = offsets[rank], counts[rank]
+        idx_s = idx_g[my_off:my_off + my_cnt].contiguous()
+        w_s = w_g[my_off:my_off + my_cnt].contiguous()
+        x = routing.rank_activations(my_cnt, args.hidden, args.seed, rank, device, torch.bfloat16,
+                                     profile=args.activation_profile)
+        problem = backend.make_problem(my_cnt, idx_s.to(device), w_s.to(device), x)
+        h = backend.dispatch(problem); backend.stage(problem, h)
+        combined = backend.combine(problem, h)
+        torch.cuda.synchronize()
+        recv_local = backend.recv_tokens(h)
+        exp, n_cmp = backend.expected(problem, h)
+        # empty source rank (my_cnt==0): nothing to reconstruct locally — gate passes vacuously.
+        if n_cmp > 0:
+            max_abs = (combined[:n_cmp].float() - exp[:n_cmp].float()).abs().max().item()
+            max_rel = max_abs / (exp[:n_cmp].float().abs().max().item() + 1e-6)
+        else:
+            max_rel = 0.0
+        problems[T] = problem
+        gate[T] = {"rstats": rstats, "recv_local": recv_local,
+                   "max_rel": max_rel, "local_ok": 1 if max_rel < tol else 0}
+
+    # ---- Pass 2: N timed trials. Token order is randomized PER TRIAL (seeded ⇒ identical
+    # on every rank, so collectives stay lock-step) so warmup/clock drift can't correlate
+    # with T. Per-iteration cross-rank MAX samples are POOLED across trials, then
+    # percentiled (review #3: p99 from one 50-iter run is just the max). MoRI keeps
+    # ascending order — it wedges on a cold jump to a large T. ----
+    disp_pool = {T: [] for T in ladder}     # pooled per-iteration cross-rank MAX (dispatch)
+    comb_pool = {T: [] for T in ladder}     # ... combine
+    rt_pool = {T: [] for T in ladder}       # ... INDEPENDENTLY-MEASURED round trip (goal P1)
+    disp_local = {T: [] for T in ladder}    # THIS rank's own dispatch samples (per-rank diag)
+    order = list(ladder)
+    rng = _random.Random(args.seed)
+    shuffle_ok = not getattr(backend, "needs_gradual_ramp", False)
+    for trial in range(max(1, args.trials)):
+        if shuffle_ok:
+            rng.shuffle(order)
+        for T in order:
+            problem = problems[T]
+            if do_burst:   # re-ramp clocks at THIS shape before timing (Blackwell)
+                for _ in range(warm_burst):
+                    bh = backend.dispatch(problem); backend.stage(problem, bh); backend.combine(problem, bh)
+                torch.cuda.synchronize()
+            # roundtrip_only backends (stateful paired dispatch/combine FSM, e.g. FlashInfer
+            # MoeAlltoAll): isolated/looped dispatch timing corrupts the symmetric workspace, so
+            # ONLY the paired roundtrip is measurable. Mirror rt into disp/comb (flagged) so the
+            # schema + plot have values; isolated_sum is meaningless for these (== 2x roundtrip).
+            roundtrip_only = getattr(backend, "roundtrip_only", False)
+
+            def rt_once(p=problem):
+                hh = backend.dispatch(p); backend.stage(p, hh); return backend.combine(p, hh)
+
+            if roundtrip_only:
+                rt_iters = time_us(torch, lambda p=problem: rt_once(p), args.warmup, args.iters)
+                disp_iters = comb_iters = rt_iters
+            else:
+                disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p),
+                                     args.warmup, args.iters)
+
+                def prep(p=problem):
+                    hh = backend.dispatch(p); backend.stage(p, hh); return hh
+                if backend.combine_needs_redispatch:
+                    comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh),
+                                         args.warmup, args.iters, pre=prep)
+                else:
+                    hh = prep()
+                    comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx),
+                                         args.warmup, args.iters)
+                # MEASURED round trip (goal P1: not a sum of percentiles): one timed region over
+                # dispatch -> stage (no-op "expert" transform) -> combine -> output ready. Captures
+                # shared sync / launch amortization / overlap that the isolated_sum cannot.
+                rt_iters = time_us(torch, lambda p=problem: rt_once(p), args.warmup, args.iters)
+            # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled.
+            disp_pool[T] += _reduce_vec(torch, dist, device, disp_iters, MAX)
+            comb_pool[T] += _reduce_vec(torch, dist, device, comb_iters, MAX)
+            rt_pool[T] += _reduce_vec(torch, dist, device, rt_iters, MAX)
+            disp_local[T] += disp_iters
+
+    # ---- Pass 3: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ----
+    def pcts(xs):
+        return {"p50": percentile(xs, 50), "p90": percentile(xs, 90),
+                "p95": percentile(xs, 95), "p99": percentile(xs, 99)}
+    rows = []
+    all_anomalies = []                                       # contract-level anomalies (goal P1)
+    thr_rt = float(getattr(args, "roundtrip_anomaly_threshold", 3.0))
+    for T in ladder:
+        gt = gts[T]
+        g = gate[T]; rstats = g["rstats"]
+        d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T]
+        dp, cp, rtp = pcts(d), pcts(c), pcts(rt)
+        # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op
+        # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput
+        # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency.
+        isum = {k: dp[k] + cp[k] for k in dp}
+        recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM)
+        recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX)
+        recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN)
+        global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN)
+        max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0]
+        point_ok = bool(global_ok) and recv_total > 0
+        # Per-rank diagnostics: gather each rank's own dispatch median -> spread + straggler.
+        per_rank_med = _allgather_floats(torch, dist, device, percentile(disp_local[T], 50))
+        slowest_rank = max(range(len(per_rank_med)), key=lambda i: per_rank_med[i])
+        rmean = sum(per_rank_med) / len(per_rank_med)
+        # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv
+        # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy
+        # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert.
+        token_rank_copies = rstats["routed_copies"]
+        token_expert_copies = gt * args.topk
+        H = args.hidden
+        # Bandwidth semantics (goal P1 "distinguish all bandwidth concepts"): the ONLY rates we can
+        # defensibly publish are logical-payload (canonical routed bytes / latency) and backend-
+        # buffer (recv-tensor bytes / latency). algorithm/bus/wire bandwidth are NULL — EP
+        # dispatch/combine have no standard busBW model and we have no transport counters, so we
+        # must NOT imply physical NVLink/XGMI/RDMA utilization.
+        def _rate(nbytes, us):
+            return round(nbytes / (us * 1e3), 3) if (us and us > 0) else None
+        disp_bytes_l = token_rank_copies * H * elem_dispatch
+        comb_bytes_l = token_rank_copies * H * 2
+        buf_disp = recv_max * H * elem_dispatch
+        buf_comb = recv_max * H * 2
+        bandwidth = {
+            "logical_payload_rate_gbps": {
+                "dispatch": _rate(disp_bytes_l, dp["p50"]), "combine": _rate(comb_bytes_l, cp["p50"]),
+                "roundtrip": _rate(disp_bytes_l + comb_bytes_l, rtp["p50"])},
+            "backend_buffer_rate_gbps": {
+                "dispatch": _rate(buf_disp, dp["p50"]), "combine": _rate(buf_comb, cp["p50"])},
+            "algorithm_bandwidth_gbps": None, "bus_bandwidth_gbps": None, "wire_utilization": None,
+            "basis": ("logical = canonical routed-payload copies x hidden x dtype / latency; "
+                      "buffer = backend recv tensor / latency; alg/bus/wire = null (no defined "
+                      "EP busBW formula, no transport counters) — NOT physical link utilization"),
+        }
+        # Contract-level anomaly checks (goal P1) — attached to the ROW and rolled into validity.
+        #   roundtrip_gt_isolated_sum: measured RT p99 >> Σ(isolated dispatch+combine) p99 — a
+        #     chained op shouldn't be far larger than its parts (the open LL-FP8 case).
+        #   roundtrip_lt_component_floor: measured RT p50 < max(dispatch,combine) p50 — a chained
+        #     op can't finish faster than its slowest required component (sync semantics violated).
+        row_anoms = []
+        if isum["p99"] > 0 and rtp["p99"] > thr_rt * isum["p99"]:
+            row_anoms.append({"type": "roundtrip_gt_isolated_sum", "T": T,
+                              "roundtrip_p99": round(rtp["p99"], 2), "isolated_sum_p99": round(isum["p99"], 2),
+                              "ratio": round(rtp["p99"] / isum["p99"], 2), "threshold": thr_rt})
+        floor = max(dp["p50"], cp["p50"])
+        if rtp["p50"] > 0 and floor > 0 and rtp["p50"] < 0.95 * floor:
+            row_anoms.append({"type": "roundtrip_lt_component_floor", "T": T,
+                              "roundtrip_p50": round(rtp["p50"], 2), "component_floor_p50": round(floor, 2)})
+        all_anomalies.extend(row_anoms)
+        rows.append({
+            "tokens_per_rank": T, "global_tokens": gt,
+            "dispatch": dp, "combine": cp, "roundtrip": rtp, "isolated_sum": isum,
+            # flat aliases kept for back-compat with v3 readers
+            "dispatch_us_p50": dp["p50"], "dispatch_us_p90": dp["p90"], "dispatch_us_p99": dp["p99"],
+            "combine_us_p50": cp["p50"], "combine_us_p90": cp["p90"], "combine_us_p99": cp["p99"],
+            "roundtrip_us_p50": rtp["p50"], "roundtrip_us_p90": rtp["p90"],
+            "roundtrip_us_p95": rtp["p95"], "roundtrip_us_p99": rtp["p99"],
+            "isolated_sum_us_p50": isum["p50"], "isolated_sum_us_p99": isum["p99"],
+            "samples_pooled": len(d), "trials": max(1, args.trials),
+            "percentile_interpolation": "nearest-rank",
+            "recv_tokens_max": recv_max, "recv_tokens_min": recv_min,
+            "recv_tokens_mean": recv_total / world_size, "recv_tokens_total": recv_total,
+            "per_rank_dispatch_us": {"min": min(per_rank_med), "mean": rmean,
+                                     "max": max(per_rank_med), "spread": max(per_rank_med) - min(per_rank_med),
+                                     "slowest_rank": slowest_rank},
+            # dispatch carries its dtype's element size; combine input is bf16 (2B).
+            "dispatch_logical_bytes": token_rank_copies * H * elem_dispatch,
+            "combine_logical_bytes": token_rank_copies * H * 2,
+            "byte_contracts": {
+                "token_rank_payload_copies": token_rank_copies,
+                "token_expert_payload_copies": token_expert_copies,
+                "dispatch_bytes": token_rank_copies * H * elem_dispatch,
+                "combine_bytes": token_rank_copies * H * 2,
+                "fp8_scale_bytes": (token_rank_copies * (H // 128) * 4) if elem_dispatch == 1 else 0,
+                "routing_index_bytes": token_expert_copies * 4,   # int32 topk_idx
+                "gate_weight_bytes": token_expert_copies * 4,     # f32 topk_weights
+            },
+            "byte_contract": "logical-routed-payload-v1",
+            # throughput from the MEASURED round trip ONLY (not isolated_sum).
+            "roundtrip_tokens_per_second": (gt / (rtp["p50"] * 1e-6)) if rtp["p50"] > 0 else None,
+            "raw_samples": {"dispatch": _histogram(d), "combine": _histogram(c), "roundtrip": _histogram(rt)},
+            # distinguished bandwidth concepts (goal P1) — logical + buffer real, alg/bus/wire null.
+            "bandwidth": bandwidth,
+            # full load + fanout statistics in EVERY row (goal P2 "report full load and fanout"):
+            "fanout_mean": rstats["fanout_mean"], "fanout_max": rstats["fanout_max"],
+            "fanout_min": rstats["fanout_min"], "fanout_hist": rstats["fanout_hist"],
+            "routed_copies": rstats["routed_copies"],
+            "expert_load_min": rstats["expert_load_min"], "expert_load_max": rstats["expert_load_max"],
+            "expert_load_mean": rstats["expert_load_mean"], "expert_load_cv": rstats["expert_load_cv"],
+            "rank_load_cv": rstats["rank_load_cv"], "hotspot_ratio": rstats["hotspot_ratio"],
+            "dest_rank_load_max": rstats["dest_rank_load_max"],
+            "dest_rank_load_mean": rstats["dest_rank_load_mean"],
+            "empty_expert_count": rstats["empty_expert_count"],
+            "empty_rank_count": rstats["empty_rank_count"],
+            "rank_load_hist": rstats["rank_load_hist"],
+            "source_token_stats": rstats.get("source_token_stats"),
+            "routing_hash": rstats["routing_hash"], "locality": rstats.get("locality"),
+            "anomalies": row_anoms,
+            "correct": point_ok, "max_rel_error": max_rel,
+        })
+        if rank == 0:
+            print(f"  T={T:<5} disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} comb {cp['p50']:6.1f}/{cp['p99']:6.1f} "
+                  f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(d)} fanout={rstats['fanout_mean']:.2f} "
+                  f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} "
+                  f"straggler=r{slowest_rank} correct={point_ok}")
+
+    # Cross-rank workload-identity proof: every rank must have built the SAME global routing
+    # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and
+    # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing.
+    trace_sig = int(hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest()[:15], 16)
+    sig_min = _reduce_int(torch, dist, device, trace_sig, MIN)
+    sig_max = _reduce_int(torch, dist, device, trace_sig, MAX)
+    routing_consistent = (sig_min == sig_max == trace_sig)
+
+    if rank != 0:
+        return 0
+
+    # status=valid requires correctness AND a proven-identical routing trace across ranks.
+    all_ok = bool(rows) and all(r["correct"] for r in rows) and routing_consistent
+
+    # ---- Multi-dimensional validity (goal P1) -> MACHINE-DERIVED publication_status. Adapters
+    # never self-label "official"; status is a pure function of these gates. ----
+    prov = backend.backend_provenance
+    prov_unknown = _provenance_unknown(prov)
+    repro = getattr(args, "reproduction_full", {})
+    git_run = getattr(args, "git_run", None)
+    provenance_complete = (not prov_unknown
+                           and bool(getattr(args, "image_digest", ""))
+                           and bool(git_run) and all((git_run or {}).get(k) for k in ("run_id", "source_sha")))
+    floored = bool(prov.get("block_num_floored"))
+    # fixed-kernel (DeepEP LL) is NOT a normalized resource-constrained run -> conformance N/A
+    # (immediate P0 "split LL fixed-kernel from normalized-resource"). Not a conformance failure.
+    fixed_kernel = bool(prov.get("low_latency_mode")) or ("fixed-kernel" in str(prov.get("tuned_source", "")))
+    resource_conformance = ("not-applicable" if fixed_kernel
+                            else "minimum-functional-nonconforming" if floored
+                            else ("resource-conforming" if args.resource_mode == "normalized"
+                                  else "backend-default" if args.resource_mode in ("tuned", "default")
+                                  else "unspecified"))
+    # record the canonical workload identity consumed (one trace per T -> set of ids/checksums).
+    if canonical and loaded_workload_ids:
+        args.workload_id = (loaded_workload_ids[0] if len(loaded_workload_ids) == 1
+                            else f"set:{len(loaded_workload_ids)}:{loaded_workload_ids[0]}")
+        args.workload_checksums = loaded_checksums
+    canonical_workload = bool(getattr(args, "workload_id", None))
+    # Activation-value identity (scaffold): today activations are seeded N(0,1) and NOT serialized,
+    # so identity is the deterministic descriptor (profile|seed|hidden|generator). When a value rig
+    # (lognormal / model-trace) lands, this becomes the byte-hash of the serialized activations.
+    activation_identity = hashlib.sha256(
+        f"{args.activation_profile}|seed={args.seed}|hidden={args.hidden}|gen=collectivex-activation-v1"
+        .encode()).hexdigest()[:16]
+    # EPLB mapping identity hash (goal P2) — over the replica placement, not just the counts.
+    eplb_mapping_hash = None
+    if eplb_plan is not None:
+        eplb_mapping_hash = hashlib.sha256(json.dumps(
+            {"phys2log": eplb_plan["phys2log"], "rank_of_phys": eplb_plan["rank_of_phys"],
+             "replicas": eplb_plan["replicas"]}, sort_keys=True).encode()).hexdigest()[:16]
+    # Anomaly roll-up (goal P1-e/f): any flagged row anomaly demotes publication_status to
+    # diagnostic, unless --waive-anomaly (set AFTER the cause is understood + documented).
+    waived = bool(getattr(args, "waive_anomaly", False))
+    anomaly_free = (len(all_anomalies) == 0) or waived
+    validity = {
+        "execution_status": "complete" if rows else "failed",
+        "semantic_correctness": "pass" if (rows and all(r["correct"] for r in rows)) else "fail",
+        "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent",
+        "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime",
+        "measurement_conformance": "conformant",   # run_ep gate rejects nonconformant pre-run
+        "resource_conformance": resource_conformance,
+        "provenance_complete": provenance_complete,
+        # anomaly-free unless a contract-level timing anomaly fired (then diagnostic, see above).
+        "anomaly_free": anomaly_free,
+    }
+    publication_status = _derive_publication_status(validity)
+
+    shape = {  # FIXED line identity (no T, no per-backend resource knobs)
+        "hidden": args.hidden, "topk": args.topk, "experts": args.experts,
+        "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype,
+        "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical,
+        # DeepEP kernel generation (v1 = NVSHMEM, v2 = NCCL-Gin) — part of line identity so a V2 run
+        # is never conflated with V1 in comparison_key / plot / cohort. Derived from deepep_version;
+        # "n-a" for non-DeepEP backends. (Existing V1 docs lack this field -> read as "v1".)
+        "kernel_gen": ("v2" if str((backend.backend_provenance or {}).get("deepep_version", "")).startswith("2")
+                       else "v1" if str((backend.backend_provenance or {}).get("deepep_version", "")).startswith("1")
+                       else "n-a"),
+        # temporal snapshot + uneven allocation change the realized workload, so they are part of
+        # the line identity (fold into comparison_key). Default 0/none reproduce the prior key for
+        # non-temporal even runs in spirit (the value is recorded either way).
+        "routing_step": routing_step, "uneven_tokens": uneven,
+        # value distribution of expert inputs — part of the workload identity (review: quant
+        # combine can be value-sensitive). "normal" today; folds into comparison_key.
+        "activation_profile": args.activation_profile,
+        # Combine contract, SEPARATE from dispatch. Today bf16/none for every backend regardless
+        # of dispatch_dtype; a quant-combine backend (PR311) reports its actuals via attrs. In
+        # shape so it folds into comparison_key — a quant-combine run is never compared to a bf16 one.
+        "quant": {
+            "combine_input_dtype": getattr(backend, "combine_input_dtype", args.combine_dtype),
+            "combine_accum_dtype": getattr(backend, "combine_accum_dtype", "fp32"),
+            "combine_output_dtype": getattr(backend, "combine_output_dtype", "bf16"),
+            "combine_quant_mode": getattr(backend, "combine_quant_mode", args.combine_quant_mode),
+            "scale_layout": getattr(backend, "scale_layout", None),
+        },
+    }
+    meta = {
+        "op": "ep-dispatch-combine", "backend": backend.name, "mode": args.mode,
+        "phase": args.phase, "world_size": world_size, "ep_size": ep_size,
+        "resource_mode": args.resource_mode,
+        "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+        "topology_class": args.topology_class, "comparison_class": args.comparison_class,
+        # honest contract name (was the misleading "comm-only-v1": dispatch INCLUDES layout
+        # under layout-and-dispatch-v1). Adapters declare which they conform to.
+        "measurement_contract": args.measurement_contract, "shape": shape,
+        # structured placement metadata (goal P2 topology) — replaces the bare topology string.
+        "placement": {
+            "kind": args.placement, "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+            "gpus_per_node": args.gpus_per_node or ep_size,
+            "scale_up_domain": args.scale_up_domain or ((args.gpus_per_node or ep_size) * 1),
+            "ranks": ep_size, "transport": args.transport,
+        },
+    }
+    headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2])
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+    doc = {
+        "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "tests/run_ep.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner, "transport": args.transport,
+        # Multi-dimensional validity + machine-derived publication status (goal P1). `status`
+        # is a back-compat alias (legacy v3 readers) — publication_status is authoritative.
+        "validity": validity,
+        "publication_status": publication_status,
+        "status": "valid" if all_ok else "invalid",
+        "workload": {
+            "source": validity["workload_source"],
+            "workload_id": getattr(args, "workload_id", None),
+            "manifest_checksums": getattr(args, "workload_checksums", None),
+            "trace_signature": f"{trace_sig:015x}",
+            "distinct_per_T_hashes": sorted(routing_hashes),
+            # within-run (cross-rank) identity is PROVEN here; cross-hardware identity holds
+            # only if another run records the SAME trace_signature / workload_id.
+            "cross_rank_consistent": routing_consistent,
+            # value-distribution identity of the expert inputs (scaffold; see activation_identity above).
+            "activation_profile": args.activation_profile,
+            "activation_identity": activation_identity,
+        },
+        "comparison_key": comparison_key(meta),
+        "x_axis": {"primary": "tokens_per_rank",
+                   "global_relation": "global_tokens = tokens_per_rank * ep_size"},
+        "backend_provenance": backend.backend_provenance,
+        # backend-independent resource vocabulary + conformance class (goal P3).
+        "resource_profile": _resource_profile(backend.backend_provenance, args),
+        "reproduction": {
+            "command": getattr(args, "reproduction_command", ""),
+            "image": getattr(args, "image", "") or None,
+            "image_digest": getattr(args, "image_digest", "") or None,
+            "image_arch": getattr(args, "image_arch", None),
+            "squash_sha256": getattr(args, "squash_sha256", None),
+            "git_run": getattr(args, "git_run", None),   # repo/run/attempt/ref/sha/job/artifact
+            # redaction (goal P1): command + provenance carry NO hostnames/IPs/UUIDs/private paths;
+            # per-node env (hostnames, GPU UUIDs, NIC GUIDs) lives in the separate gitignored
+            # env_json (CI uploads it as a workflow artifact), never inlined into this record.
+            "redaction": "no hostnames/IPs/UUIDs/private-paths in command or provenance",
+            "seed": args.seed, "warmup": args.warmup, "iters": args.iters,
+            "trials": max(1, args.trials), "samples_per_point": (max(1, args.trials) * args.iters),
+            "measurement_contract": args.measurement_contract,
+            "dispatch_dtype": args.dispatch_dtype, "mode": args.mode,
+            "combine_dtype": args.combine_dtype, "combine_quant_mode": args.combine_quant_mode,
+            "activation_profile": args.activation_profile,
+            "routing_step": routing_step, "uneven_tokens": uneven,
+            "waive_anomaly": waived,
+            "roundtrip_anomaly_threshold": thr_rt,
+            # whether (de)quantization is inside the timed window. fp8_quant_in_timing kept as a
+            # back-compat alias (dispatch-side fp8); combine_* are the quant-combine generalization
+            # (None today — no quant combine is wired). A backend sets these when it quantizes.
+            "fp8_quant_in_timing": getattr(backend, "fp8_in_timing", None),
+            "combine_quant_in_timing": getattr(backend, "combine_quant_in_timing", None),
+            "combine_dequant_in_timing": getattr(backend, "combine_dequant_in_timing", None),
+        },
+        **meta,
+        "correctness": {"passed": all_ok,
+                        "max_rel_error": max((r["max_rel_error"] for r in rows), default=None),
+                        "tolerance": getattr(backend, "tolerance", 5e-2), "points": len(rows),
+                        # honest scope: round-trip reconstruction + non-silent recv, NOT a full
+                        # per-token routing/ordering/weight/padding proof (review #3).
+                        "scope": "roundtrip-reconstruction-smoke-v1"},
+        "routing_identity": {   # cryptographic workload-identity proof (review #3)
+            "consistent_across_ranks": routing_consistent,
+            "trace_signature": f"{trace_sig:015x}",
+            "distinct_per_T_hashes": sorted(routing_hashes),
+        },
+        # EPLB plan + the per-rank load imbalance it removes (the headline of the zipf+EPLB
+        # comparison). enabled=False when the run did not apply EPLB.
+        # EPLB mapping IDENTITY (goal P2): logical/physical counts + a hash of the replica
+        # placement (phys2log/rank_of_phys/replicas). Two EPLB runs are only an official comparison
+        # if their mapping_hash matches (cohort.py enforces); zipf vs zipf+eplb is a RECOVERY
+        # experiment, not the same raw workload.
+        "eplb": ({"enabled": True, "num_logical_experts": num_logical,
+                  "num_physical_experts": args.experts,
+                  "num_redundant": args.experts - num_logical,
+                  "imbalance_before": eplb_plan["imbalance_before"],
+                  "imbalance_after": eplb_plan["imbalance_after"],
+                  "replicated_experts": eplb_plan["replicated_experts"],
+                  "max_replicas": eplb_plan["max_replicas"],
+                  "mapping_hash": eplb_mapping_hash}
+                 if eplb_plan else {"enabled": False}),
+        "routing_profile": {
+            "routing": args.routing,
+            "fanout_mean": sum(r["fanout_mean"] for r in rows) / len(rows),
+            "fanout_max": max(r["fanout_max"] for r in rows),
+            "headline_hash": headline["routing_hash"],
+        },
+        "metrics": {   # p99 is the headline percentile (review #3); p50/p90/p95 also kept per row
+            "headline_tokens_per_rank": headline["tokens_per_rank"],
+            "headline_percentile": "p99",
+            "dispatch_us_p50": headline["dispatch_us_p50"], "dispatch_us_p99": headline["dispatch_us_p99"],
+            "combine_us_p50": headline["combine_us_p50"], "combine_us_p99": headline["combine_us_p99"],
+            "roundtrip_us_p50": headline["roundtrip_us_p50"], "roundtrip_us_p99": headline["roundtrip_us_p99"],
+            "isolated_sum_us_p50": headline["isolated_sum_us_p50"], "isolated_sum_us_p99": headline["isolated_sum_us_p99"],
+            "isolated_sum_label": "sum of isolated dispatch+combine percentiles — NOT a measured chained op",
+            "roundtrip_tokens_per_second": headline["roundtrip_tokens_per_second"],
+        },
+        # phase semantics (goal P2): decode/prefill are regimes with distinct serving meaning, not
+        # just ladder aliases — a point is one MoE layer / one step / one collective.
+        "phase_profile": PHASE_PROFILE.get(args.phase, {"regime": args.phase}),
+        # source-token allocation across ranks (goal P2 uneven allocation). 'none' = even.
+        "source_allocation": {
+            "mode": uneven, "routing_step": routing_step,
+            "note": ("even — every rank gets the ladder T (global = T*ep_size)" if uneven == "none"
+                     else "uneven — per-rank source-token counts vary; see rows[].source_token_stats "
+                          "(global may not divide ep_size; empty-source-rank possible)"),
+        },
+        # contract-level timing anomalies (goal P1) — aggregate of the per-row flags; demotes
+        # publication_status to diagnostic unless --waive-anomaly (validity.anomaly_free).
+        "anomalies": all_anomalies,
+        "anomaly_summary": {"count": len(all_anomalies), "waived": waived,
+                            "types": sorted({a["type"] for a in all_anomalies})},
+        "rows": rows, "environment": env,
+    }
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+    print(f"{backend.name} ep-dispatch-combine [{args.phase}/{args.mode}/{args.measurement_contract}]: "
+          f"status={doc['status']} {len(rows)} pts, routing_consistent={routing_consistent}, "
+          f"headline T={headline['tokens_per_rank']} disp_p99={headline['dispatch_us_p99']:.1f}us "
+          f"-> {args.out}")
+    return 0 if all_ok else 1
diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py
new file mode 100644
index 000000000..eef42ee6d
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_mori.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — MoRI (AMD ROCm), normal mode.
+
+The harness owns the deterministic shared routing trace and the comm-only timing;
+this file owns MoRI's API and the ionic_rdma-fabric constraints found on MI355X
+(validated on-node, see CONTAINERS.md): the whole symmetric heap is one RDMA MR
+capped at ~4 GiB (hold at 2 GiB; bound buffers via max_num_inp_token_per_rank ⇒
+buffer_cap); combine() resets recv_num (read it before combine; compare only the
+first T rows); and the post-shmem_finalize teardown asserts (finalize hard-exits).
+
+`make_problem` now materializes the harness-provided rank slice, so MoRI honors the
+requested routing (it no longer always-uniform) and runs the identical workload to
+the NVIDIA SKUs. combine_needs_redispatch=True: combine consumes recv_num, so the
+harness re-dispatches (untimed) before each timed combine sample.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+# MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init — set BEFORE
+# `import mori`. 2 GiB registers on the MI355X ionic_rdma NICs; larger fails.
+os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
+                      os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
+
+import torch
+import torch.distributed as dist
+
+try:
+    import mori  # type: ignore
+except Exception as exc:  # pragma: no cover - needs the AMD MoRI image
+    print("ERROR: mori import failed — needs the AMD MoRI image "
+          f"(rocm/sgl-dev:...-mori-...). {exc!r}", file=sys.stderr)
+    raise
+
+# e4m3fnuz (the ROCm-native fp8) finite max. AMD's "fnuz" (finite, no -0/Inf/NaN-unsigned) e4m3
+# saturates at 240.0 — the dispatch fp8 cast scales each block so its amax maps to this.
+_FP8_FNUZ_MAX = 240.0
+_FP8_BLOCK = 128  # MoRI/DeepSeek blockwise fp8: one scale per 128-elem hidden block (7168%128==0)
+
+
+def _mori_quant_introspect():
+    """Describe MoRI's quant API (enum members + ctor/dispatch signatures + quant/scale helpers).
+
+    FNUZ fp8 dispatch on MoRI keys off EpDispatchCombineConfig.quant_type, which PR311 extended with
+    QuantType::Fp8BlockwiseQuant — but how that value is EXPOSED to Python (enum attr vs accepted
+    string vs int) differs by build. We print this to stderr at construction so a GHA run's log is
+    self-documenting: even if the run wedges or the quant_type guess is wrong, the next iteration has
+    MoRI's exact surface without needing interactive SSH (which stalls on the shared cluster)."""
+    import inspect
+    info = {}
+    ops = getattr(mori, "ops", None)
+    try:
+        info["config_sig"] = str(inspect.signature(mori.ops.EpDispatchCombineConfig.__init__))
+    except Exception as e:
+        info["config_sig"] = f"<err {e!r}>"
+    for meth in ("dispatch", "combine"):
+        try:
+            info[f"{meth}_sig"] = str(inspect.signature(getattr(mori.ops.EpDispatchCombineOp, meth)))
+        except Exception as e:
+            info[f"{meth}_sig"] = f"<err {e!r}>"
+    # Any enum / helper whose name mentions quant or scale (the QuantType enum + any quantize fn).
+    surface = {}
+    for nm in (dir(ops) if ops else []):
+        if nm.startswith("_"):
+            continue
+        if "quant" in nm.lower() or "scale" in nm.lower():
+            obj = getattr(ops, nm)
+            members = {}
+            for m in dir(obj):
+                if m.startswith("_"):
+                    continue
+                try:
+                    members[m] = int(getattr(obj, m))
+                except Exception:
+                    members[m] = str(type(getattr(obj, m)).__name__)
+            surface[nm] = members or str(type(obj).__name__)
+    info["quant_surface"] = surface
+    return info
+
+
+def _mori_quant_type_validator():
+    """MoRI's own quant_type normalizer if exposed (mori.ops.dispatch_combine._normalize_quant_type)
+    — validates a candidate CHEAPLY (no 2 GiB heap alloc) by raising on an invalid value. The config
+    ctor stores any string; only the OP normalizes it, so a config-only probe can't tell a valid mode
+    from an invalid one (that cost us a 90-min MI355X run on the wrong 'fp8_blockwise' guess)."""
+    try:
+        from mori.ops.dispatch_combine import _normalize_quant_type  # type: ignore
+        return _normalize_quant_type
+    except Exception:
+        return None
+
+
+def _fp8_quant_type_candidates():
+    """Ordered (value, label) candidates for MoRI's fp8 quant_type. fp8_direct_cast is the validated
+    mode on the mori-0227-2 image (the GHA self-introspection found the valid set is
+    ['none','fp8_direct_cast']; 'fp8_blockwise' is in the python map but THIS build's
+    _normalize_quant_type rejects it). Prefer the direct-cast string, then the typed enum member, then
+    fallbacks — __init__ keeps the first that MoRI's _normalize_quant_type accepts."""
+    ops = mori.ops
+    out = [("fp8_direct_cast", "str:fp8_direct_cast")]
+    enum = getattr(ops, "EpDispatchCombineQuantType", None)
+    if enum is not None:
+        for pref in ("Fp8DirectCast", "Fp8BlockwiseQuant"):
+            if hasattr(enum, pref):
+                out.append((getattr(enum, pref), f"EpDispatchCombineQuantType.{pref}"))
+    for s in ("fp8", "Fp8", "fp8_blockwise"):
+        out.append((s, f"str:{s}"))
+    return out
+
+
+def _quant_blockwise_fp8_fnuz(x, block=_FP8_BLOCK):
+    """bf16 [T,H] -> (e4m3fnuz [T,H], f32 per-block scales [T,H//block]). Per-128-block amax scaling
+    onto the fnuz finite range. Caller-side quantization (MoRI transports the fp8 payload + scales;
+    the combine reduces and the harness dequantizes for the consistency-correctness gate)."""
+    T, H = x.shape
+    assert H % block == 0, f"hidden {H} not a multiple of fp8 block {block}"
+    nb = H // block
+    xb = x.float().view(T, nb, block)
+    amax = xb.abs().amax(dim=2).clamp_min(1e-8)          # [T, nb]
+    scale = amax / _FP8_FNUZ_MAX                          # f32 dequant scale
+    xq = (xb / scale.unsqueeze(2)).clamp(-_FP8_FNUZ_MAX, _FP8_FNUZ_MAX).to(torch.float8_e4m3fnuz)
+    return xq.view(T, H), scale
+
+
+def _dequant_blockwise_fp8_fnuz(xq, scale, block=_FP8_BLOCK):
+    """Inverse of _quant_blockwise_fp8_fnuz: e4m3fnuz [T,H] + f32 [T,H//block] -> bf16-range f32 [T,H]."""
+    T, H = xq.shape
+    nb = H // block
+    return (xq.float().view(T, nb, block) * scale.unsqueeze(2)).view(T, H)
+
+
+class MoRIBackend:
+    name = "mori"
+    combine_needs_redispatch = True
+    # MoRI wedges on a COLD dispatch jumping straight to a large T (validated on
+    # MI355X); the harness ramps this backend's ladder geometrically from 1.
+    needs_gradual_ramp = True
+    # MoRI WEDGES under a sustained warm-up burst (the harness's Blackwell clock-ramp)
+    # and is already steady at a short warm-up (~44us, reproducible) — so it opts out.
+    wants_warm_burst = False
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). DISPATCH precision and the SEPARATE combine path are distinct axes
+    # (review: dispatch_dtype=fp8 must NOT imply quantized combine). bf16 is the default; fp8 routes
+    # the AMD-native DIRECT-CAST path (quant_type=fp8_direct_cast — the only fp8 mode this MoRI build
+    # accepts; GHA introspection found the valid set is ['none','fp8_direct_cast']): the kernel casts
+    # bf16<->e4m3fnuz internally for transport (scale_dim=0, no caller scales) and returns the recv
+    # buffer as bf16 again. The combine OUTPUT stays bf16 so SUPPORTED_COMBINE_DTYPES is unchanged.
+    # Keep in sync with capability.py CAP["mori"].
+    SUPPORTED_DISPATCH_DTYPES = {"bf16", "fp8"}  # fp8 = e4m3fnuz direct-cast (FNUZ dispatch variant)
+    SUPPORTED_COMBINE_DTYPES = {"bf16"}         # + "fp8" once the PR311 quant combine OUTPUT lands
+    SUPPORTED_COMBINE_QUANT_MODES = {"none"}    # + the PR311 mode id once validated
+    SUPPORTED_PRECISIONS = SUPPORTED_DISPATCH_DTYPES  # back-compat alias (run_ep.py / older refs)
+    SUPPORTED_MODES = {"normal"}           # MoRI has no separate low-latency entrypoint
+    # MoRI computes its routing layout INSIDE the dispatch kernel (block_num/warps launch);
+    # it cannot be hoisted, so MoRI honors only the layout-and-dispatch contract. Cross-
+    # vendor comparisons must therefore use layout-and-dispatch-v1 (the common contract).
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        assert (args.dispatch_dtype in self.SUPPORTED_DISPATCH_DTYPES
+                and args.mode in self.SUPPORTED_MODES
+                and getattr(args, "combine_dtype", "bf16") in self.SUPPORTED_COMBINE_DTYPES
+                and getattr(args, "combine_quant_mode", "none") in self.SUPPORTED_COMBINE_QUANT_MODES), \
+            "run_ep.py must reject unsupported dispatch/mode/combine before constructing the backend"
+        self.fp8_in_timing = None  # set when fp8 dispatch is used (whether the cast is timed)
+        # Combine-path quant timing (None today — no quant combine wired). PR311 sets these +
+        # the combine_* dtype attrs ep_harness reads via getattr; until then ep_harness records
+        # combine bf16 / none from the args defaults.
+        self.combine_quant_in_timing = None
+        self.combine_dequant_in_timing = None
+        self.ep_size = world_size
+        self.experts_per_rank = args.experts // self.ep_size
+        dev_cus = torch.cuda.get_device_properties(device).multi_processor_count
+        # Resource regime — map the comm budget onto CUs to mirror DeepEP's SM fraction.
+        #   normalized: block_num ≈ sm_fraction · CUs (≈ the same device fraction);
+        #   tuned: MoRI launch auto-tuning (API not present in this build — uses default,
+        #          labeled tuned_source); default: the 80-block bring-up budget.
+        # MoRI DEADLOCKS at T>=32 when block_num is reduced toward the normalized target
+        # (validated on MI355X g15: block_num=46 wedges, 80 completes T=32/64 with the
+        # realistic fan-out≈5.3 trace). So MoRI cannot be normalized down to DeepEP's
+        # device fraction; floor it at a known-functional minimum and record that the
+        # target fraction was NOT reached.
+        rm = args.resource_mode
+        floor = int(os.environ.get("CX_MORI_MIN_BLOCKS", "80"))  # functional minimum (deadlocks lower)
+        env_blocks = os.environ.get("CX_MORI_BLOCK_NUM")
+        self._block_floored = False
+        if env_blocks:
+            self.block_num = int(env_blocks)
+            self._block_target = self.block_num
+        elif rm == "normalized":
+            self._block_target = max(1, round(args.sm_fraction * dev_cus))
+            self.block_num = max(floor, self._block_target)
+            self._block_floored = self.block_num > self._block_target
+        else:  # tuned (no launch auto-tune API in mori-0227-2) / default
+            self.block_num = 80
+            self._block_target = 80
+        self._tuned_source = ("default-80" if rm == "tuned" else
+                              ("normalized-floored" if self._block_floored else "n/a"))
+        self.dispatch_warps = int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16"))
+        self.combine_warps = int(os.environ.get("CX_MORI_COMBINE_WARPS", "8"))
+
+        world_group = torch.distributed.group.WORLD
+        torch._C._distributed_c10d._register_process_group("default", world_group)
+        mori.shmem.shmem_torch_process_group_init("default")
+
+        self._cap = self.buffer_cap(args)
+        # Dispatch precision: bf16 (quant_type="none") or fp8 (e4m3fnuz DIRECT-CAST — the FNUZ
+        # variant). MoRI's only fp8 mode on this image is `fp8_direct_cast` (GHA self-introspection
+        # found the valid set is ['none','fp8_direct_cast']): the dispatch kernel direct-casts the
+        # bf16 input to e4m3fnuz for transport and returns the recv buffer as input.dtype (bf16) again
+        # — so NO caller scales (scale_dim=0; scale_dim>0 is only for caller FP4 dispatch scales). We
+        # DUMP MoRI's quant API to stderr (self-documenting GHA log — SSH to the cluster stalls) and
+        # pick the first quant_type MoRI's own _normalize_quant_type accepts (cheap; no heap alloc).
+        self._fp8 = (args.dispatch_dtype == "fp8")
+        self._quant_label = "none"
+        scale_dim = 0
+        quant_type = "none"
+        if self._fp8:
+            import json as _json
+            print("MORI_QUANT_API " + _json.dumps(_mori_quant_introspect()), file=sys.stderr, flush=True)
+            validator = _mori_quant_type_validator()
+            cands = _fp8_quant_type_candidates()
+            print(f"MORI_FP8_CANDIDATES {[l for _, l in cands]}", file=sys.stderr, flush=True)
+            for val, label in cands:
+                try:
+                    if validator is not None:
+                        validator(val)   # raises ValueError on an invalid value (no heap alloc)
+                    else:
+                        mori.ops.EpDispatchCombineConfig(   # fallback: config-construct probe
+                            data_type=torch.bfloat16, rank=rank, world_size=world_size,
+                            hidden_dim=args.hidden, scale_dim=0,
+                            scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(),
+                            max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+                            max_num_inp_token_per_rank=max(512, self._cap),
+                            num_experts_per_rank=self.experts_per_rank,
+                            num_experts_per_token=args.topk,
+                            use_external_inp_buf=False, quant_type=val)
+                    quant_type, self._quant_label = val, label
+                    break
+                except Exception as e:
+                    print(f"MORI_FP8_REJECT {label}: {e!r}", file=sys.stderr, flush=True)
+            if quant_type == "none":
+                raise RuntimeError("no MoRI quant_type candidate accepted for fp8 — see "
+                                   "MORI_QUANT_API above for this build's actual quant surface")
+            print(f"MORI_FP8_QUANT_TYPE {self._quant_label}", file=sys.stderr, flush=True)
+            self.fp8_in_timing = True  # the e4m3fnuz direct-cast is internal to dispatch (in timing)
+        # scale_dim==0 in both bf16 and fp8-direct-cast paths -> the 1-byte sentinel element size.
+        _scale_elt = torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size()
+        # zero-copy mode = NOT use_external_inp_buf. MoRI ASSERTS "Fp8DirectCast is not supported in
+        # zero-copy mode" (dispatch_combine.cpp:454, evidenced on MI355X run 28318485335), and the
+        # source also gates Fp8BlockwiseQuant on --zero-copy 0. So fp8 MUST use the external-input-buf
+        # (non-zero-copy) path; the dispatch copies the input to its staging buffer internally
+        # (EpDispatchCopyToStaging). bf16 keeps the validated zero-copy path (use_external_inp_buf=False).
+        _use_ext_inp_buf = bool(self._fp8)
+        self.config = mori.ops.EpDispatchCombineConfig(
+            data_type=torch.bfloat16, rank=rank, world_size=world_size,
+            hidden_dim=args.hidden, scale_dim=scale_dim,
+            scale_type_size=_scale_elt,
+            max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+            max_num_inp_token_per_rank=max(512, self._cap),
+            num_experts_per_rank=self.experts_per_rank,
+            num_experts_per_token=args.topk,
+            use_external_inp_buf=_use_ext_inp_buf, quant_type=quant_type,
+        )
+        self.op = mori.ops.EpDispatchCombineOp(self.config)
+        # fp8 blockwise carries fp8 quant error -> loosen the correctness gate to the fp8 class
+        # (the harness reads backend.tolerance; bf16 default 5e-2). The combine reduces the
+        # (dequantized) payload per rank, compared against x*unique_ranks within this tolerance class.
+        if self._fp8:
+            self.tolerance = 1.5e-1
+        # Provenance: MoRI has no pip version; pin via MORI_COMMIT, else the image tag
+        # the launcher exported (COLLECTIVEX_IMAGE carries the mori build tag), so the
+        # provenance gate has something real rather than "unknown".
+        img = os.environ.get("COLLECTIVEX_IMAGE", "")
+        mori_commit = os.environ.get("MORI_COMMIT") or (f"image:{img}" if img else "unknown")
+        self.backend_provenance = {
+            "mori_commit": mori_commit,
+            "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"),
+            "max_num_inp_token_per_rank": max(512, self._cap),
+            "resource_mode": args.resource_mode, "block_num": self.block_num,
+            "block_num_target": self._block_target, "block_num_floored": self._block_floored,
+            "dispatch_warps": self.dispatch_warps, "combine_warps": self.combine_warps,
+            "device_cus": dev_cus, "sm_fraction": (self.block_num / dev_cus),
+            "tuned_source": self._tuned_source,
+            "dispatch_dtype": args.dispatch_dtype,
+            "quant_type": self._quant_label,
+            "fp8_format": ("e4m3fnuz" if self._fp8 else None),
+            "fp8_mode": ("direct_cast" if self._fp8 else None),  # internal cast, scale_dim=0, no blocks
+        }
+
+    def buffer_cap(self, args):
+        # Largest tokens/rank the 2 GiB registerable heap holds at hidden=7168 (512,
+        # validated on-node). Override via CX_MORI_MAX_TOKENS.
+        return int(os.environ.get("CX_MORI_MAX_TOKENS", "512"))
+
+    def make_problem(self, T, idx, weights, x):
+        # Shared-trace slice: idx[T,topk] -> int32 (MoRI expects int32 expert ids); weights[T,topk]
+        # f32; x[T,hidden] bf16. scale_dim==0 for BOTH bf16 and fp8-direct-cast (the kernel casts
+        # bf16<->e4m3fnuz internally for transport), so scales is the (T,0) fp8 sentinel either way
+        # (dispatch ignores it since scale_dim==0). caller scales are only for FP4 dispatch.
+        indices = idx.to(torch.int32)
+        scales = torch.empty((T, 0), dtype=torch.float8_e4m3fnuz, device=self.device)
+        return types.SimpleNamespace(T=T, x=x, indices=indices,
+                                     weights=weights.to(torch.float32), scales=scales)
+
+    def dispatch(self, p):
+        (dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num) = self.op.dispatch(
+            p.x, p.weights, p.scales, p.indices,
+            block_num=self.block_num, warp_per_block=self.dispatch_warps)
+        total_recv = int(recv_num[0].item())  # read BEFORE combine (combine resets recv_num)
+        # MoRI returns the recv buffer as input.dtype (bf16) for BOTH "none" and "fp8_direct_cast"
+        # (the e4m3fnuz cast is internal to the transport, dequantized back to bf16 on recv) -> a
+        # plain .to(bf16) is the combine input. fp8's e4m3 rounding shows up in the correctness gate
+        # against the looser fp8 tolerance class set in __init__.
+        return types.SimpleNamespace(
+            dispatch_output=dispatch_output, dispatch_weights=dispatch_weights,
+            dispatch_indices=dispatch_indices, total_recv=total_recv,
+            combine_input=dispatch_output.to(torch.bfloat16))
+
+    def stage(self, p, h):
+        # comm-only contract: stage the "expert outputs" into MoRI's registered
+        # combine-input buffer UNTIMED (in a real MoE the expert FFN writes here).
+        buf = self.op.get_registered_combine_input_buffer(
+            torch.bfloat16, hidden_dim=h.combine_input.size(1))
+        buf[:h.total_recv, :].copy_(h.combine_input[:h.total_recv, :])
+
+    def combine(self, p, h):
+        combined, _w = self.op.combine(
+            h.combine_input, h.dispatch_weights, h.dispatch_indices,
+            block_num=self.block_num, warp_per_block=self.combine_warps)
+        return combined
+
+    def expected(self, p, h):
+        # MoRI combine sums one copy per destination RANK ⇒ combined[i] ≈
+        # ref[i] * (#unique destination ranks among the token's topk experts).
+        pes = p.indices.long() // self.experts_per_rank
+        unique_pes = torch.tensor(
+            [len(set(row.tolist())) for row in pes], device=self.device, dtype=torch.float32
+        ).unsqueeze(1)
+        ref = p.x.float()
+        if self._fp8:
+            # fp8_direct_cast transports e4m3fnuz, so gate against the SAME direct-cast reference
+            # (consistency — like the flashinfer mxfp8/nvfp4 paths): combined = reduce(e4m3fnuz(x)),
+            # ref = e4m3fnuz(x)*ranks, so the e4m3 rounding CANCELS. A bf16 reference instead carries
+            # the full e4m3 error into relErr, which spuriously fails the per-rank gate at T=1 (the
+            # relErr denominator there is a single token's magnitude — a near-zero token inflates it).
+            ref = p.x.to(torch.float8_e4m3fnuz).float()
+        return ref * unique_pes, p.T
+
+    def recv_tokens(self, h):
+        return int(h.total_recv)
+
+    def finalize(self, rc):
+        # MoRI's shmem teardown asserts after shmem_finalize(); results are already
+        # written, so sync and hard-exit past it.
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(0 if rc == 0 else 1)
diff --git a/experimental/CollectiveX/tests/ep_nccl.py b/experimental/CollectiveX/tests/ep_nccl.py
new file mode 100644
index 000000000..f341100e7
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_nccl.py
@@ -0,0 +1,130 @@
+"""CollectiveX — NCCL all-to-all expert-parallel backend (cross-node EP, goal 182).
+
+The canonical "token-shuffle" EP built on torch.distributed's NCCL ``all_to_all_single``: dispatch
+routes each token-copy to the rank that owns its expert via an uneven all-to-all; combine reverses it
+and weighted-sums the top-k copies back into each origin token. With no expert compute the round trip
+reconstructs ``x * sum(topk_weights)`` per token.
+
+Why this exists alongside DeepEP/UCCL/MoRI: those use custom one-sided RDMA (DeepEP/NVSHMEM, UCCL's own
+ibv verbs, MoRI ionic_rdma). Cross-node, UCCL's ``ibv_reg_mr`` failed with EINVAL -> heap corruption ->
+SIGSEGV (run 28326528672) because the cluster's IB HCAs / container lack the GPUDirect-RDMA peer-memory
+that custom verbs registration needs. NCCL's collective transport, by contrast, negotiates IB and
+*gracefully host-stages* when GPUDirect RDMA is unavailable — so an EP built purely on NCCL collectives
+runs cross-node on the same fabric. It is also the reference baseline the fused EP kernels improve upon,
+so a same-shape NCCL number is a meaningful comparison point, not just a fallback.
+
+Scope: BF16, normal mode, layout-and-dispatch-v1 (the timed window includes the layout/argsort + both
+all-to-alls). RCCL exposes the identical API, so this backend also covers AMD (rccl) cross-node EP.
+"""
+import os
+import types
+
+import torch
+import torch.distributed as dist
+
+
+class NCCLBackend:
+    name = "nccl-ep"
+    combine_needs_redispatch = False   # dispatch saves the permutation + splits; combine reuses them
+    wants_warm_burst = False
+    # Pure-collective token shuffle: bf16 only (no fp8 dispatch path), normal mode, single contract.
+    SUPPORTED_PRECISIONS = {"bf16"}
+    SUPPORTED_MODES = {"normal"}
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.experts = args.experts
+        assert args.experts % world_size == 0, \
+            f"NCCL EP needs experts({args.experts}) divisible by world_size({world_size})"
+        self.experts_per_rank = args.experts // world_size
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            f"NCCL EP supports precisions={sorted(self.SUPPORTED_PRECISIONS)} modes={sorted(self.SUPPORTED_MODES)} only"
+        self.tolerance = 5e-2   # bf16 round-trip
+        try:
+            _nccl = ".".join(str(v) for v in torch.cuda.nccl.version())
+        except Exception:
+            _nccl = "unknown"
+        self.backend_provenance = {
+            "backend": "nccl-all2all",
+            "nccl_version": _nccl,
+            "transport": "nccl-all_to_all_single",
+            "resource_mode": args.resource_mode,
+            "num_sms": None,
+            "device_sms": torch.cuda.get_device_properties(device).multi_processor_count,
+            "tuned_source": "nccl-collective",
+        }
+
+    def buffer_cap(self, args):
+        return None   # no fixed pre-allocated buffer; all-to-all sizes itself per step
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared routing-trace slice.
+        return types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+                                     topk_weights=weights.to(torch.float32), layout=None)
+
+    def dispatch(self, p):
+        ws = self.world_size
+        x = p.x                                   # [T, H] bf16
+        idx = p.topk_idx                          # [T, topk]
+        T, H = int(x.shape[0]), int(x.shape[1])
+        topk = int(idx.shape[1])
+        dev = x.device
+        # Flatten the T*topk token-copies; each goes to the rank owning its expert.
+        flat_expert = idx.reshape(-1)                                       # [T*topk]
+        flat_dest = (flat_expert // self.experts_per_rank).to(torch.int64)  # dest rank per copy
+        flat_token = torch.arange(T, device=dev, dtype=torch.int64).repeat_interleave(topk)
+        # Group copies by destination rank (stable -> deterministic, invertible permutation).
+        order = torch.argsort(flat_dest, stable=True)
+        send_counts = torch.bincount(flat_dest, minlength=ws)               # [ws]
+        send_x = x.index_select(0, flat_token.index_select(0, order)).contiguous()  # [T*topk, H], send order
+        # Exchange per-rank counts so every rank can size its receive buffer.
+        recv_counts = torch.empty_like(send_counts)
+        dist.all_to_all_single(recv_counts, send_counts)
+        sc = send_counts.tolist()
+        rc = recv_counts.tolist()
+        total_recv = int(sum(rc))
+        recv_x = torch.empty((total_recv, H), dtype=x.dtype, device=dev)
+        # The dispatch all-to-all (uneven splits). NCCL routes internode over IB (host-staged if no
+        # GPUDirect RDMA) — this is the line that runs cross-node where UCCL's ibv_reg_mr fails.
+        dist.all_to_all_single(recv_x, send_x, rc, sc)
+        return types.SimpleNamespace(recv_x=recv_x, combine_input=None, order=order,
+                                     flat_token=flat_token, flat_w=p.topk_weights.reshape(-1),
+                                     send_counts=sc, recv_counts=rc, T=T, H=H, total_recv=total_recv)
+
+    def stage(self, p, h):
+        # No expert compute: the expert "output" is the received tokens as-is (the round-trip identity).
+        h.combine_input = h.recv_x
+        return None
+
+    def combine(self, p, h):
+        # Reverse all-to-all: ship expert outputs back to their origin ranks (swap the split lists).
+        send_back = torch.empty((int(h.order.shape[0]), h.H), dtype=h.combine_input.dtype,
+                                device=h.combine_input.device)
+        dist.all_to_all_single(send_back, h.combine_input.contiguous(), h.send_counts, h.recv_counts)
+        # send_back is in send (sorted) order; invert the argsort to token-copy order.
+        copies = torch.empty_like(send_back)
+        copies[h.order] = send_back
+        # Weighted reduce of each token's top-k copies into [T, H] (accumulate in fp32 for stability).
+        out = torch.zeros((h.T, h.H), dtype=torch.float32, device=send_back.device)
+        out.index_add_(0, h.flat_token, copies.float() * h.flat_w.unsqueeze(1))
+        return out.to(p.x.dtype)
+
+    def recv_tokens(self, h):
+        return int(h.total_recv)
+
+    def expected(self, p, h):
+        # Round trip with identity expert: out[t] = sum_k w[t,k] * x[t] = x[t] * sum_k w[t,k].
+        wsum = p.topk_weights.sum(dim=1, keepdim=True).float()
+        return p.x.float() * wsum, p.T
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_uccl.py b/experimental/CollectiveX/tests/ep_uccl.py
new file mode 100644
index 000000000..f13a77051
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_uccl.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — UCCL EP (NVIDIA), normal + LL modes. PRODUCING RESULTS:
+cx_build_uccl vendors UCCL's deep_ep_wrapper as `uccl_deepep` (its Buffer takes a torch
+ProcessGroup), so this adapter runs GENUINE uccl.ep dispatch/combine (uccl_version 0.1.1,
+intranode NVLink) — validated on h100/h200/b300/b200. See docs/gated.md "UCCL EP".
+
+IMPORTANT (empirically established on H100 via GHA): the LOW-LEVEL `uccl.ep.Buffer` is
+NOT a drop-in DeepEP clone. Its constructor is
+  Buffer(rank, num_ranks, num_nvl_bytes=0, num_rdma_bytes=0, low_latency_mode=False, …)
+— it takes rank/num_ranks ints, NOT a torch ProcessGroup, so the `Buffer(self.group, …)`
+calls below raise `TypeError: incompatible function arguments`. The DeepEP-identical
+`Buffer(group, …)` API is UCCL's separate ~1900-line `deep_ep_wrapper` package (packaged
+as `deep_ep`, colliding with the container's real DeepEP), whose __init__ runs a proxy +
+IPC-handle-exchange + runtime.sync + connect_atomic_buffer bootstrap. To finish UCCL:
+vendor `deep_ep_wrapper` under a non-colliding name (it uses relative imports + only needs
+`uccl.ep`) and import its Buffer here; then this file is a true ep_deepep.py clone. This is
+DONE: cx_build_uccl vendors `deep_ep_wrapper` as `uccl_deepep` and the import below uses it; if
+that wrapper is ever absent the import falls back to the low-level `uccl.ep.Buffer`, which then
+fails loudly (preserved failed-case) — never faked. With the wrapper present, results are genuine.
+
+The harness contract (make_problem/dispatch/stage/combine/expected/buffer_cap/recv_tokens/
+finalize + backend_provenance + SUPPORTED_*) mirrors ep_deepep.py and is correct once the
+wrapper Buffer is wired.
+
+Install (see launchers/run_in_container.sh cx_build_uccl): `pip install uccl` ships a
+prebuilt cp312 wheel; the UCCL EP kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH
+(pip install nvidia-cuda-runtime-cu12, prepend its lib dir) even on a cu13 image.
+
+Correctness (identical to DeepEP's intranode test): a pure dispatch->combine round trip
+with no expert compute reconstructs x only after dividing by the number of ranks each
+token was sent to, so the harness expects combined ~= x * is_token_in_rank.sum(dim=1).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    import uccl  # for version/provenance
+    try:
+        # PREFERRED: vendored deep_ep_wrapper (cx_build_uccl -> uccl_deepep). Buffer(group, ...)
+        # takes a torch ProcessGroup (matches DeepEP + this adapter's calls) + runs UCCL's full
+        # proxy/IPC/runtime.sync bootstrap. Fallback: low-level uccl.ep.Buffer(rank,num_ranks,...).
+        from uccl_deepep import Buffer  # type: ignore
+    except Exception:
+        from uccl.ep import Buffer  # type: ignore
+except Exception as exc:  # pragma: no cover - needs the installed uccl wheel + cu12 runtime
+    print("ERROR: uccl.ep import failed — `pip install uccl nvidia-cuda-runtime-cu12` and "
+          "prepend the cu12 lib dir to LD_LIBRARY_PATH at job setup (cx_build_uccl). "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _uccl_version() -> str:
+    try:
+        import importlib.metadata as _md
+        return _md.version("uccl")
+    except Exception:
+        return getattr(uccl, "__version__", "unknown")
+
+
+# UCCL's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token
+# block-128 scale — the SAME convention DeepEP's kernels expect (UCCL's ep.Buffer is a
+# clone): scales [T, H//128] float32, e4m3, 448 = e4m3 max. Both directions of the cast
+# run OUTSIDE the timed window (cast in make_problem, dequant in stage), so fp8
+# quantization is NOT included in dispatch time (except under runtime-visible-v1).
+_FP8_MAX = 448.0
+_FP8_BLOCK = 128
+
+
+def _per_token_cast_to_fp8(x):
+    # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32)
+    T, H = x.shape
+    xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK)
+    amax = xv.abs().amax(dim=2).clamp(min=1e-4)               # [T, H//128]
+    x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H)
+    return x_fp8, (amax / _FP8_MAX).contiguous()
+
+
+def _per_block_dequant(x_fp8, scales):
+    # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16
+    R, H = x_fp8.shape
+    xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16)
+
+
+def _per_block_dequant_3d(x_fp8, scales):
+    # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16
+    E, S, H = x_fp8.shape
+    xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16)
+
+
+class UCCLBackend:
+    name = "uccl"
+    combine_needs_redispatch = False  # UCCL combine reuses the handle (DeepEP-clone semantics)
+    # Blackwell (B300) drops GPU clocks during the tiny small-T points, so the harness
+    # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100/H200.
+    wants_warm_burst = True
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). Expanded as each path is implemented + hardware-validated.
+    #   normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink on H200 (EP2).
+    #   ll mode: low_latency_dispatch/combine via allow_nvlink_for_low_latency_mode — validated
+    #   RUNNING intranode over NVLink on H200 (EP2); same DeepEP-clone LL kernel family.
+    SUPPORTED_PRECISIONS = {"bf16", "fp8"}
+    SUPPORTED_MODES = {"normal", "ll"}
+    # Three contracts (mirror DeepEP — UCCL's Buffer is the same API):
+    #   layout-and-dispatch-v1     — times get_dispatch_layout INSIDE dispatch; fp8 cast/dequant
+    #                                OUTSIDE (preprocessing mirrors a producer handing quantized x).
+    #   cached-layout-comm-only-v1 — layout hoisted out (untimed); dispatch = pure comm. normal only.
+    #   runtime-visible-v1         — dispatch INCLUDES the fp8 quant (cast) + layout + comm + the
+    #                                recv-dequant that makes expert input consumable; combine starts
+    #                                from bf16 expert outputs. (normal mode; LL times all of it in-kernel.)
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.ll = (args.mode == "ll")
+        self.contract = args.measurement_contract
+        # hoist layout out of the timed dispatch only for the cached contract in normal mode.
+        self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll
+        # runtime-visible-v1: the fp8 cast + recv-dequant move INSIDE the timed dispatch (normal
+        # mode). LL already times cast+layout+comm in its single kernel, so it's runtime-visible
+        # by construction — the flag only changes normal mode's boundary.
+        self.runtime_visible = (self.contract == "runtime-visible-v1") and not self.ll
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest
+        # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is
+        # recorded in the artifact so the looser fp8 gate is explicit, not hidden.
+        self.fp8 = (args.dispatch_dtype == "fp8")
+        self.tolerance = 1.25e-1 if self.fp8 else 5e-2
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _uccl_version()
+        if self.ll:
+            self._init_ll(args, dev_sms, ver)
+        else:
+            self._init_normal(args, rank, dev_sms, ver)
+
+    def _init_normal(self, args, rank, dev_sms, ver):
+        # fp8 cast: UNTIMED (make_problem) under layout-and-dispatch / cached-layout; TIMED (inside
+        # dispatch) under runtime-visible-v1. So fp8_in_timing tracks the contract honestly.
+        self.fp8_in_timing = (self.runtime_visible if self.fp8 else None)
+        self.combine_needs_redispatch = False  # normal combine reuses the handle
+        # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points (the shared
+        # T=128 point must match between the decode and prefill sweeps). 4 GiB holds T up to 4096.
+        num_nvl_bytes = int(os.environ.get("CX_UCCL_NVL_BYTES",
+                                           os.environ.get("CX_DEEPEP_NVL_BYTES",
+                                                          str(4 * 1024 * 1024 * 1024))))
+        self.buffer = Buffer(self.group, num_nvl_bytes, 0)
+        rm = args.resource_mode
+        tuned_src = None
+        if rm == "normalized":
+            num_sms = max(1, round(args.sm_fraction * dev_sms))   # ~same device fraction as MoRI
+        elif rm == "tuned":
+            # Best-available for the installed UCCL: its OWN default SM count (Buffer.num_sms —
+            # the library's analytic choice). get_dispatch_config(num_ranks) returns the
+            # recommended Config but doesn't expose num_sms to Python; the default reflects it.
+            num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+            tuned_src = "uccl-default-num_sms"
+        else:  # default — the bring-up budget
+            num_sms = args.num_sms
+        try:
+            Buffer.set_num_sms(num_sms)
+        except Exception as exc:  # pragma: no cover - version dependent
+            if rank == 0:
+                print(f"WARN: could not set num_sms={num_sms}: {exc!r}", file=sys.stderr)
+        self.backend_provenance = {
+            "uccl_version": ver,
+            "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{ver}",
+            "mode": "normal", "resource_mode": rm, "num_sms": num_sms, "device_sms": dev_sms,
+            "sm_fraction": (num_sms / dev_sms), "tuned_source": tuned_src or "n/a",
+            "num_nvl_bytes": num_nvl_bytes,
+        }
+
+    def _init_ll(self, args, dev_sms, ver):
+        # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink via
+        # allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch so for
+        # fp8 the quantization IS inside the timed window (recorded honestly). The buffer is sized
+        # for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical), so LL is a
+        # decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop). set_num_sms
+        # does NOT apply (the LL kernel picks its own occupancy) — recorded n/a.
+        self.fp8_in_timing = (True if self.fp8 else None)
+        self.combine_needs_redispatch = True   # re-dispatch (untimed) before each timed combine
+        self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128"))
+        self.experts = args.experts
+        rdma_bytes = Buffer.get_low_latency_rdma_size_hint(
+            self.num_max, args.hidden, self.world_size, args.experts)
+        # one QP per local expert is the DeepEP/UCCL convention for LL
+        self.num_qps = max(1, args.experts // self.world_size)
+        self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True,
+                             num_qps_per_rank=self.num_qps,
+                             allow_nvlink_for_low_latency_mode=True)
+        self.backend_provenance = {
+            "uccl_version": ver,
+            "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{ver}",
+            "mode": "ll", "resource_mode": args.resource_mode,
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel",
+            "num_max_dispatch_tokens_per_rank": self.num_max,
+            "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps,
+            "low_latency_mode": True, "use_fp8": self.fp8,
+        }
+
+    def buffer_cap(self, args):
+        # LL is sized for a fixed num_max; cap the sweep there (reported, not silent).
+        return self.num_max if self.ll else None
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice.
+        p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+                                  topk_weights=weights.to(torch.float32), layout=None)
+        if self.fp8 and not self.ll and not self.runtime_visible:
+            # layout-and-dispatch / cached-layout: per-token block-128 cast, UNTIMED (preprocessing,
+            # mirrors the real producer that hands the dispatcher already-quantized activations).
+            # runtime-visible does NOT pre-cast (the cast is timed inside dispatch); LL casts in-kernel.
+            p.x_fp8, p.x_scales = _per_token_cast_to_fp8(x)
+        if self.cache_layout:
+            # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed) so the
+            # timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None and dispatch
+            # computes it inside the timed window.)
+            ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+            p.layout = (ntr, ntpe, itir)
+        return p
+
+    def dispatch(self, p):
+        if self.ll:
+            return self._dispatch_ll(p)
+        if p.layout is not None:                       # cached-layout-comm-only-v1
+            num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout
+        else:                                          # layout-and-dispatch / runtime-visible (timed layout)
+            (num_tokens_per_rank, _, num_tokens_per_expert,
+             is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+        ref_fp8 = ref_scales = None
+        if self.fp8:
+            if self.runtime_visible:
+                # runtime-visible: the per-token block-128 cast is INSIDE the timed dispatch.
+                x_fp8, x_scales = _per_token_cast_to_fp8(p.x)
+                ref_fp8, ref_scales = x_fp8, x_scales      # for the correctness reference
+            else:
+                x_fp8, x_scales = p.x_fp8, p.x_scales      # pre-cast (untimed)
+            x_in = (x_fp8, x_scales)
+        else:
+            x_in = p.x
+        recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch(
+            x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert)
+        out = types.SimpleNamespace(
+            recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle,
+            is_token_in_rank=is_token_in_rank, ref_fp8=ref_fp8, ref_scales=ref_scales)
+        if self.fp8 and self.runtime_visible:
+            # dispatch ENDS when expert input is consumable: dequant fp8 recv -> bf16 INSIDE the
+            # timed window (the contract's "expert input genuinely consumable" boundary). stage()
+            # then no-ops for this contract.
+            recv_fp8, recv_scales = recv_x
+            out.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+            out.rv_staged = True
+        return out
+
+    def _dispatch_ll(self, p):
+        # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the cast IS
+        # inside this timed op — fp8_in_timing=True). recv is the expert-major 3D layout
+        # [num_local_experts, num_max*world, hidden] (+scales when fp8).
+        recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch(
+            p.x, p.topk_idx, self.num_max, self.experts,
+            use_fp8=self.fp8, return_recv_hook=False)
+        return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle)
+
+    def stage(self, p, h):
+        # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv to
+        # bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in both
+        # precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.)
+        if getattr(h, "rv_staged", False):
+            return None   # runtime-visible already produced bf16 combine_input inside dispatch (timed)
+        if self.ll:
+            if self.fp8:
+                recv_fp8, recv_scales = h.recv_x
+                h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales)
+            else:
+                h.combine_input = h.recv_x
+        elif self.fp8:
+            recv_fp8, recv_scales = h.recv_x
+            h.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+        else:
+            h.combine_input = h.recv_x
+        return None
+
+    def combine(self, p, h):
+        if self.ll:
+            # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones.
+            combined_x, _event, _hook = self.buffer.low_latency_combine(
+                h.combine_input, p.topk_idx, p.topk_weights, h.handle)
+            return combined_x
+        combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle,
+                                               topk_weights=h.recv_topk_weights)
+        return combined_x
+
+    def expected(self, p, h):
+        if self.ll:
+            # LL combine reduces each token's topk expert copies weighted by topk_weights; with no
+            # expert compute each copy is (the kernel's fp8 cast of) x, so combined ~= x *
+            # sum(topk_weights). fp8 quant error is covered by self.tolerance.
+            wsum = p.topk_weights.sum(dim=1, keepdim=True)
+            return p.x.float() * wsum, p.T
+        # normal: round trip with no expert compute reconstructs x*(#destination ranks); for fp8
+        # compare against the dequantized cast that was actually sent.
+        ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
+        ref = p.x.float()
+        if self.fp8:
+            # runtime-visible cast lives on the handle (no pre-cast on p); else use the pre-cast.
+            x_fp8 = getattr(h, "ref_fp8", None)
+            x_scales = getattr(h, "ref_scales", None)
+            if x_fp8 is None:
+                x_fp8, x_scales = p.x_fp8, p.x_scales
+            ref = _per_block_dequant(x_fp8, x_scales).float()
+        return ref * ranks_per_token, p.T
+
+    def recv_tokens(self, h):
+        if self.ll:
+            return int(h.recv_count.sum().item())  # token-copies received across local experts
+        rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x
+        return int(rx.shape[0])
+
+    def finalize(self, rc):
+        # UCCL's symmetric-memory / proxy teardown SIGSEGVs after the sweep completes — but the
+        # result JSON is already written by run_sweep, so (like ep_mori) hard-exit past the crashy
+        # dist/uccl cleanup with the real rc. A clean teardown isn't worth a false 'failed' on a
+        # valid result (the H100 smoke produced status=valid, correct=True before the SIGSEGV).
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(0 if rc == 0 else 1)
diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py
new file mode 100644
index 000000000..2234fea96
--- /dev/null
+++ b/experimental/CollectiveX/tests/eplb.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for
+skewed (zipf) expert load.
+
+Under skewed routing, the ranks hosting hot logical experts receive far more token-copies
+than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX
+the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts
+onto extra physical slots and PLACES the slots so every rank carries ~equal load.
+
+This module is backend-agnostic: it is purely a transform of the deterministic routing
+trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to
+rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots
+RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping
+reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical`
+and the remapped (physical) trace; nothing else changes.
+
+  num_physical = num_logical + redundant   (redundant rounded up to a multiple of ep_size)
+  build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks
+  remap_idx():  each token's logical targets -> physical replicas, spread by global token id
+
+Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch.
+"""
+from __future__ import annotations
+
+
+def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int:
+    """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the
+    physical experts divide evenly across ranks (symmetric dispatch)."""
+    r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size
+    return num_logical + r
+
+
+def _contiguous_rank_load(logical_load, ep_size):
+    """Per-rank received load WITHOUT EPLB: logical experts placed contiguously
+    (experts_per_rank = num_logical/ep_size), so rank r carries its block's total."""
+    n = len(logical_load)
+    per = n // ep_size
+    return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)]
+
+
+def build_plan(logical_load, num_physical: int, ep_size: int) -> dict:
+    """logical_load: list[float] length num_logical (token-copies per logical expert).
+    Returns the replication+placement plan (all pure-Python lists) + before/after balance."""
+    num_logical = len(logical_load)
+    assert num_physical >= num_logical, "num_physical must be >= num_logical"
+    assert num_physical % ep_size == 0, "num_physical must divide ep_size"
+    assert num_logical % ep_size == 0, "num_logical must divide ep_size"
+    spp = num_physical // ep_size                      # physical slots per rank (fixed)
+
+    # 1) Replica allocation — start one slot per logical expert, then hand each redundant
+    #    slot to the expert with the highest CURRENT per-replica load (greedy min-max).
+    replicas = [1] * num_logical
+    for _ in range(num_physical - num_logical):
+        best, best_lps = 0, -1.0
+        for e in range(num_logical):
+            lps = logical_load[e] / replicas[e]
+            if lps > best_lps:
+                best, best_lps = e, lps
+        replicas[best] += 1
+
+    # 2) Slots = (per-replica load, logical expert), one per replica.
+    slots = []
+    for e in range(num_logical):
+        lps = logical_load[e] / replicas[e]
+        slots.extend((lps, e) for _ in range(replicas[e]))
+
+    # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the
+    #    max per-rank load: heaviest slot first -> least-loaded rank that still has capacity.
+    slots.sort(reverse=True)
+    rank_slots = [[] for _ in range(ep_size)]
+    rank_load = [0.0] * ep_size
+    for lps, e in slots:
+        r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp),
+                key=lambda r: rank_load[r])
+        rank_slots[r].append(e)
+        rank_load[r] += lps
+
+    # 4) Rank-major physical numbering -> contiguous placement == this balanced placement.
+    phys2log, rank_of_phys = [], []
+    for r in range(ep_size):
+        for e in rank_slots[r]:
+            phys2log.append(e)
+            rank_of_phys.append(r)
+    log2phys = [[] for _ in range(num_logical)]
+    for pid, e in enumerate(phys2log):
+        log2phys[e].append(pid)
+
+    before = _contiguous_rank_load(logical_load, ep_size)
+    total = sum(logical_load) or 1.0
+    mean = total / ep_size
+    return {
+        "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size,
+        "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas),
+        "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys,
+        "rank_load_after": rank_load, "rank_load_before": before,
+        # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts.
+        "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean,
+        "replicated_experts": sum(1 for r in replicas if r > 1),
+    }
+
+
+def remap_idx(idx_logical, plan):
+    """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace).
+    Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's
+    physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out
+    across its replicas (= across ranks). Replicas of distinct logical experts are disjoint,
+    so a token's top-k physical ids stay distinct (dispatch invariant preserved)."""
+    import torch
+    replicas = plan["replicas"]
+    num_logical = len(replicas)
+    max_rc = plan["max_replicas"]
+    rc = torch.tensor(replicas, dtype=torch.int64)
+    # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed
+    # past rc[e] because the replica index is taken mod rc[e]).
+    padded = torch.zeros(num_logical, max_rc, dtype=torch.int64)
+    for e, phys in enumerate(plan["log2phys"]):
+        for k in range(max_rc):
+            padded[e, k] = phys[k] if k < len(phys) else phys[0]
+    gt = idx_logical.shape[0]
+    rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1)     # [gt,1] global token id
+    e = idx_logical.to(torch.int64)                             # [gt,topk]
+    ridx = rows % rc[e]                                         # [gt,topk] replica index
+    return padded[e, ridx]                                      # [gt,topk] physical ids
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed.
+    import sys
+    NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32
+    load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)]
+    nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP)
+    plan = build_plan(load, nphys, EP)
+    print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}")
+    print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} "
+          f"(hottest expert 0 replicas={plan['replicas'][0]})")
+    print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}")
+    print(f"per-rank load AFTER  (EPLB):       {[round(x,3) for x in plan['rank_load_after']]}")
+    print(f"imbalance (max/mean)  BEFORE={plan['imbalance_before']:.2f}x  AFTER={plan['imbalance_after']:.2f}x")
+    # Gates: equal slot cardinality, every logical expert placed, big imbalance cut.
+    assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL))
+    assert sum(plan["replicas"]) == nphys
+    assert len(plan["phys2log"]) == nphys
+    assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL))
+    # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing
+    assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"])
+    assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance"
+    assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}"
+    # remap (if torch present): distinctness + balanced receive on a sampled zipf trace.
+    try:
+        import torch
+        g = torch.Generator().manual_seed(0)
+        p = torch.tensor(load); p = (p / p.sum()).expand(4096, NUM_LOGICAL)
+        idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64)
+        idx_p = remap_idx(idx_l, plan)
+        assert idx_p.shape == idx_l.shape
+        # top-k physical ids distinct per token
+        assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct"
+        spp = plan["slots_per_rank"]
+        recv_before = [0] * EP
+        recv_after = [0] * EP
+        per_log = NUM_LOGICAL // EP
+        for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()):
+            for e in row_l:
+                recv_before[e // per_log] += 1
+            for pid in row_p:
+                recv_after[pid // spp] += 1
+        ib = max(recv_before) / (sum(recv_before) / EP)
+        ia = max(recv_after) / (sum(recv_after) / EP)
+        print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x  AFTER={ia:.2f}x")
+        assert ia < ib and ia < 1.35, "remap must balance per-rank receive load"
+        print("remap self-test: OK")
+    except ImportError:
+        print("(torch absent — skipped remap self-test; planner gates passed)")
+    print("EPLB self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/tests/failure_taxonomy.py b/experimental/CollectiveX/tests/failure_taxonomy.py
new file mode 100644
index 000000000..45782ee07
--- /dev/null
+++ b/experimental/CollectiveX/tests/failure_taxonomy.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""CollectiveX failure taxonomy (goal Part 3: failure & reliability characterization).
+
+A wedged or crashing EP run should become a CLASSIFIED, bounded record — not a silent hang or a
+bare rc=1. classify() maps an exception (or a process return code from the timeout-wrapped driver)
+onto a stable failure mode, so coverage/reliability views can keep failed cases instead of dropping
+them. Pure stdlib.
+"""
+from __future__ import annotations
+
+# Stable failure modes (goal Part 3). Order matters: classify() returns the first match.
+MODES = [
+    "unsupported",            # capability rejected the combo (run_ep exit 5)
+    "initialization-failure",  # process group / buffer / NVSHMEM bring-up failed
+    "out-of-memory",
+    "registration-failure",    # MR / symmetric-heap registration (e.g. MoRI errno 22)
+    "correctness-failure",     # ran but reconstruction gate failed
+    "timeout",                 # killed by the timeout wrapper (rc 124) — bounded hang
+    "deadlock",                # collective watchdog abort (NCCL SIGABRT / rc -6 after a stall)
+    "teardown-failure",        # post-finalize / shmem_finalize assertion
+    "infrastructure",          # slurm / container / FS / node failure
+    "unknown",
+]
+
+_SIGNATURES = [
+    ("unsupported", ("unsupported", "rejects", "not supported", "no fallback")),
+    ("out-of-memory", ("out of memory", "outofmemory", "cuda oom", "cudaerrormemoryallocation")),
+    ("registration-failure", ("errno 22", "registration", "register", "ibv_reg", "mr ")),
+    ("initialization-failure", ("nvshmem", "init_process_group", "ncclcomminit", "bootstrap", "buffer(")),
+    ("deadlock", ("watchdog", "sigabrt", "signal 6", "collective", "timed out waiting", "nccl timeout")),
+    ("teardown-failure", ("shmem_finalize", "destroy_process_group", "teardown", "finalize")),
+    ("correctness-failure", ("correct=false", "reconstruction", "max_rel", "assertion.*tol")),
+    ("infrastructure", ("srun: error", "slurm", "node fail", "container", "no such file")),
+]
+
+
+def classify(text: str = "", rc: int | None = None) -> str:
+    """Best-effort failure mode from captured stderr/stdout text and/or a process return code."""
+    if rc is not None:
+        if rc == 5:
+            return "unsupported"
+        if rc == 124:
+            return "timeout"             # GNU timeout SIGTERM
+        if rc in (137, -9):
+            return "timeout"             # SIGKILL (timeout -k)
+        if rc in (134, -6):
+            return "deadlock"            # SIGABRT (NCCL watchdog / assertion)
+    t = (text or "").lower()
+    for mode, sigs in _SIGNATURES:
+        if any(s in t for s in sigs):
+            return mode
+    if rc not in (None, 0):
+        return "unknown"
+    return "unknown"
+
+
+def record(text="", rc=None, case=None) -> dict:
+    """A classified failure record preserving the exact case + signal for reliability views."""
+    return {"failure_mode": classify(text, rc), "return_code": rc,
+            "case": case or {}, "evidence": (text or "")[-400:]}
+
+
+if __name__ == "__main__":
+    import sys
+    cases = [
+        ("RuntimeError: Unsupported number of EP ranks", None, "unsupported"),
+        ("", 124, "timeout"),
+        ("Signal 6 (SIGABRT) received ... NCCL watchdog", None, "deadlock"),
+        ("", -6, "deadlock"),
+        ("cuda out of memory", None, "out-of-memory"),
+        ("ibv_reg_mr failed errno 22", None, "registration-failure"),
+        ("shmem_finalize teardown assertion", None, "teardown-failure"),
+        ("srun: error: node failed", None, "infrastructure"),
+    ]
+    ok = True
+    for text, rc, want in cases:
+        got = classify(text, rc)
+        flag = "OK" if got == want else "FAIL"
+        if got != want:
+            ok = False
+        print(f"  [{flag}] rc={rc} text={text[:40]!r} -> {got} (want {want})")
+    print("failure_taxonomy self-test:", "PASS" if ok else "FAIL")
+    sys.exit(0 if ok else 1)
diff --git a/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt
new file mode 100644
index 000000000..c8825164e
--- /dev/null
+++ b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt
@@ -0,0 +1,50 @@
+# nThread 1 nGpus 8 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
+#
+# Using devices
+#  Rank  0 Group  0 Pid  12345 on    b200-node device  0 [0x1b] NVIDIA B200
+#  Rank  1 Group  0 Pid  12345 on    b200-node device  1 [0x43] NVIDIA B200
+#  Rank  2 Group  0 Pid  12345 on    b200-node device  2 [0x52] NVIDIA B200
+#  Rank  3 Group  0 Pid  12345 on    b200-node device  3 [0x61] NVIDIA B200
+#  Rank  4 Group  0 Pid  12345 on    b200-node device  4 [0x9d] NVIDIA B200
+#  Rank  5 Group  0 Pid  12345 on    b200-node device  5 [0xc3] NVIDIA B200
+#  Rank  6 Group  0 Pid  12345 on    b200-node device  6 [0xd1] NVIDIA B200
+#  Rank  7 Group  0 Pid  12345 on    b200-node device  7 [0xdf] NVIDIA B200
+#
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+           8             2     float     sum      -1     9.62    0.00    0.00      0     9.60    0.00    0.00      0
+          16             4     float     sum      -1     9.61    0.00    0.00      0     9.59    0.00    0.00      0
+          32             8     float     sum      -1     9.63    0.00    0.00      0     9.62    0.00    0.00      0
+          64            16     float     sum      -1     9.60    0.00    0.00      0     9.58    0.00    0.00      0
+         128            32     float     sum      -1     9.64    0.01    0.02      0     9.63    0.01    0.02      0
+         256            64     float     sum      -1     9.66    0.03    0.05      0     9.64    0.03    0.05      0
+         512           128     float     sum      -1     9.69    0.05    0.09      0     9.67    0.05    0.09      0
+        1024           256     float     sum      -1     9.74    0.11    0.18      0     9.72    0.11    0.18      0
+        2048           512     float     sum      -1     9.82    0.21    0.37      0     9.80    0.21    0.37      0
+        4096          1024     float     sum      -1     9.97    0.41    0.72      0     9.95    0.41    0.72      0
+        8192          2048     float     sum      -1    10.22    0.80    1.40      0    10.20    0.80    1.40      0
+       16384          4096     float     sum      -1    10.81    1.52    2.65      0    10.79    1.52    2.65      0
+       32768          8192     float     sum      -1    11.93    2.75    4.81      0    11.90    2.75    4.81      0
+       65536         16384     float     sum      -1    13.62    4.81    8.42      0    13.59    4.82    8.43      0
+      131072         32768     float     sum      -1    16.94    7.74   13.54      0    16.90    7.76   13.57      0
+      262144         65536     float     sum      -1    23.14   11.33   19.83      0    23.10   11.35   19.86      0
+      524288        131072     float     sum      -1    35.62   14.72   25.76      0    35.55   14.75   25.81      0
+     1048576        262144     float     sum      -1    60.40   17.36   30.38      0    60.30   17.39   30.43      0
+     2097152        524288     float     sum      -1    76.50   27.41   47.97      0    76.40   27.45   48.04      0
+     4194304       1048576     float     sum      -1   110.20   38.06   66.61      0   110.05   38.11   66.70      0
+     8388608       2097152     float     sum      -1   165.80   50.60   88.55      0   165.60   50.66   88.65      0
+    16777216       4194304     float     sum      -1   250.10   67.08  117.40      0   249.80   67.16  117.54      0
+    33554432       8388608     float     sum      -1   360.50   93.08  162.90      0   360.10   93.18  163.07      0
+    67108864      16777216     float     sum      -1   520.80  128.85  225.50      0   520.20  129.00  225.75      0
+   134217728      33554432     float     sum      -1   720.30  186.34  326.10      0   719.50  186.55  326.46      0
+   268435456      67108864     float     sum      -1  1080.50  248.43  434.80      0  1079.20  248.73  435.27      0
+   536870912     134217728     float     sum      -1  1990.20  269.76  472.10      0  1988.50  269.99  472.49      0
+  1073741824     268435456     float     sum      -1  3940.60  272.48  476.84      0  3938.10  272.65  477.14      0
+  2147483648     536870912     float     sum      -1  7850.10  273.56  478.73      0  7846.20  273.69  478.96      0
+  4294967296    1073741824     float     sum      -1 15680.50  273.91  479.34      0 15673.80  274.03  479.55      0
+  8589934592    2147483648     float     sum      -1 31250.80  274.87  481.02      0 31238.10  274.98  481.22      0
+#
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 168.42
+#
diff --git a/experimental/CollectiveX/tests/kv_cache_transfer.py b/experimental/CollectiveX/tests/kv_cache_transfer.py
new file mode 100644
index 000000000..655ece58d
--- /dev/null
+++ b/experimental/CollectiveX/tests/kv_cache_transfer.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""CollectiveX — KV-cache transfer benchmark (family=kv-cache).
+
+Times raw CUDA memcpy of KV-cache-shaped buffers across the transfer paths a
+serving stack actually uses, with CUDA events (GPU-accurate). Adapted from
+experimental/kvcache_transfer_DtoH_HtoD/benchmark.py but WITHOUT the vLLM
+`swap_blocks` dependency — CollectiveX containers may not ship vLLM, and the goal
+asks for the raw CUDA/HIP memcpy + CPU pinned-memory path as the reference.
+
+Dimensions (goal P2 "KV-cache transfer suite"):
+  direction : dtoh | htod | dtod-local | dtod-remote (remote needs >=2 GPUs)
+  layout    : contiguous (one copy) | paged (N scattered block copies — the real
+              paged-KV pattern; captures per-block launch/scatter overhead)
+  size class: decode-sized (small per-token blocks) .. prefill/prefix-cache-sized (large)
+  backend   : memcpy (raw cudaMemcpy), pinned (CPU pinned host) — WIRED.
+              nixl / mooncake / mori-io / nccl — declared, NOT wired (stubs; never faked).
+
+Stdlib + torch; torch is imported lazily so `--help` / `--parse-only`-style use works
+without a GPU. One provenance-tagged JSON per run, matching run_nccl.py's structure.
+
+  python tests/kv_cache_transfer.py --direction all --runner h200-dgxc \\
+      --topology-class h200-nvlink-island --transport nvlink \\
+      --env-json results/env.json --out results/h200_kvcache.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "kv-cache-memcpy-v1"
+FAMILY = "kv-cache"
+
+# Backends: which transfer mechanism moves the bytes. Only the raw memcpy + pinned-host
+# paths are wired; the rest are declared so the axis is honest and a future adapter slots in.
+WIRED_BACKENDS = ("memcpy", "pinned")
+STUB_BACKENDS = ("nixl", "mooncake", "mori-io", "nccl")
+
+# KV block byte sizes: decode-sized (a few tokens' KV) .. prefill/prefix-cache-sized.
+# A DeepSeek-V3 layer KV block for a handful of tokens is ~tens of KiB; a prefill/prefix
+# chunk is MiB. Sweep geometric 16KiB -> 256MiB and class each point.
+DECODE_MAX_BYTES = 512 * 1024          # <=512KiB == "decode-sized"
+DEFAULT_MIN_BYTES = 16 * 1024
+DEFAULT_MAX_BYTES = 256 * 1024 * 1024
+
+
+def size_class(nbytes: int) -> str:
+    return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill"
+
+
+def _sizes(min_bytes: int, max_bytes: int, factor: int = 4):
+    out, s = [], min_bytes
+    while s <= max_bytes:
+        out.append(s)
+        s *= factor
+    return out
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"],
+             str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _bench_one(torch, src, dst, total_bytes, block_bytes, layout, paged_blocks,
+               warmup: int, iters: int):
+    """Time `iters` copies of total_bytes from src->dst. paged => paged_blocks scattered
+    block copies of block_bytes each; contiguous => one copy. Returns (time_ms, gb_s)."""
+    def _do():
+        if layout == "paged":
+            # scatter: copy each logical block to a (shuffled) destination block slot —
+            # the paged-KV access pattern (non-contiguous gather/scatter).
+            for s_off, d_off in paged_blocks:
+                dst[d_off:d_off + block_bytes].copy_(src[s_off:s_off + block_bytes],
+                                                      non_blocking=True)
+        else:
+            dst.copy_(src, non_blocking=True)
+
+    for _ in range(warmup):
+        _do()
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        _do()
+    end.record()
+    torch.cuda.synchronize()
+    ms = start.elapsed_time(end) / iters
+    gb_s = (total_bytes / (ms / 1e3)) / 1e9 if ms > 0 else 0.0
+    return round(ms, 5), round(gb_s, 2)
+
+
+def _alloc(torch, where, nbytes, pinned: bool):
+    n = nbytes  # bytes; use uint8 so 1 elem == 1 byte
+    if where == "cpu":
+        t = torch.empty(n, dtype=torch.uint8, device="cpu")
+        return t.pin_memory() if pinned else t
+    return torch.empty(n, dtype=torch.uint8, device=where)
+
+
+def run_direction(torch, direction, backend, layout, sizes, block_bytes, warmup, iters,
+                  ngpu: int):
+    """Yield a row per size for one (direction, backend, layout)."""
+    rows = []
+    pinned = (backend == "pinned")
+    for nbytes in sizes:
+        # endpoints
+        if direction == "dtoh":
+            src_dev, dst_dev = "cuda:0", "cpu"
+        elif direction == "htod":
+            src_dev, dst_dev = "cpu", "cuda:0"
+        elif direction == "dtod-local":
+            src_dev, dst_dev = "cuda:0", "cuda:0"
+        elif direction == "dtod-remote":
+            if ngpu < 2:
+                return [], "n/a (needs >=2 GPUs)"
+            src_dev, dst_dev = "cuda:0", "cuda:1"
+        else:
+            return [], f"unknown direction {direction}"
+        # pinned only matters when a host buffer is involved
+        host_involved = ("cpu" in (src_dev, dst_dev))
+        if backend == "pinned" and not host_involved:
+            continue  # pinned is a host-memory property; skip for pure DtoD
+        try:
+            src = _alloc(torch, src_dev, nbytes, pinned and src_dev == "cpu")
+            dst = _alloc(torch, dst_dev, nbytes, pinned and dst_dev == "cpu")
+        except RuntimeError as exc:  # OOM at the largest sizes — stop, don't crash
+            rows.append({"transfer_bytes": nbytes, "error": f"alloc: {exc!r}", "correct": None})
+            break
+        nblk = max(1, nbytes // block_bytes)
+        bb = nbytes // nblk
+        # paged: shuffle destination block order (deterministic) to force scatter
+        paged = [((i * bb), (((i * 2654435761) % nblk) * bb)) for i in range(nblk)] \
+            if layout == "paged" else None
+        ms, gb_s = _bench_one(torch, src, dst, nbytes, bb, layout, paged, warmup, iters)
+        rows.append({
+            "transfer_bytes": nbytes, "size_class": size_class(nbytes),
+            "block_bytes": bb if layout == "paged" else nbytes,
+            "num_blocks": nblk if layout == "paged" else 1,
+            "time_ms": ms, "bandwidth_gb_s": gb_s,
+            "correct": True,  # raw memcpy is exact (uint8); kept for schema parity
+        })
+        del src, dst
+        torch.cuda.empty_cache()
+    return rows, None
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX KV-cache transfer benchmark")
+    ap.add_argument("--direction", default="all",
+                    choices=["all", "dtoh", "htod", "dtod-local", "dtod-remote"])
+    ap.add_argument("--backends", default="memcpy,pinned",
+                    help="comma list from memcpy,pinned (wired) — stubs are recorded, not run")
+    ap.add_argument("--layouts", default="contiguous,paged")
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--block-bytes", type=int, default=64 * 1024,
+                    help="paged KV block size (a few tokens' KV); default 64KiB")
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=30)
+    # provenance (mirror run_nccl.py)
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    try:
+        import torch
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+    if not torch.cuda.is_available():
+        print("ERROR: CUDA not available", file=sys.stderr)
+        return 3
+
+    ngpu = torch.cuda.device_count()
+    directions = (["dtoh", "htod", "dtod-local", "dtod-remote"]
+                  if args.direction == "all" else [args.direction])
+    backends = [b.strip() for b in args.backends.split(",") if b.strip()]
+    layouts = [l.strip() for l in args.layouts.split(",") if l.strip()]
+    sizes = _sizes(args.min_bytes, args.max_bytes)
+
+    groups = []
+    notes = []
+    peak = 0.0
+    for backend in backends:
+        if backend not in WIRED_BACKENDS:
+            notes.append(f"backend '{backend}' not wired (declared only)")
+            continue
+        for direction in directions:
+            for layout in layouts:
+                rows, na = run_direction(torch, direction, backend, layout, sizes,
+                                         args.block_bytes, args.warmup, args.iters, ngpu)
+                if na:
+                    notes.append(f"{direction}/{backend}/{layout}: {na}")
+                    continue
+                if not rows:
+                    continue
+                peak = max(peak, max((r.get("bandwidth_gb_s") or 0.0) for r in rows))
+                meta = {"direction": direction, "layout": layout, "backend": backend,
+                        "dtype": "uint8", "nodes": args.nodes,
+                        "topology_class": args.topology_class,
+                        "measurement_contract": MEASUREMENT_CONTRACT}
+                groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows})
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    doc = {
+        "schema_version": SCHEMA_VERSION,
+        "family": FAMILY,
+        "generated_by": "kv_cache_transfer.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner,
+        "transport": args.transport,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+        "nodes": args.nodes,
+        "num_gpus_visible": ngpu,
+        "wired_backends": list(WIRED_BACKENDS),
+        "declared_unwired_backends": list(STUB_BACKENDS),
+        "status": "valid" if (groups and peak > 0.0) else "invalid",
+        "num_groups": len(groups),
+        "groups": groups,
+        "notes": notes,
+        "environment": env,
+    }
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+    print(f"kv-cache: {len(groups)} (dir,backend,layout) groups -> {args.out} "
+          f"(status={doc['status']}, peak_bw={peak:.1f} GB/s, gpus={ngpu})")
+    if notes:
+        print("notes: " + "; ".join(notes), file=sys.stderr)
+    return 0 if doc["status"] == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py
new file mode 100644
index 000000000..2e05c0537
--- /dev/null
+++ b/experimental/CollectiveX/tests/make_workloads.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Generate canonical serialized workloads (goal Part 1). Runs build_workload (needs torch) for
+each (routing, global_tokens) in a ladder and writes <workload_id>.npz + .manifest.json into a
+dir that runs then consume via `run_ep.py --workload-dir`. One trace per global-token count
+because the generator is not prefix-consistent across sizes.
+
+  python3 tests/make_workloads.py --out-dir /data/sa-shared/cx_workloads \\
+      --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\
+      --tokens-ladder "1 2 4 8 16 32 64 128 256 512"
+
+Or by NAMED model manifest (goal P1 model-shape coverage) — dims resolved from configs/workloads.yaml
+(synthetic + model_derived; experts <- experts|routed_experts). Explicit --hidden/--topk/--experts
+still override per field, so the env-var-driven in-container path (CX_HIDDEN/CX_TOPK/CX_EXPERTS) is
+unchanged; this just lets a SKU stage a model shape by name:
+
+  python3 tests/make_workloads.py --out-dir /data/cx_workloads --workload kimi-k2-v1 --routing uniform --ep 8
+
+--id-only prints the deterministic workload_id per ladder point WITHOUT torch/numpy (the id is a hash
+of the identity params, not the bytes) — runnable on a login node / in CI to prove cross-SKU identity:
+
+  python3 tests/make_workloads.py --workload kimi-k2-v1 --ep 8 --id-only
+
+Generate every routing the suites need by running once per --routing. Idempotent (same id => same
+file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import workload as wl   # noqa: E402
+
+# Repo root holds configs/ (this file is in tests/). Used only for --workload name resolution.
+_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+def resolve_manifest(name):
+    """Look a workload name up in configs/workloads.yaml and return (hidden, topk, experts).
+    Searches synthetic + model_derived; expert count = `experts` or (for model-derived) `routed_experts`.
+    Raises SystemExit with the known names if the manifest is absent. Pure PyYAML + stdlib."""
+    import yaml
+    path = os.path.join(_REPO, "configs", "workloads.yaml")
+    cfg = yaml.safe_load(open(path))
+    known = []
+    for section in ("synthetic", "model_derived"):
+        sec = cfg.get(section) or {}
+        known += list(sec)
+        m = sec.get(name)
+        if m is None:
+            continue
+        experts = m.get("experts", m.get("routed_experts"))
+        if m.get("hidden") is None or m.get("topk") is None or experts is None:
+            raise SystemExit(f"workload '{name}' is missing hidden/topk/experts in {path}")
+        return int(m["hidden"]), int(m["topk"]), int(experts)
+    raise SystemExit(f"unknown --workload '{name}'; known: {sorted(known)}")
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads")
+    ap.add_argument("--out-dir", help="required unless --id-only")
+    ap.add_argument("--workload", help="named manifest in configs/workloads.yaml (sets hidden/topk/experts)")
+    ap.add_argument("--routing", default="uniform")
+    ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)")
+    ap.add_argument("--hidden", type=int, help="override (default 7168, or the --workload's hidden)")
+    ap.add_argument("--topk", type=int, help="override (default 8, or the --workload's topk)")
+    ap.add_argument("--experts", type=int, help="override (default 256, or the --workload's experts)")
+    ap.add_argument("--seed", type=int, default=67)
+    ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512")
+    ap.add_argument("--id-only", action="store_true",
+                    help="print deterministic workload_id per point WITHOUT torch/numpy (no files written)")
+    a = ap.parse_args()
+
+    # Resolve dims: a named --workload supplies defaults; explicit --hidden/--topk/--experts override
+    # per field. With neither, fall back to the historical ds-like-ref defaults (7168/8/256).
+    base_h, base_t, base_e = (7168, 8, 256)
+    if a.workload:
+        base_h, base_t, base_e = resolve_manifest(a.workload)
+    hidden = a.hidden if a.hidden is not None else base_h
+    topk = a.topk if a.topk is not None else base_t
+    experts = a.experts if a.experts is not None else base_e
+
+    if not a.id_only and not a.out_dir:
+        ap.error("--out-dir is required unless --id-only")
+
+    ladder = sorted({int(t) for t in a.tokens_ladder.replace(",", " ").split() if int(t) > 0})
+    epr = experts // a.ep
+    label = f"workload={a.workload} " if a.workload else ""
+
+    if a.id_only:
+        # Identity-only path: the workload_id is a hash of (generator|routing|hidden|topk|experts|gt|seed),
+        # so it is fully determined WITHOUT generating the trace. Proves cross-SKU identity in CI/login.
+        made = []
+        for T in ladder:
+            gt = T * a.ep
+            wid = wl.compute_workload_id(a.routing, hidden, topk, experts, gt, a.seed)
+            made.append((T, gt, wid))
+            print(f"  T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}")
+        print(f"{label}id-only: {len(made)} workload_id(s) "
+              f"(hidden={hidden} topk={topk} experts={experts} ep={a.ep} routing={a.routing} seed={a.seed})")
+        return 0
+
+    os.makedirs(a.out_dir, exist_ok=True)
+    made = []
+    for T in ladder:
+        gt = T * a.ep
+        idx, w, man = wl.build_workload(hidden, topk, experts, a.routing, gt, a.seed, epr)
+        wid = wl.save_workload(a.out_dir, idx, w, man)
+        made.append((T, gt, wid))
+        print(f"  T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}  "
+              f"(trace sha {man['checksums']['trace'][:12]})")
+    print(f"{label}wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/mooncake_transfer.py b/experimental/CollectiveX/tests/mooncake_transfer.py
new file mode 100644
index 000000000..9cc8d2931
--- /dev/null
+++ b/experimental/CollectiveX/tests/mooncake_transfer.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""CollectiveX — Mooncake transfer-engine benchmark (family=kv-cache, backend=mooncake).
+
+Mooncake (kvcache-ai/Mooncake) is the disaggregated-KV transfer engine used by vLLM/SGLang PD
+setups. This benches its RDMA `transfer_write_on_cuda` the way a prefill->decode KV write uses it:
+one TransferEngine, P2PHANDSHAKE metadata (no etcd), src+dst GPU buffers registered for RDMA, the
+engine RDMA-writes src->dst (loopback to its own rpc endpoint) over a KV-block size sweep. CUDA-
+event timed on the transfer stream.
+
+The WIRED kv-cache `mooncake` backend the goal declared a stub. Mooncake isn't in any CollectiveX
+container, so run_in_container pip-installs `mooncake-transfer-engine` first (the directive's "import
+a new one" — a pip import rather than a base-image swap). Needs an RDMA NIC (auto-detected from
+/sys/class/infiniband). The mooncake API surface + the chosen device are DUMPED to the log; absence
+of the package or an RDMA device is recorded, never faked.
+
+  python tests/mooncake_transfer.py --runner b300 --topology-class b300-nvlink-island \\
+      --transport rdma --env-json results/env.json --out results/b300_mooncake.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import socket
+import sys
+import time
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "mooncake-transfer-v1"
+FAMILY = "kv-cache"
+BACKEND = "mooncake"
+
+DEFAULT_MIN_BYTES = 64 * 1024
+DEFAULT_MAX_BYTES = 256 * 1024 * 1024
+DECODE_MAX_BYTES = 512 * 1024
+
+
+def size_class(nbytes: int) -> str:
+    return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill"
+
+
+def _sizes(lo: int, hi: int, factor: int = 4):
+    out, s = [], lo
+    while s <= hi:
+        out.append(s)
+        s *= factor
+    return out
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"],
+             str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _get_ip() -> str:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+            s.connect(("8.8.8.8", 80))
+            return s.getsockname()[0]
+    except Exception:
+        return socket.gethostbyname(socket.gethostname())
+
+
+def _rdma_devices():
+    """RDMA device names to try, in order — the detected IB devices, then common fallbacks."""
+    devs = []
+    try:
+        devs = sorted(os.listdir("/sys/class/infiniband"))
+    except Exception:
+        pass
+    # prefer a bond if present (the Mooncake test used mlx5_bond_0), then the raw devices.
+    bonds = [d for d in devs if "bond" in d]
+    return bonds + [d for d in devs if d not in bonds] + ["mlx5_bond_0", "mlx5_0", "rocep0s0"]
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX Mooncake transfer benchmark")
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=30)
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="rdma")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    def _emit(groups, status, peak, notes):
+        doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY,
+               "generated_by": "mooncake_transfer.py",
+               "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+               "runner": args.runner, "transport": args.transport,
+               "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes,
+               "wired_backends": [BACKEND], "status": status,
+               "num_groups": len(groups), "groups": groups, "notes": notes, "environment": env}
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            json.dump(doc, fh, indent=2)
+            fh.write("\n")
+        print(f"mooncake: {len(groups)} groups -> {args.out} (status={status}, peak_bw={peak:.1f} GB/s)")
+        if notes:
+            print("notes: " + "; ".join(notes), file=sys.stderr)
+
+    try:
+        import torch
+    except Exception as exc:
+        _emit([], "invalid", 0.0, [f"torch unavailable: {exc!r}"])
+        return 3
+    if not torch.cuda.is_available():
+        _emit([], "invalid", 0.0, ["CUDA/ROCm not available"])
+        return 3
+    try:
+        from mooncake.engine import TransferEngine
+    except Exception as exc:
+        _emit([], "invalid", 0.0,
+              [f"mooncake import failed (run_in_container pip-installs mooncake-transfer-engine): {exc!r}"])
+        return 1
+    print("MOONCAKE_API methods=" + json.dumps([m for m in dir(TransferEngine) if not m.startswith("_")][:40]),
+          file=sys.stderr, flush=True)
+
+    is_rocm = bool(getattr(torch.version, "hip", None))
+    xfer = "transfer_write_on_hip" if is_rocm else "transfer_write_on_cuda"
+    eng = TransferEngine()
+    host = _get_ip()
+    init_note = None
+    for dev in _rdma_devices():
+        try:
+            ret = eng.initialize(host, "P2PHANDSHAKE", "rdma", dev)
+            if ret == 0:
+                init_note = f"initialized on rdma device {dev}"
+                break
+        except Exception as e:
+            init_note = f"init raised on {dev}: {e!r}"
+    if init_note is None or "initialized" not in init_note:
+        _emit([], "invalid", 0.0, [f"mooncake init failed on all RDMA devices: {init_note}"])
+        return 1
+    print(f"MOONCAKE_INIT {init_note}", file=sys.stderr, flush=True)
+    if not hasattr(eng, xfer):
+        _emit([], "invalid", 0.0, [f"mooncake engine has no {xfer} (methods dumped above)"])
+        return 1
+    rpc = eng.get_rpc_port()
+    target = f"[{host}]:{rpc}" if ":" in host else f"{host}:{rpc}"
+    transfer = getattr(eng, xfer)
+
+    dev0 = torch.device("cuda:0")
+    stream = torch.cuda.Stream(dev0)
+    sizes = _sizes(args.min_bytes, args.max_bytes)
+    rows, peak = [], 0.0
+    for nbytes in sizes:
+        try:
+            src = torch.ones(nbytes, dtype=torch.uint8, device=dev0)
+            dst = torch.zeros(nbytes, dtype=torch.uint8, device=dev0)
+            if eng.register_memory(src.data_ptr(), src.nbytes) != 0 or \
+               eng.register_memory(dst.data_ptr(), dst.nbytes) != 0:
+                rows.append({"transfer_bytes": nbytes, "error": "register_memory != 0", "correct": None})
+                break
+
+            def _once():
+                transfer(target, src.data_ptr(), dst.data_ptr(), nbytes, stream.cuda_stream)
+            for _ in range(args.warmup):
+                _once()
+            torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            for _ in range(args.iters):
+                _once()
+            stream.synchronize()
+            torch.cuda.synchronize()
+            dt = time.perf_counter() - t0
+            eng.unregister_memory(src.data_ptr()); eng.unregister_memory(dst.data_ptr())
+        except Exception as exc:
+            rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None})
+            break
+        ms = (dt / args.iters) * 1e3
+        gb_s = (nbytes / (dt / args.iters)) / 1e9 if dt > 0 else 0.0
+        rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes),
+                     "block_bytes": nbytes, "num_blocks": 1,
+                     "time_ms": round(ms, 5), "bandwidth_gb_s": round(gb_s, 2), "correct": True})
+        peak = max(peak, gb_s)
+        del src, dst
+        torch.cuda.empty_cache()
+
+    groups = []
+    if any(r.get("bandwidth_gb_s") for r in rows):
+        meta = {"direction": "dtod-local", "layout": "contiguous", "backend": BACKEND,
+                "dtype": "uint8", "nodes": args.nodes,
+                "topology_class": args.topology_class,
+                "measurement_contract": MEASUREMENT_CONTRACT}
+        groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows})
+    status = "valid" if (groups and peak > 0.0) else "invalid"
+    _emit(groups, status, peak, [init_note, f"loopback target={target}"])
+    return 0 if status == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/mori_io_transfer.py b/experimental/CollectiveX/tests/mori_io_transfer.py
new file mode 100644
index 000000000..572251b94
--- /dev/null
+++ b/experimental/CollectiveX/tests/mori_io_transfer.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""CollectiveX — MoRI-IO transfer benchmark (family=kv-cache, backend=mori-io).
+
+MoRI-IO (ROCm/mori `mori.io`) is AMD's RDMA point-to-point transfer engine — the AMD analog of
+NIXL, used for disaggregated-serving KV movement between GPUs/nodes. This benches its read path the
+way a prefill->decode KV handoff uses it: two IOEngines in one process (initiator + target, RDMA
+backend, mutual register_remote_engine), the initiator RDMA-reads the target's GPU buffer, swept
+over KV-block-sized payloads. Wall-clock latency + bandwidth (RDMA completion via InProgress()).
+
+This is the WIRED `mori-io` backend the goal's "KV-cache transfer backends" axis declared a stub.
+Runs only on the AMD MoRI image (CX_BENCH=mori-io on mi355x); elsewhere the import fails and the run
+records that — never faked. The mori.io API surface is DUMPED to stderr at startup so a GHA run's
+log is self-documenting (SSH into the MI355X container stalls on the shared cluster).
+
+  python tests/mori_io_transfer.py --runner mi355x --topology-class mi355x-xgmi \\
+      --transport rdma --env-json results/env.json --out results/mi355x_mori_io.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+import time
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "mori-io-transfer-v1"
+FAMILY = "kv-cache"
+BACKEND = "mori-io"
+
+DEFAULT_MIN_BYTES = 64 * 1024
+DEFAULT_MAX_BYTES = 256 * 1024 * 1024
+DECODE_MAX_BYTES = 512 * 1024
+
+
+def size_class(nbytes: int) -> str:
+    return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill"
+
+
+def _sizes(min_bytes: int, max_bytes: int, factor: int = 4):
+    out, s = [], min_bytes
+    while s <= max_bytes:
+        out.append(s)
+        s *= factor
+    return out
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"],
+             str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _introspect(mod):
+    info = {"mori_io_exports": [n for n in dir(mod) if not n.startswith("_")][:40]}
+    try:
+        import importlib.metadata as _m
+        info["mori_version"] = _m.version("mori")
+    except Exception as e:
+        info["mori_version"] = f"<{e!r}>"
+    print("MORI_IO_API " + json.dumps(info), file=sys.stderr, flush=True)
+
+
+def _make_engines(io):
+    """Two local IOEngines (initiator + target) on distinct localhost ports with an RDMA backend,
+    mutually registered. Mirrors examples/io/example.py."""
+    cfg = io.IOEngineConfig(host="127.0.0.1", port=8080)
+    initiator = io.IOEngine(key="cx_initiator", config=cfg)
+    cfg2 = io.IOEngineConfig(host="127.0.0.1", port=8081)
+    target = io.IOEngine(key="cx_target", config=cfg2)
+    rdma = io.RdmaBackendConfig(qp_per_transfer=1)
+    initiator.create_backend(io.BackendType.RDMA, rdma)
+    target.create_backend(io.BackendType.RDMA, rdma)
+    initiator.register_remote_engine(target.get_engine_desc())
+    target.register_remote_engine(initiator.get_engine_desc())
+    return initiator, target
+
+
+def _bench_one(initiator, target, src_t, dst_t, nbytes, warmup, iters):
+    """Register src (initiator, GPU0) + dst (target, GPU1); RDMA-read dst->src `iters` times, poll
+    each to completion. Returns (latency_ms, gb_s). Raises on a MoRI-IO error."""
+    im = initiator.register_torch_tensor(src_t)
+    tm = target.register_torch_tensor(dst_t)
+
+    def _once():
+        uid = initiator.allocate_transfer_uid()
+        st = initiator.read(im, 0, tm, 0, nbytes, uid)
+        while st.InProgress():
+            pass
+        msg = st.Message() if hasattr(st, "Message") else ""
+        if msg and "succ" not in msg.lower() and "ok" not in msg.lower() and "done" not in msg.lower():
+            # Message() is informational on success; only treat an explicit failure word as fatal.
+            if any(w in msg.lower() for w in ("fail", "error", "abort")):
+                raise RuntimeError(f"mori-io read status: {msg}")
+
+    try:
+        for _ in range(warmup):
+            _once()
+        t0 = time.perf_counter()
+        for _ in range(iters):
+            _once()
+        dt = time.perf_counter() - t0
+    finally:
+        initiator.deregister_memory(im)
+        target.deregister_memory(tm)
+    ms = (dt / iters) * 1e3
+    gb_s = (nbytes / (dt / iters)) / 1e9 if dt > 0 else 0.0
+    return round(ms, 5), round(gb_s, 2)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX MoRI-IO transfer benchmark")
+    ap.add_argument("--direction", default="dtod-remote", choices=["dtod-remote"])
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=30)
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="rdma")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    def _emit(groups, status, peak, notes):
+        doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY,
+               "generated_by": "mori_io_transfer.py",
+               "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+               "runner": args.runner, "transport": args.transport,
+               "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes,
+               "wired_backends": [BACKEND], "status": status,
+               "num_groups": len(groups), "groups": groups, "notes": notes, "environment": env}
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            json.dump(doc, fh, indent=2)
+            fh.write("\n")
+        print(f"mori-io: {len(groups)} groups -> {args.out} (status={status}, peak_bw={peak:.1f} GB/s)")
+        if notes:
+            print("notes: " + "; ".join(notes), file=sys.stderr)
+
+    try:
+        import torch
+    except Exception as exc:
+        _emit([], "invalid", 0.0, [f"torch unavailable: {exc!r}"])
+        return 3
+    if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
+        _emit([], "invalid", 0.0,
+              [f"mori-io needs >=2 GPUs (RDMA p2p); have {torch.cuda.device_count() if torch.cuda.is_available() else 0}"])
+        return 1
+    try:
+        import mori.io as moriio
+    except Exception as exc:
+        _emit([], "invalid", 0.0, [f"mori.io import failed (needs the AMD MoRI image): {exc!r}"])
+        return 1
+    _introspect(moriio)
+    try:
+        if hasattr(moriio, "set_log_level"):
+            moriio.set_log_level("warning")
+        initiator, target = _make_engines(moriio)
+    except Exception as exc:
+        _emit([], "invalid", 0.0, [f"mori.io engine/backend init failed: {exc!r}"])
+        return 1
+
+    sizes = _sizes(args.min_bytes, args.max_bytes)
+    notes = ["mori.io 2-engine RDMA loopback (GPU0<->GPU1)"]
+    rows, peak = [], 0.0
+    for nbytes in sizes:
+        try:
+            src = torch.empty(nbytes, dtype=torch.uint8, device="cuda:0")
+            dst = torch.empty(nbytes, dtype=torch.uint8, device="cuda:1")
+            ms, gb_s = _bench_one(initiator, target, src, dst, nbytes, args.warmup, args.iters)
+        except Exception as exc:
+            rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None})
+            break
+        rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes),
+                     "block_bytes": nbytes, "num_blocks": 1,
+                     "time_ms": ms, "bandwidth_gb_s": gb_s, "correct": True})
+        peak = max(peak, gb_s)
+        del src, dst
+        torch.cuda.empty_cache()
+
+    groups = []
+    if any(r.get("bandwidth_gb_s") for r in rows):
+        meta = {"direction": "dtod-remote", "layout": "contiguous", "backend": BACKEND,
+                "dtype": "uint8", "nodes": args.nodes,
+                "topology_class": args.topology_class,
+                "measurement_contract": MEASUREMENT_CONTRACT}
+        groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows})
+    status = "valid" if (groups and peak > 0.0) else "invalid"
+    _emit(groups, status, peak, notes)
+    return 0 if status == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/nccl_kv_transfer.py b/experimental/CollectiveX/tests/nccl_kv_transfer.py
new file mode 100644
index 000000000..0e77b88e1
--- /dev/null
+++ b/experimental/CollectiveX/tests/nccl_kv_transfer.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""CollectiveX — NCCL/RCCL KV-cache transfer benchmark (family=kv-cache, backend=nccl|rccl).
+
+The point-to-point KV handoff a disaggregated stack does over the collective library directly:
+torchrun with 2 ranks, rank 0 `dist.send`s KV-block-sized buffers to rank 1 (`dist.recv`), timed
+with CUDA events. NCCL on NVIDIA, RCCL on AMD/ROCm (same torch.distributed API) — so this is the
+WIRED `nccl`/`rccl` KV-cache backend the goal's "KV-cache transfer backends" axis declared a stub
+(the NCCL collective suite covers the all_reduce/all_gather primitives; this is the p2p KV path).
+
+Emits one kv-cache-family JSON (plots in the KV-cache tab next to memcpy/nixl/mori-io). Single
+(dir, backend, layout) group per run. Backend label = rccl on ROCm, nccl on CUDA.
+
+  torchrun --nproc_per_node=2 tests/nccl_kv_transfer.py --runner h200-dgxc \\
+      --topology-class h200-nvlink-island --transport nvlink \\
+      --env-json results/env.json --out results/h200_ncclkv.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "nccl-kv-sendrecv-v1"
+FAMILY = "kv-cache"
+
+DEFAULT_MIN_BYTES = 64 * 1024
+DEFAULT_MAX_BYTES = 256 * 1024 * 1024
+DECODE_MAX_BYTES = 512 * 1024
+
+
+def size_class(nbytes: int) -> str:
+    return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill"
+
+
+def _sizes(min_bytes: int, max_bytes: int, factor: int = 4):
+    out, s = [], min_bytes
+    while s <= max_bytes:
+        out.append(s)
+        s *= factor
+    return out
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"],
+             str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _bench_one(torch, dist, rank, send_buf, recv_buf, nbytes, warmup, iters):
+    """rank0 sends -> rank1 recvs, `iters` times, CUDA-event timed on the active rank. Returns
+    (latency_ms, gb_s) on rank 0 (rank 1 returns None and is the receiver)."""
+    def _once():
+        if rank == 0:
+            dist.send(send_buf, dst=1)
+        else:
+            dist.recv(recv_buf, src=0)
+    for _ in range(warmup):
+        _once()
+    torch.cuda.synchronize()
+    dist.barrier()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        _once()
+    end.record()
+    torch.cuda.synchronize()
+    ms = start.elapsed_time(end) / iters
+    gb_s = (nbytes / (ms / 1e3)) / 1e9 if ms > 0 else 0.0
+    return round(ms, 5), round(gb_s, 2)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX NCCL/RCCL KV-cache transfer benchmark")
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=30)
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="nvlink")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+    if not torch.cuda.is_available():
+        print("ERROR: CUDA/ROCm not available", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    dev = torch.device("cuda", local_rank)
+    # backend label tracks the actual transport library: RCCL on ROCm, NCCL on CUDA.
+    is_rocm = bool(getattr(torch.version, "hip", None))
+    backend_label = "rccl" if is_rocm else "nccl"
+
+    if world < 2:
+        if rank == 0:
+            _emit(args, [], "invalid", 0.0, [f"needs >=2 ranks (torchrun --nproc_per_node>=2); world={world}"],
+                  backend_label)
+        return 1
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl", init_method="env://", world_size=world, rank=rank)
+
+    sizes = _sizes(args.min_bytes, args.max_bytes)
+    rows = []
+    peak = 0.0
+    for nbytes in sizes:
+        try:
+            send_buf = torch.empty(nbytes, dtype=torch.uint8, device=dev) if rank == 0 else torch.empty(1, dtype=torch.uint8, device=dev)
+            recv_buf = torch.empty(nbytes, dtype=torch.uint8, device=dev) if rank == 1 else torch.empty(1, dtype=torch.uint8, device=dev)
+            ms, gb_s = _bench_one(torch, dist, rank, send_buf, recv_buf, nbytes, args.warmup, args.iters)
+        except RuntimeError as exc:
+            if rank == 0:
+                rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None})
+            break
+        if rank == 0:
+            rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes),
+                         "block_bytes": nbytes, "num_blocks": 1,
+                         "time_ms": ms, "bandwidth_gb_s": gb_s, "correct": True})
+            peak = max(peak, gb_s)
+        del send_buf, recv_buf
+        torch.cuda.empty_cache()
+
+    dist.barrier()
+    if rank != 0:
+        dist.destroy_process_group()
+        return 0
+
+    groups = []
+    if any(r.get("bandwidth_gb_s") for r in rows):
+        meta = {"direction": "dtod-remote", "layout": "contiguous", "backend": backend_label,
+                "dtype": "uint8", "nodes": args.nodes,
+                "topology_class": args.topology_class,
+                "measurement_contract": MEASUREMENT_CONTRACT}
+        groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows})
+    status = "valid" if (groups and peak > 0.0) else "invalid"
+    _emit(args, groups, status, peak, [f"{backend_label} 2-rank send/recv (rank0->rank1)"], backend_label)
+    dist.destroy_process_group()
+    return 0 if status == "valid" else 1
+
+
+def _emit(args, groups, status, peak, notes, backend_label):
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+    doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY,
+           "generated_by": "nccl_kv_transfer.py",
+           "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+           "runner": args.runner, "transport": args.transport,
+           "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes,
+           "wired_backends": [backend_label], "status": status,
+           "num_groups": len(groups), "groups": groups, "notes": notes, "environment": env}
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+    print(f"{backend_label}-kv: {len(groups)} groups -> {args.out} (status={status}, peak_bw={peak:.1f} GB/s)")
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/nixl_transfer.py b/experimental/CollectiveX/tests/nixl_transfer.py
new file mode 100644
index 000000000..ef589ee2c
--- /dev/null
+++ b/experimental/CollectiveX/tests/nixl_transfer.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+"""CollectiveX — NIXL transfer benchmark (family=kv-cache, backend=nixl).
+
+NIXL (ai-dynamo/nixl) is the transfer fabric dynamo uses for disaggregated-serving KV movement.
+This benches its point-to-point transfer engine the way a prefill->decode KV handoff uses it: two
+NIXL agents in one process, one registers the source buffer and the other the destination, and the
+initiator posts a WRITE over the UCX backend (GPU<->GPU, GPU<->host). It sweeps KV-block-sized
+payloads and records wall-clock latency + bandwidth (NIXL transfers run on UCX's own streams, so
+CUDA events don't bound them — perf_counter around post+poll-to-DONE is the honest measure).
+
+This is the WIRED `nixl` backend for the goal's "KV-cache transfer backends" axis (kv_cache_transfer
+declared it a stub). It runs only in the NIXL/dynamo container (CX_BENCH=nixl switches CX_IMAGE to
+the tensorrtllm-runtime image); elsewhere the import fails and the run records that — never faked.
+
+The NIXL Python surface (version, Abseil, backends, agent methods) is DUMPED to stderr at startup so
+a GHA run's log is self-documenting even if the API drifted — SSH inspection of the NIXL container is
+not available. Emits one kv-cache-family JSON (plots in the KV-cache tab next to raw memcpy).
+
+  python tests/nixl_transfer.py --runner b300 --topology-class b300-nvlink-island \\
+      --transport nvlink --env-json results/env.json --out results/b300_nixl.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+import time
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "nixl-transfer-v1"
+FAMILY = "kv-cache"          # same family/schema as kv_cache_transfer.py -> plots in the KV-cache tab
+BACKEND = "nixl"
+
+DEFAULT_MIN_BYTES = 64 * 1024
+DEFAULT_MAX_BYTES = 256 * 1024 * 1024
+DECODE_MAX_BYTES = 512 * 1024
+
+
+def size_class(nbytes: int) -> str:
+    return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill"
+
+
+def _sizes(min_bytes: int, max_bytes: int, factor: int = 4):
+    out, s = [], min_bytes
+    while s <= max_bytes:
+        out.append(s)
+        s *= factor
+    return out
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"],
+             str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _import_nixl():
+    """Return (nixl_agent, nixl_agent_config, import_path) or raise. Tries both documented paths."""
+    try:
+        from nixl._api import nixl_agent, nixl_agent_config  # canonical
+        return nixl_agent, nixl_agent_config, "nixl._api"
+    except Exception:
+        from nixl import nixl_agent, nixl_agent_config        # re-export
+        return nixl_agent, nixl_agent_config, "nixl"
+
+
+def _nixl_introspect(nixl_agent, nixl_agent_config):
+    """Dump the NIXL surface (version, Abseil, backends, agent methods) to stderr. Self-documenting
+    so the GHA log resolves any API drift without SSH into the NIXL container."""
+    info = {}
+    try:
+        import importlib.metadata as _m
+        info["nixl_version"] = _m.version("nixl")
+    except Exception as e:
+        info["nixl_version"] = f"<{e!r}>"
+    try:
+        import nixl._bindings as _b   # the pybind core; surfaces the linked Abseil/UCX if present
+        info["bindings"] = [n for n in dir(_b) if not n.startswith("_")][:40]
+    except Exception as e:
+        info["bindings"] = f"<{e!r}>"
+    info["agent_methods"] = [n for n in dir(nixl_agent) if not n.startswith("_")]
+    print("NIXL_API " + json.dumps(info), file=sys.stderr, flush=True)
+    return info
+
+
+def _make_agents(nixl_agent, nixl_agent_config):
+    """Two local agents (initiator + target) on the UCX backend; exchange metadata so the initiator
+    can post to the target's registered memory. No IP/listen thread needed in one process."""
+    try:
+        cfg = nixl_agent_config(backends=["UCX"])
+    except TypeError:
+        cfg = nixl_agent_config(True, True, 0)   # positional fallback (older signature)
+    init = nixl_agent("cx_initiator", cfg)
+    targ = nixl_agent("cx_target", cfg)
+    return init, targ
+
+
+def _bench_one(init, targ, src_t, dst_t, nbytes, warmup, iters):
+    """Register src (initiator) + dst (target), post WRITE src->dst `iters` times, poll each to DONE.
+    Returns (latency_ms_per_xfer, gb_s). Raises on a NIXL error (caller records it)."""
+    init.register_memory(src_t)
+    targ.register_memory(dst_t)
+    init.add_remote_agent(targ.get_agent_metadata())
+    src_descs = init.get_xfer_descs([src_t])
+    dst_descs = init.get_xfer_descs([dst_t])
+
+    def _once():
+        h = init.initialize_xfer("WRITE", src_descs, dst_descs, targ.name, b"cx")
+        st = init.transfer(h)
+        if st == "ERR":
+            init.release_xfer_handle(h)
+            raise RuntimeError("nixl transfer post returned ERR")
+        while True:
+            st = init.check_xfer_state(h)
+            if st == "ERR":
+                init.release_xfer_handle(h)
+                raise RuntimeError("nixl transfer state ERR")
+            if st == "DONE":
+                break
+        init.release_xfer_handle(h)
+
+    for _ in range(warmup):
+        _once()
+    t0 = time.perf_counter()
+    for _ in range(iters):
+        _once()
+    dt = time.perf_counter() - t0
+    ms = (dt / iters) * 1e3
+    gb_s = (nbytes / (dt / iters)) / 1e9 if dt > 0 else 0.0
+    return round(ms, 5), round(gb_s, 2)
+
+
+def _alloc(torch, where, nbytes):
+    if where == "cpu":
+        return torch.empty(nbytes, dtype=torch.uint8, device="cpu").pin_memory()
+    return torch.empty(nbytes, dtype=torch.uint8, device=where)
+
+
+def run_direction(torch, init, targ, direction, sizes, warmup, iters, ngpu):
+    rows = []
+    for nbytes in sizes:
+        if direction == "dtod-local":
+            src_dev, dst_dev = "cuda:0", "cuda:0"
+        elif direction == "dtod-remote":
+            if ngpu < 2:
+                return [], "n/a (needs >=2 GPUs)"
+            src_dev, dst_dev = "cuda:0", "cuda:1"
+        elif direction == "dtoh":
+            src_dev, dst_dev = "cuda:0", "cpu"
+        elif direction == "htod":
+            src_dev, dst_dev = "cpu", "cuda:0"
+        else:
+            return [], f"unknown direction {direction}"
+        try:
+            src = _alloc(torch, src_dev, nbytes)
+            dst = _alloc(torch, dst_dev, nbytes)
+            ms, gb_s = _bench_one(init, targ, src, dst, nbytes, warmup, iters)
+        except RuntimeError as exc:
+            rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None})
+            break
+        rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes),
+                     "block_bytes": nbytes, "num_blocks": 1,
+                     "time_ms": ms, "bandwidth_gb_s": gb_s, "correct": True})
+        del src, dst
+        torch.cuda.empty_cache()
+    return rows, None
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX NIXL transfer benchmark")
+    ap.add_argument("--direction", default="all",
+                    choices=["all", "dtod-local", "dtod-remote", "dtoh", "htod"])
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=30)
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    notes = []
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    def _emit(groups, status, peak, extra_notes):
+        doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY,
+               "generated_by": "nixl_transfer.py",
+               "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+               "runner": args.runner, "transport": args.transport,
+               "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes,
+               "wired_backends": [BACKEND], "status": status,
+               "num_groups": len(groups), "groups": groups,
+               "notes": extra_notes, "environment": env}
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            json.dump(doc, fh, indent=2)
+            fh.write("\n")
+        print(f"nixl-transfer: {len(groups)} groups -> {args.out} (status={status}, "
+              f"peak_bw={peak:.1f} GB/s)")
+        if extra_notes:
+            print("notes: " + "; ".join(extra_notes), file=sys.stderr)
+
+    try:
+        import torch
+    except Exception as exc:
+        _emit([], "invalid", 0.0, [f"torch unavailable: {exc!r}"])
+        return 3
+    if not torch.cuda.is_available():
+        _emit([], "invalid", 0.0, ["CUDA not available"])
+        return 3
+
+    try:
+        nixl_agent, nixl_agent_config, path = _import_nixl()
+        notes.append(f"nixl imported via {path}")
+    except Exception as exc:
+        _emit([], "invalid", 0.0,
+              [f"nixl import failed (needs the NIXL/dynamo container): {exc!r}"])
+        return 1
+    _nixl_introspect(nixl_agent, nixl_agent_config)
+    try:
+        init, targ = _make_agents(nixl_agent, nixl_agent_config)
+    except Exception as exc:
+        _emit([], "invalid", 0.0, [f"nixl agent init failed: {exc!r}"])
+        return 1
+
+    ngpu = torch.cuda.device_count()
+    directions = (["dtod-local", "dtod-remote", "dtoh", "htod"]
+                  if args.direction == "all" else [args.direction])
+    sizes = _sizes(args.min_bytes, args.max_bytes)
+
+    groups, peak = [], 0.0
+    for direction in directions:
+        try:
+            rows, na = run_direction(torch, init, targ, direction, sizes, args.warmup, args.iters, ngpu)
+        except Exception as exc:
+            notes.append(f"{direction}: {exc!r}")
+            continue
+        if na:
+            notes.append(f"{direction}: {na}")
+            continue
+        timed = [r for r in rows if r.get("bandwidth_gb_s")]
+        if not timed:
+            continue
+        peak = max(peak, max(r["bandwidth_gb_s"] for r in timed))
+        meta = {"direction": direction, "layout": "contiguous", "backend": BACKEND,
+                "dtype": "uint8", "nodes": args.nodes,
+                "topology_class": args.topology_class,
+                "measurement_contract": MEASUREMENT_CONTRACT}
+        groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows})
+
+    status = "valid" if (groups and peak > 0.0) else "invalid"
+    _emit(groups, status, peak, notes)
+    return 0 if status == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/offload_bench.py b/experimental/CollectiveX/tests/offload_bench.py
new file mode 100644
index 000000000..a338a3a4d
--- /dev/null
+++ b/experimental/CollectiveX/tests/offload_bench.py
@@ -0,0 +1,446 @@
+#!/usr/bin/env python3
+"""CollectiveX — CPU<->GPU offload suite (goal P2 "CPU-GPU offload suite").
+
+Measures host<->device memcpy bandwidth + latency over a size sweep, for the
+four sub-ops {h2d, d2h} x {pinned, pageable}, plus two diagnostics that matter
+for real offload (KV spill, weight streaming, activation checkpointing):
+
+  * NUMA locality   — which NUMA node the host buffer landed on, and (best
+                      effort, if numactl/affinity is available) a node-pinned
+                      vs default comparison. Recorded, never required.
+  * overlap-w-compute — a copy stream running concurrently with a dummy GEMM on
+                      a separate compute stream; reports achieved overlap %
+                      (how much of the copy is hidden behind compute).
+
+Matches run_nccl.py's result CONVENTION (family/runner/op/rows/comparison_key/
+status/transport/environment/generated_at) and env_capture.py's provenance
+style, so the plot + collector consume it uniformly.
+
+Stdlib + torch. torch is needed ONLY at runtime on the GPU; --help and
+--parse-only work without it (the JSON writer + CLI are import-safe).
+
+Run (inside the container, 1 GPU is enough):
+    python tests/offload_bench.py \\
+        --runner h200 --topology-class h200-nvlink-island --transport pcie \\
+        --env-json results/env.json --out results/h200_offload.json
+
+Verify offline (no GPU/torch needed):
+    python tests/offload_bench.py --parse-only --runner h200 \\
+        --topology-class h200-nvlink-island --out /tmp/parsed.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+FAMILY = "offload"
+MEASUREMENT_CONTRACT = "host-device-memcpy-v1"
+GENERATED_BY = "offload_bench.py"
+
+# (direction, host_memory) sub-ops. h2d = host->device (CPU->GPU), d2h = the reverse.
+SUBOPS = [
+    ("h2d", "pinned"),
+    ("h2d", "pageable"),
+    ("d2h", "pinned"),
+    ("d2h", "pageable"),
+]
+
+# Default byte sweep: 4 KiB .. 256 MiB by x4. Covers decode-token-sized spills
+# up to prefix-cache / weight-shard sized streams.
+DEFAULT_MIN_BYTES = 4 * 1024
+DEFAULT_MAX_BYTES = 256 * 1024 * 1024
+DEFAULT_FACTOR = 4
+
+
+# --------------------------------------------------------------------------- #
+# import-safe helpers (no torch)                                              #
+# --------------------------------------------------------------------------- #
+def _human(n: int) -> str:
+    for unit in ("B", "KiB", "MiB", "GiB"):
+        if n < 1024 or unit == "GiB":
+            return f"{n:.0f}{unit}"
+        n /= 1024
+    return f"{n}"
+
+
+def size_ladder(min_bytes: int, max_bytes: int, factor: int) -> list[int]:
+    sizes, s = [], int(min_bytes)
+    while s <= int(max_bytes):
+        sizes.append(s)
+        s *= factor
+    return sizes
+
+
+def comparison_key(meta: dict) -> str:
+    """Deterministic curve key. transport + topology_class are part of the key so
+    a PCIe H200 result and an NVLink-C2C GB200 result are labelled distinct rather
+    than silently overlaid (mirrors run_nccl.py's intent)."""
+    parts = [
+        meta["op"],
+        meta["host_memory"],
+        meta["dtype"],
+        meta["transport"],
+        meta["topology_class"],
+        meta["comparison_class"],
+        meta["measurement_contract"],
+    ]
+    return hashlib.sha256("|".join(map(str, parts)).encode()).hexdigest()[:16]
+
+
+def _load_env(path: str | None) -> dict | None:
+    if path and os.path.exists(path):
+        with open(path) as fh:
+            return json.load(fh)
+    return None
+
+
+def _provenance() -> dict:
+    """GitHub / container provenance (mirrors tests/run_ep.py)."""
+    import platform as _plat
+
+    arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine())
+    run = {
+        "run_id": os.environ.get("GITHUB_RUN_ID"),
+        "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+        "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+        "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+        "repo": os.environ.get("GITHUB_REPOSITORY"),
+        "job": os.environ.get("GITHUB_JOB"),
+        "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+    }
+    return {
+        "image": os.environ.get("COLLECTIVEX_IMAGE", ""),
+        "image_digest": os.environ.get("COLLECTIVEX_IMAGE_DIGEST", ""),
+        "image_arch": arch,
+        "squash_sha256": os.environ.get("COLLECTIVEX_SQUASH_SHA256"),
+        "git_run": run if any(run.values()) else None,
+    }
+
+
+def _numa_locality() -> dict:
+    """Best-effort NUMA context. Never required; degrades to nulls off-NUMA.
+
+    Records the process's allowed NUMA node(s) and CPU affinity so a result that
+    happened to land cross-socket from the GPU is identifiable after the fact.
+    """
+    info: dict = {
+        "available": False,
+        "process_node": None,
+        "membind": None,
+        "cpus_allowed_list": None,
+        "node_count": None,
+        "source": None,
+    }
+    # numactl --show is the clean read; fall back to /proc self status bitmasks.
+    import shutil
+    import subprocess
+
+    if shutil.which("numactl"):
+        try:
+            out = subprocess.run(
+                ["numactl", "--show"], capture_output=True, text=True, timeout=10, check=False
+            )
+            if out.returncode == 0:
+                info["available"] = True
+                info["source"] = "numactl --show"
+                for line in out.stdout.splitlines():
+                    if line.startswith("nodebind:"):
+                        info["process_node"] = line.split(":", 1)[1].strip()
+                    elif line.startswith("membind:"):
+                        info["membind"] = line.split(":", 1)[1].strip()
+        except (OSError, subprocess.TimeoutExpired):
+            pass
+    # node count from sysfs (independent of numactl)
+    try:
+        nodes = [d for d in os.listdir("/sys/devices/system/node") if d.startswith("node")]
+        if nodes:
+            info["node_count"] = len(nodes)
+    except OSError:
+        pass
+    # CPU affinity of this process (which cores -> which socket -> NUMA hint)
+    try:
+        if hasattr(os, "sched_getaffinity"):
+            info["cpus_allowed_list"] = sorted(os.sched_getaffinity(0))
+            if info["source"] is None:
+                info["available"] = True
+                info["source"] = "os.sched_getaffinity"
+    except OSError:
+        pass
+    return info
+
+
+# --------------------------------------------------------------------------- #
+# GPU path (torch only here)                                                  #
+# --------------------------------------------------------------------------- #
+def _bench_one(torch, direction: str, host_memory: str, nbytes: int,
+               dtype, warmup: int, iters: int) -> dict:
+    """Time a single (direction, host_memory, size) point with CUDA events.
+
+    Returns latency (us) and bandwidth (GB/s, decimal). Uses non_blocking=True so
+    pinned transfers actually go async on the copy engine; pageable is implicitly
+    synchronous (the staging copy serializes), which is the honest contrast.
+    """
+    elem = torch.tensor([], dtype=dtype).element_size()
+    n = max(1, nbytes // elem)
+    pin = host_memory == "pinned"
+
+    host = torch.empty(n, dtype=dtype, device="cpu", pin_memory=pin)
+    dev = torch.empty(n, dtype=dtype, device="cuda")
+    if direction == "h2d":
+        src, dst = host, dev
+    else:
+        src, dst = dev, host
+
+    non_blocking = pin  # pageable cannot be truly async
+
+    for _ in range(warmup):
+        dst.copy_(src, non_blocking=non_blocking)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        dst.copy_(src, non_blocking=non_blocking)
+    end.record()
+    torch.cuda.synchronize()
+
+    elapsed_ms = start.elapsed_time(end)
+    avg_ms = elapsed_ms / iters
+    actual_bytes = n * elem
+    gbps = (actual_bytes / (avg_ms / 1e3)) / 1e9 if avg_ms > 0 else 0.0
+    return {
+        "size_bytes": actual_bytes,
+        "requested_bytes": nbytes,
+        "latency_us": round(avg_ms * 1e3, 4),
+        "bandwidth_gbps": round(gbps, 3),
+    }
+
+
+def _overlap_with_compute(torch, nbytes: int, dtype, iters: int) -> dict:
+    """Run a pinned H2D copy concurrently with a dummy GEMM on a separate stream
+    and report achieved overlap %.
+
+    overlap_pct = 1 - overlapped_time / (copy_alone + gemm_alone), clamped to
+    [0, 100]. 100% means the copy was fully hidden behind compute; ~0% means the
+    copy stream and compute stream serialized (e.g. PCIe contention or no copy
+    engine free). Best-effort and labelled — it is a diagnostic, not a curve point.
+    """
+    elem = torch.tensor([], dtype=dtype).element_size()
+    n = max(1, nbytes // elem)
+    host = torch.empty(n, dtype=dtype, device="cpu", pin_memory=True)
+    dev = torch.empty(n, dtype=dtype, device="cuda")
+
+    # A GEMM big enough to take longer than the copy (so the copy can hide under it).
+    m = 2048
+    a = torch.randn(m, m, device="cuda", dtype=torch.float16)
+    b = torch.randn(m, m, device="cuda", dtype=torch.float16)
+
+    copy_stream = torch.cuda.Stream()
+    compute_stream = torch.cuda.Stream()
+
+    def _time(fn) -> float:
+        torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True)
+        e = torch.cuda.Event(enable_timing=True)
+        s.record()
+        fn()
+        e.record()
+        torch.cuda.synchronize()
+        return s.elapsed_time(e) / iters
+
+    # warmup both paths
+    for _ in range(3):
+        dev.copy_(host, non_blocking=True)
+        torch.matmul(a, b)
+    torch.cuda.synchronize()
+
+    copy_ms = _time(lambda: [dev.copy_(host, non_blocking=True) for _ in range(iters)])
+    gemm_ms = _time(lambda: [torch.matmul(a, b) for _ in range(iters)])
+
+    def _overlapped():
+        for _ in range(iters):
+            with torch.cuda.stream(copy_stream):
+                dev.copy_(host, non_blocking=True)
+            with torch.cuda.stream(compute_stream):
+                torch.matmul(a, b)
+        copy_stream.synchronize()
+        compute_stream.synchronize()
+
+    both_ms = _time(_overlapped)
+
+    serial = copy_ms + gemm_ms
+    # Hidden time = how much shorter "both concurrent" is than running them back to back.
+    hidden = max(0.0, serial - both_ms)
+    # As a fraction of the SMALLER of the two (the most that can be hidden is min).
+    hideable = min(copy_ms, gemm_ms)
+    overlap_pct = (hidden / hideable * 100.0) if hideable > 0 else 0.0
+    overlap_pct = max(0.0, min(100.0, overlap_pct))
+    return {
+        "size_bytes": n * elem,
+        "copy_alone_us": round(copy_ms * 1e3, 4),
+        "gemm_alone_us": round(gemm_ms * 1e3, 4),
+        "concurrent_us": round(both_ms * 1e3, 4),
+        "serial_sum_us": round(serial * 1e3, 4),
+        "overlap_pct": round(overlap_pct, 1),
+        "gemm_shape": [m, m, m],
+    }
+
+
+def run_gpu(args) -> tuple[list[dict], dict, str | None]:
+    """Returns (rows, diagnostics, error). rows is empty + error set if torch/GPU
+    is unavailable — the caller turns that into status=invalid, never a fake row."""
+    try:
+        import torch
+    except Exception as exc:  # pragma: no cover - runtime/GPU only
+        return [], {}, f"torch unavailable: {exc!r}"
+    if not torch.cuda.is_available():
+        return [], {}, "torch.cuda.is_available() is False (no GPU in this container)"
+
+    dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16,
+             "float32": torch.float32, "uint8": torch.uint8}[args.dtype]
+    sizes = size_ladder(args.min_bytes, args.max_bytes, args.factor)
+
+    rows: list[dict] = []
+    for direction, host_memory in SUBOPS:
+        for nbytes in sizes:
+            try:
+                r = _bench_one(torch, direction, host_memory, nbytes, dtype,
+                               args.warmup, args.iters)
+                r["op"] = direction
+                r["host_memory"] = host_memory
+                rows.append(r)
+            except RuntimeError as exc:  # OOM at the top of the ladder, etc.
+                rows.append({
+                    "op": direction, "host_memory": host_memory,
+                    "size_bytes": nbytes, "requested_bytes": nbytes,
+                    "latency_us": None, "bandwidth_gbps": None,
+                    "error": repr(exc),
+                })
+
+    diagnostics: dict = {"numa": _numa_locality()}
+    if not args.no_overlap:
+        try:
+            diagnostics["overlap_with_compute"] = _overlap_with_compute(
+                torch, args.overlap_bytes, dtype, max(5, args.iters))
+        except Exception as exc:  # best-effort diagnostic
+            diagnostics["overlap_with_compute"] = {"error": repr(exc)}
+    return rows, diagnostics, None
+
+
+# --------------------------------------------------------------------------- #
+# document assembly + CLI                                                      #
+# --------------------------------------------------------------------------- #
+def build_doc(args, rows: list[dict], diagnostics: dict, error: str | None) -> dict:
+    # Peak bandwidth across every real measured row gates validity: a run that
+    # produced no positive bandwidth did not actually transfer.
+    measured = [r for r in rows if r.get("bandwidth_gbps")]
+    peak_bw = max((r["bandwidth_gbps"] for r in measured), default=0.0)
+    transferred = bool(measured) and peak_bw > 0.0
+
+    meta = {
+        "op": "host_device_copy",
+        "host_memory": "mixed",
+        "dtype": args.dtype,
+        "transport": args.transport,
+        "topology_class": args.topology_class,
+        "comparison_class": args.comparison_class,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+    }
+    # Per-curve keys: one comparison_key per (op, host_memory) so the plotter can
+    # overlay pinned-vs-pageable / h2d-vs-d2h as distinct curves.
+    curve_keys = {}
+    for direction, host_memory in SUBOPS:
+        cm = dict(meta, op=direction, host_memory=host_memory)
+        curve_keys[f"{direction}/{host_memory}"] = comparison_key(cm)
+    for r in rows:
+        r["comparison_key"] = curve_keys.get(f"{r['op']}/{r['host_memory']}")
+
+    doc = {
+        "schema_version": SCHEMA_VERSION,
+        "family": FAMILY,
+        "generated_by": GENERATED_BY,
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner,
+        "transport": args.transport,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+        "topology_class": args.topology_class,
+        "comparison_class": args.comparison_class,
+        "dtype": args.dtype,
+        "sub_ops": [f"{d}/{h}" for d, h in SUBOPS],
+        # top-level comparison_key = the whole-suite key (op=host_device_copy);
+        # per-row keys (above) drive curve overlays.
+        "comparison_key": comparison_key(meta),
+        "curve_keys": curve_keys,
+        "status": "valid" if transferred else "invalid",
+        "error": error,
+        "peak_bandwidth_gbps": round(peak_bw, 3),
+        "sweep": {"min_bytes": args.min_bytes, "max_bytes": args.max_bytes,
+                  "factor": args.factor, "warmup": args.warmup, "iters": args.iters},
+        "num_rows": len(rows),
+        "rows": rows,
+        "diagnostics": diagnostics,
+        "provenance": _provenance(),
+        "environment": _load_env(args.env_json),
+    }
+    return doc
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX CPU<->GPU offload suite")
+    # sweep knobs
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--factor", type=int, default=DEFAULT_FACTOR, help="size step factor")
+    ap.add_argument("--dtype", default="float16",
+                    choices=["float16", "bfloat16", "float32", "uint8"])
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=20)
+    ap.add_argument("--no-overlap", action="store_true",
+                    help="skip the overlap-with-compute diagnostic")
+    ap.add_argument("--overlap-bytes", type=int, default=16 * 1024 * 1024,
+                    help="copy size for the overlap-with-compute diagnostic")
+    ap.add_argument("--parse-only", action="store_true",
+                    help="emit a well-formed (status=invalid) doc with no GPU — schema check")
+    # provenance (mirrors run_nccl.py)
+    ap.add_argument("--runner", required=True, help="runner label, e.g. h200")
+    ap.add_argument("--topology-class", required=True,
+                    help="e.g. h200-nvlink-island, gb200-nvl72-c2c")
+    ap.add_argument("--transport", default="pcie",
+                    help="observed host<->device transport: pcie | nvlink-c2c")
+    ap.add_argument("--comparison-class", default="standardized",
+                    choices=["standardized", "backend-optimized", "framework-integrated"])
+    ap.add_argument("--env-json", help="path to env_capture.py output to embed")
+    ap.add_argument("--timestamp", help="ISO timestamp (default now)")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    if args.parse_only:
+        rows, diagnostics, error = [], {"numa": _numa_locality()}, "parse-only (no GPU run)"
+    else:
+        rows, diagnostics, error = run_gpu(args)
+
+    doc = build_doc(args, rows, diagnostics, error)
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+
+    ov = doc["diagnostics"].get("overlap_with_compute", {})
+    print(
+        f"offload: {doc['num_rows']} rows -> {args.out} "
+        f"(status={doc['status']}, peak_bw={doc['peak_bandwidth_gbps']} GB/s, "
+        f"overlap={ov.get('overlap_pct')}%, key={doc['comparison_key']})",
+        file=sys.stderr,
+    )
+    return 0 if doc["status"] == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/probe_mori_caps.py b/experimental/CollectiveX/tests/probe_mori_caps.py
new file mode 100644
index 000000000..19ae6e9ed
--- /dev/null
+++ b/experimental/CollectiveX/tests/probe_mori_caps.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Read-only MoRI capability probe (run under torchrun on MI355X, 8 ranks).
+
+Decides whether 'fp8' enters MoRIBackend.SUPPORTED_PRECISIONS: inspects
+EpDispatchCombineConfig for quant_type options + the scale plumbing, then attempts a
+small fp8 dispatch/combine. Prints MORI_FP8_OK (with the working quant_type + recon
+error) or MORI_FP8_FAIL (with the exception) — that verdict gates the reject matrix.
+LL is not probed: MoRI exposes no separate low-latency entrypoint (caps exclude it).
+"""
+import inspect
+import os
+import sys
+import traceback
+
+import torch
+import torch.distributed as dist
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import routing  # noqa: E402
+
+os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
+
+
+def main() -> int:
+    rank = int(os.environ.get("RANK", "0"))
+    world = int(os.environ.get("WORLD_SIZE", "1"))
+    local = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local)
+    device = torch.device(f"cuda:{local}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12399")
+    dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world,
+                            device_id=device)
+    import mori
+
+    if rank == 0:
+        p = torch.cuda.get_device_properties(0)
+        print(f"[mori] device={p.name} cus={p.multi_processor_count}")
+        print("[mori] EpDispatchCombineConfig sig:")
+        try:
+            print("   ", inspect.signature(mori.ops.EpDispatchCombineConfig))
+        except Exception as e:
+            print("    <no sig>", repr(e))
+        # surface any quant enum the module exposes
+        for name in dir(mori.ops):
+            if "quant" in name.lower() or "Quant" in name:
+                obj = getattr(mori.ops, name)
+                print(f"[mori] ops.{name} = {obj}")
+                if hasattr(obj, "__members__"):
+                    print("     members:", list(obj.__members__))
+
+    hidden, topk, experts = 7168, 8, 256
+    T = 8
+    epr = experts // world
+    world_group = torch.distributed.group.WORLD
+    torch._C._distributed_c10d._register_process_group("default", world_group)
+    mori.shmem.shmem_torch_process_group_init("default")
+
+    # candidate fp8 quant_type values to try (string and enum forms)
+    candidates = []
+    QT = getattr(mori.ops, "EpDispatchCombineQuantType", None) or getattr(mori.ops, "QuantType", None)
+    if QT is not None and hasattr(QT, "__members__"):
+        for mname in QT.__members__:
+            if "8" in mname or "fp8" in mname.lower() or "FP8" in mname:
+                candidates.append((f"enum:{mname}", QT.__members__[mname]))
+    for s in ("fp8", "fp8_e4m3", "e4m3"):
+        candidates.append((f"str:{s}", s))
+
+    if rank == 0:
+        print(f"[mori] fp8 quant_type candidates: {[c[0] for c in candidates]}")
+
+    gi, gw = routing.build_global_routing(T * world, experts, topk, "uniform", 67, epr)
+    si, sw = routing.rank_slice(gi, gw, rank, T)
+    x = routing.rank_activations(T, hidden, 67, rank, device, torch.bfloat16)
+    indices = si.to(device).to(torch.int32)
+    weights = sw.to(device).to(torch.float32)
+
+    working = None
+    detail = ""
+    for label, qt in candidates:
+        try:
+            cfg = mori.ops.EpDispatchCombineConfig(
+                data_type=torch.bfloat16, rank=rank, world_size=world,
+                hidden_dim=hidden, scale_dim=hidden // 128,
+                scale_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+                max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+                max_num_inp_token_per_rank=512, num_experts_per_rank=epr,
+                num_experts_per_token=topk, use_external_inp_buf=False, quant_type=qt)
+            op = mori.ops.EpDispatchCombineOp(cfg)
+            scales = torch.ones((T, hidden // 128), dtype=torch.float32, device=device)
+            out = op.dispatch(x, weights, scales, indices, block_num=80, warp_per_block=16)
+            recv = int(out[-1][0].item())
+            dist.barrier()
+            working = label
+            detail = f"quant_type={label} dispatched recv={recv}"
+            if rank == 0:
+                print(f"[mori] FP8 DISPATCH OK with {label}: recv={recv}")
+            break
+        except Exception as exc:
+            if rank == 0:
+                print(f"[mori] {label} failed: {type(exc).__name__}: {str(exc)[:160]}")
+            detail = f"{type(exc).__name__}: {str(exc)[:160]}"
+
+    v = torch.tensor([1 if working else 0], device=device)
+    dist.all_reduce(v, op=dist.ReduceOp.MIN)
+    if rank == 0:
+        print(("MORI_FP8_OK " + detail) if int(v.item()) == 1 else ("MORI_FP8_FAIL " + detail))
+    sys.stdout.flush(); sys.stderr.flush()
+    os._exit(0 if int(v.item()) == 1 else 7)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/reference_ep.py b/experimental/CollectiveX/tests/reference_ep.py
new file mode 100644
index 000000000..c19f854e0
--- /dev/null
+++ b/experimental/CollectiveX/tests/reference_ep.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""CollectiveX independent EP reference semantics (goal Part 3).
+
+A from-scratch model of MoE dispatch + combine, written WITHOUT DeepEP or MoRI, used ONLY for
+UNTIMED correctness validation. The point (goal: "avoid validating backend against itself"):
+expected outputs come from the canonical routing trace + this independent logic, never from the
+backend's own round trip. Pure numpy — runs anywhere, no torch.
+
+Model (ep_size ranks, experts_per_rank experts each; expert e lives on rank e // experts_per_rank):
+  dispatch:  token t selected for expert e contributes a copy of x[t] to (rank e//epr, expert e).
+  expert:    a deterministic per-expert transform f_e (default: scale x by (1 + e/E) — distinct
+             per expert so a mis-routed copy is detectable; identity is the degenerate case).
+  combine:   y[t] = sum over t's selected experts e of  topk_weight[t,e] * f_e(x[t]).
+             Reduction is over the token's experts; output is in SOURCE token order.
+
+validate_dispatch() checks every (token, selected-expert) maps to the right rank+expert and the
+right payload+gate weight, exactly once. validate_combine() checks the reduction, gate-weighting,
+source ordering, and multiple-experts-on-one-rank. reference_combine() returns y for comparing a
+backend's combined output against an independent oracle.
+"""
+from __future__ import annotations
+
+import numpy as np
+
+
+def expert_scale(e: int, experts: int) -> float:
+    """Default deterministic per-expert transform factor — distinct per expert so a copy routed
+    to the wrong expert produces a wrong value (identity would hide mis-routing)."""
+    return 1.0 + e / float(experts)
+
+
+def dispatch_plan(idx: np.ndarray, experts: int, experts_per_rank: int):
+    """Independent dispatch model. idx[T,topk] selected experts per token.
+    Returns list of (token, slot, expert, dest_rank) — every routed copy, exactly once."""
+    T, topk = idx.shape
+    plan = []
+    for t in range(T):
+        seen = set()
+        for k in range(topk):
+            e = int(idx[t, k])
+            assert e not in seen, f"token {t} selects expert {e} twice (must be distinct)"
+            seen.add(e)
+            plan.append((t, k, e, e // experts_per_rank))
+    return plan
+
+
+def reference_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale):
+    """y[t] = sum_k weights[t,k] * f_{idx[t,k]}(x[t]); source-token order. The independent oracle."""
+    T, topk = idx.shape
+    y = np.zeros_like(x, dtype=np.float64)
+    for t in range(T):
+        for k in range(topk):
+            e = int(idx[t, k])
+            y[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64)
+    return y
+
+
+def validate_dispatch(idx, experts, experts_per_rank):
+    """Every selected (token,expert) routes to the correct rank+expert, exactly once."""
+    plan = dispatch_plan(idx, experts, experts_per_rank)
+    errs = []
+    # exactly-once: no duplicate (token, expert)
+    pairs = [(t, e) for (t, _k, e, _r) in plan]
+    if len(pairs) != len(set(pairs)):
+        errs.append("duplicate (token,expert) routed copy")
+    # correct destination rank
+    for (t, k, e, r) in plan:
+        if r != e // experts_per_rank:
+            errs.append(f"token {t} expert {e} -> rank {r}, expected {e // experts_per_rank}")
+    ep = (experts + experts_per_rank - 1) // experts_per_rank
+    for (t, k, e, r) in plan:
+        if not (0 <= r < ep):
+            errs.append(f"dest rank {r} out of range [0,{ep})")
+    return errs
+
+
+def validate_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale, tol=1e-9):
+    """Recompute y two ways (vectorizable reduction vs explicit per-copy accumulation) and confirm
+    they agree — exercises reduction across experts, gate-weighting, source ordering, and the
+    multiple-experts-on-one-rank case (when topk experts share a rank)."""
+    errs = []
+    y_ref = reference_combine(idx, weights, x, experts, experts_per_rank, transform)
+    # explicit accumulation over the dispatch plan (independent path)
+    T = idx.shape[0]
+    y_acc = np.zeros((T, x.shape[1]), dtype=np.float64)
+    for (t, k, e, r) in dispatch_plan(idx, experts, experts_per_rank):
+        y_acc[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64)
+    if np.abs(y_ref - y_acc).max() > tol:
+        errs.append(f"combine reduction mismatch ({np.abs(y_ref - y_acc).max():.2e})")
+    # multiple-experts-on-one-rank present?
+    multi = any(len({int(e) // experts_per_rank for e in idx[t]}) < idx.shape[1] for t in range(T))
+    return errs, {"has_multi_expert_per_rank": bool(multi)}
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    import sys
+    rng = np.random.default_rng(0)
+    E, EPR, T, topk, H = 256, 32, 64, 8, 16
+    idx = np.stack([rng.permutation(E)[:topk] for _ in range(T)]).astype(np.int64)
+    w = rng.random((T, topk)).astype(np.float32)
+    x = rng.standard_normal((T, H)).astype(np.float32)
+    de = validate_dispatch(idx, E, EPR); assert not de, de
+    ce, info = validate_combine(idx, w, x, E, EPR); assert not ce, ce
+    print(f"dispatch+combine semantics OK (multi_expert_per_rank={info['has_multi_expert_per_rank']})")
+    # mis-routing is DETECTED: corrupt one expert id and confirm the oracle value changes
+    y0 = reference_combine(idx, w, x, E, EPR)
+    idx2 = idx.copy(); idx2[0, 0] = (idx2[0, 0] + 1) % E
+    y1 = reference_combine(idx2, w, x, E, EPR)
+    assert np.abs(y0[0] - y1[0]).max() > 1e-6, "per-expert transform must make mis-routing detectable"
+    print("mis-routing detectable via distinct per-expert transform OK")
+    # edge cases (goal Part 3): empty rank, repeated dest rank, non-divisible handled by callers
+    idx_hot = np.zeros((4, topk), dtype=np.int64)
+    idx_hot[:] = np.arange(topk)               # all tokens -> experts 0..7 (all on rank 0) = hotspot
+    assert not validate_dispatch(idx_hot, E, EPR), "single-rank hotspot must validate"
+    print("edge case: single-rank hotspot (all topk on rank 0) OK")
+    print("reference_ep self-test: PASS"); sys.exit(0)
diff --git a/experimental/CollectiveX/tests/rl_mesh_bench.py b/experimental/CollectiveX/tests/rl_mesh_bench.py
new file mode 100644
index 000000000..d35c32a22
--- /dev/null
+++ b/experimental/CollectiveX/tests/rl_mesh_bench.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""CollectiveX — RL mesh-to-mesh transfer benchmark (family=rl-mesh).
+
+In RL post-training the TRAINER mesh (updated weights) must hand parameters to the
+GENERATOR/rollout mesh, and rollouts flow back — an NCCL M2N / "NCCL Xfer" pattern between
+two DISJOINT device meshes. This benchmark splits the world into a trainer half and a
+generator half and times weight-sized tensor transfer between them, both directions, under
+two redistribution patterns:
+
+  paired       : trainer rank i  -> generator rank i        (1:1 send/recv, matched ranks)
+  redistribute : every trainer rank -> every generator rank (disjoint all-to-all reshard,
+                 the realistic case when trainer-TP != generator-TP)
+
+Run under torchrun (multi-process); world is split in half (needs >=2 ranks, even count).
+CUDA-event timed; one provenance-tagged JSON like run_nccl.py. Stdlib + torch (torch only
+needed at runtime; --help works without it).
+
+  torchrun --nproc_per_node=8 tests/rl_mesh_bench.py --runner h200-dgxc \\
+      --topology-class h200-nvlink-island --transport nvlink \\
+      --env-json results/env.json --out results/h200_rl_mesh.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "rl-mesh-xfer-v1"
+FAMILY = "rl-mesh"
+
+# Weight-shard byte sizes a trainer->generator handoff moves: a single large tensor (a fused
+# QKV / MLP weight) up to a whole layer's params. Sweep 1 MiB .. 1 GiB.
+DEFAULT_MIN_BYTES = 1 << 20
+DEFAULT_MAX_BYTES = 1 << 30
+
+
+def _sizes(lo, hi, factor=4):
+    out, s = [], lo
+    while s <= hi:
+        out.append(s)
+        s *= factor
+    return out
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [meta["direction"], meta["pattern"], str(meta["world_size"]),
+             meta["topology_class"], meta["measurement_contract"]]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _bench(fn, torch, warmup, iters):
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / iters  # ms/iter
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX RL mesh-to-mesh transfer benchmark")
+    ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES)
+    ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES)
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=20)
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="nvlink")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    if world < 2 or world % 2 != 0:
+        if rank == 0:
+            print(f"ERROR: rl-mesh needs an even world_size >= 2 (got {world})", file=sys.stderr)
+        return 5
+    torch.cuda.set_device(local_rank)
+    dev = torch.device(f"cuda:{local_rank}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12357")
+    if not dist.is_initialized():
+        dist.init_process_group("nccl")
+
+    half = world // 2
+    is_trainer = rank < half
+    # peer for the paired (1:1) pattern: trainer i <-> generator (i+half)
+    paired_peer = (rank + half) if is_trainer else (rank - half)
+    sizes = _sizes(args.min_bytes, args.max_bytes)
+    groups = []
+    peak = 0.0
+
+    def _buf(nbytes):
+        return torch.empty(nbytes, dtype=torch.uint8, device=dev)
+
+    # PAIRED 1:1 send/recv, timed on the trainer side per direction.
+    for direction in ("trainer_to_generator", "generator_to_trainer"):
+        rows = []
+        sender_is_trainer = (direction == "trainer_to_generator")
+        i_send = (is_trainer == sender_is_trainer)  # this rank sends in this direction
+        for nbytes in sizes:
+            buf = _buf(nbytes)
+
+            def step():
+                if i_send:
+                    dist.send(buf, dst=paired_peer)
+                else:
+                    dist.recv(buf, src=paired_peer)
+            try:
+                ms = _bench(step, torch, args.warmup, args.iters)
+            except RuntimeError as exc:
+                rows.append({"transfer_bytes": nbytes, "error": repr(exc), "correct": None})
+                break
+            gb_s = (nbytes / (ms / 1e3)) / 1e9 if ms > 0 else 0.0
+            # reduce timing across ranks (max = slowest pair) for a stable number
+            t = torch.tensor([ms], device=dev)
+            dist.all_reduce(t, op=dist.ReduceOp.MAX)
+            ms_max = float(t.item())
+            gb_s = (nbytes / (ms_max / 1e3)) / 1e9 if ms_max > 0 else 0.0
+            peak = max(peak, gb_s)
+            rows.append({"transfer_bytes": nbytes, "time_ms": round(ms_max, 5),
+                         "bandwidth_gb_s": round(gb_s, 2), "correct": True})
+        meta = {"direction": direction, "pattern": "paired", "world_size": world,
+                "trainer_ranks": half, "generator_ranks": world - half,
+                "topology_class": args.topology_class, "measurement_contract": MEASUREMENT_CONTRACT}
+        groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows})
+
+    # REDISTRIBUTE: disjoint all-to-all (trainer half scatters to all generator ranks). Each
+    # sender sends nbytes/half to each receiver in the other mesh; timed via batched isend/irecv.
+    for direction in ("trainer_to_generator", "generator_to_trainer"):
+        rows = []
+        senders = range(0, half) if direction == "trainer_to_generator" else range(half, world)
+        receivers = range(half, world) if direction == "trainer_to_generator" else range(0, half)
+        am_sender = rank in senders
+        am_receiver = rank in receivers
+        for nbytes in sizes:
+            chunk = max(1, nbytes // half)
+            sbuf = _buf(chunk)
+
+            def step():
+                reqs = []
+                if am_sender:
+                    for dst in receivers:
+                        reqs.append(dist.isend(sbuf, dst=dst))
+                if am_receiver:
+                    for src in senders:
+                        rbuf = _buf(chunk)
+                        reqs.append(dist.irecv(rbuf, src=src))
+                for r in reqs:
+                    r.wait()
+            try:
+                ms = _bench(step, torch, args.warmup, args.iters)
+            except RuntimeError as exc:
+                rows.append({"transfer_bytes": nbytes, "error": repr(exc), "correct": None})
+                break
+            t = torch.tensor([ms], device=dev)
+            dist.all_reduce(t, op=dist.ReduceOp.MAX)
+            ms_max = float(t.item())
+            # effective payload moved per receiver = nbytes (half chunks of nbytes/half)
+            gb_s = (nbytes / (ms_max / 1e3)) / 1e9 if ms_max > 0 else 0.0
+            peak = max(peak, gb_s)
+            rows.append({"transfer_bytes": nbytes, "time_ms": round(ms_max, 5),
+                         "bandwidth_gb_s": round(gb_s, 2), "correct": True})
+        meta = {"direction": direction, "pattern": "redistribute", "world_size": world,
+                "trainer_ranks": half, "generator_ranks": world - half,
+                "topology_class": args.topology_class, "measurement_contract": MEASUREMENT_CONTRACT}
+        groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows})
+
+    if rank != 0:
+        dist.barrier()
+        dist.destroy_process_group()
+        return 0
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+    doc = {
+        "schema_version": SCHEMA_VERSION, "family": FAMILY,
+        "generated_by": "rl_mesh_bench.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner, "transport": args.transport,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+        "world_size": world, "trainer_ranks": half, "generator_ranks": world - half,
+        "status": "valid" if (groups and peak > 0.0) else "invalid",
+        "peak_bandwidth_gb_s": round(peak, 2),
+        "num_groups": len(groups), "groups": groups, "environment": env,
+    }
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+    print(f"rl-mesh: {len(groups)} (direction,pattern) groups -> {args.out} "
+          f"(status={doc['status']}, peak_bw={peak:.1f} GB/s, world={world} trainer={half})")
+    dist.barrier()
+    dist.destroy_process_group()
+    return 0 if doc["status"] == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py
new file mode 100644
index 000000000..7c1f3458d
--- /dev/null
+++ b/experimental/CollectiveX/tests/routing.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""CollectiveX — deterministic, platform-independent MoE routing trace.
+
+Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated
+ONCE from a fixed seed over the *global* token batch, indexed by global token id, and
+is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k,
+experts_per_rank). Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations
+are per-rank (same rank ⇒ same x on any platform), so a given global token id has
+identical activation everywhere without materializing a global activation tensor.
+
+Trace classes (the rank fan-out — #destination ranks a token's top-k experts touch —
+is the property that makes an EP workload representative; review caught the old
+default having fan-out 1):
+
+  * uniform   — top-k distinct experts drawn uniformly per token. The DEFAULT.
+                Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈
+                8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson.
+  * balanced  — load-equalized AND maximally spread: token i, slot j →
+                (i + j·experts_per_rank) mod E, so the 8 experts sit one-per-rank
+                (fan-out = ep_size) and every expert is hit equally. The high-fan-out,
+                perfectly-balanced reference.
+  * balanced-rank-local — the OLD degenerate "balanced": (i·top_k + j) mod E, i.e.
+                top_k consecutive experts, which (top_k ≤ experts/rank, aligned) all
+                land on ONE rank ⇒ fan-out 1, minimum communication. Kept as an
+                explicit edge case, honestly named.
+  * zipf      — expert popularity ∝ 1/rank (skewed load), uniform-ish fan-out.
+
+Temporal classes (goal Part 2 "temporal routing changes" — the hot set MOVES across decode
+steps; selected by `step`, which every rank passes identically so the trace stays consistent):
+
+  * hotspot-single   — STATIC hotspot: expert 0 hot on every step (the adversarial baseline).
+  * hotspot-moving   — the hot expert is `step % experts` (a hotspot that migrates step-to-step).
+  * alternating-groups — tokens route within one of two disjoint expert halves, the active half
+                toggling with `step % 2` (models expert groups that alternate across steps).
+  * trace-replay     — RESERVED: captured per-step routing from real serving (needs a captured
+                trace loader; not yet wired — `build_global_routing` raises for it).
+
+Always publish the realized fan-out so the workload is never misread again
+(`routing_stats`).
+"""
+from __future__ import annotations
+
+import hashlib
+
+import torch
+
+_RANK_SUBSEED = 7919
+
+
+def _cpu_gen(seed: int) -> "torch.Generator":
+    g = torch.Generator(device="cpu")
+    g.manual_seed(int(seed))
+    return g
+
+
+def build_global_routing(global_tokens: int, experts: int, topk: int,
+                         routing: str, seed: int, experts_per_rank: int, step: int = 0):
+    """(idx[gt, topk] int64, weights[gt, topk] float32) on CPU — deterministic,
+    independent of world/EP/platform, experts distinct within a token. `step` selects
+    the temporal snapshot for the moving/alternating distributions (0 = first step =
+    the static behavior; identical on every rank so the trace stays cross-rank consistent)."""
+    if topk > experts:
+        raise ValueError(f"topk ({topk}) > experts ({experts})")
+    gt = int(global_tokens)
+    g = _cpu_gen(seed)
+    if routing == "uniform":
+        keys = torch.rand(gt, experts, generator=g)
+        idx = keys.argsort(dim=1)[:, :topk].contiguous().to(torch.int64)
+    elif routing == "balanced":
+        # one expert per rank ⇒ fan-out = ep_size, perfectly balanced load.
+        i = torch.arange(gt, dtype=torch.int64).unsqueeze(1)
+        j = torch.arange(topk, dtype=torch.int64).unsqueeze(0)
+        idx = (i + j * int(experts_per_rank)) % experts
+    elif routing == "balanced-rank-local":
+        # top_k consecutive (mod E) ⇒ all on ONE rank ⇒ fan-out 1 (min comm). Edge case.
+        i = torch.arange(gt, dtype=torch.int64).unsqueeze(1)
+        j = torch.arange(topk, dtype=torch.int64).unsqueeze(0)
+        idx = (i * topk + j) % experts
+    elif routing == "zipf" or routing.startswith("zipf-"):
+        # popularity ∝ 1/rank^s — s sets the skew. zipf == zipf-moderate (s=1).
+        s = {"zipf": 1.0, "zipf-mild": 0.5, "zipf-moderate": 1.0, "zipf-heavy": 2.0}.get(routing)
+        if s is None:
+            raise ValueError(f"unknown zipf level '{routing}'")
+        p = 1.0 / torch.arange(1, experts + 1, dtype=torch.float32).pow(s)
+        p = (p / p.sum()).expand(gt, experts)
+        idx = torch.multinomial(p, topk, replacement=False, generator=g).to(torch.int64)
+    elif routing == "hotspot-single" or routing == "hotspot-moving":
+        # adversarial: ONE hot expert is in EVERY token's top-k (max single-rank load), the
+        # other topk-1 drawn uniformly from the rest. hotspot-single pins it at expert 0
+        # (STATIC); hotspot-moving migrates it to `step % experts` (the hot rank moves across
+        # decode steps). Identical math otherwise — `hot` is the only difference.
+        hot = 0 if routing == "hotspot-single" else (int(step) % experts)
+        others = [e for e in range(experts) if e != hot]
+        others_t = torch.tensor(others, dtype=torch.int64)
+        rest = torch.stack([others_t[torch.randperm(experts - 1, generator=g)[:topk - 1]]
+                            for _ in range(gt)]).to(torch.int64)
+        idx = torch.cat([torch.full((gt, 1), hot, dtype=torch.int64), rest], dim=1)
+    elif routing == "alternating-groups":
+        # tokens route ENTIRELY within one disjoint expert half; the active half toggles with
+        # `step % 2` (group A = [0, E/2), group B = [E/2, E)). Models expert groups that
+        # alternate across steps — half the ranks idle each step (a temporal load shift).
+        half = experts // 2
+        if topk > half:
+            raise ValueError(f"alternating-groups needs topk ({topk}) <= experts/2 ({half})")
+        base = 0 if (int(step) % 2 == 0) else half
+        keys = torch.rand(gt, half, generator=g)
+        idx = (keys.argsort(dim=1)[:, :topk].contiguous().to(torch.int64) + base)
+    elif routing == "trace-replay":
+        raise ValueError("trace-replay routing is reserved — needs a captured per-step trace "
+                         "loader (not yet wired); use make_workloads.py + --workload-dir to "
+                         "replay a serialized trace, or pick a synthetic temporal mode")
+    else:
+        raise ValueError(
+            f"unknown routing '{routing}' (uniform|balanced|balanced-rank-local|"
+            f"zipf[-mild|-moderate|-heavy]|hotspot-single|hotspot-moving|alternating-groups)")
+    weights = torch.softmax(torch.randn(gt, topk, generator=g), dim=1).to(torch.float32)
+    return idx, weights
+
+
+# Activation VALUE distributions (goal Part 2 "activation-value sensitivity"). Under bf16 combine
+# these are latency-neutral (bf16 is value-independent — the ratio is ~1.0, the expected null
+# result); they become latency-relevant only under a quantized combine (PR311), where amax /
+# outliers / saturation drive scale computation. Kept here so the rig is ready + the value
+# identity (activation_identity) is honest about which distribution was used.
+ACTIVATION_PROFILES = ("normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation")
+_FP8_E4M3_MAX = 448.0   # e4m3 max magnitude — fp8-saturation pushes values to/over this
+
+
+def rank_slice(idx, weights, rank: int, tokens_per_rank: int):
+    lo = rank * tokens_per_rank
+    return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous()
+
+
+def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device,
+                     dtype=torch.bfloat16, profile: str = "normal"):
+    """Per-rank expert-input activations. Deterministic from (seed, rank) so a given global
+    token has identical activation on every platform. `profile` selects the VALUE distribution
+    (goal Part 2): normal N(0,1); zeros; small-amplitude (×0.01); wide-dynamic-range (heavy-tailed
+    with rare large outliers); fp8-saturation (values scaled to straddle the e4m3 max so an fp8
+    cast saturates). All seeded identically per rank — only the value shape changes."""
+    g = _cpu_gen(int(seed) * _RANK_SUBSEED + int(rank) + 1)
+    if profile == "zeros":
+        x = torch.zeros(tokens, hidden, dtype=torch.float32)
+    elif profile == "small-amplitude":
+        x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32) * 0.01
+    elif profile == "wide-dynamic-range":
+        # heavy-tailed: N(0,1) base with a sparse (~1%) set of large (×~250) outliers, so amax
+        # per block swings widely token-to-token (the case that stresses per-block fp8 scaling).
+        x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32)
+        spikes = (torch.rand(tokens, hidden, generator=g) < 0.01).float()
+        x = x + spikes * torch.randn(tokens, hidden, generator=g, dtype=torch.float32) * 250.0
+    elif profile == "fp8-saturation":
+        # uniform in [-1,1] scaled to ~1.5× the e4m3 max so a naive fp8 cast clips/saturates.
+        u = torch.rand(tokens, hidden, generator=g, dtype=torch.float32) * 2.0 - 1.0
+        x = u * (_FP8_E4M3_MAX * 1.5)
+    elif profile == "normal":
+        x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32)
+    else:
+        raise ValueError(f"unknown activation profile '{profile}' (one of {ACTIVATION_PROFILES})")
+    return x.to(device=device, dtype=dtype)
+
+
+def placement_perm(ep_size: int, gpus_per_node: int, placement: str) -> list:
+    """phys[logical_rank] -> physical slot, per placement kind (goal Part 2 placement matrix).
+    The physical slot's node = slot // gpus_per_node, domain = slot // scale_up_domain. Single
+    node (ep <= gpus_per_node) makes every placement identical (everything is same-node).
+
+      packed         identity — fill one node/domain before crossing (latency-oriented default).
+      runtime-native identity for now — reproduces the serving placement (link via recipe meta).
+      striped        round-robin logical ranks across nodes (exposes inter-node transport).
+      adversarial    a deterministic scatter that maximizes cross-node/-domain copies.
+    """
+    n = ep_size
+    if gpus_per_node <= 0 or gpus_per_node >= n or placement in ("packed", "runtime-native"):
+        return list(range(n))
+    nodes = (n + gpus_per_node - 1) // gpus_per_node
+    if placement == "striped":
+        # logical r -> node (r % nodes), intra-node slot (r // nodes): spreads neighbors apart.
+        return [min(n - 1, (r % nodes) * gpus_per_node + (r // nodes)) for r in range(n)]
+    if placement == "adversarial":
+        # reverse within the rank space, then stripe — pushes a rank's neighbors to far nodes.
+        return [min(n - 1, ((n - 1 - r) % nodes) * gpus_per_node + ((n - 1 - r) // nodes))
+                for r in range(n)]
+    return list(range(n))
+
+
+def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int,
+                     gpus_per_node: int, scale_up_domain: int = None,
+                     placement: str = "packed") -> dict:
+    """Locality of the routed (token, dest-rank) copies (goal Part 2 topology section).
+    A token's SOURCE rank is global_id // tokens_per_rank; its DEST ranks are idx // epr. The
+    PLACEMENT maps each logical rank to a physical slot, so node/domain membership — and thus the
+    same-node / same-domain / cross-* fractions — depend on packed vs striped vs adversarial."""
+    import torch as _t
+    gt = idx.shape[0]
+    dest = (idx // experts_per_rank).clamp(max=ep_size - 1)             # [gt, topk] dest logical rank
+    src = (_t.arange(gt) // max(1, tokens_per_rank)).clamp(max=ep_size - 1).unsqueeze(1)
+    src = src.expand_as(dest)
+    sud = scale_up_domain or (gpus_per_node * ep_size)                  # default: all one domain
+    # physical slot of each logical rank, per placement -> node / domain it lives in.
+    perm = placement_perm(ep_size, gpus_per_node, placement)
+    phys = _t.tensor(perm, dtype=_t.int64)
+    pd, ps = phys[dest], phys[src]
+    local = (dest == src)
+    same_node = (pd // gpus_per_node) == (ps // gpus_per_node)
+    same_dom = (pd // sud) == (ps // sud)
+    n = dest.numel()
+    return {
+        "placement": placement,
+        "local_rank_fraction": float(local.float().mean()),
+        "same_node_fraction": float(same_node.float().mean()),
+        "same_scaleup_domain_fraction": float(same_dom.float().mean()),
+        "cross_node_fraction": float((~same_node).float().mean()),
+        "cross_domain_fraction": float((~same_dom).float().mean()),
+        "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n),
+    }
+
+
+def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict:
+    """Realized routing properties for the GLOBAL trace — published per point so the
+    fan-out / load can never be silently misread. idx is the global [gt, topk] tensor;
+    weights the matching [gt, topk] gate weights (hashed too for workload identity).
+    """
+    ep = max(1, experts // max(1, experts_per_rank))
+    ranks = (idx // experts_per_rank)                       # [gt, topk] destination rank per assignment
+    # unique destination ranks per token (fan-out)
+    onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool)
+    onehot.scatter_(1, ranks.clamp(max=ep - 1), True)
+    fanout = onehot.sum(dim=1)                              # [gt]
+    hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist()  # counts for fan-out 1..ep
+    load = torch.bincount(idx.reshape(-1), minlength=experts).float()
+    # token-copies SENT to each destination rank (the "send histogram", review #3).
+    rank_load_t = torch.bincount(ranks.reshape(-1).clamp(max=ep - 1), minlength=ep).float()
+    rank_load = [int(x) for x in rank_load_t.tolist()]
+    # One-number imbalance summaries so a row is self-describing for the distribution-sensitivity
+    # suite (no need to read the full histograms): CV = std/mean of the load; hotspot_ratio =
+    # worst expert load over the mean. uniform -> CV≈0, hotspot_ratio≈1; zipf / hotspot-single ->
+    # high CV and hotspot_ratio (≫1). Population std (unbiased=False) over the full realized trace.
+    def _cv(t):
+        m = float(t.mean())
+        return float(t.std(unbiased=False) / m) if m > 0 else 0.0
+    expert_load_cv = _cv(load)
+    rank_load_cv = _cv(rank_load_t)
+    hotspot_ratio = float(load.max() / load.mean()) if float(load.mean()) > 0 else 0.0
+    # Empty-expert / empty-rank counts (goal P2 "report full load and fanout statistics"):
+    # how many experts/dest-ranks received ZERO token-copies (the dark side of skew — idle
+    # units while the hot rank stalls). dest-rank load max/mean make the rank histogram
+    # self-describing without re-reading rank_load_hist.
+    empty_expert_count = int((load == 0).sum())
+    empty_rank_count = int((rank_load_t == 0).sum())
+    dest_rank_load_max = int(rank_load_t.max())
+    dest_rank_load_mean = float(rank_load_t.mean())
+    # SHA-256 workload identity over BOTH topk_idx and gate weights (review #3): a chart
+    # point's routing is provably identical across SKUs only if both hashes match.
+    idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes()
+    idx_hash = hashlib.sha256(idx_bytes).hexdigest()[:16]
+    if weights is not None:
+        w_bytes = weights.to(torch.float32).cpu().numpy().tobytes()
+        w_hash = hashlib.sha256(w_bytes).hexdigest()[:16]
+        routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest()[:16]  # combined identity
+    else:
+        w_hash, routing_hash = None, idx_hash
+    return {
+        "fanout_mean": float(fanout.float().mean()),
+        "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()),
+        "fanout_hist": hist,                               # index k-1 = #tokens with fan-out k
+        "rank_load_hist": rank_load,                       # token-copies sent to each dest rank
+        "routed_copies": int(fanout.sum()),                # total (token, dest-rank) pairs
+        "expert_load_min": int(load.min()), "expert_load_max": int(load.max()),
+        "expert_load_mean": float(load.mean()), "expert_load_cv": expert_load_cv,
+        "rank_load_cv": rank_load_cv, "hotspot_ratio": hotspot_ratio,
+        "dest_rank_load_max": dest_rank_load_max, "dest_rank_load_mean": dest_rank_load_mean,
+        "empty_expert_count": empty_expert_count, "empty_rank_count": empty_rank_count,
+        "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash,
+    }
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":  # needs torch; verifies temporal modes + value profiles + new stats
+    import sys
+    E, TOPK, EPR, GT = 256, 8, 32, 4096
+    # (1) static vs moving hotspot: the hot expert is 0 for static, step%E for moving.
+    si, _ = build_global_routing(GT, E, TOPK, "hotspot-single", 67, EPR, step=5)
+    assert (si[:, 0] == 0).all(), "hotspot-single must pin expert 0 on every step"
+    mi, _ = build_global_routing(GT, E, TOPK, "hotspot-moving", 67, EPR, step=5)
+    assert (mi[:, 0] == 5).all(), "hotspot-moving step=5 must pin expert 5"
+    mi0, _ = build_global_routing(GT, E, TOPK, "hotspot-moving", 67, EPR, step=0)
+    assert (mi0[:, 0] == 0).all(), "hotspot-moving step=0 == static origin"
+    # all topk distinct (hot + topk-1 from the rest, no collision)
+    assert all(len(set(r.tolist())) == TOPK for r in mi[:16]), "moving-hotspot topk must stay distinct"
+    # (2) alternating-groups: even step -> lower half, odd step -> upper half.
+    a0, _ = build_global_routing(GT, E, TOPK, "alternating-groups", 67, EPR, step=0)
+    a1, _ = build_global_routing(GT, E, TOPK, "alternating-groups", 67, EPR, step=1)
+    assert int(a0.max()) < E // 2 and int(a1.min()) >= E // 2, "alternating-groups must toggle halves"
+    # (3) new stats: uniform low CV / no empties; hotspot high CV + many empty experts.
+    su = routing_stats(build_global_routing(GT, E, TOPK, "uniform", 67, EPR)[0], E, EPR)
+    sh = routing_stats(si, E, EPR)
+    assert su["hotspot_ratio"] < 1.5 and sh["hotspot_ratio"] > 5, "hotspot_ratio must separate uniform/hotspot"
+    assert sh["empty_expert_count"] >= 0 and "empty_rank_count" in sh and "dest_rank_load_max" in sh
+    print(f"routing temporal+stats OK (uniform hotspot_ratio={su['hotspot_ratio']:.2f} "
+          f"hotspot empty_experts={sh['empty_expert_count']} dest_rank_max={sh['dest_rank_load_max']})")
+    # (4) value profiles: distinct value shapes, all finite, fp8-saturation exceeds e4m3 max.
+    dev = torch.device("cpu")
+    z = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="zeros")
+    assert float(z.abs().max()) == 0.0, "zeros profile must be all-zero"
+    sat = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="fp8-saturation")
+    assert float(sat.abs().max()) > _FP8_E4M3_MAX, "fp8-saturation must exceed e4m3 max"
+    sm = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="small-amplitude")
+    assert float(sm.abs().max()) < 1.0, "small-amplitude must be tiny"
+    for prof in ACTIVATION_PROFILES:
+        v = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile=prof)
+        assert torch.isfinite(v).all(), f"{prof} produced non-finite values"
+    print(f"activation profiles OK ({', '.join(ACTIVATION_PROFILES)})")
+    print("routing self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py
new file mode 100644
index 000000000..217d9ca80
--- /dev/null
+++ b/experimental/CollectiveX/tests/run_ep.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""CollectiveX — EP dispatch/combine benchmark entrypoint (run under torchrun).
+
+Picks a backend adapter (DeepEP or MoRI), runs the source-tokens-per-rank sweep
+via ep_harness, and writes one provenance-tagged JSON doc. Dispatch and combine
+are timed SEPARATELY (see ep_harness); only T varies along the resulting line.
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \\
+      --phase decode --runner mi355x-amds --topology-class mi355x-xgmi \\
+      --transport xgmi --env-json results/env.json --out results/mi355x_mori_decode.json
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend deepep \\
+      --phase prefill --runner b200-dgxc --topology-class b200-nvlink-island \\
+      --transport nvlink --env-json results/env.json --out results/b200_deepep_prefill.json
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under
+# torchrun (it executes the file as __main__, not as a package).
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import ep_harness  # noqa: E402  (stdlib-only; safe before torch)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep")
+    ap.add_argument("--backend", required=True,
+                    choices=["deepep", "deepep-hybrid", "mori", "uccl", "nccl-ep", "flashinfer"])
+    ep_harness.add_common_args(ap)
+    args = ap.parse_args()
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12355")
+
+    # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction
+    # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL
+    # routing trace and remaps it to the balanced physical placement (a pure routing transform,
+    # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count.
+    if getattr(args, "eplb", False):
+        import eplb
+        args.num_logical_experts = args.experts
+        args.experts = eplb.physical_count(args.experts, args.num_redundant_experts, world_size)
+
+    # Reproduction provenance (recorded in the artifact).
+    args.reproduction_command = (f"torchrun --nproc_per_node={world_size} tests/run_ep.py "
+                                 + " ".join(sys.argv[1:]))
+    args.image = os.environ.get("COLLECTIVEX_IMAGE", "")
+    args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "")
+    # Container provenance (goal P1): arch (amd64/arm64) + local squash hash for Enroot/Pyxis.
+    import platform as _plat
+    _arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine())
+    args.image_arch = _arch
+    args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256")
+    # Complete GitHub provenance (goal P1): repo, run id, attempt, ref/branch, source SHA, job,
+    # artifact. A result is only publication-'official' when these are present (validity gate).
+    _run = {"run_id": os.environ.get("GITHUB_RUN_ID"),
+            "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+            "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+            "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+            "repo": os.environ.get("GITHUB_REPOSITORY"),
+            "job": os.environ.get("GITHUB_JOB"),
+            "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME")}
+    args.git_run = _run if any(_run.values()) else None
+
+    # Import the backend CLASS (module-top imports torch + the backend lib; no process
+    # group needed) and REJECT unsupported combos BEFORE init — never fall back or
+    # mislabel (review/goal). All ranks reject identically.
+    if args.backend == "mori":
+        from ep_mori import MoRIBackend as Backend
+    elif args.backend == "nccl-ep":
+        from ep_nccl import NCCLBackend as Backend
+    elif args.backend == "uccl":
+        from ep_uccl import UCCLBackend as Backend
+    elif args.backend == "flashinfer":
+        from ep_flashinfer import FlashInferBackend as Backend
+    elif args.backend == "deepep-hybrid":
+        from ep_deepep_hybrid import DeepEPHybridBackend as Backend
+    else:
+        from ep_deepep import DeepEPBackend as Backend
+    if args.num_ep_groups != 1:
+        if rank == 0:
+            print(f"ERROR: num_ep_groups={args.num_ep_groups} REJECTED — real subgroup process "
+                  f"groups are unimplemented; not faking it.", file=sys.stderr)
+        return 5
+    sp = getattr(Backend, "SUPPORTED_PRECISIONS", {"bf16"})
+    sm = getattr(Backend, "SUPPORTED_MODES", {"normal"})
+    if args.dispatch_dtype not in sp or args.mode not in sm:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS dispatch-dtype={args.dispatch_dtype} / "
+                  f"mode={args.mode} — not supported on this build (no fallback). "
+                  f"supported precisions={sorted(sp)} modes={sorted(sm)}.", file=sys.stderr)
+        return 5
+    # Combine-path capability (review: dispatch_dtype=fp8 must NOT silently imply quantized
+    # combine). Defaults (bf16 / none) reproduce today's behavior; a quant-combine backend
+    # widens its SUPPORTED_COMBINE_* sets. getattr keeps backends that don't declare them at bf16/none.
+    scd = getattr(Backend, "SUPPORTED_COMBINE_DTYPES", {"bf16"})
+    sqm = getattr(Backend, "SUPPORTED_COMBINE_QUANT_MODES", {"none"})
+    cdt = getattr(args, "combine_dtype", "bf16")
+    cqm = getattr(args, "combine_quant_mode", "none")
+    if cdt not in scd or cqm not in sqm:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS combine-dtype={cdt} / combine-quant-mode={cqm} "
+                  f"— quant combine not wired (no fallback). supported combine_dtypes={sorted(scd)} "
+                  f"quant_modes={sorted(sqm)}.", file=sys.stderr)
+        return 5
+    # Measurement-contract capability (review #3): each adapter conforms to a declared
+    # contract; reject anything else rather than letting it pick its own timing boundary.
+    sc = getattr(Backend, "SUPPORTED_CONTRACTS", {"layout-and-dispatch-v1"})
+    if args.measurement_contract not in sc:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS measurement-contract="
+                  f"{args.measurement_contract} — supported={sorted(sc)}.", file=sys.stderr)
+        return 5
+    if args.measurement_contract == "cached-layout-comm-only-v1" and args.mode == "ll":
+        if rank == 0:
+            print("ERROR: cached-layout-comm-only-v1 is meaningless for LL (low_latency_dispatch "
+                  "computes its layout internally; nothing to hoist).", file=sys.stderr)
+        return 5
+
+    # MoRI inits its shmem on a process group it registers as "default" and wants
+    # the gloo+nccl combo with an explicit device_id (per its reference test);
+    # DeepEP uses a plain nccl group.
+    # Cross-node rendezvous: env:// (TCPStore at MASTER_ADDR:PORT) is the default and is byte-identical
+    # to single-node behavior. But on the H100/H200/MI355X fleets the rank-0 MASTER_ADDR (the scontrol
+    # management-subnet NodeAddr) is NOT reachable from a peer rank's enroot container net namespace, so
+    # the TCPStore bootstrap times out before any RDMA transport engages. When CX_RDZV_FILE points at a
+    # path on the COMPUTE-VISIBLE shared mount, init via a FileStore instead: ranks exchange the store
+    # (and NCCL's unique-id) through the shared file, and NCCL then connects peers over the IB fabric
+    # (which IS routable cross-node) rather than the unreachable management TCP. Opt-in; unset = today.
+    if not dist.is_initialized():
+        _rdzv = os.environ.get("CX_RDZV_FILE")
+        _fstore = {"init_method": f"file://{_rdzv}", "rank": rank, "world_size": world_size} if _rdzv else {}
+        if args.backend == "mori":
+            dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world_size,
+                                    device_id=device,
+                                    **({"init_method": f"file://{_rdzv}"} if _rdzv else {}))
+        else:
+            dist.init_process_group("nccl", **_fstore)
+
+    # Construct + run inside a try so a backend exception (esp. a new adapter on GPU) prints its
+    # FULL traceback to STDOUT — torchrun captures per-rank stdout but only summarizes stderr, so an
+    # uncaught exception is otherwise invisible in CI. Print on every rank (prefixed) then re-raise.
+    try:
+        backend = Backend(args, rank, world_size, local_rank, device)
+        if rank == 0:
+            print(f"[run_ep] backend={args.backend} phase={args.phase} mode={args.mode} "
+                  f"world={world_size} ep_size={world_size} hidden={args.hidden} "
+                  f"topk={args.topk} experts={args.experts} dtype={args.dispatch_dtype} "
+                  f"routing={args.routing} seed={args.seed}")
+        rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size)
+    except Exception:
+        import traceback
+        print(f"[run_ep][rank{rank}] backend={args.backend} FAILED:\n" + traceback.format_exc(),
+              flush=True)
+        raise
+    # finalize() handles backend-specific teardown: DeepEP returns rc cleanly;
+    # MoRI hard-exits past its post-shmem_finalize teardown assertion.
+    return backend.finalize(rc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/sensitivity.py b/experimental/CollectiveX/tests/sensitivity.py
new file mode 100644
index 000000000..b8c3d1a39
--- /dev/null
+++ b/experimental/CollectiveX/tests/sensitivity.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""CollectiveX distribution-sensitivity summary (stdlib-only — no torch/numpy).
+
+A single arbitrary routing distribution can't be published as "backend X latency" and implied
+to generalize (review): MoE combine cost depends on how tokens spread across experts/ranks. This
+collapses that into ONE number per (sku, backend, phase) instead of a 7th chart dimension:
+
+    distribution_sensitivity_ratio = p99(worst stressor distribution) / p99(headline = uniform)
+
+at MATCHED tokens/rank (anchor points). >1 means the backend degrades under skew; ~1 means robust.
+Stressors = balanced / zipf* / hotspot-single (NOT the degenerate balanced-rank-local best case,
+NOT EPLB-remedied runs). Also reports the best-case ratio and the EPLB recovery where present.
+
+Compares ONLY within an identical (sku, backend, phase, dispatch_dtype, mode, contract, ep,
+combine_quant_mode, activation_profile) group — the routing distribution is the only thing that
+varies, so the ratio is attributable to it and nothing else.
+
+  python3 tests/sensitivity.py --results-dir results            # markdown table to stdout
+  python3 tests/sensitivity.py --results-dir results --out results/sensitivity.json
+  python3 tests/sensitivity.py --results-dir results --anchors 1,8,32,128 --metric roundtrip
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+
+HEADLINE = "uniform"
+BEST_CASE = "balanced-rank-local"          # min-comm degenerate case (fan-out 1) — not a stressor
+
+
+def _routing_label(doc: dict) -> str:
+    sh = doc.get("shape", {}) or {}
+    r = sh.get("routing", "?")
+    return r + ("+eplb" if (doc.get("eplb") or {}).get("enabled") else "")
+
+
+def _group_key(doc: dict) -> tuple:
+    sh = doc.get("shape", {}) or {}
+    q = sh.get("quant", {}) or {}
+    sku = (doc.get("runner") or "?").split("_")[0].split("-")[0]
+    return (sku, doc.get("backend"), doc.get("phase"),
+            sh.get("dispatch_dtype"), doc.get("mode"), doc.get("measurement_contract"),
+            doc.get("ep_size"), q.get("combine_quant_mode", "none"),
+            sh.get("activation_profile", "normal"))
+
+
+def _p99_by_T(doc: dict, metric: str) -> dict:
+    out = {}
+    for r in doc.get("rows", []):
+        T = r.get("tokens_per_rank")
+        m = r.get(metric) or {}
+        if T is not None and m.get("p99") is not None:
+            out[int(T)] = float(m["p99"])
+    return out
+
+
+def analyze(results_dir: str, metric: str = "roundtrip", anchors=None) -> dict:
+    # group docs by identical config; within a group map routing-label -> {T: p99}.
+    groups: dict = {}
+    for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)):
+        try:
+            doc = json.load(open(path))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if doc.get("family") != "moe" or not doc.get("rows"):
+            continue
+        gk = _group_key(doc)
+        # merge (not overwrite) so multiple files of the same config+routing — e.g. an anchor
+        # sensitivity run plus a full-ladder headline run — combine their T points.
+        groups.setdefault(gk, {}).setdefault(_routing_label(doc), {}).update(_p99_by_T(doc, metric))
+
+    results = []
+    for gk, by_routing in sorted(groups.items()):
+        sku, backend, phase, dtype, mode, contract, ep, cqm, act = gk
+        headline = by_routing.get(HEADLINE)
+        if not headline:
+            continue  # no uniform headline in this group -> can't form a ratio
+        def common_T(other):
+            ts = sorted(set(headline) & set(other))
+            return [t for t in ts if (anchors is None or t in anchors)]
+
+        per_dist, worst, best_case, eplb_recovery = {}, None, None, None
+        for rlabel, series in by_routing.items():
+            if rlabel == HEADLINE:
+                continue
+            ratios = {t: series[t] / headline[t] for t in common_T(series) if headline[t] > 0}
+            if not ratios:
+                continue
+            rmax_T = max(ratios, key=ratios.get)
+            per_dist[rlabel] = {"ratio_max": round(ratios[rmax_T], 4), "at_T": rmax_T,
+                                "ratio_by_T": {t: round(v, 4) for t, v in ratios.items()}}
+            base = rlabel.replace("+eplb", "")
+            is_eplb = rlabel.endswith("+eplb")
+            if base == BEST_CASE:
+                best_case = {"routing": rlabel, "ratio": round(min(ratios.values()), 4)}
+            elif not is_eplb:  # a genuine stressor (balanced / zipf* / hotspot-single)
+                cand = (ratios[rmax_T], rlabel, rmax_T)
+                if worst is None or cand[0] > worst[0]:
+                    worst = cand
+        # EPLB recovery: zipf vs zipf+eplb worst ratio (the remedy's effect), if both present
+        if "zipf" in per_dist and "zipf+eplb" in per_dist:
+            eplb_recovery = {"zipf": per_dist["zipf"]["ratio_max"],
+                             "zipf+eplb": per_dist["zipf+eplb"]["ratio_max"]}
+
+        results.append({
+            "sku": sku, "backend": backend, "phase": phase, "dispatch_dtype": dtype,
+            "mode": mode, "contract": contract, "ep": ep,
+            "combine_quant_mode": cqm, "activation_profile": act,
+            "metric": metric,
+            "headline_p99_range_us": [round(min(headline.values()), 2), round(max(headline.values()), 2)],
+            "distribution_sensitivity_ratio": round(worst[0], 4) if worst else None,
+            "worst_distribution": worst[1] if worst else None,
+            "worst_at_T": worst[2] if worst else None,
+            "best_case_ratio": best_case, "eplb_recovery": eplb_recovery,
+            "per_distribution": per_dist,
+        })
+    return {"metric": metric, "anchors": sorted(anchors) if anchors else None, "groups": results}
+
+
+def to_markdown(report: dict) -> str:
+    # Only groups that actually have a stressor distribution vs uniform are a sensitivity result;
+    # uniform-only groups (other contracts / fp8 / LL that didn't run the routing sweep) are noise.
+    rated = [r for r in report["groups"] if r["distribution_sensitivity_ratio"] is not None]
+    skipped = len(report["groups"]) - len(rated)
+    if not rated:
+        return "_no comparable (uniform + stressor) routing groups found_"
+    h = (f"### Distribution sensitivity ({report['metric']} p99; ratio = worst stressor / uniform)\n\n"
+         "| SKU | backend | phase | dtype·mode·contract | headline p99 µs | worst dist @T | "
+         "**sensitivity** | best-case | EPLB (zipf→+eplb) |\n"
+         "|---|---|---|---|---|---|---|---|---|\n")
+    for r in sorted(rated, key=lambda x: (x["sku"], x["backend"], x["phase"], x["dispatch_dtype"])):
+        sr = r["distribution_sensitivity_ratio"]
+        cfg = f"{r['dispatch_dtype']}·{r['mode']}·{(r['contract'] or '').replace('-v1','')}"
+        worst = f"{r['worst_distribution']} @{r['worst_at_T']}"
+        rng = r["headline_p99_range_us"]
+        bc = f"{r['best_case_ratio']['ratio']:.2f}×" if r.get("best_case_ratio") else "—"
+        ev = (f"{r['eplb_recovery']['zipf']:.2f}→{r['eplb_recovery']['zipf+eplb']:.2f}×"
+              if r.get("eplb_recovery") else "—")
+        h += (f"| {r['sku']} | {r['backend']} | {r['phase']} | {cfg} | "
+              f"{rng[0]}–{rng[1]} | {worst} | **{sr:.2f}×** | {bc} | {ev} |\n")
+    if skipped:
+        h += f"\n_({skipped} uniform-only group(s) omitted — no stressor distribution run for them.)_\n"
+    return h
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX distribution-sensitivity summary")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--metric", default="roundtrip", choices=["roundtrip", "dispatch", "combine"])
+    ap.add_argument("--anchors", default="", help="comma-separated tokens/rank to restrict to; blank = all common T")
+    ap.add_argument("--out", default="", help="write the JSON report here (markdown always goes to stdout)")
+    a = ap.parse_args()
+    anchors = set(int(x) for x in a.anchors.replace(",", " ").split()) if a.anchors.strip() else None
+    report = analyze(a.results_dir, a.metric, anchors)
+    if a.out:
+        os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True)
+        with open(a.out, "w") as fh:
+            json.dump(report, fh, indent=2, sort_keys=True)
+        print(f"wrote {a.out}  ({len(report['groups'])} groups)")
+    print(to_markdown(report))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py
new file mode 100644
index 000000000..db68afb4c
--- /dev/null
+++ b/experimental/CollectiveX/tests/workload.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""CollectiveX — canonical, serialized MoE routing workloads (goal Part 1: workload identity).
+
+A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent
+file, and referenced by an immutable `workload_id`. Every official benchmark point consumes the
+SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a
+checksum match, not by trusting that two machines re-ran the same seeded generator.
+
+Layout on disk (one workload = two files, basename = workload_id):
+  <dir>/<workload_id>.npz            topk_idx [gt,topk] int32, topk_weights [gt,topk] float32
+  <dir>/<workload_id>.manifest.json  dims, routing profile, generator version, seed, SHA-256s
+
+Split by dependency so it runs where each step lives:
+  * build_workload()  needs torch (via routing.py) — run on a node/container.
+  * load/verify/manifest  need only numpy + stdlib — run on a login node or in CI.
+
+Seeded runtime generation (routing.build_global_routing) stays for local dev; canonical files
+are how cross-hardware comparisons are gated.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+
+WORKLOAD_SCHEMA_VERSION = 1
+# Bump when routing.build_global_routing's numerics change so a stale file can't masquerade as
+# current. The workload_id folds this in: same id <=> same generator + params.
+GENERATOR_VERSION = "collectivex-routing-v1"
+GATE_WEIGHT_FORMAT = "softmax-of-randn-f32"   # how topk_weights are produced (see routing.py)
+ACTIVATION_GENERATOR = "collectivex-activation-v1"  # bump if the activation value-generator changes
+ACTIVATION_PROFILE_DEFAULT = "normal"               # seeded N(0,1) per token; the only wired profile
+
+
+def _sha256(b: bytes) -> str:
+    return hashlib.sha256(b).hexdigest()
+
+
+def compute_workload_id(routing: str, hidden: int, topk: int, experts: int,
+                        global_tokens: int, seed: int, generator: str = GENERATOR_VERSION,
+                        step: int = 0) -> str:
+    """Deterministic id over the identity-defining params. Same params+generator => same id.
+    `step` is the temporal snapshot for moving/alternating routing; folded in ONLY when non-zero
+    so every existing (step=0) canonical workload keeps its id."""
+    key = (f"{generator}|routing={routing}|hidden={hidden}|topk={topk}|experts={experts}"
+           f"|gt={global_tokens}|seed={seed}")
+    if step:
+        key += f"|step={step}"
+    return _sha256(key.encode())[:16]
+
+
+def compute_activation_identity(activation_profile, seed, hidden,
+                                generator=ACTIVATION_GENERATOR) -> str:
+    """Deterministic identity of the activation VALUE distribution (scaffold). Today activations
+    are seeded N(0,1) and NOT serialized, so identity = a descriptor hash. The formula MUST match
+    the inline one in ep_harness so a manifest and a result doc agree. Becomes the byte-hash of
+    the serialized activations once a model-trace value rig lands."""
+    key = f"{activation_profile}|seed={seed}|hidden={hidden}|gen={generator}"
+    return _sha256(key.encode())[:16]
+
+
+def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank,
+                   idx_np, weights_np, routing_stats=None,
+                   activation_profile=ACTIVATION_PROFILE_DEFAULT):
+    """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib."""
+    idx_bytes = idx_np.astype("int32").tobytes()
+    w_bytes = weights_np.astype("float32").tobytes()
+    wid = compute_workload_id(routing, hidden, topk, experts, global_tokens, seed)
+    return {
+        "schema_version": WORKLOAD_SCHEMA_VERSION,
+        "workload_id": wid,
+        "generator_version": GENERATOR_VERSION,
+        "gate_weight_format": GATE_WEIGHT_FORMAT,
+        "dims": {"hidden": hidden, "topk": topk, "experts": experts,
+                 "global_tokens": int(global_tokens), "experts_per_rank": experts_per_rank},
+        "routing_profile": routing,
+        "seed": seed,
+        "checksums": {  # SHA-256 over the raw little-endian array bytes (int32 / float32)
+            "topk_idx": _sha256(idx_bytes),
+            "topk_weights": _sha256(w_bytes),   # gate-weight (value) distribution identity
+            "trace": _sha256(idx_bytes + w_bytes),   # full-workload identity
+        },
+        "routing_stats": routing_stats or {},
+        # Activation value distribution (scaffold): name + deterministic descriptor identity.
+        # NOT under checksums — activations are not byte-serialized today (see compute_activation_identity).
+        "activation_profile": activation_profile,
+        "activation_identity": compute_activation_identity(activation_profile, seed, hidden),
+    }
+
+
+def build_workload(hidden, topk, experts, routing, global_tokens, seed, experts_per_rank,
+                   activation_profile=ACTIVATION_PROFILE_DEFAULT):
+    """Generate a canonical trace. Needs torch (routing.py). Returns (idx_np, weights_np, manifest)."""
+    import numpy as np
+    import routing as _routing
+    idx_t, w_t = _routing.build_global_routing(global_tokens, experts, topk, routing, seed,
+                                               experts_per_rank)
+    rstats = _routing.routing_stats(idx_t, experts, experts_per_rank, weights=w_t)
+    idx_np = idx_t.detach().cpu().numpy().astype(np.int32)
+    w_np = w_t.detach().cpu().numpy().astype(np.float32)
+    manifest = build_manifest(routing, hidden, topk, experts, global_tokens, seed,
+                              experts_per_rank, idx_np, w_np, rstats,
+                              activation_profile=activation_profile)
+    return idx_np, w_np, manifest
+
+
+def save_workload(out_dir, idx_np, weights_np, manifest) -> str:
+    import numpy as np
+    os.makedirs(out_dir, exist_ok=True)
+    wid = manifest["workload_id"]
+    np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"),
+                        topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32))
+    with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh:
+        json.dump(manifest, fh, indent=2, sort_keys=True)
+    return wid
+
+
+def load_workload(npz_path, verify=True):
+    """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest).
+    Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums."""
+    import numpy as np
+    base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path
+    with open(base + ".manifest.json") as fh:
+        manifest = json.load(fh)
+    z = np.load(base + ".npz")
+    idx_np, w_np = z["topk_idx"], z["topk_weights"]
+    if verify:
+        ok, reason = verify_workload(manifest, idx_np, w_np)
+        if not ok:
+            raise ValueError(f"workload checksum mismatch for {base}: {reason}")
+    return idx_np, w_np, manifest
+
+
+def verify_workload(manifest, idx_np, weights_np):
+    """Recompute checksums and compare to the manifest. Returns (ok, reason)."""
+    import numpy as np  # noqa: F401
+    ib = idx_np.astype("int32").tobytes()
+    wb = weights_np.astype("float32").tobytes()
+    cs = manifest.get("checksums", {})
+    if _sha256(ib) != cs.get("topk_idx"):
+        return False, "topk_idx hash differs"
+    if _sha256(wb) != cs.get("topk_weights"):
+        return False, "topk_weights hash differs"
+    if _sha256(ib + wb) != cs.get("trace"):
+        return False, "trace hash differs"
+    wid = compute_workload_id(manifest["routing_profile"], manifest["dims"]["hidden"],
+                              manifest["dims"]["topk"], manifest["dims"]["experts"],
+                              manifest["dims"]["global_tokens"], manifest["seed"],
+                              manifest.get("generator_version", GENERATOR_VERSION))
+    if wid != manifest["workload_id"]:
+        return False, f"workload_id mismatch (recomputed {wid} != {manifest['workload_id']})"
+    return True, "ok"
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    import sys
+    import tempfile
+    # (1) workload_id determinism + sensitivity — pure stdlib, always runs.
+    a = compute_workload_id("zipf", 7168, 8, 256, 4096, 67)
+    b = compute_workload_id("zipf", 7168, 8, 256, 4096, 67)
+    c = compute_workload_id("uniform", 7168, 8, 256, 4096, 67)
+    assert a == b, "workload_id must be deterministic"
+    assert a != c, "workload_id must depend on routing"
+    print(f"workload_id determinism OK (zipf={a} uniform={c})")
+    # (2) build/save/load/verify roundtrip + cross-build identity — needs torch+numpy.
+    try:
+        import numpy as np  # noqa: F401
+        try:
+            idx, w, man = build_workload(7168, 8, 256, "zipf", 512, 67, 32)
+            built = True
+        except Exception as exc:   # torch missing on a login node
+            print(f"(torch unavailable — synthesizing arrays to test load/verify: {exc!r})")
+            idx = np.random.default_rng(0).integers(0, 256, size=(512, 8)).astype(np.int32)
+            w = np.random.default_rng(1).random((512, 8)).astype(np.float32)
+            man = build_manifest("zipf", 7168, 8, 256, 512, 67, 32, idx, w)
+            built = False
+        with tempfile.TemporaryDirectory() as d:
+            wid = save_workload(d, idx, w, man)
+            idx2, w2, man2 = load_workload(os.path.join(d, f"{wid}.npz"), verify=True)
+            assert (idx2 == idx).all() and (w2 == w).all(), "roundtrip array mismatch"
+            ok, reason = verify_workload(man2, idx2, w2)
+            assert ok, reason
+            # tamper -> must fail
+            idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256
+            bad, _ = verify_workload(man2, idx2, w2)
+            assert not bad, "verify must catch tampering"
+        print(f"save/load/verify roundtrip OK (workload_id={wid}, built_via_torch={built})")
+    except ImportError:
+        print("(numpy unavailable — skipped serialization roundtrip; id logic passed)")
+    print("workload self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/validate_results.py b/experimental/CollectiveX/validate_results.py
new file mode 100644
index 000000000..9128c8a20
--- /dev/null
+++ b/experimental/CollectiveX/validate_results.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""CollectiveX result validator (goal Part 1: schema + validation tooling).
+
+Validates EP result JSON docs against ep-result-v4 and the project's semantic gates:
+schema shape, provenance completeness, workload identity (incl. cross-run trace-signature
+agreement within a comparison_key), measurement-contract membership, byte-contract presence,
+sample counts, and — crucially — that `publication_status` is the MACHINE-DERIVED function of
+`validity` (no doc may hand-label itself official). Exits non-zero when any doc claims
+`official` but fails a gate (or, with --require-official, when any doc isn't official).
+
+Pure stdlib; uses `jsonschema` if importable, else a built-in required-key/type/enum check.
+v3 docs (no publication_status) load as legacy/experimental and are reported, not failed.
+
+  python3 validate_results.py results/*.json
+  python3 validate_results.py --require-official --schema schemas/ep-result-v4.schema.json results/
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+import sys
+
+MIN_SAMPLES_OFFICIAL = 100
+KNOWN_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"}
+PUB_STATES = {"official", "comparable-experimental", "diagnostic", "invalid", "failed"}
+
+
+def derive_publication_status(v: dict) -> str:
+    """MUST mirror ep_harness._derive_publication_status — the validator's job is to confirm the
+    recorded status equals this derivation."""
+    if v.get("execution_status") != "complete":
+        return "failed"
+    if (v.get("semantic_correctness") != "pass" or v.get("measurement_conformance") != "conformant"
+            or v.get("workload_identity") == "inconsistent"):
+        return "invalid"
+    sound = (v.get("semantic_correctness") == "pass"
+             and str(v.get("workload_identity", "")).startswith("consistent")
+             and v.get("measurement_conformance") == "conformant")
+    if str(v.get("resource_conformance", "")).endswith("nonconforming"):
+        return "diagnostic"
+    # contract-level anomaly (goal P1-e/f): demotes to diagnostic unless waived (anomaly_free).
+    if not v.get("anomaly_free", True):
+        return "diagnostic"
+    if sound and v.get("provenance_complete") and v.get("workload_source") == "canonical-serialized":
+        return "official"
+    if sound:
+        return "comparable-experimental"
+    return "diagnostic"
+
+
+def _schema_check(doc, schema):
+    """jsonschema if available; else a pragmatic required-keys/enum check of the top level + rows."""
+    try:
+        import jsonschema
+        jsonschema.validate(doc, schema)
+        return []
+    except ImportError:
+        errs = []
+        for k in schema.get("required", []):
+            if k not in doc:
+                errs.append(f"missing required field '{k}'")
+        # enum spot-checks the built-in path can do cheaply
+        ms = doc.get("measurement_contract")
+        if ms is not None and ms not in KNOWN_CONTRACTS:
+            errs.append(f"unknown measurement_contract '{ms}'")
+        ps = doc.get("publication_status")
+        if ps is not None and ps not in PUB_STATES:
+            errs.append(f"unknown publication_status '{ps}'")
+        if not doc.get("rows"):
+            errs.append("no rows")
+        return errs
+    except Exception as exc:   # jsonschema.ValidationError
+        return [f"schema: {exc.message if hasattr(exc, 'message') else exc}"]
+
+
+def validate_doc(doc, schema, path):
+    errs, warns = [], []
+    legacy = "publication_status" not in doc
+    if legacy:
+        warns.append("legacy (v3, no publication_status) — loads as experimental, not comparable as official")
+        return errs, warns, "legacy-experimental"
+    errs += _schema_check(doc, schema) if schema else []
+    v = doc.get("validity", {})
+    recorded = doc.get("publication_status")
+    derived = derive_publication_status(v)
+    if recorded != derived:
+        errs.append(f"publication_status '{recorded}' != machine-derived '{derived}' (validity tampered or stale)")
+    # byte + contract + sample gates
+    if doc.get("measurement_contract") not in KNOWN_CONTRACTS:
+        errs.append(f"unknown measurement_contract {doc.get('measurement_contract')}")
+    rows = doc.get("rows", [])
+    for r in rows:
+        if "byte_contracts" not in r:
+            errs.append(f"T={r.get('tokens_per_rank')}: missing byte_contracts"); break
+        for op in ("dispatch", "combine", "roundtrip"):
+            if op not in r or "p99" not in r.get(op, {}):
+                errs.append(f"T={r.get('tokens_per_rank')}: missing {op} percentiles"); break
+    # anomaly self-consistency (goal P1-e): validity.anomaly_free must equal (no anomalies or waived).
+    anoms = doc.get("anomalies") or []
+    waived = (doc.get("anomaly_summary") or {}).get("waived", False)
+    expect_anomaly_free = (len(anoms) == 0) or bool(waived)
+    if v.get("anomaly_free", True) != expect_anomaly_free:
+        errs.append(f"validity.anomaly_free={v.get('anomaly_free')} but {len(anoms)} anomalies "
+                    f"(waived={waived}) imply {expect_anomaly_free}")
+    if anoms and not waived and recorded not in ("diagnostic", "invalid", "failed"):
+        errs.append(f"{len(anoms)} unwaived timing anomaly(ies) but status={recorded} (must be diagnostic)")
+    # official-grade gates
+    if recorded == "official":
+        if not v.get("provenance_complete"):
+            errs.append("official but provenance_complete=false")
+        if v.get("workload_source") != "canonical-serialized":
+            errs.append("official but workload not canonical-serialized")
+        # goal P1: official requires NON-NULL workload identity (id + signature).
+        wl = doc.get("workload") or {}
+        if not wl.get("workload_id"):
+            errs.append("official but workload_id is null (non-null workload identity required)")
+        if not wl.get("trace_signature"):
+            errs.append("official but trace_signature is null")
+        if anoms and not waived:
+            errs.append("official but has unwaived timing anomalies")
+        if rows and min((r.get("samples_pooled", 0) for r in rows)) < MIN_SAMPLES_OFFICIAL:
+            errs.append(f"official but a point has <{MIN_SAMPLES_OFFICIAL} pooled samples")
+        if not all(r.get("correct") for r in rows):
+            errs.append("official but a point failed correctness")
+    return errs, warns, recorded
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP result validator")
+    ap.add_argument("paths", nargs="+", help="result JSON files or dirs")
+    ap.add_argument("--schema", default=os.path.join(os.path.dirname(__file__), "schemas", "ep-result-v4.schema.json"))
+    ap.add_argument("--require-official", action="store_true",
+                    help="fail if any non-legacy doc is not 'official'")
+    ap.add_argument("--regression", action="store_true",
+                    help="also run threshold-based performance-regression detection (regression.py) "
+                         "over the same files and fail if any hard regression (outside run-to-run "
+                         "noise) is found, so one CI step gates on validity AND performance")
+    ap.add_argument("--regression-metric", default="roundtrip", help="regression op (default roundtrip)")
+    ap.add_argument("--regression-pct", default="p99", help="regression percentile (default p99)")
+    ap.add_argument("--regression-threshold", type=float, default=0.10,
+                    help="regression fractional threshold (default 0.10)")
+    a = ap.parse_args()
+    schema = None
+    if a.schema and os.path.exists(a.schema):
+        schema = json.load(open(a.schema))
+    files = []
+    for p in a.paths:
+        if os.path.isdir(p):
+            files += glob.glob(os.path.join(p, "**", "*.json"), recursive=True)
+        else:
+            files.append(p)
+    files = sorted(f for f in files if not os.path.basename(f).startswith("env_"))
+
+    # cross-run workload identity: within a comparison_key, the realized routing must be the SAME
+    # workload. We check PER-TOKEN routing_hash agreement (not the whole trace_signature) so two
+    # runs of the same config at DIFFERENT ladders (e.g. a capped cross-vendor sweep 1..16 vs a full
+    # 1..128 headline) are NOT falsely flagged — only a genuine conflict (same T, different routing
+    # bytes) is a different workload.
+    by_ck = {}   # ck -> {T: {routing_hash: [files]}}
+    bad = 0
+    for f in files:
+        try:
+            doc = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if doc.get("family") != "moe":
+            continue
+        # preserved failed-case record (goal immediate P2): a classified failure (run_in_container
+        # emitted it on a wedge/timeout/crash). Report it as a preserved case, NOT a validation error.
+        if doc.get("record_type") == "failed-case":
+            fm = (doc.get("failure") or {}).get("failure_mode", "?")
+            print(f"[FAILED-CASE] {os.path.basename(f):68s} mode={fm}  (preserved, not a validation error)")
+            continue
+        errs, warns, status = validate_doc(doc, schema, f)
+        ck = doc.get("comparison_key")
+        # routing_step (temporal) + uneven_tokens change the realized workload but are NOT in the
+        # comparison_key (they live in reproduction) — include them in the cross-run grouping so a
+        # moving-hotspot step / uneven-allocation variant isn't falsely flagged as a conflicting
+        # same-config workload.
+        repro = doc.get("reproduction") or {}
+        gk = (ck, repro.get("routing_step", 0), repro.get("uneven_tokens", "none")) if ck else None
+        if gk:
+            for r in doc.get("rows", []):
+                T, rh = r.get("tokens_per_rank"), r.get("routing_hash")
+                if T is not None and rh:
+                    by_ck.setdefault(gk, {}).setdefault(T, {}).setdefault(rh, []).append(os.path.basename(f))
+        tag = "OK" if not errs else "FAIL"
+        if errs:
+            bad += 1
+        if a.require_official and status not in ("official",) and not errs:
+            tag = "FAIL"; bad += 1; errs = [f"not official (status={status})"]
+        print(f"[{tag}] {os.path.basename(f):70s} status={status}")
+        for e in errs:
+            print(f"        ERROR: {e}")
+        for w in warns:
+            print(f"        note: {w}")
+    # report cross-run identity CONFLICTS: same comparison_key + same T but DIFFERENT routing bytes
+    # (a genuine "not the same workload" — different hardware ran different routing for one point).
+    for gk, perT in by_ck.items():
+        ck = gk[0]
+        conflicts = {T: hs for T, hs in perT.items() if len(hs) > 1}
+        if conflicts:
+            bad += 1
+            print(f"[FAIL] comparison_key {ck[:12]} (step={gk[1]},uneven={gk[2]}): per-T routing-hash CONFLICT — not the same workload:")
+            for T, hs in sorted(conflicts.items()):
+                print(f"        T={T}: " + "; ".join(f"{h[:10]}=[{', '.join(fs)}]" for h, fs in hs.items()))
+    print(f"\n{'FAILED' if bad else 'PASS'}: {len(files)} files, {bad} problem(s)")
+
+    # Optional performance-regression gate (goal P1 "Add regression thresholds"). Imported lazily so
+    # validation carries no new dependency/behavior unless --regression is passed. A hard regression
+    # (a >threshold slowdown outside this point's run-to-run noise) folds into the non-zero exit.
+    if a.regression:
+        import regression as _reg
+        rep = _reg.analyze(a.paths, metric=a.regression_metric, pct=a.regression_pct,
+                           threshold=a.regression_threshold)
+        print()
+        print(_reg.to_markdown(rep))
+        if rep["hard_regressions"]:
+            bad += rep["hard_regressions"]
+    return 1 if bad else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())