diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml new file mode 100644 index 000000000..e6da75312 --- /dev/null +++ b/.github/workflows/collectivex-experimental.yml @@ -0,0 +1,353 @@ +name: CollectiveX Experimental + +# Orchestration only — all benchmark logic lives in experimental/CollectiveX/. +# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no +# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane +# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's +# self-hosted runner and invokes that SKU's launch script — the same +# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use. + +on: + push: + branches: + - collectivex + paths: + - 'experimental/CollectiveX/**' + - '.github/workflows/collectivex-experimental.yml' + workflow_dispatch: + inputs: + sku: + # Only SKUs with a matching launchers/launch_.sh are offered — + # runner.name's prefix selects the script, so an SKU without one fails. + description: Self-hosted runner pool (must have a CollectiveX launcher) + type: choice + default: gb200 + options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, h200, b300, gb300] + benchmark: + # mori runs only on mi355x; nccl/deepep/uccl/all + the collective benches on NVIDIA SKUs. + # offload/copy-engine/kv-cache are single-process memcpy-family collectives (family!=moe). + description: Which benchmark to run + type: choice + default: nccl + options: [nccl, deepep, deepep-hybrid, mori, uccl, nccl-ep, flashinfer, flashinfer-combine-fp8, flashinfer-combine-fp8-directcast, flashinfer-combine-nvfp4, nixl, mori-io, nccl-kv, mooncake, offload, copy-engine, kv-cache, rl-mesh, allreduce-fw, allreduce-fw-vllm, all] + ops: + description: NCCL ops (space-separated); blank = default set + type: string + default: '' + min_bytes: + description: nccl-tests min message size + type: string + default: '8' + max_bytes: + description: nccl-tests max message size + type: string + default: '8G' + nodes: + description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node. + type: string + default: '' + phase: + # EP only. 'both' fans out to one job per phase (decode + prefill). + description: EP phase — decode (small T) / prefill (large T); 'both' = a job each + type: choice + default: both + options: [both, decode, prefill] + timing: + # Combined timing knobs "iters:trials:warmup" (GitHub caps workflow_dispatch at 25 inputs, + # so these share one). Blank = harness defaults (200:3:32). LOWER all three for the MoRI/ + # MI355X large-T probe (e.g. "8:1:4"): MoRI wedges (unkillable D-state) under SUSTAINED + # collectives at T>=32; minimal iters/trials/warmup is the only way to reach >64 tok/rank. + description: 'Timing "iters:trials:warmup" (blank = 200:3:32; e.g. 8:1:4 for the MoRI large-T probe)' + type: string + default: '' + tokens_ladder: + description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default + type: string + default: '' + dispatch_dtype: + description: EP dispatch payload precision (fp8 scale-layout recipes + FlashInfer OCP-microscaling mxfp8/nvfp4) + type: choice + default: bf16 + options: [bf16, fp8, fp8-pertoken, fp8-directcast, mxfp8, mxfp4, nvfp4] + mode: + # normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency + # (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it + # (MoRI) and aborts on fabrics that lack it (B300) — run only where supported. + description: EP kernel path — normal or low-latency (LL) + type: choice + default: normal + options: [normal, ll] + resource_mode: + # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples); + # tuned = each backend's own recommended/default launch config. + description: Comm resource regime + type: choice + default: normalized + options: [normalized, tuned, default] + contract: + # layout-and-dispatch-v1 = dispatch timing includes routing-layout gen (the only + # contract MoRI honors; use for cross-vendor). cached-layout-comm-only-v1 = layout + # hoisted out, pure-comm dispatch (DeepEP normal only). + # runtime-visible-v1 = serving-realistic boundary (DeepEP times fp8 cast + layout + comm + + # recv-dequant inside dispatch). cached-layout = pure-comm (DeepEP normal only). + description: Measurement contract (timing boundary) + type: choice + default: layout-and-dispatch-v1 + options: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1] + routing: + # Routing distribution of the shared trace. uniform=realistic; balanced=load-equalized; + # zipf*=skewed; hotspot-*=one hot expert (static/moving); alternating-groups=toggling halves. + description: EP routing distribution + type: choice + default: uniform + options: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, + zipf-heavy, hotspot-single, hotspot-moving, alternating-groups] + eplb: + # EPLB = replicate hot experts + balanced-place (the remedy for skewed routing). A pure + # routing-trace transform; experts -> num_logical+redundant. Meaningful with zipf*. + description: Apply EPLB expert replication/placement + type: boolean + default: false + canonical: + # Consume a CANONICAL serialized workload (generated deterministically in-container) instead + # of seeded-runtime. A canonical-serialized run with full GHA provenance is publication + # 'official' — this is the switch that promotes a cohort past comparable-experimental. + description: Use canonical serialized workload (official-grade workload identity) + type: boolean + default: false + deepep_v2: + # Build DeepEP V2 (NCCL Gin backend) from source in-container, overriding the image's V1 + # (1.2.1). Hopper(SM90)+Blackwell(SM100) only. Needs compute-node network + NCCL>=2.30.4. + description: Use DeepEP V2 kernels (build from source; NVIDIA SM90+ only) + type: boolean + default: false + activation_profile: + # Activation VALUE distribution of expert inputs. normal = headline; the others stress a + # future quantized combine (latency-neutral under bf16 — the expected null result). + description: Activation value profile + type: choice + default: normal + options: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation] + placement: + # Rank->node/domain placement (locality). Single-node SKUs make these identical; meaningful + # on multi-domain SKUs (GB300 NVL72). packed=fill a domain first; striped=spread; adversarial. + description: Rank placement + type: choice + default: packed + options: [packed, striped, runtime-native, adversarial] + sm_fraction: + # normalized comm-resource fraction (DeepEP sm_fraction*SMs / MoRI ~*CUs). Sweep this with + # resource_mode=normalized to build the resource-Pareto (latency vs comm fraction). Blank = + # harness default 0.18. + description: Normalized comm-resource fraction (resource_mode=normalized) + type: string + default: '' + hidden: + # MoE hidden dim — set (with topk/experts) for model-derived workloads (ep-models-v1). + # Blank = ds-like-ref default 7168. + description: MoE hidden dim (model-derived workloads); blank = 7168 + type: string + default: '' + topk: + description: MoE top-k (model-derived workloads); blank = 8 + type: string + default: '' + experts: + description: MoE total experts (model-derived workloads); blank = 256 + type: string + default: '' + routing_step: + # temporal snapshot index for hotspot-moving / alternating-groups (ep-temporal-v1). + description: Temporal routing step (hotspot-moving / alternating-groups) + type: string + default: '' + uneven_tokens: + # per-rank source-token allocation skew (ep-uneven-tokens-v1). + description: Uneven source-token allocation + type: choice + default: none + options: [none, linear, empty-rank] + +concurrency: + # Group per (SKU + FULL config): GitHub keeps only one running + one pending per group and + # cancels the rest, so a coarse per-SKU group made a fan-out of many configs on one SKU + # self-cancel down to ~2. Including dtype/mode/contract/routing/eplb/phase gives each config + # its OWN group -> all configs survive; they queue only on the runner's own capacity, not on + # GitHub concurrency. cancel-in-progress FALSE so a re-dispatch of the SAME config queues. + # The group includes the resource/value/placement axes (sm_fraction, resource_mode, + # activation_profile, placement) too — otherwise a Pareto sm-fraction sweep or an activation/ + # placement sweep (same dtype/mode/contract/routing/phase) would self-cancel down to ~2 runs. + group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }}-${{ inputs.benchmark }}-${{ inputs.dispatch_dtype }}-${{ inputs.mode }}-${{ inputs.contract }}-${{ inputs.routing }}-${{ inputs.eplb }}-${{ inputs.phase }}-${{ inputs.resource_mode }}-${{ inputs.sm_fraction }}-${{ inputs.activation_profile }}-${{ inputs.placement }}-${{ inputs.hidden }}-${{ inputs.topk }}-${{ inputs.experts }}-${{ inputs.routing_step }}-${{ inputs.uneven_tokens }}-${{ inputs.nodes }} + cancel-in-progress: false + +permissions: + contents: read + +jobs: + # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and + # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute- + # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs. + experimental: + name: CollectiveX Experimental (${{ matrix.phase }}) + if: github.event_name == 'push' + runs-on: mi355x + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + # Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch. + phase: [decode] + env: + CX_BENCH: mori + CX_PHASE: ${{ matrix.phase }} + # SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently + # WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung + # ~1 h before the job timeout. Keep the push smoke in the known-good range; run the + # full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed. + CX_TOKENS_LADDER: "1 2 4 8 16" + CX_RUN_TIMEOUT: "600" + # Pin to the MI355X nodes that hold the node-local squash and have a writable + # /var/lib/squash; other nodes need a slow cold import that can fail on lock/ + # cache permissions. Widen once the squash is staged cluster-wide. + CX_NODELIST: mia1-p01-g10,mia1-p01-g15 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + - name: Launch MI355X MoRI (${{ matrix.phase }}) + env: + RUNNER_NAME: ${{ runner.name }} + run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Results summary + if: always() + run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" + - name: Upload results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }} + path: experimental/CollectiveX/results/*.json + if-no-files-found: warn + + # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner. + dispatch: + if: github.event_name == 'workflow_dispatch' + # The bare `h200` label spans TWO clusters: 14 h200-dgxc runners (login-0; the EP + # path is validated there) and 2 h200-cw (CoreWeave) runners that have no + # launch_h200-cw.sh and die exit 127. Pin h200 to the h200-dgxc pool so every + # dispatch lands where the launcher + FS + partition are known-good. Other SKUs are + # single-pool, so pass the sku through unchanged. + runs-on: ${{ inputs.sku == 'h200' && 'h200-dgxc' || inputs.sku }} + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: + # nccl/rccl are collective primitives — phase is meaningless, so run ONE job (not + # the same work twice). EP backends: 'both' -> decode + prefill; else a single job. + phase: ${{ fromJSON((inputs.benchmark == 'nccl' || inputs.benchmark == 'rccl') && '["na"]' || (inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase))) }} + env: + # flashinfer-combine-{fp8,nvfp4} = the flashinfer EP backend with a QUANTIZED COMBINE OUTPUT + # (MXFP8 e4m3+e8m0, or NVFP4 e2m1, via the flashinfer-main moe_a2a_combine output_dtype). Map to + # CX_BENCH=flashinfer + CX_COMBINE_DTYPE (run_flashinfer_suite builds flashinfer-main when + # CX_COMBINE_DTYPE!=bf16). Input-cap-safe (a benchmark CHOICE, not a new input). + CX_BENCH: ${{ startsWith(inputs.benchmark, 'flashinfer-combine') && 'flashinfer' || (inputs.benchmark == 'allreduce-fw-vllm' && 'allreduce-fw' || inputs.benchmark) }} + # allreduce-fw-vllm = the framework all-reduce bench in a vLLM container (container switch for + # the vLLM custom-AR, goal 215) — set CX_IMAGE to a vLLM cuda image; the launcher uses CX_IMAGE + # when non-empty, else cx_default_image. Input-cap-safe (a benchmark CHOICE). + CX_IMAGE: ${{ inputs.benchmark == 'allreduce-fw-vllm' && 'vllm/vllm-openai:latest' || '' }} + # startsWith catches both flashinfer-combine-fp8 and -fp8-directcast (both fp8 combine output; + # the -directcast variant differs only in CX_QC_SCALE=scalar below — a single output_scalar_scale, + # no per-block scales = the unscaled direct-cast fp8 combine). + CX_COMBINE_DTYPE: ${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'bf16') }} + CX_COMBINE_QUANT_MODE: ${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'none') }} + CX_QC_SCALE: ${{ inputs.benchmark == 'flashinfer-combine-fp8-directcast' && 'scalar' || '' }} + CX_OPS: ${{ inputs.ops }} + CX_MIN_BYTES: ${{ inputs.min_bytes }} + CX_MAX_BYTES: ${{ inputs.max_bytes }} + CX_NODES: ${{ inputs.nodes }} + CX_PHASE: ${{ matrix.phase }} + CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }} + CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }} + CX_MODE: ${{ inputs.mode }} + CX_RESOURCE_MODE: ${{ inputs.resource_mode }} + CX_MEASUREMENT_CONTRACT: ${{ inputs.contract }} + CX_ROUTING: ${{ inputs.routing }} + CX_EPLB: ${{ inputs.eplb && '1' || '' }} + # canonical serialized workload (official-grade identity) + value/placement axes (goal P1/P2). + CX_CANONICAL: ${{ inputs.canonical && '1' || '' }} + CX_DEEPEP_V2: ${{ inputs.deepep_v2 && '1' || '' }} + CX_ACTIVATION_PROFILE: ${{ inputs.activation_profile }} + CX_PLACEMENT: ${{ inputs.placement }} + CX_SM_FRACTION: ${{ inputs.sm_fraction }} + # model-derived workload dims (blank = ds-like-ref defaults) + temporal/uneven axes. + CX_HIDDEN: ${{ inputs.hidden }} + CX_TOPK: ${{ inputs.topk }} + CX_EXPERTS: ${{ inputs.experts }} + CX_ROUTING_STEP: ${{ inputs.routing_step }} + CX_UNEVEN_TOKENS: ${{ inputs.uneven_tokens }} + CX_TIMING: ${{ inputs.timing }} + # GHA run provenance: run_ep records git_run (repo/run/attempt/ref/sha/job) -> a GHA result + # is provenance_complete (publication_status >= comparable-experimental, official w/ canonical). + COLLECTIVEX_SOURCE_SHA: ${{ github.sha }} + COLLECTIVEX_ARTIFACT_NAME: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }} + # GB200/watchtower needs a compute-visible workspace; harmless elsewhere. + CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} + # MI355X: pin to the warm-squash, writable nodes (see the push job). + CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + # Reject an unsupported backend/SKU/mode/dtype/contract BEFORE consuming the runner + # (review #3): fail fast on the login node, not after a salloc. 'all' fans out per + # vendor in-container, so skip the single-combo check for it. + - name: Validate capability + if: inputs.benchmark != 'all' + run: | + python3 experimental/CollectiveX/tests/capability.py \ + --sku "${{ inputs.sku }}" \ + --backend "${{ startsWith(inputs.benchmark, 'flashinfer-combine') && 'flashinfer' || (inputs.benchmark == 'allreduce-fw-vllm' && 'allreduce-fw' || inputs.benchmark) }}" \ + --mode "${{ inputs.mode }}" --dtype "${{ inputs.dispatch_dtype }}" \ + --contract "${{ inputs.contract }}" \ + --combine-dtype "${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'bf16') }}" \ + --combine-quant-mode "${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'none') }}" + - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }}) + env: + RUNNER_NAME: ${{ runner.name }} + run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Results summary + if: always() + run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" + - name: Upload results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }} + path: experimental/CollectiveX/results/*.json + if-no-files-found: warn + + update-frontend-snapshot: + name: Update InferenceX-app snapshot + needs: [experimental, dispatch] + if: >- + always() && + ( + (github.event_name == 'push' && needs.experimental.result == 'success') || + (github.event_name == 'workflow_dispatch' && needs.dispatch.result == 'success') + ) + runs-on: ubuntu-latest + steps: + - name: Trigger CollectiveX snapshot update + env: + FRONTEND_PAT: ${{ secrets.INFX_FRONTEND_PAT }} + run: | + set -euo pipefail + curl -sSf -X POST \ + -H "Authorization: Bearer $FRONTEND_PAT" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \ + -d '{ + "event_type": "update-collectivex-data", + "client_payload": { + "source_run_id": "${{ github.run_id }}" + } + }' diff --git a/.github/workflows/collectivex-sweep.yml b/.github/workflows/collectivex-sweep.yml new file mode 100644 index 000000000..76a91b4ad --- /dev/null +++ b/.github/workflows/collectivex-sweep.yml @@ -0,0 +1,215 @@ +# CollectiveX Sweep — one structured run instead of thousands of dispatches. +# +# Shape (mirrors the InferenceX CI tracker): setup -> sweep (a MATRIX job = "a job with other jobs +# in it") -> aggregate (the collector "at the end"). The matrix unit is a SHARD = one allocation that +# sweeps many cases sharing (sku, backend, mode, resource) — generate_matrix's own grouping, chunked +# so no cell exceeds the job budget. Each cell emits a handful of per-case JSONs; the aggregate job +# collects every shard into ONE line-delimited file (results/aggregate/*.ndjson) so there aren't +# thousands of individual result files. Run once per backend (deepep / uccl / flashinfer / +# deepep-hybrid / nccl-ep, + deepep_v2) for full parity. +name: CollectiveX Sweep +on: + workflow_dispatch: + inputs: + backend: + description: "EP library to sweep — 'all' = every backend in ONE combined matrix run (recommended)" + type: choice + default: all + options: [all, deepep, uccl, flashinfer, deepep-hybrid, nccl-ep] + deepep_v2: + description: DeepEP V2 from-source kernels (kernel_gen=v2; only for a single-backend deepep run — 'all' already includes a deepep-v2 variant) + type: boolean + default: false + suites: + description: "'all' or comma-list of suite names" + type: string + default: all + only_sku: + description: Restrict to one SKU (h100-dgxc|h200|b300|b200-dgxc|gb200|gb300|mi355x); blank = all + type: string + default: '' + min_nodes: + description: Keep only shards with >= this tray count (2 = rack-scale EP8 only; blank = all) + type: string + default: '' + max_nodes: + description: Keep only shards with <= this tray count (1 = single-tray EP4 only; blank = all) + type: string + default: '' + max_cases: + description: Max cases per shard cell before chunking into another GHA job (128 = no chunking for current suites) + type: string + default: '128' + flashinfer_upgrade: + description: Upgrade FlashInfer to the newer (MNNVL-fixed) wheel for plain flashinfer runs too (fixes h100 completion-flag deadlock) + type: boolean + default: false + +concurrency: + group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.deepep_v2 }}-${{ inputs.only_sku }} + cancel-in-progress: false + +jobs: + # ---- setup: resolve the suites into the shard matrix (the "pending jobs" node) ---- + setup: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.gen.outputs.matrix }} + n: ${{ steps.gen.outputs.n }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + - run: pip install --quiet pyyaml + - id: gen + working-directory: experimental/CollectiveX + run: | + set -euo pipefail + # backend='all' or a comma-list -> ONE combined multi-backend matrix; else a single backend. + case "${{ inputs.backend }}" in + all|*,*) bk="--backends ${{ inputs.backend }}" ;; + deepep) bk="" ;; + *) bk="--backend ${{ inputs.backend }}" ;; + esac + v2=""; [ "${{ inputs.deepep_v2 }}" = "true" ] && v2="--deepep-v2" + os=""; [ -n "${{ inputs.only_sku }}" ] && os="--only-sku ${{ inputs.only_sku }}" + mn=""; [ -n "${{ inputs.min_nodes }}" ] && mn="--min-nodes ${{ inputs.min_nodes }}" + xn=""; [ -n "${{ inputs.max_nodes }}" ] && xn="--max-nodes ${{ inputs.max_nodes }}" + # full matrix (with cases) -> artifact for the cells; slim (no cases) -> the strategy output. + python3 sweep_matrix.py --suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}" $bk $v2 $os $mn $xn --out matrix_full.json >/dev/null + SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='cases'} for x in m['include']]}))") + echo "matrix=$SLIM" >> "$GITHUB_OUTPUT" + echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" >> "$GITHUB_OUTPUT" + python3 -c "import json;m=json.load(open('matrix_full.json'));print('shard-cells:',len(m['include']),'cases:',sum(x['n'] for x in m['include']))" + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: cxsweep-matrix-${{ github.run_id }} + path: experimental/CollectiveX/matrix_full.json + if-no-files-found: error + + # ---- sweep: ONE matrix cell per shard (the parent job with child jobs) ---- + sweep: + needs: setup + if: ${{ fromJSON(needs.setup.outputs.n) > 0 }} + strategy: + fail-fast: false + max-parallel: 10 # don't saturate the ~20-runner fleet; cells queue as slots free + matrix: ${{ fromJSON(needs.setup.outputs.matrix) }} + # h200 label spans two clusters; pin to the validated dgxc pool (mirrors collectivex-experimental). + runs-on: ${{ matrix.sku == 'h200' && 'h200-dgxc' || matrix.sku }} + timeout-minutes: 350 + env: + CX_BENCH: ${{ matrix.backend }} + CX_DEEPEP_V2: ${{ matrix.deepep_v2 && '1' || '' }} + CX_NODES: ${{ matrix.nodes }} + CX_SHARD_FILE: results/.shard_${{ matrix.id }}.json + COLLECTIVEX_SOURCE_SHA: ${{ github.sha }} + # Consolidated shards run a whole build-group (up to ~74 cases) + one from-source build in ONE + # slurm allocation, so the launcher's default 45-min --time is too short. 120 min gives headroom; + # the allocation releases early when the shard finishes, so short shards don't waste it. + CX_TIME: '120' + CX_FLASHINFER_UPGRADE: ${{ inputs.flashinfer_upgrade && '1' || '' }} + CX_NODELIST: ${{ matrix.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }} + CX_STAGE_DIR: ${{ matrix.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: cxsweep-matrix-${{ github.run_id }} + path: experimental/CollectiveX + - name: Extract this shard's cases (stdlib only — no runner deps) + working-directory: experimental/CollectiveX + run: | + set -euo pipefail + python3 -c " + import json + m=json.load(open('matrix_full.json')) + s=[x for x in m['include'] if x['id']=='${{ matrix.id }}'] + assert s, 'shard ${{ matrix.id }} not in matrix' + s=s[0] + json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'deepep_v2':s['deepep_v2'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w')) + print('shard ${{ matrix.id }}:', len(s['cases']), 'cases') + " + - name: Sweep shard ${{ matrix.id }} (${{ matrix.n }} cases, one allocation) + env: + RUNNER_NAME: ${{ runner.name }} + run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Shard summary + if: always() + run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true + - name: Upload shard results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: cxshard-${{ matrix.id }}-${{ github.run_id }} + path: experimental/CollectiveX/results/*.json # glob skips the hidden .shard_*.json + if-no-files-found: warn + + # ---- aggregate: collect every shard into ONE ndjson (the "result aggregator at the end") ---- + aggregate: + needs: sweep + if: always() + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + pattern: cxshard-*-${{ github.run_id }} + path: _shards + merge-multiple: true + - name: Aggregate shards -> one ndjson + working-directory: experimental/CollectiveX + run: | + set -euo pipefail + tag="${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}" + python3 aggregate_results.py --in-dir ../../_shards --out "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson" + { + echo "## CollectiveX sweep aggregate (${tag})" + echo '```' + wc -l results/aggregate/*.ndjson 2>/dev/null || echo "no ndjson" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + - name: Upload aggregate + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: cxsweep-aggregate-${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}-${{ github.run_id }} + path: experimental/CollectiveX/results/aggregate/*.ndjson + if-no-files-found: warn + + update-frontend-snapshot: + name: Update InferenceX-app snapshot + needs: aggregate + if: always() && needs.aggregate.result == 'success' + runs-on: ubuntu-latest + steps: + - name: Trigger CollectiveX snapshot update + env: + FRONTEND_PAT: ${{ secrets.INFX_FRONTEND_PAT }} + run: | + set -euo pipefail + tmp="$(mktemp -d)" + trap 'rm -rf "$tmp"' EXIT + git clone --quiet --depth 1 --branch collectivex \ + "https://x-access-token:${FRONTEND_PAT}@github.com/SemiAnalysisAI/InferenceX-app.git" \ + "$tmp/app" + cd "$tmp/app" + git pull --rebase origin collectivex + mkdir -p .github + { + echo "source_run_id=${{ github.run_id }}" + echo "source_sha=${{ github.sha }}" + echo "source_workflow=${{ github.workflow }}" + echo "source_run_url=https://github.com/SemiAnalysisAI/InferenceX/actions/runs/${{ github.run_id }}" + echo "triggered_at=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + } > .github/collectivex-source-run.env + + git config user.name "InferenceX Data Bot" + git config user.email "actions@users.noreply.github.com" + git add .github/collectivex-source-run.env + if git diff --cached --quiet; then + echo "CollectiveX source-run marker is already current." + exit 0 + fi + git commit -m "chore: trigger CollectiveX data update for ${{ github.run_id }}" + git push origin HEAD:collectivex diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore new file mode 100644 index 000000000..e30004ffc --- /dev/null +++ b/experimental/CollectiveX/.gitignore @@ -0,0 +1,22 @@ +# in-container nccl-tests build cache +.nccl-tests/ +# python +__pycache__/ +*.pyc +# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs, +# so keep results out of git (CI uploads them as workflow artifacts instead). +# Sanitized headline numbers live in CONTAINERS.md. +results/*.json +results/plots/ +results/raw_*.txt +results/raw_*.txt.stderr +# superseded SSH-provenance result JSONs moved aside so plot_ep's recursive glob +# won't double-load them; same hostname/UUID sensitivity as results/. +_ssh_v4_archive/ +# running local-only reflection log (not a committed artifact) +notes.md +goal.md +# superseded seeded-runtime GHA results (canonical counterpart exists); kept out of the plot glob +_seeded_archive/ +# newest-good-per-config kept in results/; superseded runs moved here (out of the plot glob) +_superseded/ diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md new file mode 100644 index 000000000..8a8bbf56e --- /dev/null +++ b/experimental/CollectiveX/CONTAINERS.md @@ -0,0 +1,75 @@ +# CollectiveX — container & library versions + +One **multi-arch, digest-pinned** container is used for all NVIDIA SKUs, so B200 +(x86_64) and GB200 (aarch64) share a single reference and the cross-vendor +comparison is truly same-image. Set in `runtime/common.sh` (`cx_default_image`). + +## Default container (all NVIDIA SKUs) + +- **Image:** import by tag **`lmsysorg/sglang:v0.5.11-cu130`** (multi-arch OCI index). Expected index digest, recorded for provenance/verification: `sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975`. +- **Multi-arch manifest list:** linux/amd64 + linux/arm64; `enroot import` on each host pulls the matching arch. +- **Import by TAG, not digest.** enroot builds its anonymous Docker Hub token scope from the *tag* and succeeds (no creds needed — same as the serving launchers). A bare `repo@sha256:` ref makes enroot prompt for a password and **hang** in non-interactive CI; a combined `tag@sha256:` ref 400s. `cx_ensure_squash` therefore imports by tag with `_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged). +- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `tests/ep_mori.py`: + - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here. + - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now. + - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `tests/ep_mori.py`'s `finalize()` hard-exits after writing results to avoid it. + + Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image. + +## Cluster access / QOS + +- **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account. +- **GB200** (`watchtower`): account `benchmark`, qos `normal`, partition `batch` (`AllowQos=ALL`); idle capacity available. Runner workspace is **not** compute-visible → set `CX_STAGE_DIR` to a Lustre path (the launcher rsyncs there). + +## First real results (Milestone-0 spike, on the DeepSeek-V4 images) + +nccl-tests (system NCCL 2.28.3), all correctness-passed, peak bus-bw: + +| op | B200 8× (NVLink island, x86_64) | GB200 4× (NVL72 MNNVL, aarch64) | +|---|---|---| +| all_reduce | 835 GB/s | 689 GB/s | +| all_gather | 653 | 658 | +| reduce_scatter | 667 | 661 | +| alltoall | 638 | 666 | + +(B200 vs GB200 carry distinct `comparison_key`s by topology-class, so they are labelled-distinct, not silently merged. Re-run on the multi-arch default to refresh under one image.) diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md new file mode 100644 index 000000000..580a0399c --- /dev/null +++ b/experimental/CollectiveX/README.md @@ -0,0 +1,128 @@ +# CollectiveX + +Cross-vendor collective / EP-library benchmark (see `plan.md`). Per-SKU **launch +adapters** (InferenceX-style `launch_.sh`) run **any benchmark** — selected +by `CX_BENCH` — through a shared in-container runner, and a GitHub Actions +workflow triggers runs on `push` (no merge to main needed). Milestone-0 headline +already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL). + +> Experimental: WIP, not an official InferenceMAX result. All logic stays under +> `experimental/CollectiveX/`; the only file outside is the orchestration-only +> workflow. + +## Files + +| File | Role | +|---|---| +| `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) | +| `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) | +| `tests/run_ep.py` | EP dispatch/combine entrypoint (torchrun): source-tokens-per-rank sweep, dispatch & combine timed **separately** | +| `tests/ep_harness.py` | shared EP harness: token ladder, separated timing, correctness gate, doc emission (stdlib top) | +| `tests/ep_deepep.py`, `tests/ep_mori.py` | per-backend adapters (DeepEP / MoRI) implementing the harness protocol | +| `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) | +| `runtime/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build | +| `runtime/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) over `CX_PHASE` | +| `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) | +| `CONTAINERS.md` | the pinned multi-arch container + audited library versions | +| `results/` | flat JSON artifacts (+ `plots/`, raw captures) | +| `tests/fixtures/` | captured nccl-tests output for offline parser checks | + +## Run + +### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`) + +- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** EP dispatch/combine + sweep, **one job per phase** (decode + prefill) via a matrix (lands on free + `mi355x-amds` runners). +- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode / + mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl` + on MI355X runs rccl-tests), `phase` (decode / prefill / **both** → a job each), + `tokens_ladder`, `dispatch_dtype`, ops, sizes, ngpus. Lands on that SKU's + self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. For EP results + across all SKUs, dispatch once per `sku` with `phase=both`. + +Each job renders a results table to the **GitHub Actions job summary** (via +`summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs +as an artifact. (The workflow only fires once the branch is pushed to GitHub.) + +### Directly on a cluster login node + +```bash +# benchmark is selected by CX_BENCH (default nccl) +bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, NCCL primitives +CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild) +bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # B200 8× NVLink +bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh # B200 2-node, cross-IB +bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X 8× XGMI, MoRI EP (CX_BENCH=mori, default) +CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X primitives via rccl-tests +``` + +Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`, +`CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible +staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate +nothing). EP (deepep/mori) adds `CX_PHASE` (decode|prefill|both), `CX_TOKENS_LADDER` +(e.g. `"1 2 4 8 16 32 64 128"`), `CX_HIDDEN`/`CX_TOPK`/`CX_EXPERTS`, +`CX_DISPATCH_DTYPE`, `CX_NUM_EP_GROUPS`. Results land in `experimental/CollectiveX/results/`. + +### Offline (no GPU) — verify the parser/JSON pipeline + +```bash +python3 run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \ + --world-size 8 --nodes 1 --runner b200-dgxc --topology-class b200-nvlink-island --out /tmp/parsed.json +python3 env_capture.py # prints a (degraded, off-GPU) env record +python3 plot.py --results-dir results --out-dir results/plots # needs matplotlib +``` + +## Container + +One **multi-arch** image for all NVIDIA SKUs, imported by tag +`lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` +recorded for provenance). Imported by tag, not digest — enroot's anonymous +Docker Hub auth needs a tag, and a bare digest ref hangs in CI. See +`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the bundled-DeepEP +DeepSeek-V4 fallback images. + +## How it runs (confirmed against the live clusters) + +- Adapters mirror `runners/launch_*.sh`: `salloc` → enroot squash (import only if + missing) → `srun --container-image=… --container-mounts=:/ix` → in-container + `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account + `benchmark`. +- **AMD MI355X** (`launch_mi355x-amds.sh`, MoRI / `CX_BENCH=mori`) diverges: partition + `compute`, no account, pyxis `--container-writable --container-remap-root`, and a + **node-local** squash (`/var/lib/squash`) imported via `srun` on the allocated node + (not the login node). Workspace is bind-mounted directly (no `CX_STAGE_DIR`). +- Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in + `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node + adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`). +- The sglang image installs editable under `/workspace`, so the repo is mounted at + **`/ix`**. GB200 compute nodes don't see the runner workspace → `CX_STAGE_DIR` + rsyncs the tree to Lustre first. +- Every result embeds an `env_capture` record and a `comparison_key`; topology + class is part of the key, so B200(IB/NVLink) and GB200(MNNVL) stay labelled + distinct, never silently overlaid. + +## Status & known risks + +- **Spike done on real hardware** (both SKUs, 4 NCCL primitives, correctness-passed) + — on the DeepSeek-V4 images. Now standardizing on the **multi-arch** default; + validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9). +- **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds + it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive; + `tests/ep_deepep.py` follows the documented normal-mode API — validate against + the built commit. B200 (x86_64) first; GB200 (aarch64) follows. +- **MoRI / MI355X** (`tests/ep_mori.py` + `launch_mi355x-amds.sh`) is **validated on + hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip). + It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer` + zero-copy path, `expected = input × #unique-destination-ranks`). Three + ionic_rdma-fabric constraints are baked in (see `CONTAINERS.md`): a 2 GiB heap + (the NICs cap RDMA MRs at ~4 GiB), a bounded `max_num_inp_token_per_rank`, and a + hard-exit past MoRI's buggy shmem teardown. The ROCm image isn't digest-pinned yet. +- **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a + compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container + or srt-slurm. CX_BENCH=nccl only for now. +- **B200 QOS:** account `benchmark` has only `gpu-2_qos` (the serving-sweep + partition); idle `gpu-1` needs a QOS grant. GB200 `batch` is open. + +Once the multi-arch image is validated end-to-end, freeze the schema from the +artifacts (plan: "Freeze the contract"). diff --git a/experimental/CollectiveX/aggregate_results.py b/experimental/CollectiveX/aggregate_results.py new file mode 100644 index 000000000..3771d17c0 --- /dev/null +++ b/experimental/CollectiveX/aggregate_results.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +"""CollectiveX — result aggregator (the end-of-sweep collector). + +The sweep workflow (collectivex-sweep.yml) fans out one matrix CELL per SHARD +(platform × backend × mode × resource), each cell sweeping its cases in a single +allocation and emitting a handful of per-case result JSONs. Instead of leaving +thousands of individual files scattered across the repo, this aggregator COLLECTS +every shard's results into ONE compact line-delimited file: + + results/aggregate/collectivex_ep.ndjson # one result doc per line + +That single artifact is the deliverable the plotter + the app read; the per-case +JSONs stay inside the run as transient shard intermediates. Within a shard, a +config that was re-run keeps only its NEWEST usable doc (newest generated_at with +publication_status/status in official|comparable-experimental|valid), with +genuinely-failed configs preserved when they have no usable counterpart — the same +hygiene prune_results.py applies, folded into the merge so the aggregate is already +canonical. + + python3 aggregate_results.py --in-dir --out results/aggregate/collectivex_ep.ndjson + python3 aggregate_results.py --in-dir results --explode results # ndjson -> per-doc (for the plotter) + +Stdlib only. +""" +from __future__ import annotations + +import argparse +import json +import os + +USABLE = {"official", "comparable-experimental", "valid"} + + +def _key(d: dict) -> str: + """Config identity used to keep newest-per-config (mirrors prune_results._doc_key).""" + if d.get("comparison_key"): + return str(d["comparison_key"]) + keys = [g.get("comparison_key") for g in d.get("groups", []) if g.get("comparison_key")] + if keys: + return "|".join(sorted(str(k) for k in keys)) + return "|".join(str(d.get(k, "")) for k in ("family", "runner", "backend", "phase", + "measurement_contract")) + + +def _usable(d: dict) -> bool: + return (d.get("publication_status") or d.get("status")) in USABLE + + +def _iter_docs(in_dir: str): + """Yield (source, doc) for every result doc under in_dir — both per-file *.json and + line-delimited *.ndjson (so aggregates can be re-merged idempotently).""" + for root, _dirs, files in os.walk(in_dir): + for f in files: + if f.startswith("env_") or f == "analysis.json": + continue + p = os.path.join(root, f) + if f.endswith(".ndjson"): + for line in open(p): + line = line.strip() + if line: + try: + yield p, json.loads(line) + except Exception: + pass + elif f.endswith(".json"): + try: + yield p, json.load(open(p)) + except Exception: + pass + + +def aggregate(in_dir: str, keep_per_key: int = 3) -> list: + """Collect every result doc, keep newest KEEP_PER_KEY usable per config (+ orphan failures).""" + groups: dict = {} + for _src, d in _iter_docs(in_dir): + groups.setdefault(_key(d), []).append(d) + out = [] + for _k, docs in groups.items(): + usable = sorted([d for d in docs if _usable(d)], + key=lambda d: d.get("generated_at", ""), reverse=True) + if usable: + out.extend(usable[:keep_per_key]) + else: + # a config that ONLY ever failed: keep its newest record (preserve failed cases) + out.append(sorted(docs, key=lambda d: d.get("generated_at", ""), reverse=True)[0]) + return out + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX result aggregator") + ap.add_argument("--in-dir", default="results", help="root to walk for shard result files") + ap.add_argument("--out", default="results/aggregate/collectivex_ep.ndjson") + ap.add_argument("--keep-per-key", type=int, default=3) + ap.add_argument("--explode", metavar="DIR", + help="instead of merging, write each ndjson doc in --in-dir back to a per-doc " + "JSON under DIR (so the existing plotter glob can read an aggregate)") + a = ap.parse_args() + + if a.explode: + os.makedirs(a.explode, exist_ok=True) + n = 0 + for _src, d in _iter_docs(a.in_dir): + name = (d.get("artifact_name") or + f"{d.get('runner','x')}_{d.get('backend',d.get('op','x'))}_" + f"{d.get('phase','na')}_{d.get('generated_at','')}".replace(":", "-")) + with open(os.path.join(a.explode, f"{name}.json"), "w") as fh: + json.dump(d, fh) + n += 1 + print(f"explode: wrote {n} per-doc JSON to {a.explode}") + return 0 + + docs = aggregate(a.in_dir, a.keep_per_key) + os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True) + with open(a.out, "w") as fh: + for d in docs: + fh.write(json.dumps(d, separators=(",", ":")) + "\n") + skus = sorted({str(d.get("runner", "?")).split("_")[0].split("-")[0] for d in docs}) + backs = sorted({str(d.get("backend") or d.get("op") or "?") for d in docs}) + print(f"aggregate: {len(docs)} docs -> {a.out} (SKUs={skus} backends={backs})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/analyze_ep.py b/experimental/CollectiveX/analyze_ep.py new file mode 100644 index 000000000..e53497087 --- /dev/null +++ b/experimental/CollectiveX/analyze_ep.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python3 +"""CollectiveX operating-envelope analysis (goal Part 2 'operating-envelope outputs' + Part 3 +'regression/decision outputs'). Post-processes result JSONs (v3 flat or v4 nested) into the +decision-facing summaries, comparing ONLY matching (workload, topology, contract, backend, +resource) cells: + + routing-skew penalty zipf* vs matched uniform — p50/p99 dispatch amplification + LL-to-normal crossover token count where normal becomes faster than LL (p50 and p99) + topology penalty EP4 vs EP8 (and placement, when present) latency penalty + strong/weak scaling fixed-global-tokens and fixed-tokens/rank efficiency across EP + resource marginal eff. Δlatency per Δcomm-fraction (needs a resource ladder; reports n/a otherwise) + pareto + recommendations lowest-latency / lowest-resource configs per (sku, phase) + +Pure stdlib; reads the same JSONs the plotter does. Honest about missing cells (prints n/a with +the reason) rather than inventing comparisons. + + python3 analyze_ep.py --results-dir results --out analysis.json +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +from collections import defaultdict + + +def _p(r, op, pct): + """percentile from v4 nested {op:{p50..}} or v3 flat {op_us_p50}.""" + if isinstance(r.get(op), dict): + return r[op].get(pct) + return r.get(f"{op}_us_{pct}") + + +def load(results_dir): + series = [] + for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)): + if os.path.basename(f).startswith("env_"): + continue + try: + d = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") != "moe" or not d.get("rows"): + continue + sh = d.get("shape", {}) + v = d.get("validity", {}) or {} + series.append({ + "sku": (d.get("runner") or "?").split("_")[0].split("-")[0], + "ep": d.get("ep_size"), "phase": d.get("phase"), "mode": d.get("mode", "normal"), + "dtype": sh.get("dispatch_dtype"), "contract": d.get("measurement_contract"), + "routing": (sh.get("routing", "?") + ("+eplb" if (d.get("eplb") or {}).get("enabled") else "")), + "topo": d.get("topology_class"), "resource": d.get("resource_mode", "tuned"), + # placement + publication/anomaly state (goal P2 placement penalty / P2-o LL gating). + "placement": (d.get("placement") or {}).get("kind", "packed"), + "pub": d.get("publication_status") or "legacy", + "anomaly_free": v.get("anomaly_free", True), + "hidden": sh.get("hidden"), "topk": sh.get("topk"), "experts": sh.get("experts"), + # resource-Pareto axis (immediate P2): achieved comm-fraction + class; fixed-kernel + # (DeepEP LL) is EXCLUDED from Pareto (it is not a normalized resource-constrained run). + "resource_class": (d.get("resource_profile") or {}).get("resource_class"), + "achieved_fraction": (d.get("resource_profile") or {}).get("achieved_fraction"), + "pareto_eligible": (d.get("resource_profile") or {}).get("pareto_eligible"), + "fixed_kernel": (d.get("resource_profile") or {}).get("fixed_kernel", False), + "rows": {r["tokens_per_rank"]: r for r in d["rows"]}, + }) + return series + + +def resource_pareto(series): + """latency vs achieved comm-resource fraction (immediate P2 'resource Pareto sweeps'). Per + (sku,phase,dtype,T): the (achieved_fraction -> dispatch p50/p99) curve across resource points + (normalized sm-fraction ladder + tuned/default anchors), EXCLUDING fixed-kernel (LL) runs which + are not normalized resource-constrained. Reports the points + marginal efficiency Δlatency/Δfrac + so the resource/latency trade-off (more comm SMs -> lower latency, with diminishing returns) is + explicit. Needs >=2 distinct fractions at a matched cell; reports per-cell curves where present.""" + by = defaultdict(dict) # (sku,phase,dtype,T) -> {achieved_fraction: (p50,p99,class,mode)} + for s in series: + if s["mode"] != "normal" or s["routing"] != "uniform" or s["contract"] != "layout-and-dispatch-v1": + continue + if s.get("fixed_kernel"): + continue # exclude fixed-kernel from the Pareto + af = s.get("achieved_fraction") + if af is None: + continue + for T, r in s["rows"].items(): + p50, p99 = _p(r, "dispatch", "p50"), _p(r, "dispatch", "p99") + if p50: + by[(s["sku"], s["phase"], s["dtype"], T)][round(af, 4)] = (round(p50, 1), + round(p99 or 0, 1), s["resource_class"]) + out = [] + for (sku, phase, dtype, T), pts in by.items(): + if len(pts) < 2: + continue # need >=2 fractions for a Pareto curve + fr = sorted(pts) + curve = [{"achieved_fraction": f, "dispatch_p50": pts[f][0], "dispatch_p99": pts[f][1], + "resource_class": pts[f][2]} for f in fr] + # marginal efficiency between adjacent points: Δlatency per +0.1 comm-fraction (negative = faster). + marg = [] + for a, b in zip(fr, fr[1:]): + dlat, dfr = pts[b][0] - pts[a][0], b - a + if dfr > 0: + marg.append({"from_frac": a, "to_frac": b, "us_per_0.1frac": round(dlat / dfr * 0.1, 2)}) + out.append({"sku": sku, "phase": phase, "dtype": dtype, "T": T, + "n_points": len(fr), "curve": curve, "marginal": marg}) + return out + + +def model_envelope(series, here): + """Map each model-derived workload (configs/workloads.yaml) onto the SYNTHETIC measured envelope + (goal P2 "model workload summaries"). A model whose (hidden,topk,experts) matches a measured + synthetic shape is 'measured-via-proxy'; otherwise 'projected' (no run at those dims yet). Honest + about measured vs fitted vs projected; links each to its registry config.""" + try: + import yaml + wl = yaml.safe_load(open(os.path.join(here, "configs", "workloads.yaml"))) + except Exception as exc: + return [{"note": f"workloads.yaml unreadable: {exc!r}"}] + measured = {} + for s in series: + if s["hidden"] and s["routing"] == "uniform" and s["mode"] == "normal": + measured.setdefault((s["hidden"], s["topk"], s["experts"]), []).append(s["sku"]) + out = [] + for name, m in (wl.get("model_derived") or {}).items(): + dims = (m.get("hidden"), m.get("topk"), m.get("routed_experts")) + skus = measured.get(dims) + out.append({"model": name, "hidden": dims[0], "topk": dims[1], "routed_experts": dims[2], + "dispatch_dtype": m.get("dispatch_dtype"), "combine_dtype": m.get("combine_dtype"), + "kind": m.get("kind"), "verify": m.get("verify"), + "envelope_placement": ("measured-via-proxy" if skus else "projected"), + "measured_on": sorted(set(skus)) if skus else [], + "note": ("dims match the measured synthetic envelope — read its curve directly" + if skus else "no run at these dims — projected onto the synthetic envelope")}) + return out + + +def _key(s, *fields): + return tuple(s[f] for f in fields) + + +def skew_penalty(series): + """zipf* vs matched uniform: dispatch p50/p99 amplification at shared T.""" + out = [] + base = {_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"): s + for s in series if s["routing"] == "uniform"} + for s in series: + if not s["routing"].startswith("zipf"): + continue + b = base.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract")) + if not b: + continue + for T in sorted(set(s["rows"]) & set(b["rows"])): + zp, up = _p(s["rows"][T], "dispatch", "p50"), _p(b["rows"][T], "dispatch", "p50") + zq, uq = _p(s["rows"][T], "dispatch", "p99"), _p(b["rows"][T], "dispatch", "p99") + if up and uq: + out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], "routing": s["routing"], + "T": T, "p50_amplification": round(zp / up, 3), "p99_amplification": round(zq / uq, 3)}) + return out + + +def ll_crossover(series): + """Token count where normal becomes faster than LL (per sku,dtype). Two variants, gated + differently (goal P2-o "gate LL crossover on valid measured roundtrip"): + * op='dispatch' -> ISOLATED-KERNEL crossover (always allowed; clearly labelled isolated). + * op='roundtrip' -> MEASURED-roundtrip crossover, EXCLUDED when the LL series carries an + unresolved timing anomaly (the open LL-FP8 case) so a suspect roundtrip can't set it.""" + out = [] + for op in ("dispatch", "roundtrip"): + norm = {_key(s, "sku", "ep", "dtype"): s for s in series + if s["mode"] == "normal" and s["routing"] == "uniform" + and s["contract"] == "layout-and-dispatch-v1"} + for s in series: + if s["mode"] != "ll" or s["routing"] != "uniform": + continue + n = norm.get(_key(s, "sku", "ep", "dtype")) + if not n: + continue + gated = (op == "roundtrip" and not s.get("anomaly_free", True)) + for stat in ("p50", "p99"): + cross = None + if not gated: + for T in sorted(set(s["rows"]) & set(n["rows"])): + ll, nm = _p(s["rows"][T], op, stat), _p(n["rows"][T], op, stat) + if ll and nm and nm < ll: + cross = T + break + out.append({"sku": s["sku"], "ep": s["ep"], "dtype": s["dtype"], "stat": stat, + "basis": "isolated-kernel" if op == "dispatch" else "measured-roundtrip", + "normal_faster_at_T": ("excluded-ll-roundtrip-anomaly" if gated + else (cross if cross is not None else "never-in-range"))}) + return out + + +def placement_penalty(series): + """packed vs striped (vs adversarial) at matched (sku,phase,dtype,ep,routing): absolute + + % latency delta AND the cross-domain-copy-fraction delta — so the penalty can be attributed + to routing locality vs backend overhead (goal P2 topology-penalty). Needs placement-varied + runs (multi-node); reports nothing when only one placement is present.""" + out = [] + by = defaultdict(dict) + for s in series: + if s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1": + by[(s["sku"], s["phase"], s["dtype"], s["ep"], s["routing"])][s["placement"]] = s + for k, places in by.items(): + if "packed" not in places or len(places) < 2: + continue + base = places["packed"] + for kind, s in places.items(): + if kind == "packed": + continue + for T in sorted(set(s["rows"]) & set(base["rows"])): + a = _p(base["rows"][T], "dispatch", "p50"); b = _p(s["rows"][T], "dispatch", "p50") + if not (a and b): + continue + la = (base["rows"][T].get("locality") or {}).get("cross_domain_fraction") + lb = (s["rows"][T].get("locality") or {}).get("cross_domain_fraction") + out.append({"sku": k[0], "phase": k[1], "dtype": k[2], "ep": k[3], "routing": k[4], + "placement": kind, "T": T, "packed_p50": round(a, 1), + f"{kind}_p50": round(b, 1), "abs_penalty_us": round(b - a, 1), + "penalty_pct": round(100 * (b - a) / a, 1), + "cross_domain_frac_packed": la, "cross_domain_frac_other": lb}) + return out + + +def topology_penalty(series): + """EP4 vs EP8 dispatch p50 at matched tokens/rank for the same sku (a scaling/topology cost).""" + out = [] + by = defaultdict(dict) + for s in series: + if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1": + by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s + for k, eps in by.items(): + if len(eps) < 2: + continue + lo, hi = min(eps), max(eps) + sl, sh = eps[lo], eps[hi] + for T in sorted(set(sl["rows"]) & set(sh["rows"])): + a, b = _p(sl["rows"][T], "dispatch", "p50"), _p(sh["rows"][T], "dispatch", "p50") + if a and b: + out.append({"sku": k[0], "phase": k[1], "dtype": k[2], "T": T, + f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1), + "penalty_pct": round(100 * (b - a) / a, 1)}) + return out + + +def scaling(series): + """strong: fixed GLOBAL tokens, vary EP -> latency. weak: fixed tokens/RANK, vary EP.""" + out = {"strong": [], "weak": []} + by = defaultdict(dict) + for s in series: + if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1": + by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s + for k, eps in by.items(): + if len(eps) < 2: + continue + for ep, s in eps.items(): + for T, r in s["rows"].items(): + d50 = _p(r, "dispatch", "p50") + if d50: + out["weak"].append({"sku": k[0], "phase": k[1], "ep": ep, "tokens_per_rank": T, + "global_tokens": T * ep, "dispatch_p50": round(d50, 1)}) + out["strong"].append({"sku": k[0], "phase": k[1], "ep": ep, "global_tokens": T * ep, + "tokens_per_rank": T, "dispatch_p50": round(d50, 1)}) + return out + + +def scaling_efficiency(series): + """From EP4+EP8 (same sku/phase): weak = fixed tokens/rank (ideal: flat latency); strong = + fixed GLOBAL tokens (ideal: latency falls ~1/EP). Efficiency = ideal/observed (1.0 = ideal).""" + out = {"weak": [], "strong": []} + by = defaultdict(dict) + for s in series: + if s["routing"] == "uniform" and s["mode"] == "normal" and s["contract"] == "layout-and-dispatch-v1": + by[(s["sku"], s["phase"], s["dtype"])][s["ep"]] = s + for k, eps in by.items(): + if len(eps) < 2: + continue + lo, hi = min(eps), max(eps) + # weak: same tokens/rank T on both EP -> latency should stay flat + for T in sorted(set(eps[lo]["rows"]) & set(eps[hi]["rows"])): + a, b = _p(eps[lo]["rows"][T], "dispatch", "p50"), _p(eps[hi]["rows"][T], "dispatch", "p50") + if a and b: + out["weak"].append({"sku": k[0], "phase": k[1], "tokens_per_rank": T, + f"ep{lo}": round(a, 1), f"ep{hi}": round(b, 1), + "weak_efficiency": round(a / b, 3)}) # >1 = EP8 faster (super-ideal) + # strong: same GLOBAL tokens -> EP_hi has fewer tokens/rank; ideal latency ~ a*(lo/hi) + for Tlo in eps[lo]["rows"]: + gt = Tlo * lo + Thi = gt // hi + if Thi in eps[hi]["rows"]: + a, b = _p(eps[lo]["rows"][Tlo], "dispatch", "p50"), _p(eps[hi]["rows"][Thi], "dispatch", "p50") + if a and b: + ideal = a * (lo / hi) + out["strong"].append({"sku": k[0], "phase": k[1], "global_tokens": gt, + f"ep{lo}_p50": round(a, 1), f"ep{hi}_p50": round(b, 1), + "strong_efficiency": round(ideal / b, 3)}) + return out + + +def regressions(series, baseline_series, thresh=0.10): + """Flag latency regressions vs a baseline, comparing ONLY matching (sku,ep,phase,mode,dtype, + contract,routing) cells at shared T. Regression = current p50/p99 > baseline*(1+thresh).""" + bkey = {_key(b, "sku", "ep", "phase", "mode", "dtype", "contract", "routing"): b for b in baseline_series} + out = [] + for s in series: + b = bkey.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract", "routing")) + if not b: + continue + for T in sorted(set(s["rows"]) & set(b["rows"])): + for op in ("dispatch", "combine", "roundtrip"): + for stat in ("p50", "p99"): + cur, base = _p(s["rows"][T], op, stat), _p(b["rows"][T], op, stat) + if cur and base and cur > base * (1 + thresh): + out.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], + "routing": s["routing"], "T": T, "op": op, "stat": stat, + "baseline": round(base, 1), "current": round(cur, 1), + "regression_pct": round(100 * (cur - base) / base, 1)}) + return out + + +def distribution_summary(series, results_dir): + """One block per (sku,backend?,phase): worst-distribution penalty, zipf penalty, EPLB recovery, + balanced/high-fanout penalty, + placeholders for activation/quant penalties (goal P2 + "distribution-sensitivity summaries"). Reuses tests/sensitivity.py for the ratio and adds the + balanced + EPLB views the skew table doesn't surface.""" + summary = {"note": "ratios = p99(distribution) / p99(uniform) at matched tokens/rank"} + # worst / zipf / EPLB recovery come straight from tests/sensitivity.py. + try: + import sys as _sys + _sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "tests")) + import sensitivity as _sens + groups = _sens.analyze(results_dir)["groups"] + summary["sensitivity"] = [{"sku": g["sku"], "backend": g["backend"], "phase": g["phase"], + "worst": g["worst_distribution"], + "worst_ratio": g["distribution_sensitivity_ratio"], + "best_case": g["best_case_ratio"], "eplb_recovery": g["eplb_recovery"], + "per_distribution": g["per_distribution"]} for g in groups + if g["distribution_sensitivity_ratio"] is not None] + except Exception as exc: + summary["sensitivity"] = [] + summary["sensitivity_error"] = repr(exc) + # balanced (high-fanout) penalty: balanced p99 / uniform p99 (a distinct stressor from zipf). + base = {_key(s, "sku", "ep", "phase", "mode", "dtype", "contract"): s + for s in series if s["routing"] == "uniform"} + bal = [] + for s in series: + if s["routing"] != "balanced": + continue + b = base.get(_key(s, "sku", "ep", "phase", "mode", "dtype", "contract")) + if not b: + continue + for T in sorted(set(s["rows"]) & set(b["rows"])): + up, bp = _p(b["rows"][T], "dispatch", "p99"), _p(s["rows"][T], "dispatch", "p99") + if up and bp: + bal.append({"sku": s["sku"], "ep": s["ep"], "phase": s["phase"], "T": T, + "balanced_p99_penalty": round(bp / up, 3)}) + summary["balanced_high_fanout_penalty"] = bal + # activation / quant-combine distribution penalties: only meaningful under a quantized combine + # (bf16 is value-independent). Recorded as blocked until PR311 lands (goal P2 — kept honest). + summary["activation_profile_penalty"] = { + "status": "blocked-on-quant-combine", + "note": "activation VALUE distribution is latency-neutral under bf16 combine; needs a " + "quantized (value-sensitive) combine kernel (ROCm/MoRI PR311) to measure"} + summary["quant_combine_penalty"] = { + "status": "blocked-on-quant-combine", + "note": "no quantized combine kernel wired (combine_quant_mode=none everywhere); the rig " + "(combine_quant_mode field + capability gate + suite) is ready for when it lands"} + return summary + + +def recommendations(series): + """Per (sku, phase): lowest-p99-dispatch config at the headline T=64 (decode) / T=256 (prefill).""" + out = [] + by = defaultdict(list) + for s in series: + by[(s["sku"], s["phase"])].append(s) + for (sku, phase), ss in by.items(): + T = 64 if phase == "decode" else 256 + cands = [] + for s in ss: + r = s["rows"].get(T) + if r: + q = _p(r, "dispatch", "p99") + if q: + cands.append((q, f"{s['dtype']}/{s['mode']}/{s['contract']}/{s['routing']}/{s['resource']}", s["ep"])) + if cands: + cands.sort() + out.append({"sku": sku, "phase": phase, "at_T": T, "lowest_p99_dispatch_us": round(cands[0][0], 1), + "config": cands[0][1], "ep": cands[0][2]}) + return out + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX operating-envelope analysis") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--baseline", help="dir of baseline results for regression detection") + ap.add_argument("--out") + a = ap.parse_args() + here = os.path.dirname(os.path.abspath(__file__)) + s = load(a.results_dir) + rep = {"n_series": len(s), "skew_penalty": skew_penalty(s), "ll_crossover": ll_crossover(s), + "topology_penalty": topology_penalty(s), "placement_penalty": placement_penalty(s), + "scaling": scaling(s), "scaling_efficiency": scaling_efficiency(s), + "model_envelope": model_envelope(s, here), + "distribution_summary": distribution_summary(s, a.results_dir), + "resource_pareto": resource_pareto(s), + "recommendations": recommendations(s)} + if a.baseline: + regs = regressions(s, load(a.baseline)) + rep["regressions"] = regs + print(f"regressions vs baseline: {len(regs)} cell(s) > +10%") + print(f"loaded {len(s)} series") + sk = rep["skew_penalty"] + if sk: + worst = max(sk, key=lambda x: x["p99_amplification"]) + print(f"skew penalty: {len(sk)} cells; worst p99 amplification {worst['p99_amplification']}x " + f"({worst['sku']} {worst['routing']} T{worst['T']})") + tp = rep["topology_penalty"] + if tp: + print(f"topology penalty (EP4->EP8): {len(tp)} cells; e.g. " + + ", ".join(f"{x['sku']} T{x['T']} {x['penalty_pct']:+}%" for x in tp[:3])) + rpar = rep["resource_pareto"] + print(f"resource-Pareto cells (>=2 fractions, fixed-kernel excluded): {len(rpar)}" + + (f"; e.g. {rpar[0]['sku']} T{rpar[0]['T']} {rpar[0]['n_points']} pts" if rpar else " (need an sm_fraction ladder)")) + print(f"LL crossover cells: {len(rep['ll_crossover'])}; recommendations: {len(rep['recommendations'])}") + for r in rep["recommendations"]: + print(f" rec {r['sku']}/{r['phase']} @T{r['at_T']}: {r['lowest_p99_dispatch_us']}us via {r['config']}") + if a.out: + json.dump(rep, open(a.out, "w"), indent=2) + print(f"wrote {a.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/cohort.py b/experimental/CollectiveX/cohort.py new file mode 100644 index 000000000..96f31f322 --- /dev/null +++ b/experimental/CollectiveX/cohort.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +"""CollectiveX publication-cohort builder + validator (goal Part 1: publication cohort manifests, +official-cohort validation, source-SHA pinning; goal Part 2: EPLB mapping identity). + +A *publication cohort* is the set of result artifacts that are meant to be compared on ONE chart — +e.g. the same workload + measurement contract + config across SKUs/backends. Unlike `comparison_key` +(which gates a single curve and so INCLUDES topology/sku), a cohort deliberately lets sku / backend / +topology VARY (those are the independent variable) while requiring everything that must be identical +for the comparison to be fair to actually match: + + cohort_key = (mode, phase, ep_size, resource_mode, comparison_class, measurement_contract, + dispatch_dtype, activation_profile, combine_quant_mode, trace_signature) + +For each cohort this tool emits a MANIFEST listing every member with its identity fingerprint +(source SHA, workload id, image digest, backend version, schema version) and decides whether the +cohort is OFFICIAL-eligible. A cohort is official only when every member is itself measurement-sound +and the dimensions that MUST match across hardware do: + + * one benchmark source SHA (goal P1 "same benchmark source SHA"; --pin-sha enforces) + * non-null + identical workload_id (goal P1 "non-null workload identity") + * identical trace_signature (same realized routing bytes — by cohort_key construction) + * identical EPLB mapping_hash (goal P2 "matching EPLB mapping identity") when EPLB is on + * no unresolved timing anomalies (goal P1 anomaly gate) + * complete provenance per member (image digest + git run) + +Rejected members are recorded WITH machine-readable reasons (goal P1 "store rejected artifacts with +explicit rejection reasons") rather than silently dropped. + + python3 cohort.py --results-dir results # summarize all cohorts + python3 cohort.py --results-dir results --require-official # exit 3 unless an official cohort exists + python3 cohort.py --results-dir results --pin-sha --out results/cohorts.json +""" +from __future__ import annotations + +import argparse +import glob +import hashlib +import json +import os + +MIN_SAMPLES_OFFICIAL = 100 + + +def _backend_version(doc: dict) -> str: + p = doc.get("backend_provenance", {}) or {} + return (p.get("deepep_commit") or p.get("deepep_version") + or p.get("mori_commit") or "unknown") + + +def fingerprint(doc: dict, path: str) -> dict: + """Per-artifact identity used to detect cohort mismatches + build the cohort id.""" + sh = doc.get("shape", {}) or {} + q = sh.get("quant", {}) or {} + wl = doc.get("workload", {}) or {} + repro = doc.get("reproduction", {}) or {} + gr = repro.get("git_run") or {} + eplb = doc.get("eplb") or {} + v = doc.get("validity", {}) or {} + return { + "file": os.path.basename(path), + "sku": (doc.get("runner") or "?").split("_")[0].split("-")[0], + "backend": doc.get("backend"), "mode": doc.get("mode"), "phase": doc.get("phase"), + "ep_size": doc.get("ep_size"), "resource_mode": doc.get("resource_mode"), + "comparison_class": doc.get("comparison_class"), + "measurement_contract": doc.get("measurement_contract"), + "dispatch_dtype": sh.get("dispatch_dtype"), + "kernel_gen": sh.get("kernel_gen") or ("v1" if doc.get("backend") == "deepep" else "n-a"), + "activation_profile": sh.get("activation_profile", "normal"), + "combine_quant_mode": q.get("combine_quant_mode", "none"), + "trace_signature": wl.get("trace_signature") or (doc.get("routing_identity") or {}).get("trace_signature"), + "workload_id": wl.get("workload_id"), + "workload_source": wl.get("source"), + "source_sha": (gr.get("source_sha") or ""), + "image_digest": (repro.get("image_digest") or ""), + "backend_version": _backend_version(doc), + "schema_version": doc.get("schema_version"), + "publication_status": doc.get("publication_status") or "legacy", + "anomaly_free": v.get("anomaly_free", True), + "provenance_complete": v.get("provenance_complete", False), + "eplb_enabled": bool(eplb.get("enabled")), + "eplb_mapping_hash": eplb.get("mapping_hash"), + "min_samples": min((r.get("samples_pooled", 0) for r in doc.get("rows", [])), default=0), + "correct": all(r.get("correct") for r in doc.get("rows", [])) if doc.get("rows") else False, + } + + +def cohort_key(fp: dict) -> tuple: + """Identity a cohort's members must share. sku/backend/topology deliberately EXCLUDED — those + are what a cross-hardware chart compares.""" + return (fp["mode"], fp["phase"], fp["ep_size"], fp["resource_mode"], fp["comparison_class"], + fp["measurement_contract"], fp["dispatch_dtype"], fp["kernel_gen"], + fp["activation_profile"], fp["combine_quant_mode"], fp["trace_signature"]) + + +def cohort_id(members: list) -> str: + """Stable content hash of the cohort: encodes every member's (source SHA, workload id, image + digest, backend version, schema version) — goal P1 'cohort IDs that encode ...'.""" + parts = sorted(f"{m['sku']}|{m['backend']}|{m['source_sha']}|{m['workload_id']}|" + f"{m['image_digest']}|{m['backend_version']}|{m['schema_version']}" for m in members) + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def evaluate_cohort(members: list, pin_sha: bool) -> dict: + """Split members into the OFFICIAL subset (accepted) + the rest (rejected, with reasons). + A non-canonical (wid=null / seeded-runtime) member is REJECTED from the official cohort but + does NOT block it — that is the point of recording rejections. official_eligible then depends + on the ACCEPTED subset being mutually consistent (one source SHA under --pin-sha, one workload_id, + one EPLB mapping), NOT on there being zero rejected members. A seeded run of the same config + shares the deterministic trace_signature, so it lands in the same cohort and is simply excluded.""" + rejected, accepted = [], [] + for m in members: + reasons = [] # PER-MEMBER gates only + # publication_status is machine-derived from ALL validity dims (correctness, workload + # identity, measurement + RESOURCE conformance, provenance, anomalies). Only an 'official' + # member belongs in an official cohort — this is the authoritative gate; the granular + # checks below just enrich the rejection reason (e.g. a resource-nonconforming MoRI run is + # 'diagnostic' and excluded here even though it is correct + canonical + provenance-complete). + if m["publication_status"] != "official": + reasons.append(f"publication_status={m['publication_status']} (official cohort needs 'official')") + if not m["correct"]: + reasons.append("a point failed correctness") + if not m["anomaly_free"]: + reasons.append("unresolved timing anomaly (not waived)") + if not m["workload_id"]: + reasons.append("workload_id is null (not canonical-serialized) — comparable-experimental, not official") + if m["workload_source"] != "canonical-serialized": + reasons.append(f"workload_source={m['workload_source']} (official needs canonical-serialized)") + if not m["provenance_complete"]: + reasons.append("provenance incomplete (image digest / git run missing)") + if m["min_samples"] < MIN_SAMPLES_OFFICIAL: + reasons.append(f"a point has <{MIN_SAMPLES_OFFICIAL} pooled samples") + (rejected if reasons else accepted).append({**m, "rejection_reasons": reasons}) + # cross-member consistency over the ACCEPTED (would-be-official) subset. + a_shas = {m["source_sha"] for m in accepted if m["source_sha"]} + a_wids = {m["workload_id"] for m in accepted if m["workload_id"]} + a_maps = {m["eplb_mapping_hash"] for m in accepted if m["eplb_enabled"]} + a_eplb = any(m["eplb_enabled"] for m in accepted) + incoherent = [] + if pin_sha and len(a_shas) > 1: + incoherent.append(f"accepted members span {len(a_shas)} source SHAs (--pin-sha requires one)") + if len(a_wids) > 1: + incoherent.append(f"accepted members span {len(a_wids)} workload_ids") + if a_eplb and len(a_maps) > 1: + incoherent.append(f"accepted members span {len(a_maps)} EPLB mapping_hashes") + official_eligible = len(accepted) >= 1 and not incoherent + return { + "cohort_id": cohort_id(members), "n_members": len(members), + "skus": sorted({m["sku"] for m in members}), + "official_skus": sorted({m["sku"] for m in accepted}), + "backends": sorted({m["backend"] for m in members if m["backend"]}), + "source_shas": sorted({m["source_sha"] for m in members if m["source_sha"]}), + "workload_ids": sorted({m["workload_id"] for m in members if m["workload_id"]}), + "official_source_shas": sorted(a_shas), "official_workload_ids": sorted(a_wids), + "eplb_mapping_hashes": sorted(a_maps), "any_eplb": a_eplb, + "official_eligible": official_eligible, "incoherent": incoherent, + "accepted": accepted, "rejected": rejected, + } + + +def build(results_dir: str, pin_sha: bool) -> dict: + cohorts = {} + for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)): + if os.path.basename(f).startswith("env_"): + continue + try: + doc = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if doc.get("family") != "moe" or not doc.get("rows"): + continue + if "publication_status" not in doc: + continue # legacy v3 — not cohort-eligible + fp = fingerprint(doc, f) + cohorts.setdefault(cohort_key(fp), []).append(fp) + out = [] + for ck, members in cohorts.items(): + ev = evaluate_cohort(members, pin_sha) + ev["key"] = {"mode": ck[0], "phase": ck[1], "ep_size": ck[2], "resource_mode": ck[3], + "comparison_class": ck[4], "measurement_contract": ck[5], + "dispatch_dtype": ck[6], "kernel_gen": ck[7], "activation_profile": ck[8], + "combine_quant_mode": ck[9], "trace_signature": ck[10]} + out.append(ev) + out.sort(key=lambda c: (not c["official_eligible"], -c["n_members"])) + return {"results_dir": results_dir, "pin_sha": pin_sha, "n_cohorts": len(out), + "n_official_eligible": sum(1 for c in out if c["official_eligible"]), + "cohorts": out} + + +def to_markdown(report: dict) -> str: + h = (f"### Publication cohorts ({report['n_cohorts']} cohorts, " + f"{report['n_official_eligible']} official-eligible; pin_sha={report['pin_sha']})\n\n" + "| cohort | contract | dtype·act·cq | EP | SKUs | backends | members | official | top rejection |\n" + "|---|---|---|---|---|---|---|---|---|\n") + for c in report["cohorts"]: + k = c["key"] + cfg = f"{k['dispatch_dtype']}·{k['activation_profile']}·{k['combine_quant_mode']}" + rej = "" + if c["rejected"]: + rs = c["rejected"][0]["rejection_reasons"] + rej = (rs[0] if rs else "")[:48] + h += (f"| `{c['cohort_id']}` | {(k['measurement_contract'] or '').replace('-v1','')} | {cfg} | " + f"{k['ep_size']} | {','.join(c['skus'])} | {','.join(c['backends'])} | " + f"{len(c['accepted'])}✓/{len(c['rejected'])}✗ | {'YES' if c['official_eligible'] else '—'} | {rej} |\n") + return h + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX publication-cohort builder/validator") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--pin-sha", action="store_true", + help="require all members of an official cohort to share one source SHA") + ap.add_argument("--require-official", action="store_true", + help="exit 3 unless at least one cohort is official-eligible") + ap.add_argument("--out", help="write the full cohort manifest JSON here") + a = ap.parse_args() + report = build(a.results_dir, a.pin_sha) + if a.out: + os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True) + json.dump(report, open(a.out, "w"), indent=2, sort_keys=True) + print(f"wrote {a.out}") + print(to_markdown(report)) + if a.require_official and report["n_official_eligible"] == 0: + print("FAIL: no official-eligible cohort (see rejection reasons above)") + return 3 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/configs/backends.yaml b/experimental/CollectiveX/configs/backends.yaml new file mode 100644 index 000000000..c83d94fbc --- /dev/null +++ b/experimental/CollectiveX/configs/backends.yaml @@ -0,0 +1,83 @@ +# CollectiveX backend registry (goal Part 2) — the single source of truth for backend +# capability, replacing the data split between the adapters and tests/capability.py. Keep in +# sync with ep_deepep.py / ep_mori.py SUPPORTED_* sets (capability.py mirrors this at runtime). +schema_version: 1 +backends: + deepep: + vendor: nvidia + modes: [normal, ll] # ll is DECODE-ONLY (fixed num_max dispatch) + dtypes: [bf16, fp8] # DISPATCH-side precision + contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1] + transports: [nvlink, mnnvl, rdma] + ep_max_intranode: 8 # <=8 ranks = intranode NVL kernel (incl. MNNVL trays) + ep_min: 2 + # combine path + distribution semantics (goal P2 "distribution + quant-combine constraints"). + # bf16/none combine only (quantized combine reserved until a kernel is wired); honors any + # routing trace + EPLB; all activation profiles runnable (value-neutral under bf16). + combine_dtypes: [bf16] + quant_modes: [none] + routings: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, zipf-heavy, + hotspot-single, hotspot-moving, alternating-groups] + eplb: true + activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation] + phase_constraints: + ll: {phases: [decode], max_tokens_per_rank: 128} # LL is a fixed-num_max decode path + required_image: "lmsysorg/sglang:v0.5.11-cu130" + cap_token_per_rank: 4096 # 4 GiB NVL buffer holds ~4096 tok/rank at hidden=7168 + uccl: + vendor: nvidia + modes: [normal, ll] # uccl.ep.Buffer is a DeepEP-API clone + dtypes: [bf16, fp8] # DISPATCH-side precision + contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1] + transports: [nvlink, rdma] + ep_max_intranode: 8 + ep_min: 2 + combine_dtypes: [bf16] + quant_modes: [none] + routings: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, zipf-heavy, + hotspot-single, hotspot-moving, alternating-groups] + eplb: true + activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation] + phase_constraints: + ll: {phases: [decode], max_tokens_per_rank: 128} + required_image: "lmsysorg/sglang:v0.5.11-cu130" + install: "pip install uccl nvidia-cuda-runtime-cu12 (cu12 runtime on LD_LIBRARY_PATH); see cx_build_uccl" + cap_token_per_rank: 4096 + mori: + vendor: amd + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + transports: [xgmi, rdma] + ep_max_intranode: 8 + ep_min: 2 + combine_dtypes: [bf16] # + fp8 when ROCm/MoRI PR311 quant_type combine lands + quant_modes: [none] # + the PR311 mode id once validated + routings: [uniform, balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, zipf-heavy, + hotspot-single, hotspot-moving, alternating-groups] + eplb: true + activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation] + phase_constraints: + normal: {max_tokens_per_rank: 512} # 2 GiB registerable heap cap at hidden=7168 + required_image: "rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2" + cap_token_per_rank: 512 + fragility: "wedges (D-state) on sustained iters>=200 at T>=32; needs gradual ramp, low iters" + aiter: + vendor: amd + modes: [normal] + dtypes: [bf16, fp8] + contracts: [layout-and-dispatch-v1] + transports: [xgmi, rdma] + ep_max_intranode: 8 + ep_min: 2 + status: "scaffolded — adapter ep_aiter.py not yet implemented (capability declared, not validated)" + required_image: "rocm/sgl-dev (AITER CK MoE EP)" + +# 'all' resolves to a DEFINED per-vendor backend set (NOT the same across vendors). +vendor_backends: + nvidia: [nccl, deepep, uccl] + amd: [rccl, mori] +# Collective primitives (not EP dispatch/combine — phase/dtype/mode/contract N/A). +collective_backends: + nccl: [nvidia] + rccl: [amd] diff --git a/experimental/CollectiveX/configs/platforms.yaml b/experimental/CollectiveX/configs/platforms.yaml new file mode 100644 index 000000000..a25fd97a8 --- /dev/null +++ b/experimental/CollectiveX/configs/platforms.yaml @@ -0,0 +1,118 @@ +# CollectiveX platform registry (goal Part 2). One entry per SKU: hardware capability is +# separated from VALIDATED software capability (what we've actually run green on real HW). +# scale_up_domain = #GPUs reachable over the intra-domain fabric before crossing a tier +# (NVLink island / NVL72 MNNVL tray-group / XGMI). gpus_per_node bounds single-node EP. +schema_version: 1 +platforms: + h100: + vendor: nvidia + arch: sm90 + gpu: "H100 80GB HBM3" + gpus_per_node: 8 + scale_up_domain: 8 # single 8-GPU NVLink island + transport_tiers: [nvlink, ib] + runner: h100-8x + launcher: launch_h100-dgxc-slurm.sh + ssh: "sa-shared@100.118.57.65" # partition hpc-gpu-1, /mnt/nfs, exclude hpc-gpu-1-7 + validated: + ep_degrees: [8] + backends: [deepep] + max_intranode_gpus: 8 + internode: false # not yet exercised for EP + h200: + vendor: nvidia + arch: sm90 + gpu: "H200 143GB HBM3e" + gpus_per_node: 8 + scale_up_domain: 8 + transport_tiers: [nvlink, ib] + runner: h200-8x + launcher: launch_h200.sh + ssh: "sa-shared@100.78.55.80" # partition main, /home NFS + validated: + ep_degrees: [8] + backends: [deepep] + max_intranode_gpus: 8 + internode: false + b300: + vendor: nvidia + arch: sm100 + gpu: "B300 SXM6 268GB" + gpus_per_node: 8 + scale_up_domain: 8 + transport_tiers: [nvlink, ib] + runner: b300-nv + launcher: launch_b300.sh + ssh: "sa-shared@100.101.13.83" # partition batch_1, acct benchmark, /data, exclude b300-018 + notes: "Blackwell drops clocks on tiny T -> per-point warm burst (warmup>=30). LL aborts." + validated: + ep_degrees: [8] + backends: [deepep] + modes: [normal] # Blackwell LL aborts on this fabric -> normal-only + max_intranode_gpus: 8 + internode: false + gb300: + vendor: nvidia + arch: sm100 + gpu: "GB300 Grace-Blackwell (aarch64)" + gpus_per_node: 4 # NVL72 compute tray = 4 GPU/node + scale_up_domain: 72 # NVL72 MNNVL: one NVLink P2P domain spans the rack + transport_tiers: [mnnvl, ib] + runner: gb300-8x + launcher: _gb300_ep8.sh + ssh: "2-hop: sa-shared@100.92.114.46 -> im-gb300-login-02" # batch_1, acct benchmark, /data + notes: "EP8 = 2 trays but INTRANODE NVLink path (MNNVL is one domain for <=8 ranks). deep_ep 1.1.0." + validated: + ep_degrees: [4, 8] + backends: [deepep] + max_intranode_gpus: 8 # <=8 ranks use the intranode NVL kernel even across 2 trays + internode: false # internode-normal asserts out until >8 ranks (EP16+) + b200: + vendor: nvidia + arch: sm100 + gpu: "B200 SXM 180GB" + gpus_per_node: 8 + scale_up_domain: 8 + transport_tiers: [nvlink, ib] + runner: b200-dgxc + launcher: launch_b200-dgxc.sh + ssh: "" # GHA self-hosted pool (sku=b200-dgxc); dispatch uses the runner label + notes: "B200 8x NVLink (sibling of B300, sm100). Single-node; normal-only (Blackwell LL aborts)." + validated: + ep_degrees: [8] + backends: [deepep] + modes: [normal] + max_intranode_gpus: 8 + internode: false + gb200: + vendor: nvidia + arch: sm100 + gpu: "GB200 Grace-Blackwell (aarch64)" + gpus_per_node: 4 # NVL72 compute tray = 4 GPU/node + scale_up_domain: 72 # NVL72 MNNVL one NVLink domain + transport_tiers: [mnnvl, ib] + runner: gb200-nv + launcher: launch_gb200-nv.sh + ssh: "" # GHA self-hosted pool (sku=gb200) + notes: "NVL72 sibling of GB300. EP4/EP8 intranode-NVL (<=8 ranks, MNNVL one domain); EP16/32/64 via the multi-tray nodes sweep." + validated: + ep_degrees: [4, 8] + backends: [deepep] + max_intranode_gpus: 8 + internode: false + mi355x: + vendor: amd + arch: gfx950 + gpu: "MI355X CDNA4 256 CU" + gpus_per_node: 8 + scale_up_domain: 8 # single 8-GPU XGMI island + transport_tiers: [xgmi, rdma] + runner: mi355x-8x + launcher: launch_mi355x-amds.sh + ssh: "2-hop bastion -> mia1-vm-amd-prj3-slurm-001" # partition compute, cpus-per-task=128 + notes: "MoRI wedges (D-state) on sustained iters>=200 at T>=32; cap iters. 512-tok buffer cap. No LL/fp8." + validated: + ep_degrees: [8] + backends: [mori] + max_intranode_gpus: 8 + internode: false diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml new file mode 100644 index 000000000..194f5f40c --- /dev/null +++ b/experimental/CollectiveX/configs/suites.yaml @@ -0,0 +1,218 @@ +# CollectiveX named benchmark suites (goal Part 2). A suite binds workloads x platforms x +# backends x modes x contracts x resource regimes x repetitions x required publication level. +# generate_matrix.py resolves a suite against platforms.yaml/backends.yaml capabilities BEFORE +# any GPU is allocated, omitting unsupported combinations with recorded reasons. +schema_version: 1 + +# HEADLINE DISTRIBUTION CONTRACT (goal Part 2 "define one headline distribution"). ONE routing +# profile is the cross-hardware headline; every other distribution is a SENSITIVITY view, never a +# peer headline dimension. plot_ep.py defaults to this (HEADLINE_DISTRIBUTION) and labels the +# sensitivity section as "not the headline". +headline_distribution: + routing: uniform + basis: synthetic # synthetic | fitted | replayed — uniform is the controlled synthetic ref + rationale: >- + uniform is deterministic, controlled, and present on every SKU/backend, so it is the + apples-to-apples cross-hardware reference. balanced / zipf / zipf+eplb / hotspot* are + sensitivity views. Interim load-realism reference = zipf+eplb (skew + the production remedy); + long-term headline will be InferenceX TRACE-REPLAY (captured per-step serving routing) once a + replay loader lands — then `basis` becomes `replayed`. + sensitivity_distributions: [balanced, balanced-rank-local, zipf, zipf-mild, zipf-moderate, + zipf-heavy, hotspot-single, hotspot-moving, alternating-groups] + +suites: + ep-smoke-v1: + description: "fast canary: one small point per platform/backend/mode/contract" + workloads: [ds-like-ref] + platforms: [h100, h200, gb300, gb200, mi355x] + backends: [deepep, mori] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform] + resource_modes: [tuned] + token_points: [8, 64] + trials: 1 + required_publication: comparable-experimental + + ep-nightly-v1: + description: "headline matrix: both contracts, bf16+fp8, normal+LL, decode+prefill" + workloads: [ds-like-ref] + platforms: [h100, h200, b300, b200, gb300, gb200, mi355x] + backends: [deepep, mori] + modes: [normal, ll] + dtypes: [bf16, fp8] + contracts: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1] + routings: [uniform] + resource_modes: [tuned] + phases: [decode, prefill] + trials: 3 + required_publication: official + + ep-models-v1: + description: "model-shape envelope: real MoE dimensions, controlled routing" + workloads: [deepseek-v4, kimi-k2.x, qwen3.5, glm-5, minimax-m3] + platforms: [h100, h200, b300, b200, gb300, gb200, mi355x] + backends: [deepep, mori] + modes: [normal] + dtypes: [fp8, bf16] + contracts: [runtime-visible-v1] + routings: [uniform] + resource_modes: [tuned] + phases: [decode, prefill] + trials: 3 + required_publication: comparable-experimental + + ep-scaling-v1: + description: "strong (fixed global tokens) + weak (fixed tokens/rank) scaling across EP degrees" + workloads: [ds-like-ref] + platforms: [gb300, gb200] # the only SKU with >1 validated EP degree (EP4 + EP8) + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform] + resource_modes: [tuned] + scaling: [strong, weak] + ep_degrees: [4, 8] + trials: 3 + required_publication: comparable-experimental + + ep-topology-v1: + description: "placement sensitivity: packed vs striped vs adversarial on multi-domain SKUs" + workloads: [ds-like-ref] + platforms: [gb300, gb200] # NVL72 tray boundary is the scale-up domain edge + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform, zipf] + placements: [packed, striped, adversarial] + resource_modes: [tuned] + ep_degrees: [8] + trials: 3 + required_publication: comparable-experimental + + ep-distribution-sensitivity-v1: + description: "distribution robustness: ratio p99_worst / p99_headline(uniform) at ANCHOR tokens + only. NOT a chart dimension — collapses to one sensitivity number per (sku,backend,phase) via + tests/sensitivity.py. BF16/normal today; the value (activation) axis is added when the rig lands." + workloads: [ds-like-ref] + platforms: [h100, h200, b300, b200, gb300, gb200, mi355x] + backends: [deepep, mori] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + # headline = uniform; balanced-rank-local = min-comm best case; zipf-heavy/hotspot-single = worst. + routings: [uniform, balanced, balanced-rank-local, zipf, zipf-heavy, hotspot-single] + resource_modes: [tuned] + phases: [decode, prefill] + # ANCHOR points only (not the full ladder) — the suite answers "how fragile", not "the curve". + token_points_decode: [1, 8, 32, 128] + token_points_prefill: [128, 512, 2048] + trials: 3 + required_publication: comparable-experimental + + ep-routing-v1: + description: "routing-skew sensitivity + EPLB remedy" + workloads: [ds-like-ref] + platforms: [h100, h200, b300, b200, gb300, gb200] + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform, balanced, zipf, zipf-mild, zipf-moderate, zipf-heavy, hotspot-single] + eplb: [false, true] + resource_modes: [tuned] + phases: [decode, prefill] + trials: 3 + required_publication: comparable-experimental + + ep-activation-sensitivity-v1: + description: "activation-VALUE sensitivity: same trace under each value profile. Under bf16 + combine the ratio is ~1.0 (value-independent) — the EXPECTED null result that also baselines + the rig for when a quantized (value-sensitive) combine lands. Diagnostic, never headline." + workloads: [ds-like-ref] + platforms: [h100, h200, b300, b200, mi355x] + backends: [deepep, mori] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform] + # the activation value distributions (routing.ACTIVATION_PROFILES). normal = headline. + activation_profiles: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation] + resource_modes: [tuned] + phases: [decode] + token_points: [1, 8, 32, 128] + trials: 3 + required_publication: diagnostic + + ep-quant-combine-sensitivity-v1: + description: "BLOCKED ON PR311 — quantized-combine distribution sensitivity (none/fp8/mxfp8). + The rig is ready (combine_quant_mode field + capability gate + comparison_key fold), but no + quantized combine kernel is wired, so this suite resolves to ZERO valid cases today (capability + rejects combine_quant_mode != none). Kept so the matrix lights up the moment the kernel lands." + workloads: [ds-like-ref] + platforms: [mi355x] + backends: [mori] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform, zipf, hotspot-single] + combine_quant_modes: [none, fp8, mxfp8] # only 'none' resolves valid until PR311 + resource_modes: [tuned] + phases: [decode] + trials: 3 + required_publication: diagnostic + + ep-placement-v1: + description: "placement matrix: packed vs striped vs adversarial. Single-node SKUs make these + identical (all same-node); meaningful once a multi-node EP cohort exists. analyze_ep computes + the packed-vs-striped topology penalty + locality attribution." + workloads: [ds-like-ref] + platforms: [gb300, gb200] # NVL72 tray boundary = the only multi-domain SKU here + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform, zipf] + placements: [packed, striped, adversarial] + resource_modes: [tuned] + ep_degrees: [8] + phases: [decode, prefill] + trials: 3 + required_publication: comparable-experimental + + ep-temporal-v1: + description: "temporal routing: a hot expert that MOVES across decode steps + expert groups that + ALTERNATE. One run per step (--routing-step); analyze across steps. Diagnostic sensitivity view." + workloads: [ds-like-ref] + platforms: [h100, h200] + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [hotspot-moving, alternating-groups] + routing_steps: [0, 1, 2, 3] + resource_modes: [tuned] + phases: [decode] + token_points: [8, 32, 128] + trials: 3 + required_publication: diagnostic + + ep-uneven-tokens-v1: + description: "uneven source-token allocation: per-rank token counts vary (global may not divide + EP); includes the empty-source-rank case. Records source_token_stats (min/mean/max/CV)." + workloads: [ds-like-ref] + platforms: [h100, h200] + backends: [deepep] + modes: [normal] + dtypes: [bf16] + contracts: [layout-and-dispatch-v1] + routings: [uniform] + uneven_tokens: [none, linear, empty-rank] + resource_modes: [tuned] + phases: [decode] + token_points: [8, 32, 128] + trials: 3 + required_publication: diagnostic diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml new file mode 100644 index 000000000..1612c773d --- /dev/null +++ b/experimental/CollectiveX/configs/workloads.yaml @@ -0,0 +1,146 @@ +# CollectiveX workload registry (goal Part 2). Each workload references an IMMUTABLE canonical +# manifest (tests/workload.py -> .npz + .manifest.json). Three kinds: +# synthetic — controlled DeepSeek-like baseline (dims real, routing controlled) +# model-derived — REAL model MoE dimensions with controlled routing (shape != routing behavior) +# trace-replay — captured routing behavior (future; needs a captured trace) +# Model dims marked verify=true must be confirmed against a checked-in model config before any +# result built on them is promoted past 'comparable-experimental'. +schema_version: 1 + +synthetic: + ds-like-ref: + kind: synthetic + hidden: 7168 + topk: 8 + experts: 256 + dispatch_dtype: bf16 + combine_dtype: bf16 + routings: [uniform, balanced, zipf] + note: "Controlled baseline used through v3/v4 (DeepSeek-V3-shaped)." + +model_derived: + # --- PINNED, NAMED model manifests (goal P1 "Add workload manifests"). The "-v1" suffix freezes + # the (hidden, topk, routed_experts) shape behind an immutable name so a published result can cite + # `kimi-k2-v1` and have it mean exactly these dims forever; if a future model rev changes a dim it + # gets a "-v2" manifest, never a silent edit here. These are the names referenced for model-shape + # coverage. The legacy unsuffixed entries below are kept for back-compat with existing suites. + # canonical workload_id folds (hidden, topk, routed_experts) -> identical bytes on every SKU. + deepseek-v3-v1: + kind: model-derived + hidden: 7168 + topk: 8 + routed_experts: 256 + shared_experts: 1 + expert_alignment: 128 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: false # DeepSeek-V3 EP serving shape; identical dims to the ds-like-ref baseline + deepseek-v4-v1: + kind: model-derived + hidden: 7168 + topk: 8 + routed_experts: 256 + shared_experts: 1 + expert_alignment: 128 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: false # matches the validated DSV3/V4 serving shape used on these clusters + minimax-m3-v1: + kind: model-derived + hidden: 6144 + topk: 8 + routed_experts: 256 + shared_experts: 1 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: true + kimi-k2-v1: + kind: model-derived + hidden: 7168 + topk: 8 + routed_experts: 384 + shared_experts: 1 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: true + qwen3.5-v1: + kind: model-derived + hidden: 4096 + topk: 8 + routed_experts: 128 + shared_experts: 0 + dispatch_dtype: bf16 + combine_dtype: bf16 + verify: true + + # --- LEGACY unsuffixed entries (kept for back-compat with ep-models-v1 and analyze_ep envelope + # matching). Prefer the "-v1" names above for new work. deepseek-v4/minimax-m3/qwen3.5 mirror their + # "-v1" shapes exactly; kimi-k2.x == kimi-k2-v1; glm-5 has no "-v1" (not in the goal's manifest set). + deepseek-v4: + kind: model-derived + hidden: 7168 + topk: 8 + routed_experts: 256 + shared_experts: 1 + expert_alignment: 128 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: false # matches the validated DSV3/V4 serving shape used on these clusters + minimax-m3: + kind: model-derived + hidden: 6144 + topk: 8 + routed_experts: 256 + shared_experts: 1 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: true + kimi-k2.x: + kind: model-derived + hidden: 7168 + topk: 8 + routed_experts: 384 + shared_experts: 1 + dispatch_dtype: fp8 + combine_dtype: bf16 + verify: true + glm-5: + kind: model-derived + hidden: 5120 + topk: 8 + routed_experts: 160 + shared_experts: 1 + dispatch_dtype: bf16 + combine_dtype: bf16 + verify: true + qwen3.5: + kind: model-derived + hidden: 4096 + topk: 8 + routed_experts: 128 + shared_experts: 0 + dispatch_dtype: bf16 + combine_dtype: bf16 + verify: true + +# decode vs prefill are workload METADATA, not just token-ladder aliases (goal Part 2). Each point +# is ONE MoE layer · ONE step · a SINGLE dispatch+combine collective pair (NOT a whole model or +# several concurrent layers). The harness emits this as `phase_profile` so a T=128 point launched +# under "prefill" is never silently read as a decode point. +phase_profiles: + decode: + token_ladder: [1, 2, 4, 8, 16, 32, 64, 128] + description: "one (or few) tokens per active sequence per step; routing varies step-to-step" + active_sequences: "one batch of active sequences" + tokens_per_iter: "1 (or few) per active sequence" + microbatch_distribution: "one decode step across the active sequences" + routing_variability: "varies step-to-step (use the temporal routing modes to model this)" + represents: "one MoE layer · one decode step · one dispatch+combine collective" + prefill: + token_ladder: [128, 256, 512, 1024, 2048, 4096] + description: "chunked-prefill: many tokens per sequence enter each MoE layer at once" + chunk_size: "the tokens/rank point IS the prefill chunk size entering the MoE layer" + tokens_entering_moe: "chunk_size * ep_size tokens enter one MoE layer at once" + request_mixture: "a single chunked-prefill chunk (no request-mix modelled yet)" + chunked_prefill_behavior: "one chunk per measured point" + represents: "one MoE layer · one prefill chunk · one dispatch+combine collective" diff --git a/experimental/CollectiveX/docs/gated.md b/experimental/CollectiveX/docs/gated.md new file mode 100644 index 000000000..5f69783b9 --- /dev/null +++ b/experimental/CollectiveX/docs/gated.md @@ -0,0 +1,332 @@ +# CollectiveX — gated items: implemented-where-possible, honest blockers otherwise + +This records goal.md items that are **not** completable as real GHA results on the available +NVIDIA fleet today, with the *specific* blocker for each (empirically established, not assumed), +plus what WAS done toward each. Scope: NVIDIA chips (H100, H200, B300; GB300 capacity-limited). + +The container all NVIDIA results run in is `lmsysorg/sglang:v0.5.11-cu130` (CUDA 13.0, NCCL 2.28.9, +torch 2.11; pre-installed: deep_ep 1.2.1, flashinfer 0.6.8, nixl 1.0.1, nvshmem 3.4.5). Established +by an in-container probe on the H200 cluster. + +## EP backends + +### NVIDIA NCCL EP — NOT represented by DeepEP V2; needs its own adapter +Upstream `NVIDIA/nccl` now has a real `contrib/nccl_ep` implementation. It is an NCCL API extension for +MoE dispatch/combine built on NCCL Device API LSA/GIN, and should be treated as its own backend surface, +not as a synonym for DeepEP V2. + +CollectiveX currently keeps these surfaces separate: +- **DeepEP V2**: `backend=deepep`, `shape.kernel_gen=v2`, `deepep_version=2.0.0+...`; this is DeepEP's + ElasticBuffer/dispatch/combine implementation using the NCCL Gin backend. +- **`nccl-ep` baseline in this harness**: a portable token-shuffle implementation using + `torch.distributed.all_to_all_single` over NCCL/RCCL. This is useful as a host-orchestrated baseline, + especially cross-node, but it is **not** upstream `contrib/nccl_ep`. +- **Upstream NCCL EP**: still needs a dedicated adapter/provenance label before CollectiveX can claim + native NCCL EP results. When wired, it must not overwrite either DeepEP V2 or the current + all-to-all baseline identity. + +So the correct comparison is not "NCCL EP = DeepEP V2". DeepEP V2 remains a relevant NCCL-Gin-backed +comparison point, but native NCCL EP needs its own line in the backend/version matrix. + +### UCCL EP — DONE via vendored deep_ep_wrapper (was deferred; the bootstrap is now wired) +`pip install uccl` (prebuilt cp312 wheel) + a cu12 CUDA runtime on `LD_LIBRARY_PATH` (the wheel is +cu12 on a cu13 image) **builds and imports** — the C++ runtime `uccl.ep` loads (pkg-0.1.1), confirmed +on H100 via GHA. BUT the DeepEP-compatible surface is **not** the low-level `uccl.ep.Buffer`: that +constructor is `Buffer(rank, num_ranks, num_nvl_bytes, num_rdma_bytes, low_latency_mode, …)` — it does +NOT take a torch ProcessGroup, and a no-bootstrap construction raises `TypeError: incompatible +function arguments`. The DeepEP-identical `Buffer(group, …)` lives in UCCL's separate ~1900-line +`deep_ep_wrapper` package (packaged AS `deep_ep`, so it collides with the container's real DeepEP). +That wrapper's `__init__` runs a non-trivial bootstrap — `get_local_ipc_handle` / `get_local_device_id` +exchanged via `dist.all_gather_object`, `runtime.sync(...)`, CPU `UcclProxy` setup +(`get_cpu_proxies_meta`), and `connect_atomic_buffer` — entangled with UCCL's bench harness `init_dist`. +The wrapper is cleanly vendorable (relative imports + only depends on `uccl.ep`), and that is now +DONE: `cx_build_uccl` git-clones `uccl-project/uccl` at the wheel-matched tag and vendors +`deep_ep_wrapper` under the non-colliding name `uccl_deepep`; `ep_uccl.py` imports its +`Buffer(group, …)` and runs genuine UCCL dispatch/combine. **Validated: `correct=True`, +`uccl_version=0.1.1`, intranode NVLink on h100/h200/b300/b200** (normal bf16+fp8 + LL). If the wrapper +is ever absent the import falls back to the low-level `uccl.ep.Buffer`, which fails loudly (preserved +failed-case) — never faked. Fresh full-sweep re-validation (post idempotent-build fix, which cured the +old per-case-rebuild SIGABRT/timeout): **h200 = 426/426 correct incl LL-mode 32/32** (run 28535235520); +**h100 = 394/394 correct in NORMAL mode** (run 28535226475) **but all 4 LL-mode cases HANG (rc=124, 900s +timeout — 0/32)**. Since the identical UCCL LL code is 32/32 on h200 (same Hopper arch, same wheel), the +h100 LL hang is an **h100-dgxc cluster limitation** (LL uses IBGDA-style low-latency proxies; the +h100-dgxc fabric deadlocks them — consistent with the documented h100-dgxc cross-node IB wall below), +NOT an arch or UCCL-code wall. Both SKUs also fail ONLY the `empty-rank` diagnostic (see empty-rank note +below). Remaining gap: aarch64 GB200/GB300 (the from-source/proxy bootstrap doesn't come up — see the +aarch64 wall below); uccl is x86-single-node so far. + +### NIXL — transfer DONE (container switch); device-EP blocked on UCX GPU Device API +Two distinct things. **(1) NIXL host RDMA transfer** (`nixl_agent.register_memory / get_xfer_descs / +initialize_xfer / transfer`) — the fabric dynamo uses for KV movement — is **WIRED + valid** +(`tests/nixl_transfer.py`, `CX_BENCH=nixl`). It needed a **container switch** (the sglang multiarch +image has no NIXL build deps): `cx_default_image` selects `nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime: +1.3.0-dev.1-cuda13` for `CX_BENCH=nixl`. B300 run 28314858649: NIXL 0.10.1, UCX backend, 2 in-process +agents — dtod-local **94 GB/s**, dtod-remote **24 GB/s** (dtoh/htod hit a NIC dmabuf `ibv_reg_mr Bad +address` limit; GPU↔GPU is the KV-handoff path that matters). + +**(2) NIXL device-EP** (`examples/device/ep`, a DeepEP fork) — the from-source **meson** build. The +container switch was the directive's exact ask ("switch containers and see if it fixes"), and it +**CLEARED the documented Abseil 20220623 blocker**: the dynamo image ships **Abseil 20250814** (meson +subproject) + meson/ninja/pybind11 3.0.2/cmake, and `meson setup` now SUCCEEDS (build-probe +`cx_probe_nixl_ep`, run 28314858649 log). The next blocker is `UCX GPU Device API: NO` (the device-EP +needs UCX's device-initiated GPU put/get API via ``). **Build attempt +made:** `cx_probe_nixl_ep` now BUILDS UCX from source with `--with-cuda` and points pkg-config at it — +but `meson setup` STILL reports `UCX GPU Device API : NO` (run 28320702204). So it is NOT a missing +build flag: UCX's device API compiles in only with GPUDirect-Async / device-initiated-comm **driver + +hardware** support (IBGDA/GDAKI), a base-platform capability absent here — not a container/build fix. +`nixl_ep_cpp` therefore does not build; the adapter (mirroring `ep_deepep.py`) waits on a platform with +that device-comm support. Evidenced terminal wall. + +### FlashInfer EP / TensorRT-LLM NVLink one-sided AllToAll — DONE on H100 + B300 (H200 runner gated) +`flashinfer.comm.MoeAlltoAll` (which LIVES IN `flashinfer.comm.trtllm_moe_alltoall` — it IS the +TRT-LLM "throughput backend" one-sided all-to-all, calling the same `moe_a2a_dispatch`/`moe_a2a_combine` +kernels) builds its MNNVL symmetric workspace over the torch.distributed NCCL group via FlashInfer's +`TorchDistBackend` (no MPI/mpi4py). The cross-rank symmetric buffer uses +`CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR` → `pidfd_getfd` → **CAP_SYS_PTRACE** on x86_64. Empirically: +- **H100 (`h100-dgxc`) + B300 (`b300`):** their enroot/pyxis runner containers **grant** the cap → + FlashInfer EP runs and is **official** (bf16 + the quant dispatch matrix below), decode + prefill. + This is the TRT-LLM NVLink one-sided AllToAll EP — the existing FlashInfer EP results ARE that path + (provenance `backend_lineage = flashinfer.comm.trtllm_moe_alltoall.MoeAlltoAll`). + - **H100 intermittent crash (open):** the MoeAlltoAll **construction** succeeds (cap granted), but + ~half of h100 flashinfer cases hit `torch.AcceleratorError: CUDA error: unspecified launch failure` + during dispatch/combine execution (run 28500524185: 21/38 cases; scattered across T/routing, the SAME + config both crashes AND passes → a genuine intermittent, NOT config/pidfd). NOT a per-case IPC reclaim + race either: a between-case `/dev/shm` drop + settle was tested (run 28522872429) and made it WORSE + (in-flight IPC corruption, 21→27 fails). So it's flashinfer MoE-kernel flakiness on Hopper — needs + compute-sanitizer on a live run to root-cause. Mitigations shipped: (1) each flashinfer case is + RETRIED up to `CX_FLASHINFER_RETRIES` (default 3) times in the shard loop, dropping the intermediate + failed-case record on a retry-success so the shard isn't polluted; (2) flashinfer is sweep-chunked + (`SLOW_MAX_CASES=12`, smaller than others so the retry budget stays within `--time`), bounded + + PARALLEL so a crash can't take a large shard down. **Retry MEASURED (run 28534841204, retry engaged + — 17 retries in the p3 shard alone): coverage 30/46 configs, 173/173 correct — up from the ~19-24 + baseline but NOT the ~94% a clean-independent-50% model predicts.** The deadlock is severe (1470 + completion-flag-timeout events that run) and, crucially, CORRELATED within a container: once the + MNNVL barrier state degrades, retries in the same allocation keep timing out, so retry has + diminishing returns (one whole chunk, p1, passed cleanly while p0/p2/p3 degraded). Fuller coverage + would need a fresh container per retry (re-import cost) or much smaller chunks (more GHA jobs) — both + rejected for marginal gain; the real fix is live compute-sanitizer root-cause. Upgrade to 0.6.14 was + also tested (run 28530579787) and did NOT fix it (it was a vLLM-side fix), so bundled wheel + retry + is the shipped path. B300 + GB300 flashinfer are 100% clean (Blackwell), confirming Hopper-kernel. +- **H200 (`h200-dgxc`) runner:** its container **denies** CAP_SYS_PTRACE, so `pidfd_getfd` fails at + MoeAlltoAll **construction** on every rank (`pidfd_getfd(...) errno 1: Operation not permitted`, + deterministic — NOT the h100 intermittent, so retry cannot help). This is a per-runner environment + limitation, NOT a code/hardware gap — the identical adapter is official on H100+B300. Not + harness-fixable: our launchers pass no `--container-cap-add`/cap flags (caps are the cluster's enroot + default — h100-dgxc grants it, h200-dgxc doesn't), enroot runs unprivileged so the cap isn't grantable + per-job, and `MoeAlltoAll` has **no non-MNNVL transport** to route around it (it IS the MNNVL one-sided + A2A). Documented rather than forcing a security-sensitive `--cap-add SYS_PTRACE` on that shared runner. +- **aarch64 (GB200/GB300):** would use `CU_MEM_HANDLE_TYPE_FABRIC` (no pidfd); GB300 capacity-limited. + +## Precision matrix + +### MXFP8 / NVFP4 dispatch — DONE on FlashInfer EP; MXFP4 dispatch — gated (tile-padded SF) +DeepEP (V1/V2) dispatch accepts **e4m3 fp8 only**. But FlashInfer's A2A is a **dtype-agnostic byte +mover** taking `input_payloads` as a LIST, so a quantized dispatch moves `[q, scale_factor]` and +dequants in `stage()` (UNTIMED preprocessing, cached so the roundtrip measures comm). Using FlashInfer's +own quantize/dequantize kernels, `ep_flashinfer.py` now does **MXFP8** (`mxfp8_quantize`, e4m3 + e8m0 +block-32 — device dequant verified == `mxfp8_dequantize_host`) and **NVFP4** (`fp4_quantize` + +`e2m1_and_ufp8sf_scale_to_float`, e2m1 + e4m3 block-16) dispatch, plus the three e4m3 fp8 scale-layouts. +Coverage by arch (all `correct=True` end-to-end): +- **e4m3 fp8 (×3) + mxfp8:** H100 **and** B300 (e4m3/e8m0 are Hopper-supported). +- **nvfp4:** **B300 (Blackwell) only.** FP4 (e2m1) is a Blackwell-native tensor format; FlashInfer's + fp4 quantize/dequantize does NOT round-trip on Hopper sm90 (validated: nvfp4 `correct=True` on B300, + `correct=False` on H100). `capability.resolve` now gates nvfp4 to Blackwell (`ARCH_ONLY_DTYPES`), so a + Hopper nvfp4 dispatch is cleanly rejected rather than run-and-marked-invalid. +- **MXFP4 dispatch — gated:** FlashInfer's `mxfp4_quantize` emits its scale factor in a **tile-padded + `[pad(T,128), H/32]` swizzled layout** with no `is_sf_swizzled_layout=False` option — it does NOT + factor as a per-token `[T, k]` tensor, so it can't be moved through the per-token A2A. (mxfp8 + nvfp4 + both expose a linear per-token SF; mxfp4 alone does not.) The 4-bit MX format is covered in spirit by + nvfp4 (also 4-bit e2m1); mxfp4 specifically stays gated on the quantizer's SF layout. + +### Quantized combine OUTPUT (MXFP8 / NVFP4 combine) — DONE on B300 via flashinfer-main (container switch) +Distinct from quantized *dispatch*: a quantized **combine** emits a non-bf16 reduced output. The bundled +`flashinfer 0.6.8.post1` `moe_a2a_combine` had **no `output_dtype`**, and neither did 0.6.13 (latest +PyPI) nor the cu130 nightly wheel (0.6.13.dev20260612) — `output_dtype`/`output_scales` landed on +flashinfer **main** after those. So `cx_build_flashinfer_latest` BUILDS flashinfer main from source +in-container (after a 7-layer version-coupling peel: cubin↔python↔jit-cache version checks, then +`nvidia-cutlass-dsl` 4.5.2 for the CuTe `OperandMajorMode`, then **uninstalling** the stale precompiled +cubin/jit-cache so `get_moe_alltoall_module()` JIT-compiles the 14-arg kernel fresh from main's csrc). +- **MXFP8 combine — DONE on B300:** `combine(output_dtype=float8_e4m3fn, output_scales=uint8[T,H/32])` = + e4m3 + UE8M0 block-32 (the source-spec'd layout); dequant `e4m3 * 2^(e8m0-127)`. Valid, `correct=True` + ×8 (`backend_provenance.combine_quant=True`, `flashinfer_stack` captured). FP32-accum is the kernel's + internal reduce; scale-transport (e8m0) + tolerance-class (1.6e-1 vs bf16 5e-2) are exercised. +- **NVFP4 combine — DONE on B300:** `output_dtype=uint8 (packed e2m1) + e4m3 vec-16 scales + + output_scalar_scale`; dequant via `e2m1_and_ufp8sf_scale_to_float` (the e4m3 scales viewed as uint8 + ufp8). Valid, `correct=True` ×8 (Blackwell-native fp4, like nvfp4 dispatch). +- **H100 combine — build-time-limited (NOT arch):** the ~70-min in-container flashinfer-main source + build exceeds the H100 runner's job budget (SIGTERM). B300's longer budget lets it land. A pre-staged + flashinfer-main wheel (one-time build) would remove the per-run rebuild; deferred. +- **Direct-cast FP8 combine — kernel limit (evidenced, B300 run 28315037266):** ATTEMPTED via + `CX_QC_SCALE=scalar` (`output_dtype=float8_e4m3fn` + `output_scalar_scale`, NO per-block + `output_scales`). The kernel ASSERTS `Check failed: (output.dtype()==payload.dtype()) is false: + output_dtype without output_scales must match payload dtype` — i.e. an fp8 output REQUIRES per-block + `output_scales`; a scalar-only/unscaled direct-cast fp8 combine is **not a supported moe_a2a_combine + mode**. The SCALED mxfp8/nvfp4 outputs are the only fp8/fp4 combine paths. (Also confirmed the nightly + `flashinfer 0.6.13` wheel now carries `output_dtype` — the ~70-min main-source build is no longer + needed for combine-quant.) MoRI fp8_blockwise combine (AMD, PR311) remains a separate AMD path. + +## Topology and rack-scale + +### NVL72 rack-scale EP — DONE up to EP64 via FlashInfer-MNNVL; cross-node-over-IB DONE via nccl-ep +**Within an NVL72 NVLink domain, EP8/16/32/64 are DONE.** The key: DeepEP's NVLink `Buffer(group,nvl,0)` +is intranode-only (≤8 ranks, incl. MNNVL trays → GB300/GB200 EP8 over 2 trays via deepep), BUT +**FlashInfer's MoeAlltoAll MNNVL symmetric workspace SPANS the whole NVL72 NVLink domain** — so +`benchmark=flashinfer nodes=4/8/16` runs EP16/32/64 across 4/8/16 trays. Validated correct=True: +GB300 EP8 (28319504164) + EP16 (28319809968); GB200 EP8 (28319793439, after porting the GB300 EP +multi-srun path into launch_gb200-nv.sh — was nccl-only) + EP16 (28319971335) + EP64 (28319975631, +ep_size=64/world=64). EP32 (both SKUs) re-dispatched after a workflow concurrency-group collision +(the group omitted inputs.nodes — fixed). Bounded only by NVL72 tray CAPACITY, not the method. +- **Cross-node over InfiniBand (H200 DONE via nccl-ep; H100 cluster WALLED).** Two layers had to fall: + (1) **Rendezvous:** torch's `env://` TCPStore *and* torchrun's elastic-agent store advertise the + rank-0 management-subnet NodeAddr, which is NOT reachable from a peer rank's enroot container net + namespace (900s connect timeout; runs 28325250919 / 28326334616). Solved with a shared-mount + **FileStore** (`CX_RDZV_FILE`) + a **local NGPUS-process spawn** (no torchrun elastic agent) — the PG + bootstraps through the shared file and NCCL then connects peers over IB. (2) **Data path:** the custom + one-sided RDMA backends do NOT survive cross-node — UCCL's `ibv_reg_mr` fails EINVAL → `free(): + corrupted unsorted chunks` → SIGSEGV (run 28326528672, *after* the rendezvous now forms), DeepEP + normal-internode asserts out — because they need GPUDirect-RDMA peer-memory registration the cluster's + IB HCAs / container don't expose. The portable fix is a transport that host-stages gracefully: + **nccl-ep** (`tests/ep_nccl.py`), the NCCL `all_to_all_single` token-shuffle EP baseline. H200 + nodes=2 / **world=16 over IB**, run 28327088942: **correct=True at every T(1→128)**, disp_p50 + 547–808µs, status=comparable-experimental (single-node world=8 validated first, run 28327013318). + (IBGDA/internode-DeepEP would be a faster one-sided path but needs the driver capability — gated; + nccl-ep is the validated, portable cross-node EP.) + **H100 cross-node — WALLED (correcting an earlier "same path covers H100" overclaim).** The h100 + launcher gained the same `CX_NODES>1` FileStore-rendezvous block (ported from h200; committed), and the + 2-node allocation + per-node container DO come up (run 28446105759: nodes hpc-gpu-1-0/1). But the + nccl-ep run reproducibly HANGS to the 900s timeout on BOTH decode and prefill, with no captured evidence + (the `timeout -k` kill pre-empts stderr) — the gloo+NCCL FileStore bringup that auto-detects the right + interface on the h200 fabric does not converge on the hpc-gpu-1 cluster (different inter-node + networking; no SSH to introspect the correct `GLOO/NCCL_SOCKET_IFNAME`). Not a systematic-matrix data + point either: `sweep_matrix` places h100 at `nodes=''` (single-node) only — cross-node ws16 was a + separate goal-182 demo. So h100 single-node EP (all backends @ ws8) is complete; cross-node ws16 stays a + cluster-bringup wall pending interface-level access to that cluster. +- **Cross-node MI355X (goal 183, "if available") — via nccl-ep on RCCL.** MoRI's RDMA registration also + aborts cross-node (SIGABRT, run 28325251742, *after* the rendezvous master is correctly resolved) — + the AMD analogue of UCCL's GPUDirect-RDMA wall. nccl-ep runs on RCCL (identical `all_to_all_single` + API) over a 2-node MI355X allocation with the same FileStore rendezvous (the MI355X multi-srun gained + `CX_RDZV_FILE`; nccl-ep uses a pure rccl PG, sidestepping the gloo `connectFullMesh` 127.0.1.1 alias + too — and `nccl-ep` had to be added to the MI355X launcher's AMD-bench allowlist, else it silently + fell back to MoRI). **DONE:** MI355X nodes=2 / **world=16 over RoCE/IB**, run 28328718973, + **correct=True** T=1→8, disp_p50 345–431µs, status=comparable-experimental. +- **DeepEP-hybrid on gb300 WORKS at EP4 AND EP8 (corrected twice); only UCCL aarch64 remains a wall.** + Per-backend re-validation (informed by upstream docs: NVIDIA HybridEP = the Megatron + `moe_flex_dispatcher_backend="hybridep"`, TMA-NVLink + IBGDA, **built for NVL72 rack-scale GB200/GB300**) + overturned the earlier blanket "uccl + deepep-hybrid fail at EP4 and EP8 on Grace-Blackwell" claim: + - **DeepEP-hybrid gb300 EP4 (single-tray) — WORKS.** EP4 sweep (run 28452161275): 30 valid docs, + **169/169 correct**, `max_rel_error=0.0`, `branch=hybrid-ep`. + - **DeepEP-hybrid gb300 EP8 (2-tray, MNNVL) — WORKS.** Run 28480519588: decode **8/8** + prefill **6/6**, + `ws=8 nodes=2 transport=mnnvl`, full T-ladder 128→4096 all `correct=True` (RT p50 374µs@T128 → + 1404µs@T4096). NOT intranode-only (an earlier wrong claim): the only blocker was build PERSISTENCE — + `cx_build_deepep_hybrid` did `build_ext --inplace` under `/tmp/DeepEP_hybrid` + PYTHONPATH, but `/tmp` + does NOT survive across the EP8 multi-srun's separate srun steps (only the pyxis container rootfs does), + so the case-srun saw the bundled mainline `deep_ep` → `no attribute HybridEPBuffer`. Fixed by installing + into site-packages (`pip install`, persists — mirrors deepep-v2), build_ext fallback for EP4. + - **DeepEP-hybrid h100 + h200 (Hopper, EP8 single-node) — WORKS, 212/212 correct each** (runs + 28535221873 / 28535231056, post idempotent-build fix): 43/44 cases valid across the `none` + + `linear` uneven-token distributions, decode+prefill ladders T=8→4096, all `correct=True`. The ONE + failing case (c043) is the `empty-rank` diagnostic (`ep-uneven-tokens-v1`, `required_publication: + diagnostic` — one rank gets ZERO tokens): HybridEP's `set_intra_node_buffers` → `hybrid_ep.cu:81 + cudaDeviceSynchronize` raises `cudaErrorIllegalAddress` on Hopper (identical index c043 on BOTH + SKUs = deterministic-by-config, NOT the flashinfer intermittent nor accumulation). Not + retried/chunked: deterministic kernel limit, and the backend already has 212 correct points/SKU. + - **`empty-rank` is a CROSS-BACKEND Hopper diagnostic differentiator (not HybridEP-only).** The same + zero-token-rank case ALSO crashes **UCCL** on Hopper (h100 c073 rc=1, h200 c073) — so of the Hopper + EP backends, deepep-hybrid + uccl fail it while **mainline DeepEP HANDLES it** (verified control: + h100 mainline deepep empty-rank case c073 = valid doc, **3/3 correct**, zero failed records in the + shard). So the empty-rank diagnostic cleanly separates zero-token-rank-robust (mainline DeepEP) from + non-robust (HybridEP, UCCL) EP kernels. It's `required_publication: diagnostic`, one case per + backend, and flips those backends' GHA jobs to "failure" despite full data — judge by the failed-case + record + the 200+ correct points, not the job conclusion. Untested on Blackwell (b300/gb300 hybrid + + uccl suites are `uneven_tokens=none` only, so no Blackwell control exists for empty-rank). + - **UCCL aarch64 (gb300) — WALL (confirmed fresh, the one genuine aarch64 EP wall).** Run 28457032490: + `ModuleNotFoundError: No module named 'uccl.ep'` — the uccl EP extension does not import on aarch64 + Grace-Blackwell (consistent with UCCL-EP docs: NVIDIA/AMD + EFA/IB/Broadcom, no aarch64/Grace). EP4+EP8. + LESSON: a failing run is not proof of a capability wall — both deepep-hybrid claims were wrong; the EP8 + one was a build-env bug, not a hardware limit. Always check the library's actual support before walling. + Both backends work on x86 single-node (uccl b300=126/b200=124; deepep-hybrid h100=212/h200=212/b300=36, + 43/44 cases on Hopper — only the empty-rank diagnostic crashes, see above). deepep + (bundled V1), deepep-v2 (from-source), flashinfer, nccl-ep, AND deepep-hybrid@EP4 all run on gb300, so + the only unfillable gb300 cells are uccl (any EP) and deepep-hybrid EP8. +- **DeepEP V2 (from-source `kernel_gen=v2`): DONE on x86 + aarch64, EP4 AND rack EP8.** Genuine V2 + (`deepep_version=2.0.0+af9a040`) builds on h100/h200/b300/b200 AND on aarch64 Grace-Blackwell — gb300 + EP4 (run 28429220764) produced `kernel_gen=v2`/`2.0.0`, log "built deep_ep 2.0.0 … V2 ready". So aarch64 + V2 is NOT a wall: wherever the EP4/single-node path runs (it calls `cx_build_deepep_v2` once in + `run_in_container`), V2 builds and runs. **Rack EP8 (gb200/gb300, 2 trays) — now DONE too**, after two + fixes the earlier "deferred" note anticipated only the first of: (1) the EP8 multi-srun launcher ran + `run_ep.py` over 8 ephemeral per-rank containers, BYPASSING `cx_build_deepep_v2` (so `deepep_v2=true` + silently ran bundled V1 and the doc `kernel_gen` was honestly `v1`). Fixed with `CX_BUILD_ONLY` + + a setup-srun that builds V2 ONCE PER NODE into a persistent `--container-name` every case-srun reuses. + (2) With V2 actually installed, EP8 then crashed `cudaErrorIllegalAddress` at `csrc/legacy/buffer.hpp` + across trays — NOT a hardware wall (bundled V1 runs 180 correct cross-tray EP8 docs, `ws8/nodes2/mnnvl`). + Upstream V2's `Buffer` ADDED `allow_mnnvl` (default **False**); when off, DeepEP itself sets + `NVSHMEM_DISABLE_MNNVL=1` and the legacy buffer falls onto the intranode-only CUDA-IPC peer path, which + faults across NVL72 trays. `tests/ep_deepep.py` now passes `allow_mnnvl=True` on both Buffer ctors when + `CX_ALLOW_MNNVL=1` (gated on `inspect` finding the param, so bundled-V1 + x86 single-node are unchanged); + the gb300 launcher exports it for the deepep EP8 case. **Validated:** gb300 EP8 run 28434764062 → + `kernel_gen=v2 / ws8 / nodes2 / transport=mnnvl / allow_mnnvl=True / mode=normal / correct=8/8`, roundtrip + p50 158→227µs (T=8→1024). `sweep_matrix` re-enables v2 at gb200/gb300 EP8. (gb200 launcher inherits the + same build-once + `CX_ALLOW_MNNVL` fix; pending a gb200 allocation to re-confirm.) + +## Other inference collectives (NVIDIA scope) + +- **All-reduce / all-gather (standardized NCCL):** DONE — real `family=nccl` results on H100/H200/B300, + rendered in the All-reduce/All-gather tabs. +- **CPU↔GPU offload, copy-engine/SDMA, KV-cache transfer:** DONE — single-process memcpy-family benches + (`tests/offload_bench.py`, `copy_engine_bench.py`, `kv_cache_transfer.py`). +- **Framework all-reduce — FlashInfer one-shot/two-shot DONE:** `allreduce_fw_bench.py` wires the real + `trtllm_allreduce_fusion` (pattern `kAllReduce`, `use_oneshot` True/False) over the TRT-LLM IPC + workspace — nccl baseline + flashinfer-oneshot + flashinfer-twoshot, all `correct=True` (one-shot + beats the NCCL ring in the small-message latency regime). **SGLang/vLLM/AITER custom-AR — now DONE** + by REPLICATING the framework's serving distributed-init (init_distributed_environment + + initialize_model_parallel) on the torchrun group and using the TP GroupCoordinator's + ca_comm.custom_all_reduce (the wrapper builds ca_comm only inside that init — a bare ctor skipped): + sglang H200 175 GB/s correct=True (run 28320404895); AITER MI355X 367.8 GB/s correct=True (run + 28320579741, aiter.dist.parallel_state, ca_comm under device_communicator); vLLM via the + allreduce-fw-vllm CONTAINER SWITCH to vllm/vllm-openai + entering set_current_vllm_config(VllmConfig()) + (its CustomAllreduce is a CustomOp asserting an active config), H200 correct=True (run 28320699661). + RL mesh-to-mesh + all-gather DP-attention→TP-MoE shapes: covered by the standardized sweeps. +- **KV-cache backends:** raw memcpy + CPU-pinned WIRED; **NIXL WIRED** (`tests/nixl_transfer.py`, B300 + via the dynamo-container switch — see the NIXL section above); **MoRI-IO WIRED** (`tests/ + mori_io_transfer.py`, MI355X, `mori.io` IOEngine RDMA p2p). **MoonCake WIRED on NVIDIA** (`tests/ + mooncake_transfer.py`, run_mooncake_suite pip-installs the engine; B300 35.4 GB/s via + `transfer_write_on_cuda`). **MoonCake on MI355X = ROCm wall (evidenced):** the engine initializes on + ROCm (`MOONCAKE_INIT … on rdma device rdma0`) but the pip wheel exposes NO `transfer_write_on_hip` + method (only the CUDA one) — `0 groups, status=invalid`, run 28342781762. A HIP transfer path would + need an upstream Mooncake ROCm build, not a container/flag fix. + +- **MI355X primitives (rccl-tests) tab:** the All-reduce/All-gather tabs render `family=nccl`; the AMD + equivalent is `rccl` (`CX_BENCH=nccl` → rccl-tests on the MI355X launcher). Repeated dispatches + (28340951946, 28342780904) failed in the runner *checkout/setup* step (exit 2/3, `EACCES` on a shared + `LOGS/agentic` dir + missing workspace) — the MI355X GHA runners are shared with the agentic + benchmark fleet, so the CollectiveX checkout collides intermittently. This is a runner-contention + infra flake, NOT an rccl-tests limitation; it lands when it gets a clean runner. + +## AMD / MI355X items — now ATTEMPTED via GHA (no longer "out of scope") +The directive's container-switch + AMD-lift asks. All run via GHA on the MI355X MoRI image: +- **FNUZ fp8 dispatch (MoRI) — VALIDATED (e4m3fnuz):** `dispatch_dtype=fp8` on the mori backend routes + MoRI's `quant_type=fp8_direct_cast` — the ROCm-native e4m3fnuz format (the self-introspecting adapter + found the valid set is `['none','fp8_direct_cast']`; the guessed `fp8_blockwise` is rejected by this + build). Required `use_external_inp_buf=True` (Fp8DirectCast asserts in zero-copy mode) + gating against + the e4m3fnuz consistency reference. MI355X run 28318788729: T=2/4/8 `correct=True`, max_rel **3e-4**, + disp_p99 ~45-70µs. The run's status=invalid is solely MoRI's forced-T=1 ramp point (a single-token + relErr-metric instability, rank-0 max_rel=3e-4 — not a comm error). Full 5-run resolution chain (each + peeling one layer via the GHA log alone — no SSH) in notes.md. +- **AMD SDMA copy path:** `copy_engine_bench.py` no longer refuses on ROCm — the off-SM DMA path IS the + SDMA engine; labeled `copy_engine_kind=sdma` / `accelerator=rocm` (vs NVIDIA `copy-engine`). The + non-interference probe characterizes SDMA-vs-CU interference (pynvml absent → graceful fallback). +- **MoRI-IO KV backend:** `tests/mori_io_transfer.py` (above). +- **MI355X cross-node EP (goal 183):** the custom-RDMA MoRI path aborts cross-node (SIGABRT, GPUDirect- + RDMA wall) — same class as UCCL on NVIDIA — so cross-node MI355X EP runs via **nccl-ep on RCCL** + (NCCL/RCCL `all_to_all_single`, host-staged over IB) with the shared-mount FileStore rendezvous. See + the rack-scale section above; single-node MI355X EP is covered by the MoRI sweep. + +## Operational note — do not delete ALL runs of a non-`main` workflow +`collectivex-experimental.yml` lives ONLY on the `collectivex` branch (unlike `collectivex-sweep.yml`, +which is also on `main`). GitHub keeps a workflow in the Actions registry only if it is on the default +branch OR has at least one run. Deleting EVERY run of `collectivex-experimental.yml` therefore +DE-REGISTERS it — `gh workflow run collectivex-experimental.yml --ref collectivex` then fails with +"workflow not found on the default branch," and `gh` even reports the failed dispatch as success if the +caller greps stdout for `github.com` (the 404 URL matches). Re-register by pushing any change under +`experimental/CollectiveX/**` (the `on: push` trigger creates a run). Robust fix: also add the workflow +to `main` (as the sweep already is), so run-deletion can never de-register it. diff --git a/experimental/CollectiveX/docs/methodology.md b/experimental/CollectiveX/docs/methodology.md new file mode 100644 index 000000000..41a246991 --- /dev/null +++ b/experimental/CollectiveX/docs/methodology.md @@ -0,0 +1,384 @@ +# CollectiveX EP benchmark — methodology mapping + +> Status: experimental (goal P2, "Methodology/reference docs"). This document explains +> what the CollectiveX EP dispatch/combine harness reused from upstream test code, what it +> deliberately changed, and the exact contracts a result must satisfy to be published. It is +> grounded in the code as it stands: `tests/ep_harness.py`, `tests/ep_deepep.py`, +> `tests/ep_mori.py`, `tests/reference_ep.py`, `tests/run_ep.py`, `validate_results.py`, and +> `schemas/ep-result-v4.schema.json`. Where a claim cannot be verified from the repo it is +> flagged inline rather than asserted. + +The shared design constraint behind everything below is the *fair-comparison contract* stated at +the top of `ep_harness.py`: a single deterministic routing trace is generated once from a fixed +seed over the **global** batch and is identical on every SKU; each rank materializes only its +slice (`routing.rank_slice` / the `my_off:my_off+my_cnt` slice in `run_sweep`). Adapters never +roll their own RNG. So "what was reused vs changed" always means: *reused the library's API call, +changed the workload and the timing boundary so every backend runs the same problem under a named, +machine-checkable measurement contract.* + +--- + +## DeepEP tests/legacy: what was reused + +The DeepEP adapter (`tests/ep_deepep.py`) reuses DeepEP's **documented normal-mode and +low-latency Python API directly**, the same surface its own intranode/internode test code drives: + +- **The buffer + dispatch/combine call sequence.** Normal mode constructs a single + `deep_ep.Buffer(group, num_nvl_bytes, 0)`, calls `buffer.get_dispatch_layout(topk_idx, experts)`, + then `buffer.dispatch(...)` and `buffer.combine(...)`. Low-latency mode uses + `Buffer(..., low_latency_mode=True, num_qps_per_rank=…)`, `low_latency_dispatch`, and + `low_latency_combine`. These are DeepEP's own entrypoints, not reimplementations. +- **The correctness identity from DeepEP's intranode test.** A pure dispatch→combine round trip + with *no expert compute* reconstructs `x` scaled by the number of destination ranks each token + was sent to. The adapter's `expected()` encodes exactly this: `ref * ranks_per_token`, where + `ranks_per_token = is_token_in_rank.sum(dim=1)` (see the module docstring and `expected()`). + This is the same invariant DeepEP's `test_intranode` relies on. +- **DeepEP's own comm-only timing boundary** is preserved as one of the offered contracts: + `cached-layout-comm-only-v1` hoists `get_dispatch_layout` out of the timed region (computed once + in `make_problem`, stored on `p.layout`), so the timed `dispatch()` is pure communication — + matching the boundary DeepEP's own benchmark uses. +- **The fp8 per-token block-128 cast convention.** `deep_ep` 1.2.x ships no helper for this (its + `utils` is empty), so `_per_token_cast_to_fp8` / `_per_block_dequant` implement the exact + convention DeepEP's kernels expect (scales `[T, H//128]` float32, e4m3, `448.0` as e4m3 max). + This is faithful reuse of the kernel's data contract, not a new scheme. +- **The LL QP convention** (one QP per local expert: `num_qps = experts // world_size`) and the + fixed `num_max_dispatch_tokens_per_rank` decode shape follow DeepEP's LL usage. + +## DeepEP tests/legacy: what was changed + +- **Workload: synthetic per-rank uniform random routing → one deterministic global trace.** + DeepEP's tests generate routing per rank locally. CollectiveX generates the routing **once over + the global batch** from a fixed seed (`routing.build_global_routing`) and hands each rank its + slice via `make_problem`, so DeepEP and MoRI provably run the *same* routed problem + (`make_problem` does no RNG — see the docstring: "materializes the harness-provided rank slice"). +- **Workload axes DeepEP's test does not sweep.** The harness drives a tokens-per-rank ladder + (decode `1..128`, prefill `128..4096`), and adds routing-distribution control (`uniform`, + `zipf*`, `hotspot-*`, `alternating-groups`, `balanced*`), temporal snapshots (`--routing-step`), + uneven per-rank source-token allocation (`--uneven-tokens`), EPLB replication + (`tests/eplb.py`), and structured placement metadata. None of these exist in the upstream test. +- **Timing boundary made explicit and named.** DeepEP's bench implicitly measures comm-only; + CollectiveX requires the adapter to *declare* `SUPPORTED_CONTRACTS` and conform to whichever the + run requests — `layout-and-dispatch-v1` (layout timed *inside* dispatch), + `cached-layout-comm-only-v1` (DeepEP's own boundary), or `runtime-visible-v1` (fp8 cast + + recv-dequant moved *inside* the timed window). `run_ep.py` rejects an unsupported contract + rather than letting the backend silently pick one. +- **Statistics.** Instead of a single timed loop, the harness pools `iters × trials` + (default `200 × 3 = 600`) samples with per-trial token-order shuffling, reduces **cross-rank MAX + per iteration before percentiling** (`median_i(max_r)`, not `max_r(median_i)`), and reports + p50/p90/p95/p99 with p99 as the headline. It also adds a separately *measured* round trip + (dispatch→stage→combine in one timed region) distinct from the `isolated_sum` of the two medians. +- **Correctness oracle is independent.** DeepEP's test validates DeepEP against DeepEP's own + expected formula; CollectiveX additionally carries a backend-free oracle (`reference_ep.py`, + see below) so correctness is not "backend vs itself." +- **Resource normalization.** The adapter can be restricted to a device-SM *fraction* + (`set_num_sms(round(sm_fraction · device_sms))`) so DeepEP and MoRI run at a comparable comm-unit + budget — an axis the upstream test does not model. + +> Note on "DeepEP `tests/legacy`": the plan references upstream DeepEP `tests/legacy` and a +> "DeepEP legacy test parity" item (goal P1, still open). The current adapter follows DeepEP's +> *documented normal/LL API*; a dedicated `tests/legacy` parity adapter is not yet implemented in +> this repo, so claims here describe the API surface reuse, not a line-for-line legacy port. + +--- + +## MoRI tests/python/ops: what was reused + +The MoRI adapter (`tests/ep_mori.py`) follows the upstream `ROCm/mori` `tests`/`examples` +dispatch+combine path: + +- **The op construction and call sequence.** It builds `mori.ops.EpDispatchCombineConfig(...)` and + `mori.ops.EpDispatchCombineOp(config)`, then calls `op.dispatch(x, weights, scales, indices, …)` + and `op.combine(...)` — MoRI's own ops, with `block_num` / `warp_per_block` launch parameters as + in its examples. +- **The shmem bring-up.** It registers the torch process group as `"default"` and calls + `mori.shmem.shmem_torch_process_group_init("default")`, mirroring MoRI's reference test setup + (`cpu:gloo,cuda:nccl` group with an explicit `device_id`, set up in `run_ep.py`). +- **The zero-copy registered-combine-input buffer path.** + `op.get_registered_combine_input_buffer(...)` is filled in `stage()` — the same zero-copy path + the upstream example uses to place "expert outputs" before combine. +- **The combine correctness identity.** MoRI's combine sums one copy per destination **rank**, so + with no expert compute `combined[i] ≈ x[i] × (#unique destination ranks among the token's topk + experts)`. `expected()` computes exactly this (`unique_pes` per token). This is the upstream + example's `expected = input × #unique-destination-ranks` reused verbatim in intent. +- **int32 expert ids / the scale-tensor shape.** MoRI expects int32 indices and a real `(T, 0)` + fp8 scale tensor (because `scale_dim == 0`); the adapter honors both. + +## MoRI tests/python/ops: what was changed + +- **Workload: always-uniform → the shared global trace.** The reference test routes uniformly. + The adapter's `make_problem` now materializes the **harness-provided** rank slice, so MoRI honors + the requested routing distribution and runs the identical workload to the NVIDIA SKUs (docstring: + "it no longer always-uniform"). +- **Heap held at 2 GiB instead of the reference's hardcoded 6 GiB.** MoRI registers the *entire* + symmetric heap as one RDMA MR at init. On the MI355X ionic_rdma NICs a 6 GiB MR fails + (`RegisterRdmaMemoryRegion … EINVAL`); 2 GiB registers. The adapter sets + `MORI_SHMEM_HEAP_SIZE` (default `2G`) **before** `import mori`. The reference's 6 GiB is "exactly + why it can't run as-is here" (CONTAINERS.md). +- **Bounded `max_num_inp_token_per_rank` → a real `buffer_cap`.** Capped at 512 tokens/rank at + hidden 7168 so dispatch/combine buffers fit the 2 GiB heap. The harness clamps the ladder to this + cap and **reports dropped points** rather than silently truncating (`token_ladder` returns + `dropped`). +- **`combine_needs_redispatch = True`.** MoRI's `combine()` resets `recv_num`, so `total_recv` + must be read **before** combine, and the harness re-dispatches (untimed) before *each* timed + combine sample (`time_us(..., pre=prep)`). DeepEP reuses its handle, so it sets this `False`. +- **Gradual cold-start ramp.** MoRI wedges on a cold dispatch that jumps straight to a large T, so + `needs_gradual_ramp = True` makes the harness approach max-T via a geometric ramp from 1 and + *not* shuffle token order. It also opts out of the Blackwell warm-burst (`wants_warm_burst = + False`) because a sustained burst wedges it. +- **Hard-exit teardown.** MoRI's post-`shmem_finalize()` teardown asserts (`CheckStatusValid` → + SIGABRT). The adapter's `finalize()` flushes results and `os._exit()`s past it instead of + returning cleanly the way DeepEP does. +- **Contract restriction.** MoRI computes its routing layout **inside** the dispatch kernel and it + cannot be hoisted, so it declares only `layout-and-dispatch-v1`. This is *why* cross-vendor + comparisons must use `layout-and-dispatch-v1` — it is the one contract both backends can honor. +- **Resource budget floored, not normalized down.** MoRI deadlocks at T≥32 when `block_num` is + reduced to the normalized target (validated: 46 wedges, 80 completes), so the adapter floors + `block_num` at a functional minimum and **records that the target fraction was not reached** + (`block_num_floored = True`, `tuned_source = "normalized-floored"`). The harness reads this and + marks the result resource-nonconforming → demoted to `diagnostic` (see publication contract). + +> Note on the exact upstream path name: CONTAINERS.md and the plan refer to `ROCm/mori` +> `tests`/`examples` and `tests/python/ops`. The adapter reproduces that dispatch+combine path's +> API and expected-value formula; the precise upstream file/commit is captured at runtime via +> `MORI_COMMIT` (else the image tag) into provenance rather than pinned in this doc. + +--- + +## FlashInfer PR 3000 benchmark inspiration + +The project plan lists, under "Reference benchmark scripts to draw from": *"flashinfer PR #3000; +ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`."* (`plan.md`). FlashInfer PR #3000 is named +there as **methodological inspiration for the EP dispatch/combine benchmark shape** — i.e. one of +the reference benchmark scripts whose structure informed how CollectiveX measures a single MoE +dispatch+combine pair — alongside the MoRI and DeepEP test code described above. + +**What is verifiable from this repo:** PR #3000 is cited only as a reference script in `plan.md`. +There is no FlashInfer adapter, import, or copied benchmark code in the tree today (a "FlashInfer +EP paths" item remains open in goal.md P1, and FlashInfer is otherwise referenced only for combine +precision via PRs #3643 / #3376). + +**What this doc does not assert:** I have **not** independently verified the contents of FlashInfer +PR #3000 (its exact title, the kernel it benchmarks, or which specific measurement choices were +borrowed) against the FlashInfer repository — that verification is outside what the CollectiveX +codebase contains, and the PR number is recorded here as-cited. Treat the specific influence as +"named as inspiration in the plan," not as a line-level provenance claim. If precise attribution is +needed, confirm against `flashinfer-ai/flashinfer` PR #3000 directly before publishing. + +What CollectiveX's EP methodology demonstrably shares with a good EP micro-benchmark (whatever its +origin): dispatch and combine are timed **separately**, each point is **one MoE layer / one step / +one dispatch+combine collective pair** (not a whole model), the token-count is the swept x-axis, +and percentiles come from many pooled iterations rather than a single timed loop. + +--- + +## Why CollectiveX timing boundaries differ + +DeepEP's and MoRI's own benchmarks each measure *their* natural boundary, which makes their numbers +non-comparable: DeepEP can hoist layout computation out of the timed region; MoRI computes layout +*inside* its kernel and cannot. If each backend simply reported "dispatch latency" under its own +convention, a DeepEP comm-only number would be compared against a MoRI layout-and-dispatch number +as if they measured the same thing. CollectiveX therefore makes the boundary an **explicit, named, +machine-checked contract** (review #3 in `ep_harness.py`): adapters declare `SUPPORTED_CONTRACTS` +and `run_ep.py` rejects an unsupported request. There are three contracts. + +### `layout-and-dispatch-v1` — the cross-vendor common boundary +Dispatch timing **includes** routing-layout generation. For DeepEP, `get_dispatch_layout` runs +*inside* the timed `dispatch()` (`p.layout is None`). For MoRI, layout is computed inside the +kernel and **cannot** be hoisted — so this is *the only contract MoRI can honor*, and hence the one +both vendors share. The fp8 cast/dequant stays **outside** the timed window (cast in +`make_problem`, dequant in `stage`), modelling a producer that hands the dispatcher already-quantized +activations. **Use this for any DeepEP-vs-MoRI comparison.** + +### `cached-layout-comm-only-v1` — DeepEP's own boundary (DeepEP only, normal mode) +Layout is computed **once, untimed** (in `make_problem`, stored on `p.layout`) so the timed +`dispatch()` is **pure communication**. This reproduces DeepEP's own benchmark boundary and is +useful for "how fast is the comm kernel alone," but it is **not** comparable to MoRI (which can't +hoist layout) and is rejected for LL mode (low-latency dispatch computes layout internally — +nothing to hoist; `run_ep.py` rejects this combo). + +### `runtime-visible-v1` — the serving-realistic boundary (DeepEP only today) +Dispatch starts from **what the runtime has right after routing** and **includes everything needed +to make expert input consumable**: the per-token block-128 **fp8 cast moves inside** the timed +window, plus layout, comm, and the recv-side **dequant to bf16** (`_per_block_dequant` inside +`dispatch()`, after which `stage()` no-ops). Combine starts from bf16 expert outputs and ends when +token outputs are consumable. This answers "what does the serving path actually pay," and the +adapter records the boundary honestly via `fp8_in_timing` (true only under this contract for fp8). +LL is runtime-visible *by construction* (its single kernel already times cast+layout+comm), so the +flag only changes normal mode. + +### Boundaries shared across all three +- **Combine excludes staging in every contract.** Placement of expert outputs (`stage()`) is + untimed for every backend — it stands in for the expert FFN write, which is not part of the + collective being measured. +- **`isolated_sum` is a diagnostic, not a measurement.** It is the arithmetic SUM of the isolated + dispatch and combine percentiles. It **cannot** reveal shared sync, launch amortization, or + dispatch/combine overlap, so it must not be used for throughput or SLO capacity. The **measured + round trip** (`roundtrip`, one timed region over dispatch→stage→combine) is the real chained + latency, and it is the only basis for `roundtrip_tokens_per_second`. +- **Cross-rank reduction order.** A collective finishes with its slowest rank, so each iteration's + latency is reduced **MAX across ranks first**, then percentiled. + +The contract name is part of the `comparison_key` and the schema enum, so two rows under different +contracts are labelled distinct and never silently overlaid. + +--- + +## Correctness contract definition + +"Correct" in CollectiveX has two layers: the **independent oracle** that defines the semantics, and +the **runtime gate** that every sweep point must pass. + +### The independent oracle (`tests/reference_ep.py`) +A from-scratch numpy model of MoE dispatch + combine, written **without** DeepEP or MoRI, used only +for untimed validation — so the benchmark is never "validated against itself." Its model: + +- **Layout:** expert `e` lives on rank `e // experts_per_rank`. +- **Dispatch:** token `t` selected for expert `e` contributes one copy of `x[t]` to + `(rank e//epr, expert e)`. `dispatch_plan()` enumerates every routed copy exactly once and + `validate_dispatch()` asserts each `(token, selected-expert)` maps to the **correct rank and + expert, exactly once** (duplicate `(token,expert)` pairs and out-of-range ranks are errors). +- **Expert transform:** a deterministic per-expert factor `f_e = 1 + e/E`, **distinct per expert**, + so a copy routed to the *wrong* expert produces a wrong value (identity would hide mis-routing — + the self-test corrupts one expert id and asserts the oracle output changes). +- **Combine:** `y[t] = Σ_k weights[t,k] · f_e(x[t])`, reduced over the token's selected experts, + output in **source-token order**. `validate_combine()` recomputes this two independent ways + (vectorizable reduction vs explicit per-copy accumulation) and asserts they agree — exercising + the reduction, the **gate-weighting**, the **source ordering**, and the + **multiple-experts-on-one-rank** case. +- **Edge cases** (goal P3): empty rank, repeated destination rank, single-rank hotspot (all topk on + rank 0) are covered in the self-test; non-divisible global token counts are handled by callers. + +So the oracle's definition of correct is **exact destination rank/expert/token mapping (each routed +copy once), plus the combine reduction with correct gate weights in correct source order.** + +### The runtime gate (in `ep_harness.run_sweep`) +Per ladder point, each backend's `combine` output is compared to its `expected()` reference +(DeepEP: `x · #destination-ranks`; MoRI: `x · #unique-destination-ranks`). The gate computes +`max_rel = max_abs_error / max|expected|` and passes the point when `max_rel < tolerance` +(bf16 `5e-2`; fp8 `1.25e-1`, looser because e4m3's 3 mantissa bits cap round-trip error — the +tolerance is **recorded in the artifact** so the looser fp8 gate is explicit). A point is `correct` +only if the local gate passes on **every** rank (MIN-reduced `local_ok`) **and** non-zero tokens +were actually received (`recv_total > 0`) — so a silent no-op cannot pass. + +The artifact is honest about scope: `correctness.scope = "roundtrip-reconstruction-smoke-v1"` — it +is a round-trip reconstruction plus non-silent-recv check at runtime, **not** a full per-token +routing/ordering/padding proof at runtime (that exhaustive proof is what `reference_ep.py` provides +off the hot path). + +### Workload identity (part of "did everyone run the same correct thing") +Beyond per-point correctness, the sweep proves all ranks built the **same** global routing: each +rank hashes its per-T routing hashes into a `trace_signature` and the harness MIN/MAX-reduces it; +`workload_identity = "consistent-across-ranks"` only if all ranks agree. A mismatch means NVIDIA and +AMD did **not** run identical routing, which (see below) makes the result `invalid`. + +--- + +## Publication contract definition + +`publication_status` is **machine-derived** from a multi-dimensional `validity` record — no caller +may hand-label a result `official`. The derivation lives in `ep_harness._derive_publication_status` +and is **mirrored** in `validate_results.py:derive_publication_status`; the validator's core job is +to confirm the recorded status equals this re-derivation (a mismatch = "validity tampered or +stale", a hard error). The five tiers and their gates: + +### `failed` +`execution_status != "complete"` — the sweep produced no rows. Nothing else is evaluated. + +### `invalid` +Execution completed but a **fundamental soundness gate failed**: `semantic_correctness != "pass"` +(a point failed the correctness gate), **or** `measurement_conformance != "conformant"`, **or** +`workload_identity == "inconsistent"` (ranks did not run the same routing). An invalid result is +not a usable measurement of anything. + +### `diagnostic` +Measurement is **sound** (correct + consistent workload + conformant contract) but it is **not a +fair cross-platform point**, for one of: +- **Resource-nonconforming** — `resource_conformance` ends in `"nonconforming"` (e.g. MoRI's + floored `block_num`: it needed *more* comm units than the normalized target, so it isn't an + apples-to-apples resource point). Fixed-kernel paths (DeepEP LL: `low_latency_mode`) are + classified `not-applicable`, **not** a conformance failure, and are simply excluded from the + resource-Pareto comparison. +- **A flagged timing anomaly** — `anomaly_free == false`. The harness flags + `roundtrip_gt_isolated_sum` (measured RT p99 > `threshold ×` isolated-sum p99, default 3×; the + open LL-FP8 case) and `roundtrip_lt_component_floor` (RT p50 < 0.95 × max(dispatch, combine) p50, + which violates chained-op sync semantics). Either demotes to `diagnostic` **unless explicitly + waived** via `--waive-anomaly` (which sets `anomaly_free = true`) *after* the cause is understood + and documented. +- It is also the fallback for an otherwise-sound result that does not meet the higher bars. + +### `comparable-experimental` +Measurement is sound (`semantic_correctness == pass`, `workload_identity` starts with +`"consistent"`, `measurement_conformance == conformant`), resource-conforming, and anomaly-free — +but it is **missing a publication requirement** (e.g. incomplete provenance, or a seeded-runtime +workload rather than a canonical serialized one). This is the normal tier for a clean development or +cross-vendor run that hasn't cleared the full official bar. It is comparable, just not "official." + +### `official` +Everything `comparable-experimental` requires **plus both**: +- `provenance_complete == true` — no `"unknown"` backend provenance, **and** a non-empty image + digest, **and** a GitHub run record with `run_id` + `source_sha` (assembled in `run_ep.py` from + `GITHUB_*` / `COLLECTIVEX_*` env). A bare local run can never be official. +- `workload_source == "canonical-serialized"` — the run consumed pre-generated, checksum-verified + trace bytes (`--workload-dir`, `tests/workload.py`), so it is **provably** the same workload as + any other run consuming the same files (not just a same-seed regeneration). + +`validate_results.py` enforces additional **official-grade** gates on top of the derivation: a +non-null `workload_id` and `trace_signature`, no unwaived anomalies, every point `correct`, and a +minimum of `100` pooled samples per point (`MIN_SAMPLES_OFFICIAL`). It exits non-zero if any doc +claims `official` but fails a gate, and (with `--require-official`) if any non-legacy doc is not +official. + +### Cross-run identity (validator-only) +Within a `comparison_key` (further grouped by `routing_step` and `uneven_tokens`, which change the +realized workload but live in `reproduction`, not the key), the validator checks **per-T +`routing_hash` agreement**: two runs at the same config and same T but **different routing bytes** +are flagged as "not the same workload." It deliberately keys on per-T hashes (not the whole +`trace_signature`) so a capped cross-vendor sweep (e.g. `1..16`) and a full headline sweep +(`1..128`) of the same config are **not** falsely flagged — only a genuine same-T conflict is. + +### Other record types the validator preserves +- **Legacy (v3, no `publication_status`)** docs load as `legacy-experimental` and are reported, not + failed. +- **Preserved failed-case** records (`record_type == "failed-case"`, emitted by the runner on a + wedge/timeout/crash) are reported as preserved cases, **not** validation errors — the project + rule is "do not silently discard failed or incorrect results." + +## Collective suites: all-reduce / all-gather / framework AR — serving-use mapping + +The non-EP collective families map to specific inference-serving communication patterns: + +### All-reduce (`family=nccl` op=all_reduce + `family=allreduce-fw`) +TP all-reduce of activations — the per-layer reduction across a tensor-parallel group after the +attention/MLP matmuls. Two tiers measured in the SAME All-reduce tab so they are directly comparable: +- **NCCL ring** (`run_nccl.py`, nccl-tests): the bandwidth-optimal baseline; wins at large messages. +- **Framework custom AR** (`allreduce_fw_bench.py`): FlashInfer one-shot + two-shot via + `trtllm_allreduce_fusion` (pattern `kAllReduce`). One-shot is a single NVLink round that beats the + ring in the small-message latency-bound regime (the few-KiB..few-MiB activations a decode step + all-reduces); two-shot trades a second round for higher bandwidth as the message grows (and needs + `token_num > tp_size`). The crossover is exactly the decision this tab visualizes. + +### All-gather (`family=nccl` op=all_gather) — DP-attention → TP-MoE handoff +In SGLang/DeepSeek-style serving, **data-parallel attention** runs each DP rank over its own token +shard, then the hidden states are **all-gathered** before the **tensor-parallel MoE** so every TP +rank sees the full token set for expert routing. The collected payload is `[total_tokens, hidden]` +bf16. The standardized all-gather sweep is a geometric byte ladder that **spans the payload-size +range of this handoff** (a few KiB per-rank shard up to the tens-of-MiB full-batch gather), so the +latency/bandwidth curves in the All-gather tab cover the DP-attention→TP-MoE handoff sizes directly. + +**Named per-model handoff shapes.** The gathered payload is `total_tokens × hidden × 2` bytes (bf16). +The table names the exact points for each model's EP shape (`hidden` from the `-v1` workload manifests), +at a representative decode batch (256 tokens) and prefill chunk (4096 tokens), and the nearest covering +point on the geometric all-gather byte ladder — so the named shapes are explicit, not just read off the +sweep: + +| Model | hidden | decode (256 tok) | prefill (4096 tok) | covered by all-gather sweep | +|------------------|-------:|-----------------:|-------------------:|-----------------------------| +| DeepSeek-V3/V4 | 7168 | 3.67 MB | 58.7 MB | yes (1 MiB–64 MiB band) | +| Kimi-K2 | 7168 | 3.67 MB | 58.7 MB | yes (1 MiB–64 MiB band) | +| MiniMax-M3 | 6144 | 3.15 MB | 50.3 MB | yes (1 MiB–64 MiB band) | +| Qwen3.5 | 4096 | 2.10 MB | 33.6 MB | yes (1 MiB–64 MiB band) | + +All four models' decode and prefill handoffs land inside the standardized sweep's 1–64 MiB span, so the +All-gather tab's measured latency/bandwidth at those byte points IS the per-model DP-attention→TP-MoE +handoff cost (read the curve at the model's column value). The shapes are model-derived (hidden) × +serving-regime (token count); the byte ladder is dtype-agnostic so an fp8 handoff halves each figure. diff --git a/experimental/CollectiveX/docs/references.md b/experimental/CollectiveX/docs/references.md new file mode 100644 index 000000000..91f3a0918 --- /dev/null +++ b/experimental/CollectiveX/docs/references.md @@ -0,0 +1,154 @@ +# CollectiveX — learning / resource notes + +> Status: experimental (goal P2, "Add learning/resource notes"). These four arXiv papers are the +> learning resources listed in `plan.md`. Each summary below was fetched from `arxiv.org/abs/` +> (titles/authors/dates taken from the live abstract page) and is then **mapped to the specific +> CollectiveX benchmark dimensions it informs** — the metric, contract, capability axis, or +> comparison the paper bears on. + +**Retrieval status (fetched 2026-06):** + +| arXiv ID | Title | Retrieved? | Note | +|---|---|---|---| +| 2511.15076 | GPU-Initiated Networking for NCCL | yes | clean fetch | +| 2603.13606 | NCCL EP: Towards a Unified Expert Parallel Communication API for NCCL | yes | **ID looked future-dated (year "26"); verify.** The page resolved to real content (submitted 13 Mar 2026 per the page), not a not-found error — recorded as retrieved, flagged for a sanity check of the ID/date before citing. | +| 2512.19849 | UCCL-EP: Portable Expert-Parallel Communication | yes | clean fetch | +| 2412.19437 | DeepSeek-V3 Technical Report | yes | clean fetch | + +All four resolved to genuine abstract pages. 2603.13606 is the only one flagged: its identifier +(and the page's stated 13 March 2026 submission date) is forward-dated relative to when it was +assigned in the plan, so although the fetch returned coherent NCCL-EP content, the ID should be +double-checked against arXiv directly before it is used as a hard citation. Nothing below is +fabricated; the one uncertainty is called out here. + +--- + +## Summarize arXiv 2511.15076 + +**GPU-Initiated Networking for NCCL** — Hamidouche, Bachan, Markthub, Gootzen, Agostini, Jeaugey, +Shafi, Theodorakis, Gorentla Venkata (NVIDIA). Submitted 19 Nov 2025 (v2 24 Nov 2025). + +Describes NCCL 2.28's new **Device API**, focused on the **GPU-Initiated Networking (GIN)** +component for network RDMA. The motivation is fine-grained, low-latency GPU-to-GPU communication +for tightly coupled compute-communication workloads — explicitly Mixture-of-Experts — where the +traditional host-initiated model's CPU coordination is overhead. GIN is a three-layer architecture: +host-side setup APIs, device-side remote-memory operations callable from inside CUDA kernels, and a +network plugin with dual semantics (GPUDirect Async Kernel-Initiated and a Proxy backend). The paper +demonstrates GIN by integrating it with **DeepEP** and reports benchmark results, positioning GIN as +combining low-latency device-initiated ops with NCCL's collective algorithms and production +infrastructure. + +## Summarize arXiv 2603.13606 + +> **Flagged ID — see retrieval table.** The arXiv identifier is forward-dated; the fetch returned +> the content below (an NCCL-EP paper), but verify the ID/date before citing as authoritative. + +**NCCL EP: Towards a Unified Expert Parallel Communication API for NCCL** — Goldman, Boker, +Sheraizin, Admoni, Polyakov, Bhattacharya, Yu, Sun, Theodorakis, Yin, Gootzen, Shafi, Ravid, +Di Girolamo, Dinan, Li, Gorentla Venkata, Bloch (NVIDIA). Page states submitted 13 Mar 2026 +(v3 2 Apr 2026); 13 pages, 8 figures, 7 tables; cs.DC. + +Introduces **NCCL EP**, an MoE communication library built on NCCL's Device API (the GIN work +above), offering unified `ncclEpDispatch` / `ncclEpCombine` primitives with **C and Python** +interfaces. It has two modes: a **Low-Latency (LL)** mode for inference decode targeting small +batches (the page quotes "1–128 tokens") over all-to-all RDMA+NVLink, and a **High-Throughput (HT)** +mode for training and inference prefill targeting large batches ("4096+ tokens") using hierarchical +communication that aggregates within NVLink domains before inter-node RDMA. It situates itself +alongside DeepEP and Hybrid-EP, evaluates on an H100 cluster across multi-node configs (LL kernel +results + end-to-end with vLLM), and aims to be a supported EP path on current and emerging NVIDIA +platforms. + +## Summarize arXiv 2512.19849 + +**UCCL-EP: Portable Expert-Parallel Communication** — Mao, Zhang, Cui, Huang, You, Chen, Xu, Gu, +Shenker, Raiciu, Zhou, Stoica. Submitted 22 Dec 2025 (v2 22 Jan 2026). + +Targets the **portability** problem in EP: systems like DeepEP perform well but require tight +GPU↔NIC coupling for GPU-initiated RDMA, so they don't run everywhere. **UCCL-EP** instead routes +compact token commands through a **GPU–CPU control channel** where multithreaded CPU proxies issue +the RDMA operations, and it **emulates ordering semantics using RDMA immediate data** for NICs that +lack native support (e.g. AWS EFA). Implemented on **both NVIDIA and AMD** GPUs with EFA and +Broadcom NICs, it reports up to **2.1× dispatch/combine throughput on EFA**, up to **40% higher +SGLang token throughput**, and up to **45% higher DeepSeek-V3 training throughput on a 16-node +AMD+Broadcom platform**. + +## Summarize arXiv 2412.19437 + +**DeepSeek-V3 Technical Report** — DeepSeek-AI et al. (~200 authors). Submitted 27 Dec 2024 +(v2 18 Feb 2025). + +Describes **DeepSeek-V3**, a **Mixture-of-Experts** LLM with **671B total / 37B activated per +token**, using **Multi-head Latent Attention (MLA)** and **DeepSeekMoE**, an **auxiliary-loss-free +load-balancing** strategy, and a **multi-token-prediction** objective. Pre-trained on 14.8T tokens +then SFT + RL; reported comparable to leading closed-source models at **2.788M H800 GPU-hours**, with +stable training (no irrecoverable loss spikes / rollbacks) and public checkpoints. For CollectiveX +the load-bearing details are the **MoE shape and the load-balancing approach**, not the end-to-end +quality numbers. + +--- + +## Map each paper to CollectiveX benchmark dimensions + +Each paper informs specific, concrete axes of the harness (`tests/ep_harness.py`, +`tests/ep_deepep.py`, `configs/backends.yaml`, `schemas/ep-result-v4.schema.json`). The mapping: + +### 2511.15076 (GIN / NCCL Device API) → the DeepEP **kernel-generation axis** and the **runtime-visible** boundary +- **`shape.kernel_gen` (v1 NVSHMEM vs v2 NCCL-GIN).** The harness already records DeepEP's kernel + generation as part of line identity (`kernel_gen` derived from `deepep_version`, folded into + `comparison_key`) precisely because DeepEP V2 moved its transport from NVSHMEM to the NCCL Device + API. This paper *is* the NCCL device-side RDMA (GIN) that the V2 path builds on — it is the + primary-source explanation for why a "DeepEPv2" run must never be conflated with a "DeepEP V1" run + (goal P1, "DeepEP version matrix"). Informs the `kernel_gen` field and the version-as-first-class- + axis requirement. +- **`runtime-visible-v1` measurement contract.** GIN's thesis is removing CPU coordination so comm + is launched/issued from inside the kernel. That is exactly the cost-surface `runtime-visible-v1` + tries to capture (cast + layout + comm + recv-dequant inside the timed window). The paper + motivates why a serving-realistic boundary, not just comm-only, is worth measuring. +- **`transport` axis** (`nvlink`/`mnnvl`/`rdma` in `backends.yaml`) — GIN is the RDMA device-path + whose latency the EP transports record. + +### 2603.13606 (NCCL EP) → the planned **NVIDIA NCCL EP adapter**, the **dispatch/combine API contract**, and **phase = decode/prefill** +- **The open "NVIDIA NCCL EP" backend** (goal P1: *"Add adapter for `NVIDIA/nccl/contrib/nccl_ep`"*) + — this paper is the design of that very library (`ncclEpDispatch` / `ncclEpCombine`). It is the + reference for adding an `nccl-ep` entry to `configs/backends.yaml` and a third adapter beside + DeepEP and MoRI, to be compared against DeepEP normal/LL under `layout-and-dispatch-v1`. +- **`mode` axis (normal vs ll) and `phase` (decode vs prefill).** NCCL EP's split into **LL + (1–128 tokens, decode)** and **HT (4096+ tokens, prefill/training)** lines up directly with the + harness's `DECODE_LADDER = [1..128]` / `PREFILL_LADDER = [128..4096]` and the `mode = ll|normal` + axis. It corroborates the decode/prefill token-regime modelling and the LL decode cap. +- **`comparison_key` design.** NCCL EP, DeepEP, and Hybrid-EP being distinct libraries with the same + `dispatch`/`combine` surface is exactly the situation the `backend` field + provenance + (`backend name, fork, commit, API generation`) exist to disambiguate. + +### 2512.19849 (UCCL-EP) → **cross-vendor portability**, the planned **UCCL adapter**, and the **transport / resource axes** +- **The open "UCCL EP" backend** (goal P1: *"Add UCCL backend adapter … Add cross-platform result + class"*) — this paper is that backend. It is the reference for a UCCL `backends.yaml` entry and a + capability declaration spanning **both NVIDIA and AMD** (the only paper here that is natively + cross-vendor, like CollectiveX itself). +- **The whole cross-vendor comparison thesis.** UCCL-EP exists because DeepEP's GPU↔NIC coupling + isn't portable. CollectiveX's reason for being is comparing such EP libraries fairly *across + vendors* — and its mechanism (one deterministic shared routing trace, `layout-and-dispatch-v1` as + the common contract, topology-class in the `comparison_key` so NVIDIA and AMD are never silently + overlaid) is the apparatus needed to evaluate exactly this paper's portability-vs-performance + trade-off. +- **`transport` axis + the CPU-proxy resource story.** UCCL-EP's CPU-proxy / RDMA-immediate-data + design adds transports (EFA, Broadcom) beyond `nvlink/xgmi`, and its CPU-side issue model is a + data point for the `resource_profile` vocabulary (comm units / where the work runs), which today + models SM/CU fractions. + +### 2412.19437 (DeepSeek-V3) → the **default benchmark shape**, **EPLB / routing-skew axis**, and **fp8 dispatch** +- **The headline shape itself.** The harness defaults — `hidden = 7168`, `topk = 8`, + `experts = 256` (`add_common_args`), and the goal's "Default to DeepSeek V3 shape / EP8 / uniform + / BF16" — *are* DeepSeek-V3's MoE configuration. This paper is the source of the canonical shape + every official curve is reported at, and of the `deepseek-v3-v1` / `deepseek-v4-v1` workload + manifests (goal P1). +- **EPLB and the routing-distribution axis.** DeepSeek-V3's **auxiliary-loss-free load balancing** + is the real-world counterpart to (a) the `--routing` skew distributions (`zipf*`, `hotspot-*`) the + harness stresses and (b) the **EPLB** expert-replication transform (`tests/eplb.py`, + `--eplb`/`--num-redundant-experts`) offered as the remedy for skew. The paper motivates *why* + load imbalance and its mitigation are first-class benchmark dimensions (`expert_load_cv`, + `rank_load_cv`, `hotspot_ratio`, the EPLB `imbalance_before/after` + `mapping_hash`). +- **fp8 throughout.** DeepSeek-V3's fp8 training/inference underpins the `dispatch_dtype = fp8` + axis and the per-token block-128 fp8 scale convention in `ep_deepep.py`. +- **Per-token activation rate.** "37B activated per token" is the MoE sparsity that makes + tokens-per-rank (not model size) the meaningful x-axis for a dispatch/combine micro-benchmark. diff --git a/experimental/CollectiveX/docs/upstream_precision.md b/experimental/CollectiveX/docs/upstream_precision.md new file mode 100644 index 000000000..62f96d66f --- /dev/null +++ b/experimental/CollectiveX/docs/upstream_precision.md @@ -0,0 +1,54 @@ +# Upstream precision work — review + mapping to CollectiveX (goal P1 "Integrate precision-related upstream work") + +Reviews the three precision PRs named in goal.md and maps each onto CollectiveX's precision axes +(`shape.dispatch_dtype`, `shape.quant.combine_input_dtype/combine_quant_mode`, the +`combine_quant_in_timing` reproduction flag, and the `capability.py` / `backends.yaml` `combine_dtypes` ++ `quant_modes` sets). All three are MERGED upstream. CollectiveX already carries the *scaffold* for +them (the combine-path axes default to bf16/none and are validated by `capability.resolve`), so each PR +maps to a concrete, reserved mode id that slots in when the kernel is wired + hardware-available. + +## MoRI PR 311 — `feat(EP): FP8 blockwise quantization for IntraNode combine` (ROCm/mori, MERGED) +- **What:** adds `QuantType::Fp8BlockwiseQuant` (Python `fp8_blockwise`) — a quant-aware FP8 combine for + the IntraNode EP path, replacing MoRI's old direct-cast (which truncated activations above the e4m3 + range and degraded SGLang DeepSeek-R1 accuracy at high concurrency). Per-token per-block max-abs scale + on the quant side; per-block FMA dequant on recv. Block size = `hidden_dim / scale_dim`. +- **Maps to:** the `combine_quant_mode` axis. CollectiveX's `ep_mori.py` / `capability.py` / + `backends.yaml` already reserve this ("`+ fp8 when the MoRI quant_type combine path (PR311) lands`"). + The reserved mode id is now concrete: **`fp8_blockwise`** with `combine_input_dtype=fp8`, + per-block scale layout — exactly the CollectiveX `combine_quant_mode` + `scale_layout` fields. +- **Scope:** AMD/MI355X (MoRI is the AMD backend). Out of scope for *NVIDIA chips*, but it is the + reference design for the quant-combine contract that the NVIDIA backends will mirror. + +## FlashInfer PR 3376 — `feat: add mxfp8 quant to moe a2a combine` (flashinfer-ai/flashinfer, MERGED) +- **What:** `moe_a2a_combine` can directly output **MXFP8** — adds `output_dtype`, `output_scales`, + `sf_layout`; bumps `kMaxPayloads` for per-token quantization dispatch. +- **Maps to:** `combine_quant_mode=mxfp8`, `combine_output_dtype=mxfp8`, `scale_layout=sf_layout`, and + `combine_quant_in_timing=true` (the quant is inside the combine kernel). This is the NVIDIA + quantized-combine path. + +## FlashInfer PR 3643 — `feat: add mxfp4/nvfp4 quant to moe a2a combine` (flashinfer-ai/flashinfer, MERGED) +- **What:** follow-up to 3376; adds **MXFP4 / NVFP4** quant to `moe_a2a_combine`, plus + `output_scalar_scale: float = 1.0`. +- **Maps to:** `combine_quant_mode ∈ {mxfp4, nvfp4}`, `combine_output_dtype ∈ {mxfp4, nvfp4}`. These are + the goal's "NVFP4 combine" / "MXFP8 combine" precision-matrix rows, and (via the dispatch side of the + same kernel family) the "NVFP4/MXFP4/MXFP8 dispatch" rows. + +## Why these are not yet RUN on NVIDIA (see docs/gated.md) +The FlashInfer combine quant (3376/3643) lives in `flashinfer.comm.moe_a2a_*` — the same MoE all-to-all +that needs a **symmetric multi-process MNNVL workspace**. On x86_64 (H100/H200/B200) that needs +`CAP_SYS_PTRACE`/pidfd (not granted in the enroot/pyxis container); on aarch64 (GB200/GB300) it uses +CUDA FABRIC handles (would work; GB300 capacity-limited). So MXFP8/MXFP4/NVFP4 *combine* (and the fp4 +*dispatch* in the same family) are reachable on NVIDIA only once that container-capability/hardware +blocker is resolved — they are not silently faked. DeepEP's own dispatch remains e4m3-fp8-only. + +## What CollectiveX did with this review +- **Capability table:** the reserved mode ids are now named in `capability.py` / `backends.yaml` + comments (`fp8_blockwise` for mori; `mxfp8`/`mxfp4`/`nvfp4` for the flashinfer combine path) so a + future wiring is a one-line capability widening, not a redesign. They remain **rejected** by + `capability.resolve` today (not runnable → not claimed). +- **Schema/labels:** `shape.quant.{combine_input_dtype,combine_quant_mode,combine_output_dtype, + scale_layout}` + `reproduction.combine_quant_in_timing` already exist (v4 schema), so a quantized- + combine result is a distinct, correctly-labelled comparison point the moment one is produced. +- **Correctness tests:** deferred with the kernels — when a quant-combine path is wired, the + `reference_ep.py` oracle gains a tolerance class per `combine_quant_mode` (looser e4m3/fp4 bound), + mirroring the existing fp8-dispatch tolerance (1.25e-1 vs bf16 5e-3). diff --git a/experimental/CollectiveX/env_capture.py b/experimental/CollectiveX/env_capture.py new file mode 100644 index 000000000..b906a0497 --- /dev/null +++ b/experimental/CollectiveX/env_capture.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — Layer-0 environment + topology capture. + +Emits a JSON document describing the node a collective benchmark ran on, so +every result is provenance-tagged and a B200-vs-GB200 comparison is defensible. +Standard library only (so it runs in any minimal container, and off-GPU it +degrades gracefully instead of crashing). torch is used only if importable. + +Usage: + python env_capture.py --out results/env_b200-dgxc.json + python env_capture.py --redact --out env.json # hash hostnames/IPs/UUIDs + +Importable: + from env_capture import capture_environment + env = capture_environment(redact=False) +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import platform +import re +import shutil +import socket +import subprocess +import sys + +SCHEMA_VERSION = 1 + +# Env vars worth recording — transport/tuning knobs that change what a +# collective actually does (esp. the GB200 MNNVL flags vs B200). +ENV_PREFIXES = ("NCCL_", "NVSHMEM_", "MC_", "UCX_", "SGLANG_DEEPEP", "DEEPEP_") +ENV_EXACT = ( + "CUDA_VISIBLE_DEVICES", + "CUDA_DEVICE_ORDER", + "SLURM_JOB_ID", + "SLURM_NNODES", + "SLURM_NTASKS", + "SLURM_JOB_PARTITION", + # Image identity — set by the launcher so the bundle records what ran. + "COLLECTIVEX_IMAGE", + "COLLECTIVEX_IMAGE_DIGEST", +) + + +def _run(cmd: list[str], timeout: int = 20) -> str | None: + """Run a command, return stdout (stripped) or None if unavailable.""" + if shutil.which(cmd[0]) is None: + return None + try: + out = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, check=False + ) + except (subprocess.TimeoutExpired, OSError): + return None + if out.returncode != 0: + return None + return out.stdout.strip() + + +def _redact(value: str | None) -> str | None: + """Stable short hash so artifacts can be shared without leaking + hostnames / IPs / GPU UUIDs / IB GUIDs while staying joinable.""" + if not value: + return value + return "redacted-" + hashlib.sha256(value.encode()).hexdigest()[:12] + + +def _gpus(redact: bool) -> dict: + """GPU inventory via nvidia-smi (None fields off-GPU).""" + info: dict = {"source": None, "count": None, "devices": []} + q = _run( + [ + "nvidia-smi", + "--query-gpu=name,uuid,memory.total,compute_cap,pci.bus_id", + "--format=csv,noheader,nounits", + ] + ) + if q is None: + return info + info["source"] = "nvidia-smi" + devices = [] + for line in q.splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) < 5: + continue + name, uuid, mem_mib, cc, bus = parts[:5] + devices.append( + { + "name": name, + "uuid": _redact(uuid) if redact else uuid, + "memory_total_mib": int(mem_mib) if mem_mib.isdigit() else mem_mib, + "compute_capability": cc, + "pci_bus_id": _redact(bus) if redact else bus, + } + ) + info["count"] = len(devices) + info["devices"] = devices + return info + + +def _driver_cuda() -> dict: + out = _run( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"] + ) + driver = out.splitlines()[0].strip() if out else None + # `nvidia-smi` (no args) prints the CUDA driver-API version in its header. + cuda = None + header = _run(["nvidia-smi"]) + if header: + m = re.search(r"CUDA Version:\s*([0-9.]+)", header) + if m: + cuda = m.group(1) + return {"driver_version": driver, "cuda_version": cuda} + + +def _torch_info() -> dict: + """NCCL / torch build info — only if torch is importable in this env.""" + info: dict = {"available": False} + try: + import torch # type: ignore + except Exception: + return info + info["available"] = True + info["torch_version"] = torch.__version__ + try: + info["cuda_runtime"] = torch.version.cuda + except Exception: + info["cuda_runtime"] = None + try: + if torch.cuda.is_available(): + nccl = torch.cuda.nccl.version() + # version() returns an int (e.g. 22304) or a tuple, depending on build. + info["nccl_version"] = ( + ".".join(map(str, nccl)) if isinstance(nccl, tuple) else nccl + ) + info["device_count"] = torch.cuda.device_count() + info["device_name"] = torch.cuda.get_device_name(0) + cc = torch.cuda.get_device_capability(0) + info["compute_capability"] = f"{cc[0]}.{cc[1]}" + except Exception as exc: # pragma: no cover - hardware dependent + info["error"] = repr(exc) + return info + + +def _topology(redact: bool) -> dict: + """GPU/NIC topology matrix + a fingerprint to gate comparability. + + The fingerprint is a hash of the structural part of `nvidia-smi topo -m` + (the connection legend), so two nodes with the same wiring share a key + even if absolute device IDs differ.""" + topo = _run(["nvidia-smi", "topo", "-m"]) + if topo is None: + return {"source": None, "matrix": None, "fingerprint": None} + # Fingerprint the link-type tokens (NV#, NODE, SYS, PIX, PXB, ...) only — + # ignore GPU/NIC labels and whitespace so it's placement-stable. + tokens = re.findall(r"\b(NV\d+|NODE|SYS|PIX|PXB|PHB|X)\b", topo) + fingerprint = hashlib.sha256(" ".join(tokens).encode()).hexdigest()[:16] + return { + "source": "nvidia-smi topo -m", + # The matrix can contain hostnames in some setups; redact wholesale. + "matrix": ("" if redact else topo), + "fingerprint": fingerprint, + } + + +def _rdma(redact: bool) -> dict: + """RDMA/IB device presence — names only, GUIDs redactable.""" + devices: list[str] = [] + listing = _run(["ibv_devinfo", "-l"]) + if listing: + for line in listing.splitlines()[1:]: # first line is a count + name = line.strip() + if name: + devices.append(name) + elif _run(["ibstat", "-l"]): + devices = [d.strip() for d in _run(["ibstat", "-l"]).splitlines() if d.strip()] + return { + "available": bool(devices), + "devices": [_redact(d) if redact else d for d in devices], + } + + +def _env_vars() -> dict: + out = {} + for k, v in os.environ.items(): + if k in ENV_EXACT or any(k.startswith(p) for p in ENV_PREFIXES): + out[k] = v + return dict(sorted(out.items())) + + +def capture_environment(redact: bool = False, timestamp: str | None = None) -> dict: + """Return a JSON-serializable environment/provenance record.""" + host = socket.gethostname() + return { + "schema_version": SCHEMA_VERSION, + "captured_at": timestamp or _dt.datetime.now().astimezone().isoformat(), + "redacted": redact, + "host": _redact(host) if redact else host, + "platform": { + "system": platform.system(), + "release": platform.release(), + "machine": platform.machine(), # x86_64 vs aarch64 (B200 vs GB200) + "python": sys.version.split()[0], + }, + "gpus": _gpus(redact), + "driver": _driver_cuda(), + "torch": _torch_info(), + "topology": _topology(redact), + "rdma": _rdma(redact), + "env": _env_vars(), + } + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX Layer-0 environment capture") + ap.add_argument("--out", help="write JSON here (default: stdout)") + ap.add_argument( + "--redact", + action="store_true", + help="hash hostnames / IPs / GPU UUIDs / IB GUIDs for shareable artifacts", + ) + ap.add_argument( + "--timestamp", + help="ISO timestamp to stamp (default: now); pass one for reproducible bundles", + ) + args = ap.parse_args() + + env = capture_environment(redact=args.redact, timestamp=args.timestamp) + blob = json.dumps(env, indent=2) + if args.out: + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + fh.write(blob + "\n") + # A one-line human summary to stdout (the JSON is the artifact). + g = env["gpus"] + print( + f"env -> {args.out} | machine={env['platform']['machine']} " + f"gpus={g['count']} topo_fp={env['topology']['fingerprint']}" + ) + else: + print(blob) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/generate_matrix.py b/experimental/CollectiveX/generate_matrix.py new file mode 100644 index 000000000..dd9ecc045 --- /dev/null +++ b/experimental/CollectiveX/generate_matrix.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +"""CollectiveX matrix generator (goal Part 2: capability planning, sharding, canaries). + +Reads configs/{suites,workloads,platforms,backends}.yaml, resolves a named suite into the FULLY +VALIDATED set of (workload, platform, backend, mode, dtype, contract, routing, ep, phase) cases +BEFORE any GPU is allocated — omitting unsupported combinations with a recorded reason. Then: + * groups compatible cases into SHARDS (same platform/nodes/placement/image/backend/mode/resource + -> one allocation runs many token points), and + * selects a CANARY per (platform, backend, mode, contract) to run before the full shard. + + python3 generate_matrix.py --suite ep-nightly-v1 --out matrix.json + python3 generate_matrix.py --suite ep-smoke-v1 # prints summary + omissions + +Pure stdlib + PyYAML. 'all' as a backend resolves to the platform vendor's EP backend set. +""" +from __future__ import annotations + +import argparse +import itertools +import json +import os + +import yaml + +HERE = os.path.dirname(os.path.abspath(__file__)) + + +def _load(name): + with open(os.path.join(HERE, "configs", name)) as fh: + return yaml.safe_load(fh) + + +def resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase, platforms, backends, + combine_quant_mode="none", placement="packed", activation_profile="normal", eplb=False): + """Return (ok, reason). Mirrors adapter SUPPORTED_* + platform/backend registry limits, including + the combine-quant / routing / EPLB / activation distribution constraints (goal P2-m).""" + p = platforms["platforms"].get(plat) + b = backends["backends"].get(beng) + if p is None: + return False, f"unknown platform {plat}" + if b is None: + return False, f"unknown backend {beng}" + if b["vendor"] != p["vendor"]: + return False, f"{beng} is {b['vendor']}, {plat} is {p['vendor']}" + if mode not in b["modes"]: + return False, f"{beng} has no mode {mode}" + pm = (p.get("validated") or {}).get("modes") + if pm and mode not in pm: + return False, f"{plat} validated modes={pm} (got {mode})" # e.g. B300 LL aborts -> normal-only + if dtype not in b["dtypes"]: + return False, f"{beng} has no dtype {dtype}" + if contract not in b["contracts"]: + return False, f"{beng} has no contract {contract}" + if ep not in p["validated"]["ep_degrees"]: + return False, f"{plat} EP{ep} not validated (have {p['validated']['ep_degrees']})" + if ep > p["validated"]["max_intranode_gpus"] and not p["validated"].get("internode"): + return False, f"{plat} EP{ep} needs internode (not validated)" + pc = (b.get("phase_constraints") or {}).get(mode) + if pc and pc.get("phases") and phase not in pc["phases"]: + return False, f"{beng} mode={mode} is {pc['phases']}-only (got {phase})" + if contract == "cached-layout-comm-only-v1" and mode == "ll": + return False, "cached-layout meaningless for LL" + # combine-quant / distribution constraints (goal P2-m). Default none/packed/normal reproduce + # today; the quant-combine suite's fp8/mxfp8 modes are REJECTED here (no kernel wired) so it + # resolves to zero valid cases until PR311 lands. + if combine_quant_mode not in b.get("quant_modes", ["none"]): + return False, f"{beng} quant_modes={b.get('quant_modes', ['none'])} (got {combine_quant_mode}) — not wired" + if routing not in b.get("routings", [routing]): + return False, f"{beng} does not support routing {routing}" + if eplb and not b.get("eplb", False): + return False, f"{beng} does not support EPLB" + if activation_profile not in b.get("activation_profiles", ["normal"]): + return False, f"{beng} does not support activation_profile {activation_profile}" + return True, "ok" + + +def expand_backends(spec, plat, platforms, backends): + """Resolve 'all' to the platform vendor's EP backend set (goal: do NOT skip capability).""" + if spec != "all": + return spec if isinstance(spec, list) else [spec] + vendor = platforms["platforms"][plat]["vendor"] + eps = [b for b in backends["vendor_backends"][vendor] if b in backends["backends"]] + return eps + + +def generate(suite_name): + suites = _load("suites.yaml")["suites"] + platforms = _load("platforms.yaml") + backends = _load("backends.yaml") + workloads = _load("workloads.yaml") + if suite_name not in suites: + raise SystemExit(f"unknown suite {suite_name}; have {sorted(suites)}") + s = suites[suite_name] + phases = s.get("phases", ["decode"]) + routings = s.get("routings", ["uniform"]) + resource_modes = s.get("resource_modes", ["tuned"]) + # optional distribution axes (default to today's single value when the suite omits them). + cqms = s.get("combine_quant_modes", ["none"]) + placements = s.get("placements", ["packed"]) + activations = s.get("activation_profiles", ["normal"]) + eplbs = s.get("eplb", [False]) # ep-routing-v1 sweeps [false, true] + steps = s.get("routing_steps", [0]) # ep-temporal-v1 sweeps the snapshot index + unevens = s.get("uneven_tokens", ["none"]) # ep-uneven-tokens-v1 sweeps the allocation + cases, omitted = [], [] + for plat in s["platforms"]: + bset = [] + for bspec in s["backends"]: + bset += expand_backends(bspec, plat, platforms, backends) + for beng in sorted(set(bset)): + eps = s.get("ep_degrees") or platforms["platforms"][plat]["validated"]["ep_degrees"] + for (wl, mode, dtype, contract, routing, ep, phase, rmode, cqm, placement, act, + eplb, step, uneven) in itertools.product( + s["workloads"], s["modes"], s.get("dtypes", ["bf16"]), s["contracts"], + routings, eps, phases, resource_modes, cqms, placements, activations, + eplbs, steps, unevens): + ok, reason = resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase, + platforms, backends, combine_quant_mode=cqm, + placement=placement, activation_profile=act, eplb=eplb) + rec = {"workload": wl, "platform": plat, "backend": beng, "mode": mode, + "dtype": dtype, "contract": contract, "routing": routing, "ep": ep, + "phase": phase, "resource_mode": rmode, "combine_quant_mode": cqm, + "placement": placement, "activation_profile": act, + "eplb": eplb, "routing_step": step, "uneven_tokens": uneven} + (cases if ok else omitted).append({**rec, **({} if ok else {"reason": reason})}) + # SHARDS: one allocation per (platform, backend, mode, resource, image) runs many points. + shards = {} + for c in cases: + img = backends["backends"][c["backend"]].get("required_image", "?") + key = (c["platform"], c["backend"], c["mode"], c["resource_mode"], img) + shards.setdefault(key, []).append(c) + shard_list = [{"platform": k[0], "backend": k[1], "mode": k[2], "resource_mode": k[3], + "image": k[4], "cases": v} for k, v in shards.items()] + # CANARY: one representative (smallest) case per (platform, backend, mode, contract). + canary = {} + for c in cases: + ck = (c["platform"], c["backend"], c["mode"], c["contract"]) + canary.setdefault(ck, c) + # cohort-level source-SHA pinning (goal P2-n): record whether this suite REQUIRES all SKUs to + # use one benchmark source SHA (official runs) — cohort.py --pin-sha enforces it at validation. + # official suites pin by default; diagnostic/bring-up may mix. + pin = s.get("pin_source_sha", s.get("required_publication") == "official") + return {"suite": suite_name, "required_publication": s.get("required_publication"), + "pin_source_sha": pin, + "headline_distribution": (_load("suites.yaml").get("headline_distribution") or {}).get("routing"), + "n_cases": len(cases), "n_omitted": len(omitted), + "cases": cases, "omitted": omitted, "shards": shard_list, + "canaries": list(canary.values())} + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX matrix generator") + ap.add_argument("--suite", required=True) + ap.add_argument("--out") + a = ap.parse_args() + m = generate(a.suite) + print(f"suite={m['suite']} required={m['required_publication']}: " + f"{m['n_cases']} valid cases, {m['n_omitted']} omitted, " + f"{len(m['shards'])} shards, {len(m['canaries'])} canaries") + seen = set() + for o in m["omitted"]: + k = (o["platform"], o["backend"], o["mode"], o["dtype"], o["contract"], o["reason"]) + if k not in seen: + seen.add(k) + print(f" OMIT {o['platform']}/{o['backend']}/{o['mode']}/{o['dtype']}/{o['contract']}: {o['reason']}") + if a.out: + with open(a.out, "w") as fh: + json.dump(m, fh, indent=2) + print(f"wrote {a.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh new file mode 100644 index 000000000..e5e7ddeb6 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# CollectiveX — 2-node B200 SKU adapter (cross CX-7 InfiniBand spine), x86_64. +# +# The other half of the headline: the same primitives as single-node B200, but +# spanning two nodes so the transport is InfiniBand rather than NVLink. Contrast +# with GB200, where the 2-node-equivalent stays on NVL72 NVLink (MNNVL). +# +# Multi-node orchestration differs from single-node, so this adapter does NOT +# use run_in_container.sh: it builds nccl-tests (MPI=1), runs each op across all +# ranks (raw capture), then parses on the login node. Currently CX_BENCH=nccl +# only (multi-node DeepEP/MNNVL is the srt-slurm follow-up). +# +# SPIKE CAVEATS: needs `srun --mpi=pmix` wired for pyxis and a compute-visible +# checkout — set CX_STAGE_DIR to a shared FS (e.g. /home/sa-shared/cx-stage) if +# the runner workspace is not cross-mounted to compute. +# +# Run: bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +CX_BENCH="${CX_BENCH:-nccl}" +[ "$CX_BENCH" = "nccl" ] || cx_die "launch_b200-dgxc-slurm.sh supports CX_BENCH=nccl only (got '$CX_BENCH'); multi-node DeepEP is a follow-up" + +RUNNER_NAME="${RUNNER_NAME:-b200-dgxc-slurm}" +PARTITION="${CX_PARTITION:-gpu-2}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +GPUS_PER_NODE="${CX_GPUS_PER_NODE:-8}" +NODES="${CX_NODES:-2}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image b200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +TOPO="b200-nvlink-island+cx7-ib" +WORLD=$((NODES * GPUS_PER_NODE)) +MPI_FLAG="${CX_SRUN_MPI:-pmix}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Record container identity in env_capture provenance (propagated via --export=ALL). +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" + +declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf + [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf ) + +cx_log "runner=$RUNNER_NAME nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" + --no-container-entrypoint) +ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json" + +# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node). +srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" \ + bash -c ' + set -euo pipefail + cd /ix/experimental/CollectiveX + source runtime/common.sh + mkdir -p results + cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null + python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" + ' + +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build" +OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + +# 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS. +for op in $OPS; do + raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt" + cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG) -> $raw" + srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \ + --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \ + --export=ALL,NCCL_CUMEM_ENABLE=1 \ + "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-8G}" -f 2 -g 1 -c 1 -w 5 -n 20 \ + > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)" + + # 3) Parse on the login node (pure stdlib python; no container needed). + python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \ + --world-size "$WORLD" --nodes "$NODES" \ + --runner "$RUNNER_NAME" --topology-class "$TOPO" --transport ib \ + --env-json "$ENVJSON" \ + --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \ + --timestamp "$TS" || cx_log "WARN: parse $op failed" +done + +cx_log "done — JSON artifacts under $CX_DIR/results/" diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh new file mode 100644 index 000000000..6d0c31c11 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# CollectiveX — B200 single-node SKU adapter (8x B200, NVLink island, x86_64). +# +# Thin adapter: handles B200-specific allocation/container, then hands off to +# runtime/run_in_container.sh which runs whichever benchmark CX_BENCH selects +# (nccl | deepep | all). Mirrors runners/launch_b200-dgxc.sh (salloc + enroot +# squash + srun --container) with all model-serving stripped. +# +# Run from inside the InferenceX checkout on the B200 login node: +# bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # nccl (default) +# CX_BENCH=deepep bash .../launch_b200-dgxc.sh # DeepEP (rebuild) +# +# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(30) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES +# CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-b200-dgxc}" +PARTITION="${CX_PARTITION:-gpu-2}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image b200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Record container identity in env_capture provenance. +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \ + --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_b300-nv.sh b/experimental/CollectiveX/launchers/launch_b300-nv.sh new file mode 100644 index 000000000..7f485480a --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b300-nv.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# CollectiveX — B300 (b300-nv GH runner) adapter. The self-hosted runner is named +# `b300-nv_NN`, so runner.name's prefix resolves to this file via +# launch_${RUNNER_NAME%%_*}.sh. Identical B300 settings to launch_b300.sh (the +# canonical/manual entry point) — delegate so there is a single source of truth. +set -euo pipefail +exec bash "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/launch_b300.sh" "$@" diff --git a/experimental/CollectiveX/launchers/launch_b300.sh b/experimental/CollectiveX/launchers/launch_b300.sh new file mode 100644 index 000000000..422d045c2 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b300.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# CollectiveX — B300 single-node SKU adapter (8x B300 SXM6, NVLink island, x86_64, SM100). +# +# Thin adapter: B300-specific allocation/container, then hands off to +# runtime/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors +# launch_h200.sh; B300 differs in: partition `batch_1` with a REQUIRED account +# (`benchmark`), and the compute-visible share is /data (10.3.26.100:/data) — NOT +# /home and NOT the node-local /scratch, both invisible to compute nodes here. Both +# the squash AND the staged repo MUST live on /data or pyxis fails "No such file". +# +# Run from inside the InferenceX checkout on the B300 login node: +# bash experimental/CollectiveX/launchers/launch_b300.sh # nccl (default) +# CX_BENCH=deepep CX_PHASE=both bash .../launch_b300.sh # DeepEP, decode+prefill +# +# Env knobs: CX_PARTITION(batch_1) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-b300}" +PARTITION="${CX_PARTITION:-batch_1}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" # B300 scheduler REQUIRES a valid account/partition combo +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-b300-018}" # known-bad node (per the serving launcher) +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-45}" +IMAGE="${CX_IMAGE:-$(cx_default_image b300)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/data/sa-shared/containers}" +export CX_STAGE_DIR="${CX_STAGE_DIR:-/data/sa-shared/cx_stage}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="b300-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \ + --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh new file mode 100644 index 000000000..6a754f5bf --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh @@ -0,0 +1,226 @@ +#!/usr/bin/env bash +# CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray. +# +# Two paths, selected by CX_NODES: +# * CX_NODES=1 (default): single tray, 4 GPU, intra-tray MNNVL. Hands off to +# run_in_container.sh (CX_BENCH = nccl | deepep | all), -g 4. +# * CX_NODES>1: multi-node over the NVL72 NVLink fabric (MNNVL), e.g. CX_NODES=2 +# = 8 GPU. nccl only — builds nccl-tests (MPI=1), runs each op across all ranks +# via `srun --mpi=pmix` (1 GPU/rank), parses on the login node. Same shape that +# runs single-node B200 (NVLink island) and multi-node B200 (CX-7 IB) — here it +# stays entirely on NVL72 NVLink. Validated 8-GPU (2 trays) on-node. +# +# Run from inside the InferenceX checkout on the GB200 login node: +# bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # 4 GPU, nccl +# CX_NODES=2 bash .../launch_gb200-nv.sh # 8 GPU MNNVL +# CX_BENCH=deepep bash .../launch_gb200-nv.sh # 4 GPU, DeepEP +# +# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NODES(1) +# CX_GPUS_PER_NODE(4) CX_TIME(30) CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH +# CX_OPS CX_MIN_BYTES CX_MAX_BYTES CX_SRUN_MPI(pmix) CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-gb200-nv}" +PARTITION="${CX_PARTITION:-batch}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +GPUS_PER_NODE="${CX_GPUS_PER_NODE:-4}" # NVL72 compute tray = 4 GPU/node +NODES="${CX_NODES:-1}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +WORLD=$((NODES * GPUS_PER_NODE)) + +export CX_RUNNER="$RUNNER_NAME" CX_TS="$TS" +export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +# Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded. +export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD bench=$CX_BENCH (aarch64)" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +# ---------------------------------------------------------------------------- +if [ "$NODES" -le 1 ]; then + # Single tray (4 GPU): generic dispatcher, -g N single process. + export CX_NGPUS="$GPUS_PER_NODE" + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPUS_PER_NODE" \ + --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" + [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" + cx_log "JOB_ID=$JOB_ID" + trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" + cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" + exit 0 +fi + +# ---------------------------------------------------------------------------- +# Multi-node MNNVL over the NVL72 NVLink fabric. CX_BENCH=nccl -> nccl-tests across WORLD ranks +# (build MPI=1, srun --mpi=pmix, parse on login). Any EP backend (deepep/uccl/flashinfer) -> the +# EP multi-srun path ported from launch_gb300-nv.sh: run_ep.py across WORLD srun tasks (1 GPU/rank, +# per-rank RANK/LOCAL_RANK from SLURM_*), intranode NVLink across <=8 MNNVL ranks. One config/dispatch. +MPI_FLAG="${CX_SRUN_MPI:-pmix}" +declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf + [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf ) + +JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" + --no-container-entrypoint) +ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json" + +# EP backends (deepep/uccl/flashinfer): run run_ep.py across WORLD srun tasks over MNNVL, then exit +# (the nccl-tests path below is nccl-only). Mirrors launch_gb300-nv.sh's shard-aware EP8 path. +if [ "$CX_BENCH" != "nccl" ]; then + MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N)" | head -1)"; MP=29553 + mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results" + # Source the hybrid-ep build env if the build-once wrote it (build_ext --inplace PYTHONPATH/LD_LIBRARY_PATH + # are process-local and don't cross srun steps; the file persists in the named container). No-op otherwise. + WRAP='[ -f /tmp/.cx_hybrid_env ] && . /tmp/.cx_hybrid_env; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"' + + # Build from-source kernels (DeepEP V2 / flashinfer-quant-combine) ONCE PER NODE into a persistent + # named container, then every case-srun reuses it (build visible to all WORLD ranks). Mirrors the + # proven launch_gb300-nv.sh EP8 path: without this, the multi-srun ran ephemeral per-rank containers + # that bypassed the build hooks (deepep_v2 silently ran bundled V1, quant-combine ran cq=none). + CNAME="cxep_${JOB_ID}" + CMOUNT=(--container-mounts="$MOUNT_SRC:$MOUNT_DIR" --no-container-mount-home + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" --no-container-entrypoint) + cx_log "EP setup: build into named container $CNAME per node (deepep_v2=${CX_DEEPEP_V2:-} combine=${CX_COMBINE_DTYPE:-bf16})" + srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --container-name="$CNAME" \ + --container-image="$SQUASH_FILE" "${CMOUNT[@]}" --export=ALL,CX_BUILD_ONLY=1 \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" &1 | tail -15 \ + || cx_log "WARN: EP build-only step returned nonzero (see above)" + + # Per-rank env. deepep V2 spans NVL72 trays only with allow_mnnvl=True (else DeepEP sets + # NVSHMEM_DISABLE_MNNVL=1 -> intranode-IPC path -> illegal address cross-tray); CX_ALLOW_MNNVL=1 makes + # tests/ep_deepep.py pass it (gated on the param existing, so bundled V1 is unchanged). flashinfer rides + # NCCL's MNNVL transport. (gb200 validation pending an allocation; identical to gb300 run 28434764062.) + EP_EXPORTS="ALL,MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1,MC_FORCE_MNNVL=1" + [ "$CX_BENCH" = "deepep" ] && EP_EXPORTS="$EP_EXPORTS,CX_ALLOW_MNNVL=1" + + # SWEEP (CX_SHARD_FILE set): one TAB-line per shard case so the rack-scale EP path sweeps EVERY + # case (parity with single-node). MANUAL: one line per phase from the :-defaulted CX_* env. + cx_ep_cases() { + # CX_SHARD_FILE is workflow-relative (results/.shard_.json, written under + # working-directory=experimental/CollectiveX). This path runs on the SUBMIT HOST (cwd=repo root), + # so resolve against $CX_DIR when not found as-is — else the SHARD branch is skipped and only ONE + # default case runs instead of the shard's N. + local sf="${CX_SHARD_FILE:-}" + [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf" + if [ -n "$sf" ] && [ -f "$sf" ]; then + # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` collapses consecutive tabs and + # swallows empty fields like a false eplb, shifting columns. No case field contains '|'.) + python3 - "$sf" <<'PY' +import json, sys +d = json.load(open(sys.argv[1])) +for c in d.get("cases", []): + g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv) + print("|".join([g("phase","decode"), g("dtype","bf16"), g("mode","normal"), + g("contract","layout-and-dispatch-v1"), g("routing","uniform"), + ("1" if c.get("eplb") else ""), g("resource_mode","tuned"), + g("activation_profile","normal"), g("placement","packed"), g("routing_step","0"), + g("uneven_tokens","none"), g("hidden","7168"), g("topk","8"), g("experts","256"), + g("ladder","")])) +PY + else + local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill" + local ph + for ph in $phases; do + printf '%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ + "$ph" "${CX_DISPATCH_DTYPE:-bf16}" "${CX_MODE:-normal}" \ + "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" "${CX_ROUTING:-uniform}" \ + "${CX_EPLB:+1}" "${CX_RESOURCE_MODE:-tuned}" "${CX_ACTIVATION_PROFILE:-normal}" \ + "${CX_PLACEMENT:-packed}" "${CX_ROUTING_STEP:-0}" "${CX_UNEVEN_TOKENS:-none}" \ + "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}" + done + fi + } + + ci=0 + while IFS='|' read -r ph dtype mode contract routing eplb rmode act placement rstep uneven hidden topk experts lad; do + [ -n "$ph" ] || continue + ci=$((ci+1)) + out="results/${RUNNER_NAME}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")_${dtype}_${mode}.json" + cx_log "EP${WORLD}[$ci] $ph $CX_BENCH $dtype/$mode/$contract routing=$routing eplb=${eplb:-} rmode=$rmode act=$act plc=$placement" + # shellcheck disable=SC2086 + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$WORLD" \ + --ntasks-per-node="$GPUS_PER_NODE" --container-name="$CNAME" "${CMOUNT[@]}" \ + --export="$EP_EXPORTS" \ + bash -c "$WRAP" _ --backend "$CX_BENCH" --phase "$ph" --dispatch-dtype "$dtype" \ + --mode "$mode" --measurement-contract "$contract" \ + --routing "$routing" ${eplb:+--eplb} --resource-mode "$rmode" \ + --activation-profile "$act" --placement "$placement" --routing-step "$rstep" --uneven-tokens "$uneven" \ + --tokens-ladder "$lad" --hidden "$hidden" --topk "$topk" \ + --experts "$experts" --warmup "${CX_WARMUP:-32}" --iters "${CX_ITERS:-200}" \ + --trials "${CX_TRIALS:-3}" --seed "${CX_SEED:-67}" --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" \ + --transport "$CX_TRANSPORT" \ + ${CX_COMBINE_DTYPE:+--combine-dtype "$CX_COMBINE_DTYPE"} ${CX_COMBINE_QUANT_MODE:+--combine-quant-mode "$CX_COMBINE_QUANT_MODE"} \ + --out "$out" &1 | tail -8 + cx_log "EP${WORLD}[$ci] $ph rc=${PIPESTATUS[0]}" + done < <(cx_ep_cases) + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" + cx_log "done — EP artifacts under $CX_DIR/results/" + exit 0 +fi + +# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node). +srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \ + --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" /dev/null + python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" + ' + +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build" +OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + +# 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS. +for op in $OPS; do + raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt" + cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG, MNNVL) -> $raw" + srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \ + --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \ + --export=ALL,NCCL_CUMEM_ENABLE=1,NCCL_MNNVL_ENABLE=1,MC_FORCE_MNNVL=1 "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)" + + # 3) Parse on the login node (pure stdlib; no container needed). + python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \ + --world-size "$WORLD" --nodes "$NODES" \ + --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" \ + --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \ + --timestamp "$TS" || cx_log "WARN: parse $op failed" +done + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $CX_DIR/results/" diff --git a/experimental/CollectiveX/launchers/launch_gb300-nv.sh b/experimental/CollectiveX/launchers/launch_gb300-nv.sh new file mode 100644 index 000000000..41d08bbb9 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_gb300-nv.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +# CollectiveX — GB300 (NVL72 Grace-Blackwell, aarch64) GHA launcher. Lands on the gb300-nv +# self-hosted runner (on the im-gb300-login-02 slurm login) and runs the chosen EP config. +# +# Two paths by CX_NODES: +# CX_NODES<=1 (EP4): single NVL72 tray, 4 GPU. Hands off to run_in_container.sh (torchrun -g 4). +# CX_NODES==2 (EP8): 2 trays, 8 GPU over the MNNVL NVLink domain. run_in_container's single-node +# torchrun can't span nodes, so this path runs run_ep.py DIRECTLY across 8 srun tasks (1 rank +# each), per-rank RANK/LOCAL_RANK from SLURM_*, MASTER_ADDR=first node — the intranode NVLink +# path works across <=8 ranks on MNNVL (no internode/NVSHMEM). One CX_* config per dispatch. +# +# Env: CX_NODES(2) CX_PARTITION(batch_1) CX_ACCOUNT(benchmark) CX_BENCH(deepep) CX_PHASE + the +# CX_DISPATCH_DTYPE/CX_MODE/CX_MEASUREMENT_CONTRACT/CX_ROUTING/CX_EPLB/CX_TOKENS_LADDER knobs. +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)"; REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +PARTITION="${CX_PARTITION:-batch_1}"; ACCOUNT="${CX_ACCOUNT:-benchmark}" +NODES="${CX_NODES:-2}"; GPN="${CX_GPUS_PER_NODE:-4}" +NGPUS="${CX_NGPUS:-$((NODES*GPN))}"; TIME_MIN="${CX_TIME:-90}" +# CX_IMAGE is a docker TAG, not a squash path: cx_ensure_squash mangles the tag to +# _.sqsh and finds the pre-staged squash by THAT name (the same convention +# H200/B300 use). Passing a .sqsh PATH here made it try `enroot import docker://` +# -> "Invalid image reference", then pyxis "No such file or directory" on the mangled +# target. The pre-staged file is /data/sa-shared/containers/lmsysorg_sglang_v0.5.11-cu130.sqsh, +# which is exactly the mangled name of this tag, so it resolves with no re-import. +IMAGE="${CX_IMAGE:-$(cx_default_image gb300)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/data/sa-shared/containers}" +export CX_STAGE_DIR="${CX_STAGE_DIR:-/data/sa-shared/cx_stage}" +export ENROOT_CACHE_PATH="${ENROOT_CACHE_PATH:-/data/sa-shared/.enroot_cache}" +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +RUNNER="gb300-${NGPUS}x" +export CX_RUNNER="$RUNNER" CX_TS="$TS" CX_TOPO="gb300-nvl72-mnnvl" CX_TRANSPORT="mnnvl" +export CX_BENCH="${CX_BENCH:-deepep}" CX_NGPUS="$NGPUS" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" + +cx_log "GB300 runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH phase=${CX_PHASE:-decode}" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "$CX_STAGE_DIR")" +[ "${CX_DRYRUN:-0}" = "1" ] && { cx_log "DRYRUN"; exit 0; } +command -v salloc >/dev/null || cx_die "salloc not found" + +if [ "$NODES" -le 1 ]; then # ---- EP4: single tray, run_in_container (torchrun -g 4) ---- + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPN" --exclusive \ + --time="$TIME_MIN" --job-name="$RUNNER")" + [ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc" + trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + srun --jobid="$JOB_ID" --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:/ix" \ + --no-container-mount-home --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \ + --export=ALL bash /ix/experimental/CollectiveX/runtime/run_in_container.sh + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"; exit 0 +fi + +# ---- EP8: 2 trays, run_ep.py directly across 8 ranks (no torchrun; MNNVL intranode path) ---- +JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" --gres=gpu:"$GPN" \ + --ntasks-per-node="$GPN" --exclusive --time="$TIME_MIN" --job-name="$RUNNER")" +[ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT +MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N)" | head -1)"; MP=29551 +mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results" +# Source the hybrid-ep build env if the build-once wrote it (deepep-hybrid: build_ext --inplace + +# PYTHONPATH/LD_LIBRARY_PATH are process-local and don't cross srun steps; the file persists in the +# named container). No-op for other backends (file absent). +WRAP='[ -f /tmp/.cx_hybrid_env ] && . /tmp/.cx_hybrid_env; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"' + +# From-source kernels (DeepEP V2 / flashinfer quant-combine) cannot be built in the per-rank multi-srun +# (8 separate ephemeral containers). Build them ONCE PER NODE into a PERSISTENT named container, then +# every case-srun REUSES it (--container-name, no re-import) so the build is visible to all 8 ranks. +# Brings the EP8 rack path to parity with EP4 (run_in_container builds once + reuses). Mounts re-apply +# per srun-step (not persisted in the container fs), so each srun still passes "${CMOUNT[@]}". +CNAME="cxep8_${JOB_ID}" +CMOUNT=(--container-mounts="$MOUNT_SRC:/ix" --no-container-mount-home + --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint) +cx_log "EP8 setup: build into named container $CNAME per node (deepep_v2=${CX_DEEPEP_V2:-} combine=${CX_COMBINE_DTYPE:-bf16})" +srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 \ + --container-name="$CNAME" --container-image="$SQUASH_FILE" "${CMOUNT[@]}" --export=ALL,CX_BUILD_ONLY=1 \ + bash /ix/experimental/CollectiveX/runtime/run_in_container.sh &1 | tail -15 \ + || cx_log "WARN: EP8 build-only step returned nonzero (see above)" + +# The EP8 case list as TAB-separated arg-lines. SWEEP (CX_SHARD_FILE set): one line per shard case, +# so the rack-scale EP8 path sweeps EVERY case of its shard (parity with run_in_container's single- +# node SHARD loop) instead of the old single CX_* config. MANUAL (no shard file): one line per phase +# from the CX_* env — every field is :-defaulted so set -u never trips on an unset knob (the old bug: +# bare $CX_DISPATCH_DTYPE here was unbound under sweep, crashing the whole job on its first line). +cx_ep8_cases() { + # CX_SHARD_FILE is workflow-relative (results/.shard_.json, written by the Extract step with + # working-directory=experimental/CollectiveX). This EP8 path runs on the SUBMIT HOST where cwd is + # the repo root, so resolve it against $CX_DIR (=experimental/CollectiveX) when not found as-is — + # else the SHARD branch is skipped and only ONE default case runs instead of the shard's N. + local sf="${CX_SHARD_FILE:-}" + [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf" + if [ -n "$sf" ] && [ -f "$sf" ]; then + # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` would collapse consecutive tabs and + # swallow empty fields like a false eplb, shifting every column. No case field contains '|'.) + python3 - "$sf" <<'PY' +import json, sys +d = json.load(open(sys.argv[1])) +for c in d.get("cases", []): + g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv) + print("|".join([g("phase","decode"), g("dtype","bf16"), g("mode","normal"), + g("contract","layout-and-dispatch-v1"), g("routing","uniform"), + ("1" if c.get("eplb") else ""), g("resource_mode","tuned"), + g("activation_profile","normal"), g("placement","packed"), g("routing_step","0"), + g("uneven_tokens","none"), g("hidden","7168"), g("topk","8"), g("experts","256"), + g("ladder","")])) +PY + else + local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill" + local ph + for ph in $phases; do + printf '%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ + "$ph" "${CX_DISPATCH_DTYPE:-bf16}" "${CX_MODE:-normal}" \ + "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" "${CX_ROUTING:-uniform}" \ + "${CX_EPLB:+1}" "${CX_RESOURCE_MODE:-tuned}" "${CX_ACTIVATION_PROFILE:-normal}" \ + "${CX_PLACEMENT:-packed}" "${CX_ROUTING_STEP:-0}" "${CX_UNEVEN_TOKENS:-none}" \ + "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}" + done + fi +} + +# Per-rank env for the EP8 case sruns. flashinfer-combine rides NCCL's MNNVL transport (validated: +# cq=fp8/nvfp4 @ ws8). DeepEP V2's Buffer gates multi-tray NVLink behind allow_mnnvl, which defaults +# False -> DeepEP then sets NVSHMEM_DISABLE_MNNVL=1 and the legacy buffer takes the intranode-only CUDA-IPC +# peer path, faulting across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). CX_ALLOW_MNNVL=1 +# makes tests/ep_deepep.py pass allow_mnnvl=True so the NVL buffer spans both trays over the fabric API. +# Bundled V1's Buffer predates the param (its NVL buffer already spans MNNVL) -> the harness drops the kwarg. +EP8_EXPORTS="ALL,MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1" +[ "$CX_BENCH" = "deepep" ] && EP8_EXPORTS="$EP8_EXPORTS,CX_ALLOW_MNNVL=1" + +ci=0 +while IFS='|' read -r ph dtype mode contract routing eplb rmode act placement rstep uneven hidden topk experts lad; do + [ -n "$ph" ] || continue + ci=$((ci+1)) + out="results/${RUNNER}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")_${dtype}_${mode}.json" + cx_log "EP8[$ci] $ph $CX_BENCH $dtype/$mode/$contract rt=$routing eplb=${eplb:-} combine=${CX_COMBINE_DTYPE:-bf16}/${CX_COMBINE_QUANT_MODE:-none}" + # shellcheck disable=SC2086 + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$NGPUS" \ + --ntasks-per-node="$GPN" --container-name="$CNAME" "${CMOUNT[@]}" \ + --export="$EP8_EXPORTS" \ + bash -c "$WRAP" _ --backend "$CX_BENCH" --phase "$ph" --dispatch-dtype "$dtype" \ + --mode "$mode" --measurement-contract "$contract" \ + --routing "$routing" ${eplb:+--eplb} --resource-mode "$rmode" \ + --activation-profile "$act" --placement "$placement" --routing-step "$rstep" --uneven-tokens "$uneven" \ + --tokens-ladder "$lad" --hidden "$hidden" --topk "$topk" \ + --experts "$experts" --warmup "${CX_WARMUP:-32}" --iters "${CX_ITERS:-200}" \ + --trials "${CX_TRIALS:-3}" --seed "${CX_SEED:-67}" --runner "$RUNNER" --topology-class "$CX_TOPO" \ + --transport "$CX_TRANSPORT" \ + ${CX_COMBINE_DTYPE:+--combine-dtype "$CX_COMBINE_DTYPE"} ${CX_COMBINE_QUANT_MODE:+--combine-quant-mode "$CX_COMBINE_QUANT_MODE"} \ + --out "$out" &1 | tail -8 + cx_log "EP8[$ci] $ph rc=${PIPESTATUS[0]}" +done < <(cx_ep8_cases) +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" diff --git a/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh new file mode 100644 index 000000000..2a35340a8 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# CollectiveX — H100 (DGX Cloud Slurm) single-node SKU adapter (8x H100, NVLink +# island, x86_64, SM90). Matches the GH self-hosted runner name `h100-dgxc-slurm_NN` +# (runner.name prefix -> this script via launch_${RUNNER_NAME%%_*}.sh). +# +# Thin adapter mirroring launch_b200-dgxc.sh (same DGX Cloud tenancy/conventions: +# partition default gpu-2, account benchmark, compute-visible /home/sa-shared); +# allocates, then hands off to run_in_container.sh (CX_BENCH = nccl | deepep | all). +# The DeepEP path runs the full FP8 + low-latency matrix (validated on 8x H100). +# +# !!! First on-runner run = validation (no direct SSH to this cluster at authoring). +# If pyxis fails "No such file" the share is not compute-visible — set CX_SQUASH_DIR +# + CX_STAGE_DIR to a compute-visible FS (cf. hpc-gpu-1 needing /mnt/nfs). +# +# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(45) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +# Cluster identity from runners/launch_h100-dgxc-slurm.sh (the serving launcher): +# partition hpc-gpu-1, account customer, known-bad node hpc-gpu-1-7 excluded. This +# is the SAME cluster validated over SSH. CRITICAL: /home is login-local (not +# compute-visible) — the squash MUST live on /mnt/nfs; the GH runner workspace is +# already on /mnt/nfs (compute-visible) so the checkout mounts directly (no staging). +RUNNER_NAME="${RUNNER_NAME:-h100-dgxc-slurm}" +PARTITION="${CX_PARTITION:-hpc-gpu-1}" +ACCOUNT="${CX_ACCOUNT:-customer}" +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-hpc-gpu-1-7}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-45}" +IMAGE="${CX_IMAGE:-$(cx_default_image h100)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/nfs/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="h100-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION account=$ACCOUNT ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +# ---- Cross-node H100 EP (goal 182): mirrors launch_h200.sh. Allocate N nodes, ONE container task per +# node; run_in_container builds the backend per node then spawns NGPUS local ranks rendezvousing via a +# FileStore on the shared mount (CX_RDZV_FILE) — deliberately AVOIDS torchrun (its elastic-agent TCPStore +# at the management-subnet NodeAddr is unreachable from a peer's enroot container net namespace). nccl-ep +# is the validated portable cross-node EP (all_to_all_single, host-stages); custom-RDMA backends hit the +# GPUDirect-RDMA wall. /mnt/nfs is compute-visible so the FileStore is shared across nodes. +if [ "${CX_NODES:-1}" -gt 1 ]; then + NODES="${CX_NODES}" + cx_log "H100 CROSS-NODE EP: nodes=$NODES world=$((NODES*NGPUS)) bench=$CX_BENCH (IB; FileStore rdzv)" + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \ + --nodes="$NODES" --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" + [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID (multi-node) from salloc" + trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N)]" + export CX_TOPO="h100-multinode-ib" CX_TRANSPORT="rdma" + # FileStore rendezvous file on the shared mount (same underlying file on every node); fresh per job. + RDZV="$MOUNT_DIR/experimental/CollectiveX/.rdzv_${JOB_ID}" + rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true + srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 \ + --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint \ + --export=ALL,CX_NNODES="$NODES",CX_RDZV_FILE="$RDZV" \ + bash -c 'export CX_NODE_RANK=${SLURM_NODEID:-0}; exec bash "$0"' \ + "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" || cx_log "WARN: cross-node H100 EP rc=$?" + rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" + cx_log "done — cross-node H100 EP artifacts under results/" + exit 0 +fi + +JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --exclude="$EXCLUDE_NODES" \ + --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_h200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_h200-dgxc-slurm.sh new file mode 100755 index 000000000..9dd862987 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_h200-dgxc-slurm.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# The H200 GHA self-hosted runner is named h200-dgxc-slurm_NN, so the workflow's +# launch_${RUNNER_NAME%%_*}.sh convention resolves to THIS name. Thin alias to the real +# H200 adapter (launch_h200.sh) — no logic here, just the name the runner expects. +exec bash "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/launch_h200.sh" "$@" diff --git a/experimental/CollectiveX/launchers/launch_h200.sh b/experimental/CollectiveX/launchers/launch_h200.sh new file mode 100644 index 000000000..72f34b69a --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_h200.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# CollectiveX — H200 single-node SKU adapter (8x H200, NVLink island, x86_64, SM90). +# +# Thin adapter: H200-specific allocation/container, then hands off to +# runtime/run_in_container.sh (CX_BENCH = nccl | deepep | all). Mirrors +# launch_b200-dgxc.sh; H200 differs in: partition `main` (14x 8-GPU H200 nodes), +# NO account (open scheduler), home is shared NFS (compute-visible, so no +# CX_STAGE_DIR), and the sglang image is imported on first use (not pre-staged). +# +# Run from inside the InferenceX checkout on the H200 login node: +# bash experimental/CollectiveX/launchers/launch_h200.sh # nccl (default) +# CX_BENCH=deepep CX_PHASE=both bash .../launch_h200.sh # DeepEP, decode+prefill +# +# Env knobs: CX_PARTITION(main) CX_ACCOUNT() CX_NGPUS(8) CX_TIME(45) CX_IMAGE +# CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_PHASE CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-h200}" +PARTITION="${CX_PARTITION:-main}" # H200 cluster's only partition (sinfo: main*) +ACCOUNT="${CX_ACCOUNT:-}" # H200 scheduler is open; no account needed +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-45}" # generous: first-use enroot import of the image +IMAGE="${CX_IMAGE:-$(cx_default_image h200)}" +# This cluster's /home is shared NFS and IS compute-visible (confirmed on login-0: +# the GHA runners live under /home/sa-shared/gharunners and the sglang image is +# pre-staged at /home/sa-shared/containers). The h100-dgxc sibling is the opposite +# (/home login-local, /mnt/nfs is the share) — /mnt/nfs does NOT exist here, so the +# old /mnt/nfs default failed the GHA runner at "mkdir /mnt/nfs: Permission denied". +# The checkout already lives on the compute-visible NFS, so mount it directly: no +# staging (CX_STAGE_DIR empty). Override CX_STAGE_DIR only from a login-local checkout. +SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}" +export CX_STAGE_DIR="${CX_STAGE_DIR:-}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="h200-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ${ACCOUNT:+account=$ACCOUNT }ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +# ---- Cross-node H100/H200 EP (goal 182): allocate N nodes, run ONE container task per node, and let +# run_in_container build uccl (per node) then spawn its NGPUS local ranks rendezvousing via a FileStore +# on the shared mount (CX_RDZV_FILE). This deliberately AVOIDS torchrun: torchrun's elastic agent runs +# its OWN cross-node TCPStore at --master-addr, unreachable from a peer's enroot container net namespace +# (the management-subnet NodeAddr is not in the container's net view — the prior torchrun attempt timed +# out 900s at exactly that bootstrap, while the FileStore path got past it). The build MUST be in- +# container per node (uccl is pip-installed, not in the image), so one-container-per-node — NOT multi- +# srun-per-rank — is required: separate per-rank containers are ephemeral and would each lack uccl. +# UCCL EP is internode-native (RDMA/IB); DeepEP normal-internode asserts out. Repo on compute-vis NFS. +if [ "${CX_NODES:-1}" -gt 1 ]; then + NODES="${CX_NODES}" + cx_log "H200 CROSS-NODE EP: nodes=$NODES world=$((NODES*NGPUS)) bench=$CX_BENCH (IB; UCCL internode-native; FileStore rdzv)" + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" ${ACCOUNT:+--account="$ACCOUNT"} --nodes="$NODES" --gres=gpu:"$NGPUS" \ + --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" + [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID (multi-node) from salloc" + trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N)]" + export CX_TOPO="h200-multinode-ib" CX_TRANSPORT="rdma" + # FileStore rendezvous file on the shared mount (same underlying file on every node); fresh per job. + RDZV="$MOUNT_DIR/experimental/CollectiveX/.rdzv_${JOB_ID}" + rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true + # one task/node; CX_NODE_RANK is the per-node SLURM_NODEID (set inside the task, not via --export). + srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 \ + --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint \ + --export=ALL,CX_NNODES="$NODES",CX_RDZV_FILE="$RDZV" \ + bash -c 'export CX_NODE_RANK=${SLURM_NODEID:-0}; exec bash "$0"' \ + "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" || cx_log "WARN: cross-node H200 EP rc=$?" + rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" + cx_log "done — cross-node H200 EP artifacts under results/" + exit 0 +fi + +JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" ${ACCOUNT:+--account="$ACCOUNT"} --gres=gpu:"$NGPUS" \ + --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" + +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh new file mode 100644 index 000000000..7be963cfb --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh @@ -0,0 +1,190 @@ +#!/usr/bin/env bash +# CollectiveX — MI355X (AMD CDNA4, 8 GPU/node) SKU adapter: MoRI dispatch/combine. +# +# AMD counterpart to the NVIDIA adapters. Differs from them in ways taken from +# the real runners/launch_mi355x-amds.sh: +# * partition `compute`, no --account (cluster default), --cpus-per-task=128, +# and known-bad nodes excluded; +# * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on +# the allocated node (not on the login node like the shared-FS NVIDIA path); +# * pyxis flags --container-writable --container-remap-root for the ROCm image. +# AMD backends: CX_BENCH=mori (MoRI EP dispatch/combine, default) or nccl +# (collective primitives via rccl-tests, the ROCm nccl-tests fork). +# +# !!! NOT yet validated on hardware (no MI355X cluster access at authoring time). +# Treat the first on-runner run as validation — like the DeepEP path was on GB200. +# +# Run from inside the InferenceX checkout on the MI355X login node: +# bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh +# +# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(60) CX_IMAGE +# CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}" +PARTITION="${CX_PARTITION:-compute}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-60}" # generous: a cold enroot import of the large ROCm image +IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}" # node-local on MI355X +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}" +# Optional node pin. The node-local squash is only staged on some nodes, and on +# others /var/lib/squash isn't writable (cold-import fails). Pin CI to nodes that +# already hold the squash via CX_NODELIST (overrides the exclude list). +NODELIST="${CX_NODELIST:-}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +# AMD backends/benches wired on MI355X (ROCm/CDNA4): +# mori — MoRI EP dispatch/combine (the AMD EP backend) +# nccl — collective primitives via rccl-tests (the ROCm nccl-tests fork) +# kv-cache — KV block transfer (HIP memcpy family; capability allows amd) +# rl-mesh — RL trainer<->generator mesh (torch.distributed -> RCCL on ROCm) +# allreduce-fw— framework all-reduce (RCCL baseline; the flashinfer one/two-shot impls are +# NVIDIA-only and self-skip on the ROCm image, leaving a valid RCCL-baseline curve) +# copy-engine — off-SM DMA copy vs CU-kernel copy; on ROCm the DMA path IS the SDMA engine +# (the AMD SDMA path), labeled copy_engine_kind=sdma in the result +# mori-io — MoRI-IO RDMA p2p transfer engine (mori.io; AMD analog of NIXL) GPU0<->GPU1 +# Default mori; honor an explicit CX_BENCH within this set. NVIDIA-only EP backends +# (deepep/uccl/flashinfer/deepep-hybrid/offload/nixl) fall back to mori (capability also +# rejects them on amd, so a dispatch of those to mi355x is a no-op the validator catches first). +# nccl-ep IS supported on AMD: it is pure torch.distributed all_to_all_single over RCCL (the +# cross-node EP path that host-stages where MoRI's custom RDMA aborts — goal 183). +export CX_BENCH="${CX_BENCH:-mori}" +case "$CX_BENCH" in + mori|nccl-ep|nccl|kv-cache|rl-mesh|allreduce-fw|copy-engine|mori-io|nccl-kv|mooncake) ;; + *) cx_log "mi355x: CX_BENCH='$CX_BENCH' is NVIDIA-only / unsupported on AMD; using mori"; export CX_BENCH=mori ;; +esac +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi" +# MI355X is a shared cluster with slow cold enroot imports + node contention; the default 900s +# per-phase wall-clock guard is too tight here (MoRI prefill at large T + a busy node times out). +# Raise to 1800s (fits inside the 60-min salloc). Override with CX_RUN_TIMEOUT. +export CX_RUN_TIMEOUT="${CX_RUN_TIMEOUT:-1800}" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH image=$IMAGE" +# AMD workspace is compute-visible (the serving launcher bind-mounts it directly), +# so no staging; the node-local squash is handled via srun below. +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +SQUASH_KEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')" +SQUASH_FILE="$SQUASH_DIR/${SQUASH_KEY}.sqsh" +# Lock in a guaranteed-writable per-node dir, NOT next to the squash: on some +# nodes /var/lib/squash is root/admin-owned, so even a world-readable squash +# can't get a sibling .lock created (flock -> "Bad file descriptor"). CX_LOCK_DIR +# overrides. The lock only serializes concurrent imports on the same node. +LOCK_FILE="${CX_LOCK_DIR:-/tmp}/${SQUASH_KEY}.sqsh.lock" +cx_log "squash(node-local)=$SQUASH_FILE lock=$LOCK_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +# ---- Cross-node MI355X EP (goal 183): MoRI is RDMA-native (ionic_rdma) — it registers a symmetric +# heap per rank and dispatches/combines over RDMA, so it spans nodes natively. CX_NODES>1 allocates +# N nodes (pinned to the warm-squash nodes via CX_NODELIST so no cold import), imports the squash on +# each, then multi-sruns run_ep across NODES*8 ranks (1 GPU/rank, RANK/LOCAL_RANK from SLURM_*) — the +# same multi-srun shape the GB300 EP8 path uses. Reduced timing (MoRI wedges under sustained load). +if [ "${CX_NODES:-1}" -gt 1 ]; then + NODES="${CX_NODES}"; WORLD=$((NODES * NGPUS)) + cx_log "MI355X CROSS-NODE EP: nodes=$NODES world=$WORLD bench=$CX_BENCH (MoRI RDMA internode)" + if [ -n "$NODELIST" ]; then + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --nodelist="$NODELIST" --nodes="$NODES" --gres=gpu:"$NGPUS" \ + --ntasks-per-node="$NGPUS" --exclusive --cpus-per-task=16 --time="$TIME_MIN" --job-name="$RUNNER_NAME")" + else + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --nodes="$NODES" --gres=gpu:"$NGPUS" \ + --ntasks-per-node="$NGPUS" --exclusive --cpus-per-task=16 --time="$TIME_MIN" --job-name="$RUNNER_NAME")" + fi + [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID (multi-node) from salloc" + trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]" + # import the squash on EVERY allocated node (1 task/node). + srun --jobid="$JOB_ID" --ntasks-per-node=1 bash -c " + mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true + exec 9>\"$LOCK_FILE\" 2>/dev/null; flock -w 600 9 2>/dev/null || true + unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1 && echo \"squash present: $SQUASH_FILE\" \ + || { rm -f \"$SQUASH_FILE\"; enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" /dev/null | grep -oE 'NodeAddr=[^ ]+' | head -1 | cut -d= -f2)"; [ -z "$MA" ] && MA="$_mn"; MP=29557 + cx_log "rendezvous master node=$_mn addr=$MA:$MP" + # FileStore rendezvous on the shared mount: nccl-ep (pure rccl PG, no gloo) inits via file:// and + # sidesteps BOTH the TCPStore master-addr reach AND the gloo connectFullMesh 127.0.1.1 alias. MoRI + # (gloo+nccl) still consumes MASTER_ADDR; run_ep.py prefers CX_RDZV_FILE when set (harmless for mori). + RDZV="$MOUNT_DIR/experimental/CollectiveX/.rdzv_${JOB_ID}"; rm -f "$MOUNT_SRC/experimental/CollectiveX/.rdzv_${JOB_ID}" 2>/dev/null || true + phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill" + # source _xnode_net.sh inside each rank: pins GLOO/NCCL_SOCKET_IFNAME to the routable 10.x NIC so + # gloo's per-rank connectFullMesh advertises the reachable iface (not the 127.0.1.1 hostname alias). + WRAP='export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; cd /ix/experimental/CollectiveX; source runtime/_xnode_net.sh 2>/dev/null || true; exec python3 tests/run_ep.py "$@"' + rc=0 + for ph in $phases; do + out="results/${RUNNER_NAME}_${CX_BENCH}_${ph}_${TS}.json" + # shellcheck disable=SC2086 + timeout -k 30 "${CX_RUN_TIMEOUT:-1800}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$WORLD" \ + --ntasks-per-node="$NGPUS" --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --container-writable --container-remap-root --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" --no-container-entrypoint \ + --export=ALL,MASTER_ADDR="$MA",MASTER_PORT="$MP",CX_RDZV_FILE="$RDZV" \ + bash -c "$WRAP" _ --backend "$CX_BENCH" --phase "$ph" --tokens-ladder "${CX_TOKENS_LADDER:-1 2 4 8}" \ + --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ + --measurement-contract layout-and-dispatch-v1 --routing "${CX_ROUTING:-uniform}" \ + --iters "${CX_ITERS:-8}" --trials "${CX_TRIALS:-1}" --warmup "${CX_WARMUP:-4}" --seed 67 \ + --runner "$RUNNER_NAME" --topology-class mi355x-multinode-rdma --transport rdma --out "$out" &1 | tail -12 + cx_log "cross-node $ph rc=${PIPESTATUS[0]}" + done + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" + rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true + cx_log "done — cross-node MI355X EP artifacts under results/" + exit 0 +fi + +# Pin to specific nodes (CX_NODELIST) when set, else exclude the known-bad ones. +if [ -n "$NODELIST" ]; then + cx_log "node pin: --nodelist=$NODELIST" + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --nodelist="$NODELIST" --gres=gpu:"$NGPUS" \ + --exclusive --cpus-per-task=128 --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +else + JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \ + --exclusive --cpus-per-task=128 --time="$TIME_MIN" --job-name="$RUNNER_NAME")" +fi +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +# Clear stray containers, then enroot-import to the node-local squash (flock, +# /dev/null || true' || true +srun --jobid="$JOB_ID" bash -c " + mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true + exec 9>\"$LOCK_FILE\" || { echo 'cannot open lock $LOCK_FILE' >&2; exit 1; } + flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then + echo 'squash present: $SQUASH_FILE' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" /dev/null || true +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md new file mode 100644 index 000000000..d62bb7746 --- /dev/null +++ b/experimental/CollectiveX/plan.md @@ -0,0 +1,940 @@ +# CollectiveX — Plan + +> **How to read this.** This is the single canonical plan. It is **spike-first** and **scoped to `experimental/CollectiveX/`** on a branch — nothing in the production serving path changes until a promotion decision is made later. Part 1 is background (what CollectiveX is, reconstructed from team discussion). Part 2 is the implementation plan. Where this plan says "now," it means the Milestone 0 spike; "later" items (GitHub workflow, database, app frontend) are deliberately deferred. All repository references (runners, launchers, workflows, matrix logic, the `experimental/` charter) were verified against the live InferenceX repo — see References. + +--- + +# Part 1 — Background + +## What it is + +CollectiveX is an benchmarking workstream under the InferenceX umbrella. It measures **collective communication** and **MoE dispatch/combine**, and performs **apples-to-apples, cross-vendor comparison of expert-parallel (EP) libraries** across NVIDIA and AMD (TPU later). The intended deliverables are an **OSS benchmark project** and a **public explainer article** — a credible cross-vendor collective benchmark plus the story around it. + +## Why + +Existing public benchmarks don't offer trustworthy, like-for-like collective/EP comparison across vendors. CollectiveX fills that gap by reusing InferenceX's runner and cluster infrastructure to produce reproducible, provenance-tagged results. + +## Current state + +- An initial MVP exists: it collected collective and kernel shapes and produced MoE dispatch/combine results on NVIDIA. +- **Normal mode works; low-latency (LL) mode is blocked** on IBGDA enablement — a direct GPU↔NIC data-and-control path over PCIe that removes CPU coordination and simplifies MoE dispatch/combine collectives — which depends on cluster-networking work outside this project. +- The main near-term enabler is NVIDIA networking / IBGDA; the AMD EP stack and AMD networking (Ultra Ethernet) are the cross-vendor counterpart. + +--- + +# Part 2 — Implementation plan + +## Implementation status (built) + +The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that: + +- **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`. +- **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`. +- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. +- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `tests/ep_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `tests/ep_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`). + +This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental). + +## Scope and placement + +CollectiveX starts as an **experimental project on its own branch**, fully contained under `experimental/CollectiveX/`: + +```bash +git switch main +git pull --ff-only +git switch -c collectivex +mkdir -p experimental/CollectiveX +``` + +This matches the repository's intent: `experimental/` is explicitly non-core ("experimental WIP code that is mostly Claude Code generated… not intended for production use or as part of the official InferenceMAX results"). + +For the experimental phase, **everything stays inside `experimental/CollectiveX/**`**. Do **not** modify: + +```text +benchmarks/ +runners/ +utils/ +.github/configs/ +perf-changelog.yaml +InferenceX-app +``` + +The only eventual exception is a minimal workflow dispatcher under `.github/workflows/` (because executable workflows must live there); all real CollectiveX logic, schemas, launchers, and processing stay under `experimental/CollectiveX/`. + +**This supersedes any notion of CollectiveX becoming a top-level InferenceX subsystem or extending the production serving matrix up front.** Promotion — into core InferenceX, into a dedicated repo, or into InferenceX-app's database/frontend — is an explicit *later* decision (Milestone 4), made only after the benchmark contract has stabilized on real hardware. + +### What InferenceX already gives us + +InferenceX's existing execution model is almost exactly the control plane CollectiveX needs: + +1. Generate and strictly validate a matrix on a GitHub-hosted runner. +2. Fan jobs out to named or labelled self-hosted runners. +3. Those listeners submit work to Slurm (or launch Docker locally). +4. Normalize outputs. +5. Upload artifacts. +6. Aggregate and dispatch ingestion to the dashboard. + +`e2e-tests.yml` already divides generated configs into job families and invokes reusable single-node and multi-node workflows; `benchmark-tmpl.yml` cleans up resources, checks out the selected ref, **derives the launcher from the runner name**, launches the job, validates outputs, and uploads normalized results. Runner listeners live on cluster login/controller nodes while jobs run on compute nodes via Slurm; runner names/labels are load-bearing — the name prefix selects the launcher and exact names/SKU labels control scheduling. + +CollectiveX reuses all of this, but enters through **CollectiveX-specific launchers** rather than threading fake models through the serving launchers (see Cluster reuse). + +## Architecture + +Four planes, cleanly separated: + +- **Control plane:** scheduling, runners, cleanup, artifact movement, workflow metadata (reused from InferenceX). +- **Benchmark plane:** collective semantics, backend invocation, correctness, timing. +- **Data plane:** canonical result records, raw per-rank samples, topology and provenance. +- **Presentation plane:** comparable subsets, charts, history, diagnostics. + +Data flow within the experimental directory: + +```text +Portable shape definitions + + +Backend definitions + + +Target/cluster definitions + ↓ +CollectiveX matrix resolver + ↓ +Resolved shards + ↓ +Existing InferenceX self-hosted runner + ↓ +experimental/CollectiveX/launchers/* + ↓ +Backend adapter (NCCL / RCCL / DeepEP / AITER / MoRI / …) + ↓ +Versioned result bundle + ↓ +Aggregator + regression checker + ↓ +Static experimental report → (later) InferenceX-app ingestion → Postgres → /collectives +``` + +### Target structure at promotion (Milestone 4) + +This packaged layout is the **promotion target**, not the spike. Milestone 0 uses the light layout in the rollout section below (`run_nccl.py` / `run_deepep.py` / `env_capture.py` / `plot.py` + flat `results/`); the structure here is what CollectiveX grows into *if* it is promoted out of `experimental/`. + +```text +InferenceX/ +├── experimental/ +│ ├── README.md +│ └── CollectiveX/ +│ ├── README.md +│ ├── DESIGN.md +│ ├── ROADMAP.md +│ ├── pyproject.toml +│ ├── Makefile +│ │ +│ ├── src/ +│ │ └── collectivex/ +│ │ ├── __init__.py +│ │ ├── cli.py +│ │ ├── config/ +│ │ │ ├── models.py +│ │ │ ├── loader.py +│ │ │ ├── resolver.py +│ │ │ └── matrix.py +│ │ ├── benchmark/ +│ │ │ ├── harness.py +│ │ │ ├── timing.py +│ │ │ ├── correctness.py +│ │ │ ├── routing.py +│ │ │ └── metrics.py +│ │ ├── backends/ +│ │ │ ├── base.py +│ │ │ ├── fake.py +│ │ │ ├── nccl_tests.py +│ │ │ ├── rccl_tests.py +│ │ │ ├── deepep.py +│ │ │ └── framework_ep.py +│ │ ├── cluster/ +│ │ │ ├── inventory.py +│ │ │ ├── capabilities.py +│ │ │ ├── environment.py +│ │ │ └── launcher.py +│ │ ├── results/ +│ │ │ ├── models.py +│ │ │ ├── writer.py +│ │ │ ├── aggregate.py +│ │ │ ├── compare.py +│ │ │ └── redact.py +│ │ └── report/ +│ │ ├── build.py +│ │ └── templates/ +│ │ +│ ├── configs/ +│ │ ├── suites/ +│ │ │ ├── smoke.yaml +│ │ │ ├── primitives.yaml +│ │ │ ├── moe-decode.yaml +│ │ │ ├── moe-prefill.yaml +│ │ │ └── full.yaml +│ │ ├── shapes/ +│ │ │ ├── synthetic/ +│ │ │ └── traced/ +│ │ ├── backends/ +│ │ ├── targets/ +│ │ └── clusters.yaml +│ │ +│ ├── launchers/ +│ │ ├── common.sh +│ │ ├── launch_b200-dgxc.sh # B200 single node +│ │ ├── launch_b200-dgxc-slurm.sh # B200 multinode +│ │ └── launch_gb200-nv.sh # GB200 NVL72 +│ │ +│ ├── schemas/ +│ │ ├── case-v1.schema.json +│ │ ├── result-v1.schema.json +│ │ ├── manifest-v1.schema.json +│ │ └── environment-v1.schema.json +│ │ +│ ├── scripts/ +│ │ ├── bootstrap.sh +│ │ ├── run_suite.sh +│ │ ├── run_shard.sh +│ │ └── build_report.sh +│ │ +│ ├── tests/ +│ │ ├── fixtures/ +│ │ ├── test_config.py +│ │ ├── test_matrix.py +│ │ ├── test_parsers.py +│ │ ├── test_correctness.py +│ │ └── test_comparability.py +│ │ +│ └── docs/ +│ ├── BENCHMARK_CONTRACT.md +│ ├── BACKEND_ADAPTER.md +│ ├── SHAPE_REGISTRY.md +│ ├── RESULT_FORMAT.md +│ ├── FRONTEND.md +│ └── PROMOTION_CRITERIA.md +│ +└── .github/workflows/ + └── collectivex-experimental.yml # Added only when cluster CI begins (Milestone 2) +``` + +> Note: launcher names mirror the real runner-name prefixes. The spike adds the three NVIDIA launchers above; AMD (`launch_mi355x-amds.sh`) and others follow. + +## Benchmark model — keep four concepts separate + +CollectiveX needs its **own** schema. Do **not** reuse or extend the serving matrix, which is built around model / ISL / OSL / framework / TP / EP / concurrency and lives in `utils/matrix_logic/generate_sweep_configs.py`. Representing collectives with fake model names, `ISL=0`, or overloaded concurrency fields would create permanent technical debt. CollectiveX gets its own matrix logic (in the packaged layout, `src/collectivex/config/matrix.py`) — introduced with the workflow at Milestone 2, not the spike — rather than touching `utils/matrix_logic/generate_sweep_configs.py`. + +The model keeps four concepts independent: + +**Shape** — the logical communication workload: + +```text +operation, message size, tokens per rank, hidden size, top-k, +expert count, routing distribution, dtype, phase +``` + +**Backend** — the implementation under test: + +```text +NCCL, RCCL, DeepEP, AITER, MoRI, framework-native EP, reference implementation +``` + +**Target** — where and how it runs: + +```text +runner type, cluster, nodes, GPUs per node, rank placement, +fabric, container image, transport capabilities +``` + +**Suite** — a curated selection of shape × backend × target combinations. Keeping these separate prevents copying the same DeepSeek/MiniMax shape into every NVIDIA and AMD configuration. + +### Portable definitions + +Shape: + +```yaml +schema-version: 1 +shape-id: moe.decode.h7168.top8.e256.t64.uniform.v1 + +kind: moe +phase: decode +operation: dispatch-combine + +shape: + tokens-per-rank: 64 + hidden-size: 7168 + top-k: 8 + num-experts: 256 + dispatch-dtype: fp8 + combine-dtype: bf16 + routing: + distribution: uniform + seed: 67 + expert-alignment: 16 +``` + +Backend: + +```yaml +backend-id: deepep-normal +backend: deepep +mode: normal + +source: + repository: deepseek-ai/DeepEP + ref: pinned-commit + +settings: + async-overlap: false + num-comm-sms: standardized + qp-count: auto +``` + +Target: + +```yaml +target-id: b200-dgxc-4n +runner-type: b200-multinode +cluster-id: b200-dgxc + +resources: + nodes: 4 + gpus-per-node: 8 + exclusive: true + +placement: + ranks-per-node: 8 + rank-order: contiguous + +capabilities: + rdma: true + ibgda: experimental + nvshmem: true +``` + +Suite: + +```yaml +suite-id: moe-decode-smoke + +shapes: + - moe.decode.h7168.top8.e256.t64.uniform.v1 + +backends: + - deepep-normal + - deepep-low-latency + +targets: + - b200-dgxc-2n + +measurement: + warmup-iterations: 20 + measured-iterations: 200 + trials: 3 + correctness: full +``` + +### Case identity + +A **case** is one immutable, versioned point: the natural key composes the three concepts — + +```text +case-id = __ __ +e.g. deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n + nccl__allreduce.fp16.logsweep.v1__b200-dgxc-2n +``` + +A shape must never silently change; a newly extracted distribution gets a new versioned `shape-id`. + +**Required shape fields — primitives:** operation; logical element count; datatype; input/output bytes; in-place vs out-of-place; reduction op (where applicable); world size; rank placement; host-driven vs device-driven launch; blocking/synchronization semantics. + +**Required shape fields — MoE (additional):** tokens per rank; hidden size; top-k; number of experts; EP size; dispatch and combine dtypes; routing distribution; expert alignment/padding; capacity constraints; quantization scale representation; cached vs recomputed routing layout; communication-SM count; async-overlap mode. DeepEP shows why these must be first-class — its interface takes tokens/rank, hidden size, top-k, expert count, FP8 mode and comm-SM settings, and exposes async dispatch/combine. + +### Shape registry + +Two independent shape sources: + +**Synthetic** — for continuous curves and hardware characterization (logarithmic byte sweep for primitives; token-count sweep for MoE; EP-scaling sweep; uniform and controlled-skew routing; intranode and internode placements; decode-oriented and prefill-oriented regimes). Don't build every Cartesian combination; define named suites (`primitive-latency-v1`, `primitive-bandwidth-v1`, `moe-decode-v1`, `moe-prefill-v1`, `moe-skew-v1`, `scaleout-v1`). + +**Trace-derived** — extracted from real InferenceX runs/profiles: + +```text +models/deepseek-v4/decode/ +models/minimax-m3/decode/ +models/kimi-k2.7/prefill/ +``` + +Each traced shape retains: source workflow run; model/config; phase; layer/layer-group; observed token histogram; routing skew; concurrent collective count; framework version; extraction-tool version. InferenceX already has a targeted profiling workflow (`profile.yml`) with optional MoE debug output and a separate trace-storage path — a natural source for real shapes rather than only guessed synthetic inputs. + +## Benchmark layers and comparison classes + +| Layer | Purpose | Examples | +|---|---|---| +| **L0 Environment** | Prove the cluster is benchmarkable | topology, NIC/GPU state, peer access, RDMA, IBGDA capability, version capture | +| **L1 Primitive collectives** | Characterize the raw communication substrate | send/recv, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv | +| **L2 MoE communication** | Compare real EP libraries | dispatch, combine, dispatch+combine round trip, normal and low-latency modes | +| **L3 Integrated pipelines** | Communication in realistic operator sequences | route → permute → dispatch → grouped GEMM → combine → unpermute | +| **L4 E2E correlation** | Explain InferenceX serving performance | isolated CollectiveX result linked to the corresponding InferenceX run/profile | + +The MVP concentrates on **L1 and L2**. L3 overlaps OperatorX and comes after the contracts are stable; L4 is the eventual tie-back to serving. + +**L0 — Environment validation** (before measuring anything): GPU count/identity; GPU/NIC topology; CUDA/ROCm version; driver version; NCCL/RCCL version; RDMA device visibility; peer-access matrix; IBGDA/SHMEM capability; container digest; clock/power state; selected network interfaces. A failed probe yields one clear `environment-invalid` result, not dozens of misleading backend failures. + +**L1 — Primitives:** send/receive, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv. Use vendor test programs where possible rather than rewriting primitives. Measure two regions separately: latency (bytes→low KiB) and bandwidth (MiB→GiB). + +**L2 — MoE collectives:** dispatch, combine, dispatch+combine. Dimensions: tokens/rank, hidden size, top-k, expert count, EP size, dispatch dtype, combine dtype, routing skew, normal vs low-latency, comm-SM count, node count. + +### Three comparison classes + +Every result is tagged with exactly one, and they must never be silently mixed on one chart: + +| Class | Meaning | +|---|---| +| `standardized` | Matched logical shape **and** fixed resource budget — same shape, topology, dtype, correctness contract, allowed comm-SMs, and timing boundaries. The main apples-to-apples comparison. | +| `backend-optimized` | Same logical output, but each library uses its recommended comm-SMs / protocols / QP count / buffer sizing / graph capture / tuning. Answers "what is the best each stack can do?" | +| `framework-integrated` | The actual path used by SGLang / vLLM / TensorRT-LLM / Dynamo. Connects to InferenceX; not a pure microbenchmark. | + +### Comparability key + +Every result gets a machine-generated comparison key; rows with different keys are not connected on the same curve by default: + +```text +operation, shape ID, dtype, world size, node count, rank placement, +routing distribution, comparison class, measurement contract version, topology class +``` + +## Measurement and correctness + +### Timing boundaries + +Record separately — never report one latency that sometimes includes JIT and sometimes doesn't: + +```text +1. communicator creation +2. buffer allocation and registration +3. first invocation / JIT +4. warmed steady-state invocation +5. host launch time +6. GPU completion time +7. optional end-to-end framework-visible time +``` + +Per measured iteration: synchronize before starting (unless explicitly testing queued execution); use GPU events for device duration and host monotonic time for API/launch duration; retain per-rank measurements; aggregate only after rank-level data is stored; report the **slowest rank** as well as the average. + +### Correctness as a hard gate + +A result is `valid` only after correctness passes. A fast result that fails correctness stays visible as `invalid` — never silently dropped. + +Primitive checks: deterministic input; expected reduction result; guard regions around buffers; in-place and out-of-place checks; dtype-specific tolerances. + +MoE checks: token conservation; correct expert assignment; correct routing weights; valid permutation metadata; dispatch output vs reference; combine output vs reference; no padded-token leakage; deterministic routing hash. + +Failed results remain in artifacts, e.g.: + +```json +{ + "status": "invalid", + "correctness_passed": false, + "error": "combine result exceeded bf16 tolerance" +} +``` + +### Routing distributions + +At minimum: uniform; single-hot/worst-case concentration; Zipf-like skew; bounded imbalance; replayed real histogram. Store the routing seed and the generated assignment hash. + +### Metrics + +| Category | Metrics | +|---|---| +| Latency | p50, p90, p95, p99, min, max | +| Rank behavior | slowest-rank latency, rank spread, coefficient of variation | +| Primitive throughput | algorithm bandwidth, bus bandwidth, effective bytes/s | +| MoE throughput | tokens/s, logical payload GB/s, dispatch and combine separately | +| Efficiency | bandwidth relative to declared topology bottleneck | +| Host overhead | API launch time, CPU utilization where available | +| GPU overhead | communication SM count, GPU active time, optional power | +| Memory | persistent buffer bytes, peak temporary bytes | +| Overlap | standalone comm, standalone compute, overlapped duration, overlap efficiency | +| Reliability | initialization failures, hangs, retries, correctness failures | +| Provenance | all software, image, driver, firmware and topology identifiers | + +### Bandwidth definitions + +NCCL `algbw`/`busbw` are stored but not treated as universal (NCCL applies operation-specific correction factors). MoE libraries often report **logical bottleneck bandwidth** (may include local-rank traffic or exclude metadata/padding; DeepEP explicitly publishes logical bandwidth). Store separate fields, and use `null` rather than a deceptive inference when a backend can't expose physical bytes: + +```text +logical_payload_bytes +allocated_payload_bytes +estimated_link_bytes +metadata_bytes +padding_bytes +``` + +## Result and artifact format + +Each shard emits a versioned bundle: + +```text +output/ +├── manifest.json +├── cases.json +├── results.jsonl +├── rank-samples.jsonl.gz +├── summary.json +├── environment/ +│ ├── gpu.json +│ ├── network.json +│ ├── topology.json +│ └── software.json +├── raw/ +│ ├── stdout.log +│ ├── stderr.log +│ └── backend-output/ +├── commands/ +│ └── reproduce.sh +└── profiles/ +``` + +**Manifest** (invariant run-level metadata): schema version; workflow run + attempt; source SHA/ref; cluster ID; runner; Slurm job ID; node count; topology fingerprint; image digest; backend commit/build; start/end timestamps; redaction version. + +**Result row:** + +```json +{ + "schema_version": 1, + "case_id": "deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n", + "status": "valid", + "trial": 1, + "backend": "deepep", + "mode": "normal", + "comparison_class": "standardized", + "metrics": { + "latency_us_p50": 0, + "latency_us_p99": 0, + "slowest_rank_us_p50": 0, + "logical_bandwidth_gbps": 0, + "tokens_per_second": 0, + "rank_spread_pct": 0, + "persistent_buffer_bytes": 0 + }, + "correctness": { "passed": true, "max_abs_error": 0, "max_rel_error": 0 } +} +``` + +Use an explicit `schema_version` from the beginning — do not repeat the app's historical need to infer schema version from whether a field happens to exist. + +## Backend adapters + +Each adapter implements a small contract: + +```python +class CollectiveBackend: + def probe(self, environment) -> CapabilityReport: ... + def prepare(self, case, workdir) -> PreparedCommand: ... + def run(self, prepared, launcher) -> RawRun: ... + def parse(self, raw_run) -> list[RankSample]: ... + def validate(self, case, raw_run) -> CorrectnessReport: ... + def describe(self) -> BackendProvenance: ... +``` + +**Tier 0 — communication baselines:** NVIDIA `nccl-tests`, ROCm `rccl-tests`, optionally PyTorch distributed as a common-API baseline. Don't rewrite primitives from scratch — `nccl-tests` already supports multi-node, warmups, correctness checking (`-c 1`), per-rank aggregation, device-driven implementations, and separate CPU-time reporting. *(Confirm whether the installed build emits JSON; if not, parse the text table.)* + +**Tier 1 — MoE dispatch/combine:** upstream DeepEP, ROCm DeepEP, and the NVIDIA/AMD EP paths already used by the InferenceX serving stacks. **Version pins are first-class.** Upstream DeepEP V2 changed NVSHMEM→NCCL, unified high-throughput and low-latency APIs, changed buffer behavior, and removed a previous zero-SM LL mode; ROCm's port has different maturity, NIC variants, rocSHMEM dependencies. DeepEP is **built at job setup** (via `rebuild-deepep.sh`, resolved by srt-slurm), not shipped in the image — its build time and `aarch64` (GB200) feasibility are tracked spike risks. A chart labelled only "DeepEP" is therefore ambiguous — store: + +```text +backend name, upstream/fork, git commit, API generation, +transport backend, build flags, runtime library versions, container digest +``` + +**Tier 2 — additional optimized stacks (later):** MSCCL++, AITER comm/fusion paths, MoRI/Pollara, NVSHMEM/rocSHMEM microbenchmarks, framework-native fused collectives. + +## Rollout — spike-first + +**Spike-first.** No schema, Pydantic model, or comparison contract is frozen until one real, correctness-gated number exists on real hardware. The first milestone is a single end-to-end spike on **two NVIDIA topologies, B200 and GB200**, chosen because they exercise the two transport regimes that matter: B200 is an 8-GPU NVLink island with CX-7 InfiniBand between nodes; GB200 is an NVL72 multi-node-NVLink (MNNVL) domain. Running the same collective across both is itself the first headline result, and it forces the provenance and comparison-class machinery to be real from line one. The schema is the spike's *output*, extracted from the artifacts it produces — not its input. AMD and all platform work (workflow, DB, frontend) follow. + +### Milestone 0 — NVIDIA B200 + GB200 spike + +One milestone, NVIDIA-only, end to end. This collapses the former "design contract," "CPU framework," "primitive NVIDIA baseline," and the NVIDIA half of "MoE MVP" into a single vertical slice that produces real numbers on real fabric. + +Scaffolding — deliberately light, matching `experimental/` convention (bare scripts + flat JSON + a plot; no package / Pydantic / JSON-schemas yet — those arrive at the contract freeze): + +```text +experimental/CollectiveX/ + README.md + run_nccl.py # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON) + tests/run_ep.py # EP dispatch/combine sweep (DeepEP/MoRI); dispatch & combine timed separately + env_capture.py # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json + plot.py # matplotlib, like token_position_decode_slo/*/plot_*.py + launchers/ + common.sh + launch_b200-dgxc.sh # B200 single node (b200-dgxc runner → 8-GPU NVLink island, x86_64) + launch_b200-dgxc-slurm.sh # B200 multinode (b200-multinode runner → CX-7 IB spine) + launch_gb200-nv.sh # GB200 (gb200 runner → NVL72 MNNVL, aarch64, 4 GPU/node) + results/*.json # flat, hand-verifiable +``` + +Reuse existing patterns rather than reinventing: `experimental/dsv32/bench.py` for `torch.cuda.Event` timing and stdout environment capture, and `experimental/token_position_decode_slo/glm-5/{bmk_*_sbatch.sh,plot_sla_frontier.py}` for Slurm orchestration + plotting. Mirror the runner→launcher routing convention (`bash ./launchers/launch_${RUNNER_NAME%%_*}.sh`) so the runner name selects the CollectiveX launcher as the serving path does. + +**DeepEP is not prebuilt in any image.** The serving recipes build it at job setup via `setup_script: rebuild-deepep.sh` (resolved by srt-slurm; see `benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`). The spike reuses that same rebuild path — on B200 (x86_64) first. Pin images by digest from `.github/configs/nvidia-master.yaml`: B200 `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b`; GB200 `lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc` (an unpinned nightly today — capture its digest before relying on it). + +What it measures: + +```text +Primitives (stock nccl-tests, -c 1 for correctness) — on BOTH B200 and GB200: + all-reduce, all-gather, reduce-scatter, all-to-all + latency regime (bytes→KiB) and bandwidth regime (MiB→GiB) + B200 : 8 GPU/node (x86_64); 1 node (NVLink island) and 2 nodes (cross CX-7 IB) + GB200 : 4 GPU/node (aarch64); 1 node and 2+ nodes — all still inside the NVL72 NVLink (MNNVL) domain + +MoE (DeepEP, normal mode only — LL mode is the known-broken/blocked path, out of scope): + one decode-shaped dispatch+combine: tokens-per-rank=64, hidden=7168, + top-k=8, experts=256, dispatch fp8 + correctness: token conservation + combine vs a reference implementation + B200 (x86_64) first; GB200 DeepEP is a fast-follow once the aarch64 rebuild-deepep path is proven +``` + +The headline is the **same NCCL primitive shape on both topologies**: B200's 2-node path crosses CX-7 InfiniBand, while GB200's stays on NVL72 NVLink (MNNVL). That IB-vs-MNNVL contrast at a matched logical shape is the result worth publishing. (nccl-tests and DeepEP must be built for `aarch64` on GB200 — the reason DeepEP is B200-first.) + +Provenance captured on every row from the first run — non-negotiable even in a spike, because it is what makes the B200-vs-GB200 number defensible: + +```text +topology-class b200-nvlink-island(+cx7-ib) | gb200-nvl72-mnnvl +transport actually used (NVLink / IB / NVSHMEM-IBGDA), derived from flags + measured behavior +transport env set/recorded: + B200 : NCCL_CUMEM_ENABLE=1 + GB200 : NCCL_CUMEM_ENABLE=1, NCCL_MNNVL_ENABLE=1, MC_FORCE_MNNVL=1 + (also seen in serving: NCCL_P2P_LEVEL=NVL, SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK) +comm-SM count, QP count where applicable +backend commit + API generation + build flags +container digest, CUDA / driver / NCCL versions +comparison-class tag (standardized where shape, dtype and SM budget match) +``` + +These flags come from validated GB200 serving recipes (`…/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`); MNNVL is GB200/GB300-only, which is exactly what makes the transport differ from B200. + +Output: a result bundle on disk (`manifest.json`, `results.jsonl`, `environment/`, `raw/`, `commands/reproduce.sh`). Hand-verify the first rows; do not build a generated Pydantic contract yet. + +Exit criteria: + +* real NCCL latency + bandwidth curves on **both** B200 and GB200, correctness-passed (the headline) +* one DeepEP dispatch+combine number (normal mode) on **B200**, correctness-passed; GB200 DeepEP as the immediate fast-follow +* every row carries topology-class, transport, comparison-class and full provenance +* a B200-vs-GB200 side-by-side that the comparison key permits **and labels as topology-class-differing** — that labeled comparison is the intended result, not an accident +* **only now** freeze the schema (`CollectiveCase` / `CollectiveResult` / manifest), extracted from these artifacts + +Explicitly out of scope for the spike: AMD, IBGDA low-latency mode, GitHub Actions, database, frontend, trace-derived shapes, and the fake backend as a deliverable (keep a trivial one only if it speeds offline tests). + +### Milestone 1 — AMD parity + +Bring the AMD side up against the schema the spike froze — not in parallel with it: + +```text +RCCL-tests adapter (mirror the nccl-tests text-table parser) +one AMD launcher (launch_mi355x-amds.sh) +one AMD MoE dispatch/combine backend (DeepEP ROCm / AITER / MoRI) +equivalent shapes + identical result contract +first cross-vendor (NVIDIA vs AMD) comparison +``` + +Record the AMD transport stack (rocSHMEM, MoRI-IO / Pollara, NIC variant) with the same provenance rigor the spike established. An unlabeled "DeepEP" row compared across vendors is meaningless. + +### Milestone 2 — GitHub workflow + +Add (orchestration only; see GitHub workflow design below): + +```text +collectivex-experimental.yml +preflight +canary +matrix sharding +artifact collection +regression comparison +static report artifact +``` + +Do not connect it to `perf-changelog.yaml`. + +### Milestone 3 — Trace-derived shapes + +Extract representative shapes from InferenceX profiles (DeepSeek V4, MiniMax M3, Kimi). Every traced shape must retain: source workflow run; source configuration; framework version; model phase; extraction-tool version; routing-histogram hash. + +### Milestone 4 — Promotion decision + +Only then decide whether to: keep CollectiveX permanently experimental; move it into core InferenceX; extract it into a dedicated repository; or integrate its data into InferenceX-app (database + `/collectives` frontend). + +### First PRs (the spike) + +The spike lands as a few small PRs, each producing something runnable — not a docs-and-schema PR: + +```text +1. Scaffold + NCCL on B200 single node + run_nccl.py (text-table parser), env_capture.py, plot.py, + launchers/launch_b200-dgxc.sh, results/*.json + → lands when it emits a real all-reduce curve with provenance from an 8-GPU B200 + +2. B200 multinode + GB200 + launchers/launch_b200-dgxc-slurm.sh, launchers/launch_gb200-nv.sh + → lands when the same primitive runs on 2-node B200 (cross-IB) and on GB200 NVL72 (MNNVL), + each tagged with topology-class and transport (aarch64 build for GB200) + +3. DeepEP dispatch+combine — B200 first + tests/ep_deepep.py, routing generator + reference combine for correctness, + reusing rebuild-deepep at job setup + → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow + +4. Freeze the contract + extract the case / result / manifest schema from the bundles produced in 1–3; + add fixtures captured from real output — this is where the packaged structure begins +``` + +The first objective is a real, provenance-tagged, correctness-gated number on two NVIDIA topologies — the contract is the spike's output, not its foundation. + +## Cluster reuse and capability inventory + +### What to reuse + +Existing self-hosted runner registrations; exact runner labels; Slurm access from runner hosts; checkout and artifact patterns; resource-cleanup strategy; repository secrets; container caches where appropriate. The runner inventory (`.github/configs/runners.yaml`) already enumerates H100, H200, B200, B300, GB200, GB300, MI300X, MI325X, MI355X fleets and groups such as `h200-multinode`, `b200-multinode`, individual nodes, etc. CollectiveX **reads** this file rather than duplicating runner names. + +### What not to reuse directly + +Do not call the serving launchers (`runners/launch_${RUNNER_NAME%%_*}.sh`) — they carry model-serving assumptions (model paths, framework setup, result naming). Mirror the **selection convention** with CollectiveX launchers instead: + +```bash +bash experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh +``` + +Each CollectiveX launcher handles only: Slurm allocation; container image; mounts; network environment; rank launch; result copy-back; cleanup. There are **two launch paths**, mirroring the serving side: **single-node** B200 mirrors the `salloc … --gres=gpu:N --exclusive … && srun --container-image=` pattern in `runners/launch_b200-dgxc.sh`; **multi-node** B200/GB200 drives **srt-slurm** (`srtctl apply -f `), which already knows how to rebuild DeepEP and set the MNNVL env — so the CollectiveX GB200 launcher is a thin wrapper handing srt-slurm a CollectiveX recipe, not a from-scratch sbatch. (Later, common Slurm/container functions can be factored into a shared lib used by both systems.) + +> Runner-name subtlety to handle in `inventory.py`: one physical cluster can appear under multiple prefixes — `b200-dgxc_NN` routes to `launch_b200-dgxc.sh` (single-node) while `b200-dgxc-slurm_N` (label `b200-multinode`) routes to `launch_b200-dgxc-slurm.sh`. One fabric domain can therefore span several runner labels. + +### Capability overlay + +`inventory.py` loads `../../../.github/configs/runners.yaml` and combines it with a CollectiveX capability overlay — one source of truth for runner names, CollectiveX metadata kept isolated: + +```yaml +b200-multinode: + launcher: b200-dgxc-slurm + vendor: nvidia + hardware: b200 + topology-class: b200-nvlink-cx7 + fabric-domain: b200-dgxc-main + gpus-per-node: 8 + arch: x86_64 + max-nodes: 16 + scheduler: slurm + container-runtime: enroot-pyxis + capabilities: + nccl: true + deepep: true # built at job setup via rebuild-deepep, not prebuilt + rdma: true + nvshmem: true + ibgda: experimental # capability present ≠ currently validated + scheduling: + exclusive-nodes: true + max-parallel-shards: 1 + +gb200: + launcher: gb200-nv + vendor: nvidia + hardware: gb200 + topology-class: gb200-nvl72-mnnvl + gpus-per-node: 4 # NVL72 compute tray + arch: aarch64 # nccl-tests + DeepEP must build for aarch64 + scheduler: srt-slurm + transport-env: { NCCL_CUMEM_ENABLE: 1, NCCL_MNNVL_ENABLE: 1, MC_FORCE_MNNVL: 1 } + capabilities: + nccl: true + deepep: true # rebuilt at setup; aarch64 path is a tracked risk + mnnvl: true # GB200/GB300 only + ibgda: experimental +``` + +`fabric-domain` is essential: two jobs on separate compute nodes may still contend for the same leaf/spine network, so **GitHub concurrency is keyed by fabric domain, not GPU SKU**. The inventory distinguishes hardware capability, software currently installed, and feature state (known-good vs experimental vs temporarily broken) — IBGDA support and "IBGDA low-latency currently validated" are different properties. + +**Operational coexistence with the serving sweep.** `b200-multinode` is only three runners (`b200-dgxc-slurm_7/8/9`), **shared with the production serving sweeps**, and srt-slurm allocations are long. Exclusive nodes + `max-parallel-shards: 1` + fabric-domain serialization means CollectiveX and the serving sweep contend for the same scarce runners. Decide the scheduling/coexistence policy (off-hours windows? a dedicated runner?) before enabling any recurring CollectiveX suite, rather than discovering the contention in CI. + +## GitHub workflow design (Milestone 2) + +When cluster CI begins, add one small orchestration-only file — `.github/workflows/collectivex-experimental.yml` — with no benchmarking logic: + +```text +validate → resolve matrix → preflight canaries → benchmark shards +→ aggregate → compare against baseline → build static report → upload artifacts +``` + +Triggers while on the branch: + +```yaml +on: + push: + branches: [ collectivex ] + paths: + - experimental/CollectiveX/** + - .github/workflows/collectivex-experimental.yml + pull_request: + paths: + - experimental/CollectiveX/** + - .github/workflows/collectivex-experimental.yml +``` + +Later, after a minimal dispatcher exists on `main`, add `workflow_dispatch` with inputs: `ref, suite, target, backend, shape, profile` (and comparison class / normal-LL-both / dry-run). + +Jobs: + +1. **Validate** — install the package; validate all suite/shape/backend/cluster YAML; confirm runner references exist in `runners.yaml`; reject unknown fields; emit the resolved run plan as an artifact. (Match InferenceX's strict Pydantic practice — models reject extra fields.) +2. **Compile and shard** — **do not** generate one job per benchmark point. Group cases by `cluster, node count, GPU placement, container image, backend build, transport mode, fabric domain, profiler requirement`. A shard runs many compatible points under one Slurm allocation (avoids thousands of matrix jobs, repeated communicator init, queue latency, repeated container import). Bounded runtime; record per-case failures unless the cluster itself is unhealthy. +3. **Preflight** — confirm GPU count; validate peer access; enumerate NICs; test RDMA/device visibility; verify backend libraries; run a tiny correctness case; capture topology/software. A failed preflight marks the whole shard `environment-invalid` rather than manufacturing dozens of backend failures. +4. **Canary** — for each `(cluster, backend, mode)` group, run one small representative case; launch the larger matrix only after it passes (mirrors InferenceX's canary-before-full-sweep). +5. **Benchmark** (`collectivex-benchmark-tmpl.yml`) — run on the resolved runner label; unique Slurm job name from workflow/attempt/shard; exclusive nodes; serialize/limit by `fabric-domain`; call the CollectiveX launcher; upload results even on partial failure; always upload environment+logs; fail the job only after artifact creation. +6. **Aggregate and regress** — validate every result against JSON schema; reject duplicate natural keys; merge rank samples and summaries; compute trial aggregates; compare against the most recent compatible baseline; publish a step summary; upload one `results_collectivex` bundle. +7. **Dispatch ingestion** (only once promoted to feed the app) — repository-dispatch the InferenceX-app repo with `{ "benchmark-family": "collectivex", "run-id": "...", "run-attempt": "..." }`. + +Use a separate `collectivex-changelog.yaml`: a CollectiveX backend change must not trigger the expensive serving sweep through `perf-changelog.yaml`, and a serving change must not launch every collective suite. + +## Regression policy (Milestone 2+) + +A compatible baseline requires exact matches on: case ID; cluster ID; topology fingerprint (or approved topology class); backend; comparison class; normal/LL mode; node and rank placement; dtype and shape; measurement-contract version. **Do not compare "same GPU SKU" across materially different fabrics.** + +```text +regression if: + correctness changed pass → fail + OR median latency degradation exceeds max(fixed floor, cluster noise threshold) + OR bandwidth degradation exceeds max(fixed floor, cluster noise threshold) +``` + +Derive each cluster's noise threshold from repeated baseline measurements via median absolute deviation — don't hard-code a universal 3% before knowing each fabric's noise. Retain failed, timed-out, and invalid results; reliability is part of the benchmark. + +## Reporting, database, and frontend + +**Now (spike / Milestone 2): a static, artifact-driven report.** Do not begin by changing InferenceX-app. + +```bash +python -m collectivex.report --results output/aggregate.json --output output/report/ +``` + +```text +report/ +├── index.html +├── data.json +├── assets/ +└── runs/ + └── .html +``` + +Report views: **Overview** (supported clusters/backends, latest run, correctness failures, recent regressions, coverage matrix); **Primitive explorer** (latency / algbw / busbw / rank-spread vs payload size; single-node vs multinode); **MoE explorer** (dispatch & combine latency vs tokens/rank; tokens/s vs EP size; uniform vs skewed; normal vs LL; comm-SMs vs performance); **Case details** (exact shape, backend commit, container digest, topology fingerprint, environment, command, correctness report, rank-level distribution, raw logs). A **comparison warning** must visibly reject invalid comparisons: + +```text +Not directly comparable: +- different routing distribution +- different topology class +- different communication-SM budget +- standardized versus backend-optimized mode +``` + +**Later (Milestone 4 / promotion into InferenceX-app):** add `/collectives` to the app (Next.js, React Query, raw API rows, client-side transforms, D3 charts; tab metadata/routing are centralized). Avoid a single global "CollectiveX score" at launch. Port the report views, plus Library Comparison, Scale-and-topology, and Historical-regression views, and a run-detail drawer. The frontend computes the `comparison-key` and refuses to connect rows with differing keys by default — **this guard matters more than any individual chart.** + +API routes (app): + +```text +/api/v1/collectives +/api/v1/collectives/availability +/api/v1/collectives/history +/api/v1/collectives/runs/:id +/api/v1/collectives/artifacts/:id +``` + +Continue the app convention: API returns raw DB rows; the frontend does chart-specific transforms. + +**Database (app, later).** Do not put CollectiveX rows in `benchmark_results` (its identity is serving configs + ISL/OSL/concurrency). Reuse `workflow_runs`, then add: + +```sql +collective_workloads(id, case_id, schema_version, family, operation, shape jsonb) +collective_environments(id, cluster_id, hardware, topology_class, topology_hash, software jsonb, capabilities jsonb) +collective_configs(id, workload_id, environment_id, backend, backend_version, comparison_class, mode, nodes, gpus_per_node, world_size, settings jsonb) +collective_results(id, workflow_run_id, config_id, trial, date, status, metrics jsonb, + latency_p50_us, latency_p99_us, logical_bandwidth_gbps, bus_bandwidth_gbps, + tokens_per_second, rank_skew_pct, error) +collective_artifacts(result_id, artifact_type, storage_url, metadata jsonb) +collective_availability(date, hardware, cluster_id, backend, family, operation, mode) +``` + +Follow the app's hybrid design (JSONB for evolving metrics; indexed "hot" columns for common filters; idempotent ingestion; natural unique keys; denormalized date; latest-results materialized view). Keep raw per-rank samples in artifacts/object storage, not in Postgres. + +## Future expansions + +The spike de-risks the path to the actual deliverable — a public OSS collective benchmark and an explainer article. Expansion axes, roughly near → far, with dependencies: + +**Hardware breadth.** B300 / GB300 next (GB300 is also MNNVL, with known disagg KV-transfer wins) → H100 / H200 as a cheaper, more-available **InfiniBand baseline** ideal for characterizing per-fabric noise → AMD MI300X / MI325X / MI355X (this is Milestone 1) → TPU (far; a separate stack and toolchain). + +**Backend breadth.** Framework-native EP (the `framework-integrated` class — ties numbers back to the SGLang/vLLM serving paths) → MSCCL++, NVSHMEM / rocSHMEM microbenchmarks, AITER comm/fusion, MoRI / Pollara (AMD). + +**IBGDA low-latency mode.** The recurring strategic blocker and the original "LL is broken" story; gated on the NVIDIA SRE maintenance window for B200/B300. Highest narrative value — add as an experimental suite the moment it unblocks. + +**Scale-out.** 2 → 4 → 8 → 16 nodes; on GB200, intra-NVL72 vs cross-rack scaling-efficiency curves (where MNNVL ends and the inter-rack fabric begins). + +**L3 integrated operator path.** route → permute → dispatch → grouped-GEMM → combine → unpermute — the bridge to OperatorX. + +**L4 e2e correlation.** Link an isolated dispatch/combine number to the same shape's cost inside a real serving run via `profile.yml` traces — the "explain serving performance" payoff and the tie-back to the core product. + +**Trace-derived shapes (Milestone 3).** DeepSeek V4 / MiniMax M3 / Kimi token-histogram and routing-skew extraction, so the synthetic shapes are anchored to real workloads. + +**AMD Ultra Ethernet (UEC).** The AMD networking path; pairs with the MoRI / Pollara backends. + +**Productization (north star).** Static report → public OSS benchmark site + the explainer article; promotion into InferenceX-app (`/collectives` + Postgres + nightly suite + regression alerts) at Milestone 2 / 4. + +## Continuous benchmark — vision & scope + +Goal: a continuous benchmark that reproduces the spike automatically and grows into a credible cross-vendor EP/collective comparison. **Start with balanced DeepSeek shapes, intranode EP**, then venture to advanced cases. Target **≥1 EP library per platform** first — DeepEP on NVIDIA, MoRI on AMD. + +### EP library landscape +- MoRI (AMD) — https://github.com/ROCm/mori +- DeepEP / DeepEPv2 / Hybrid-EP — https://github.com/deepseek-ai/DeepEP (hybrid: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) +- NVIDIA NCCL EP — https://github.com/NVIDIA/nccl/tree/master/contrib/nccl_ep +- UCCL — https://github.com/uccl-project/uccl +- NVLink One-Sided AllToAll EP (mainly NVL72) — TensorRT-LLM blog18 (Optimizing MoE Communication with One-Sided AllToAll over NVLink) +- NIXL EP — https://github.com/ai-dynamo/nixl/tree/main/examples/device/ep + +### Shapes & axes +- **Classic DeepSeek V3:** hidden 7168, top-8, 256 routable experts. +- **Prefill vs decode** (# tokens). +- **Normal EP vs low-latency (LL) EP.** +- **Dispatch precision:** NVFP4, MXFP4, MXFP8, BF16. +- **Combine precision:** MXFP8, direct-cast FP8, BF16, NVFP4 — see MoRI #311, flashinfer #3643 / #3376. +- **Balanced vs unbalanced vs EPLB.** +- **Realistic shapes from InferenceX models** — collect hidden sizes / routing (Qwen3.5 has an unusual top-k). + +### Other inference collectives (later) +- KV-cache transfer: MoRI-IO, NIXL, Mooncake; CPU↔GPU offload — `experimental/kvcache_transfer_DtoH_HtoD/benchmark.py`. +- Low-latency one-shot / two-shot all-reduce (SGLang & vLLM in-tree kernels + AITER / FlashInfer variants) — e.g. sglang `sgl-kernel/csrc/allreduce/quick_all_reduce.cuh`. + +### Reference benchmark scripts to draw from +- flashinfer PR #3000; ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`. + +### Learning resources +- arXiv 2511.15076, 2603.13606, 2512.19849, 2412.19437. + +## Things not to do + +* Do not add collective fields to the existing serving matrix. +* Do not make one GitHub Actions job per payload size. +* Do not call all logical-bandwidth figures "bus bandwidth." +* Do not compare different topology fingerprints as though GPU SKU were sufficient. +* Do not silently discard failed or incorrect results. +* Do not let a backend choose undocumented tuning parameters (in `standardized` mode). +* Do not make low-latency mode the only reported result. +* Do not publish one overall ranking before coverage and comparison contracts are stable. +* Do not start with every EP library, TPU, UEC, and every model shape. +* Do not store full raw rank samples indefinitely in Postgres. +* Do not expose internal hostnames, paths, NIC GUIDs, IP addresses, or private image references in public artifacts. +* Do not freeze the schema before the spike has produced a real artifact to freeze it from. + +## References (verified against the live InferenceX repo) + +- `experimental/README.md` — the non-core / "not official results" charter this project lives under. +- `.github/configs/runners.yaml` — runner labels and exact names (H100…GB300, AMD MI3xx). +- `.github/workflows/benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`, `profile.yml`, `speedbench-al.yml` — the `bash ./runners/launch_${RUNNER_NAME%%_*}.sh` selection convention. +- `runners/launch_*.sh` — existing per-cluster launchers (`launch_b200-dgxc.sh`, `launch_b200-dgxc-slurm.sh`, `launch_gb200-nv.sh`, `launch_mi355x-amds.sh`, …). +- `utils/matrix_logic/generate_sweep_configs.py`, `validation.py` — the serving matrix CollectiveX must **not** extend. +- `.github/workflows/e2e-tests.yml`, `collect-results.yml` — the validate → fan-out → collect control plane being reused. +- `perf-changelog.yaml` — the additions-only serving gate CollectiveX must **not** trigger. +- NVIDIA Magnum IO NVSHMEM + GPUDirect Async (IBGDA): `https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/` diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py new file mode 100644 index 000000000..c24136ebc --- /dev/null +++ b/experimental/CollectiveX/plot.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — plot NCCL primitive curves, B200 vs GB200. + +Loads run_nccl.py result JSONs from results/, and for each operation draws two +panels: latency-vs-size and bus-bandwidth-vs-size, overlaying one curve per +(runner, topology-class, world-size). The B200(IB)-vs-GB200(MNNVL) contrast at +a matched shape is the intended overlay and the spike's headline. + +Comparison guard (plan §Comparability): curves are only overlaid when they +share op + dtype + comparison-class + measurement-contract. Anything else is +reported as "not directly comparable" and skipped rather than silently mixed. + + python plot.py --results-dir results --out-dir results/plots + +matplotlib + (optional) numpy. Run on a workstation/laptop over the JSON +artifacts; no GPU needed. +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +from collections import defaultdict + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def _human(nbytes: int) -> str: + for unit in ("B", "KiB", "MiB", "GiB"): + if nbytes < 1024 or unit == "GiB": + return f"{nbytes:.0f}{unit}" if unit == "B" else f"{nbytes/1:.0f}{unit}" + nbytes /= 1024 + return str(nbytes) + + +def load_nccl_results(results_dir: str) -> list[dict]: + docs = [] + for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))): + try: + with open(path) as _f: + d = json.load(_f) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") == "nccl" and d.get("rows"): + d["_path"] = path + docs.append(d) + return docs + + +def curve_label(d: dict) -> str: + return f"{d['runner']} · {d['topology_class']} · ws{d['world_size']}" + + +def overlay_signature(d: dict) -> tuple: + """Fields that must match for two curves to share a chart (topology and + world-size are deliberately NOT here — they are the comparison axis).""" + return (d["op"], d.get("dtype"), d.get("comparison_class"), d.get("measurement_contract")) + + +def plot_op(op: str, docs: list[dict], out_dir: str) -> str | None: + if not docs: + return None + # Comparison guard: keep the dominant signature, warn on the rest. + sigs = defaultdict(list) + for d in docs: + sigs[overlay_signature(d)].append(d) + main_sig = max(sigs, key=lambda s: len(sigs[s])) + keep = sigs[main_sig] + for sig, ds in sigs.items(): + if sig == main_sig: + continue + for d in ds: + print(f" [guard] skipping {curve_label(d)} for op={op}: not directly " + f"comparable (dtype/class/contract differs: {sig} vs {main_sig})") + + fig, (ax_lat, ax_bw) = plt.subplots(1, 2, figsize=(14, 5)) + for d in sorted(keep, key=curve_label): + rows = sorted(d["rows"], key=lambda r: r["size_bytes"]) + sizes = [r["size_bytes"] for r in rows] + lat = [r["out_of_place"]["time_us"] for r in rows] + bw = [r["busbw_gbps"] for r in rows] + label = curve_label(d) + ax_lat.plot(sizes, lat, "o-", linewidth=2, markersize=4, label=label) + ax_bw.plot(sizes, bw, "o-", linewidth=2, markersize=4, label=label) + + for ax in (ax_lat, ax_bw): + ax.set_xscale("log", base=2) + ax.set_xlabel("Message size (bytes)") + ax.grid(True, alpha=0.3) + ax.legend(fontsize=9) + ax_lat.set_yscale("log") + ax_lat.set_ylabel("Latency (µs, out-of-place)") + ax_lat.set_title(f"{op}: latency vs size") + ax_bw.set_ylabel("Bus bandwidth (GB/s)") + ax_bw.set_title(f"{op}: bus bandwidth vs size") + fig.suptitle( + f"CollectiveX · {op} · dtype={main_sig[1]} · class={main_sig[2]} " + f"(topology is the comparison axis)", + fontsize=11, + ) + fig.tight_layout() + os.makedirs(out_dir, exist_ok=True) + out = os.path.join(out_dir, f"nccl_{op}.png") + fig.savefig(out, dpi=150, bbox_inches="tight") + plt.close(fig) + return out + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX primitive plots") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--out-dir", default="results/plots") + ap.add_argument("--op", help="only plot this op") + args = ap.parse_args() + + docs = load_nccl_results(args.results_dir) + if not docs: + print(f"no nccl result JSONs found in {args.results_dir}/") + return 1 + + by_op = defaultdict(list) + for d in docs: + by_op[d["op"]].append(d) + + ops = [args.op] if args.op else sorted(by_op) + made = [] + for op in ops: + out = plot_op(op, by_op.get(op, []), args.out_dir) + if out: + made.append(out) + print(f"wrote {out} ({len(by_op[op])} curve(s))") + if not made: + print("nothing plotted") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/plot_ep.py b/experimental/CollectiveX/plot_ep.py new file mode 100644 index 000000000..e26b9d686 --- /dev/null +++ b/experimental/CollectiveX/plot_ep.py @@ -0,0 +1,1789 @@ +#!/usr/bin/env python3 +"""CollectiveX — render EP dispatch/combine sweeps to a self-contained HTML. + +Reads the family=moe result JSONs (tests/run_ep.py output) and emits ONE +dependency-free HTML file (inline SVG, no CDN — opens offline) with: + + * an interactive explorer: operation (dispatch | combine | round-trip) x + phase (decode | prefill) x x-axis (tokens/rank | global tokens) x y-axis + (latency | tokens/s | alg bandwidth), one colored line per SKU/backend/EP; + * a static small-multiples grid (phase x operation) of latency vs tokens/rank. + +Only source-tokens-per-rank varies along a line; everything else (backend, EP +degree, phase, precision, top-k/experts/hidden, routing) is fixed and identifies +the line — per the CollectiveX EP framework. + + python3 plot_ep.py --results-dir results --out results/plots/collectivex_ep.html +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +import sys + +# SKU -> color (matches the matplotlib convention used for the NCCL plots). +COLORS = {"b200": "#1f77b4", "gb200": "#2ca02c", "mi355x": "#d62728", + "b300": "#9467bd", "gb300": "#8c564b", "h100": "#ff7f0e", "h200": "#e377c2"} + +# Per-SKU color FAMILIES: every (sku,backend,dtype,mode,resource) config gets its own +# shade within its SKU's hue family, so lines are individually identifiable AND the SKU +# is still readable at a glance (SKU-only coloring collided same-SKU configs into one). +SKU_FAMILY = { + "h100": ["#ff7f0e", "#d6a72b", "#ffbb78", "#8c6d1f", "#e8a33d"], # oranges / golds + "h200": ["#e377c2", "#b04a8f", "#f4b6df"], # pinks + "b200": ["#1f77b4", "#0d3d66", "#4a90d9", "#7fb2e0"], # blues + "b300": ["#9467bd", "#6b3fa0", "#c5b0d5", "#7b4fa0"], # purples + "gb200": ["#2ca02c", "#1a661a", "#7bc77b"], # greens + "gb300": ["#8c564b", "#5e372f", "#c49c94"], # browns + "mi355x": ["#d62728", "#a30000", "#ff9896", "#e34a4a"], # reds +} +PALETTE = ["#17becf", "#bcbd22", "#7f7f7f", "#393b79", "#637939"] # fallback for unknown SKUs + +# MoE (hidden, top-k, routed-experts) -> human model name. Used to label the model-shape selector +# + coverage + tooltips. DeepSeek-V3/V4 (7168/8/256) is the cross-hardware headline shape; the +# others are official canonical results at additional model dims. An unlisted shape is labelled by +# its dims (see model_name) so a new model is still selectable the moment its data lands. +MODEL_NAMES = { + (7168, 8, 256): "DeepSeek-V3/V4", + (6144, 8, 256): "MiniMax-M3", + (7168, 8, 384): "Kimi-K2", + (4096, 8, 128): "Qwen3.5", + (7168, 8, 288): "DeepSeek-V3 (EPLB physical)", +} + + +def model_name(shape: dict) -> str: + """Map a result shape to a model name; fall back to the dims for an unregistered shape.""" + h, k, e = shape.get("hidden"), shape.get("topk"), shape.get("experts") + return MODEL_NAMES.get((h, k, e)) or f"shape {h}/{k}/{e}" + + +def _iter_docs(results_dir: str): + """Yield every result doc under results_dir: one per *.json file, AND one per line of each + *.ndjson (the consolidated aggregate written by aggregate_results.py). This lets the plot read + the single aggregate ndjson instead of thousands of individual JSONs — keeping results/ small + (the restructure goal). During a transition both may exist; delete the individuals once merged + so no doc is double-counted.""" + for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)): + try: + yield json.load(open(path)) + except (json.JSONDecodeError, OSError): + continue + for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.ndjson"), recursive=True)): + try: + with open(path) as fh: + for line in fh: + line = line.strip() + if line: + yield json.loads(line) + except (json.JSONDecodeError, OSError): + continue + + +def load_series(results_dir: str, legacy: str = "all") -> list[dict]: + series = [] + for d in _iter_docs(results_dir): + if d.get("family") != "moe" or not d.get("rows"): + continue + # legacy = a v3 doc with no machine-derived publication_status. exclude -> v4-only main + # plot; only -> the legacy.html archive. + is_legacy = "publication_status" not in d + if (legacy == "exclude" and is_legacy) or (legacy == "only" and not is_legacy): + continue + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + rows = [] + for r in d["rows"]: + # v4 carries nested {p50,p90,p95,p99} dicts for dispatch/combine/roundtrip/isolated_sum. + # Fall back to v3 flat *_us_p* (serial -> isolated_sum) so legacy docs still load. + def pcts(k, flat): + if isinstance(r.get(k), dict) and r[k].get("p50") is not None: + o = dict(r[k]); o.setdefault("p95", o.get("p90")) + return o + p50 = r.get(f"{flat}_us_p50") + return {"p50": p50, "p90": r.get(f"{flat}_us_p90") or p50, + "p95": r.get(f"{flat}_us_p95") or r.get(f"{flat}_us_p90") or p50, + "p99": r.get(f"{flat}_us_p99") or p50} + dop, cop = pcts("dispatch", "dispatch"), pcts("combine", "combine") + iso = pcts("isolated_sum", "serial") # renamed from "serial" + rtp = pcts("roundtrip", "roundtrip") # MEASURED round trip (v4) + if not (dop["p50"] and cop["p50"]): + continue + if rtp["p50"] is None: # legacy: no measured RT + rtp = iso + rows.append({ + "t": r["tokens_per_rank"], "gt": r.get("global_tokens"), + "dispatch": dop, "combine": cop, "roundtrip": rtp, "isolated_sum": iso, + "fanout": r.get("fanout_mean"), + "dbytes": r.get("dispatch_logical_bytes") or r.get("routed_bytes_total") or 0, + "cbytes": r.get("combine_logical_bytes") or 0, + "recv": r.get("recv_tokens_max") or r.get("recv_tokens") or 0, + "straggler": (r.get("per_rank_dispatch_us") or {}).get("slowest_rank"), + "correct": bool(r.get("correct")), + }) + if not rows: + continue + sh = d.get("shape", {}) + mode = d.get("mode", "normal") + dtype = sh.get("dispatch_dtype", "?") + rmode = d.get("resource_mode", "") + ll = " LL" if mode == "ll" else "" + # resource suffix: tuned is the default (omit); flag the others so a normalized + # or default-budget line is never confused with the tuned one. + rs = {"normalized": " (norm)", "default": " (def)"}.get(rmode, "") + contract = d.get("measurement_contract", "?") + cl = " [cl]" if contract == "cached-layout-comm-only-v1" else "" # cached-layout flag + backend = d.get("backend") + ep = d.get("ep_size") + # DeepEP kernel generation (v1 NVSHMEM / v2 NCCL-Gin); default v1 for legacy deepep docs + # without the field, n-a for non-deepep. Folds into the line key + label so V1/V2 are distinct. + kgen = sh.get("kernel_gen") or ("v1" if backend == "deepep" else "n-a") + kg = f" {kgen}" if kgen == "v2" else "" # only annotate v2 (keep v1 labels unchanged) + # Routing axis: base distribution + EPLB. "zipf+eplb" is the balanced-by-replication + # variant of zipf; uniform is the baseline (omitted from the label to keep it short). + eplb_doc = d.get("eplb") or {} + routing_disp = f'{sh.get("routing", "?")}+eplb' if eplb_doc.get("enabled") else sh.get("routing", "?") + # temporal step + uneven allocation are distinct workloads — fold into the routing label so + # moving-hotspot snapshots / uneven variants draw as separate lines, not overlaid. + _repro = d.get("reproduction") or {} + _step = _repro.get("routing_step", 0) + _uneven = _repro.get("uneven_tokens", "none") + if _step: + routing_disp += f"@s{_step}" + if _uneven != "none": + routing_disp += f"·{_uneven}" + rt = "" if routing_disp == "uniform" else f' ·{routing_disp}' + # FULL per-line label: SKU·EP·backend·dtype[·LL][·resource][·cached-layout][·routing]. + # EP is explicit because a SKU can span EP degrees (GB300 EP4 on one NVL72 tray, EP8 + # across two); routing is explicit so balanced/zipf/zipf+eplb don't collide with uniform. + label = f'{sku.upper()} EP{ep} · {backend}{kg} · {dtype}{ll}{rs}{cl}{rt}' + repro = d.get("reproduction", {}) + gr = repro.get("git_run") or {} + rid = d.get("routing_identity", {}) + wl = d.get("workload") or {} + # publication status (v4) gates the default view; legacy v3 docs -> "legacy". + pub = d.get("publication_status") or "legacy" + # workload signature: prefer the v4 workload block, fall back to routing_identity (v3). + wsig = wl.get("trace_signature") or rid.get("trace_signature") + series.append({ + "sku": sku, "backend": backend, "ep": ep, + "pub": pub, "wsig": wsig, "wid": wl.get("workload_id"), + # combine-quant mode + activation (value) profile are part of workload identity + # (review: quant combine can be value-sensitive). Default none/normal for pre-scaffold + # results; used by the comparison guard + tooltip so a quantized-combine or + # different-value run is never read as the same point as a bf16/normal one. + "cqm": (sh.get("quant") or {}).get("combine_quant_mode", "none"), + "act": sh.get("activation_profile", "normal"), + "phase": d.get("phase", "decode"), "mode": mode, + "dtype": dtype, "resource": rmode or "tuned", "contract": contract, + # comparison class: best-stack (tuned/default) vs resource-constrained + # (normalized) — kept distinct so they're never read as one fair contest. + "suite": "resource-constrained" if rmode == "normalized" else "backend-default", + "routing": routing_disp, + # eplb per-rank load imbalance removed (the headline of zipf vs zipf+eplb). + "eplb_before": eplb_doc.get("imbalance_before"), "eplb_after": eplb_doc.get("imbalance_after"), + # ep + routing in the key so EP4/EP8 and uniform/balanced/zipf/zipf+eplb of one SKU + # get distinct colors/lines (sku stays ckey.split("|")[0] for the family lookup). + "kgen": kgen, + "ckey": f"{sku}|{backend}|{dtype}|{mode}|{rmode}|{contract}|ep{ep}|{routing_disp}|{kgen}", # config identity (color); kgen so V1/V2 are distinct lines + "label": label, + "dash": "" if dtype == "bf16" else "6 4", # bf16 solid, fp8 dashed (2nd cue) + "color": COLORS.get(sku, "#555"), # provisional; reassigned below + "topo": d.get("topology_class"), "transport": d.get("transport"), + "fp8_in_timing": repro.get("fp8_quant_in_timing"), + "run_id": gr.get("run_id"), "source_sha": (gr.get("source_sha") or "")[:10], + "repo": gr.get("repo"), "image_digest": (repro.get("image_digest") or "")[:19], + "routing_consistent": rid.get("consistent_across_ranks"), + "trace_sig": rid.get("trace_signature"), + "samples": (rows and d["rows"][0].get("samples_pooled")) or None, + "prov": d.get("backend_provenance", {}), + # model name (from the MoE shape) so the model-shape selector / legend / coverage can + # name a series; the raw shape stays for the dims-based match in the chart filter. + "model": model_name(sh), + "shape": sh, "rows": rows, + }) + # NOTE (goal Part 1, "plot/artifact integrity"): raw series are IMMUTABLE after loading. + # An earlier version injected each config's decode-range points into its prefill series so + # prefill panels spanned the full token axis — that COPIED observations between series and + # is removed. Each phase now plots only its own measured points; the x-axis simply spans + # whatever a series measured. (A shaded decode/prefill regime is the cosmetic alternative.) + + # Assign a DISTINCT color per config key, grouped by SKU family (stable across the + # decode/prefill panels so a line keeps its color everywhere). + by_sku: dict[str, list[str]] = {} + for ck in sorted({s["ckey"] for s in series}): + by_sku.setdefault(ck.split("|")[0], []).append(ck) + ckcolor: dict[str, str] = {} + fb = 0 + for sku, cks in by_sku.items(): + fam = SKU_FAMILY.get(sku) + for j, ck in enumerate(cks): + if fam: + ckcolor[ck] = fam[j % len(fam)] + else: + ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1 + for s in series: + s["color"] = ckcolor[s["ckey"]] + return series + + +def load_nccl_series(results_dir: str) -> list[dict]: + """Load family=nccl docs (run_nccl.py output) into JS-friendly series — ADDITIVE to the + family=moe series; routed to the All-reduce / All-gather tabs by `op`. One series per result + doc (a single op x runner x topology x transport sweep over message sizes). Color is assigned + per (sku, topology_class, transport) config within the SKU's hue family, matching the EP plot's + convention so a SKU is readable at a glance. invalid docs are kept but flagged (greyed in the UI) + so a failed/zero-busbw run is excluded from comparison rather than silently dropped (goal P1).""" + series = [] + for d in _iter_docs(results_dir): + if d.get("family") != "nccl" or not d.get("rows"): + continue + runner = d.get("runner") or "?" + sku = runner.split("_")[0].split("-")[0] + topo = d.get("topology_class") or "?" + transport = d.get("transport") or "" + op = d.get("op") or "?" + status = d.get("status") or "?" + valid = status == "valid" + rows = [] + for r in d["rows"]: + # busbw_gbps is the best (max) across placements; pull the matching time from whichever + # placement that came from so latency + bandwidth describe the same observation. Default + # to out-of-place (the conventional headline) when busbw is absent/zero (latency-bound + # small messages report 0 GB/s — kept for the latency view, dropped from the bw view by y>0). + oop, ip = r.get("out_of_place") or {}, r.get("in_place") or {} + best_bw = r.get("busbw_gbps") + if best_bw is not None and ip.get("busbw_gbps") is not None and \ + ip.get("busbw_gbps") == best_bw and (oop.get("busbw_gbps") or -1) != best_bw: + t_us, algbw = ip.get("time_us"), ip.get("algbw_gbps") + else: + t_us, algbw = oop.get("time_us"), oop.get("algbw_gbps") + if r.get("size_bytes") is None or t_us is None: + continue + rows.append({ + "size": r["size_bytes"], "dtype": r.get("dtype"), + "t_us": t_us, "algbw": algbw, "busbw": best_bw, + "oop_us": oop.get("time_us"), "ip_us": ip.get("time_us"), + "correct": r.get("correct"), + }) + if not rows: + continue + rows.sort(key=lambda x: x["size"]) + tlab = f" · {transport}" if transport else "" + # label carries provenance (topology + transport); world-size disambiguates same-topo runs. + label = f'{sku.upper()} · {topo}{tlab} (ws{d.get("world_size","?")})' + series.append({ + "op": op, "sku": sku, "runner": runner, + "topo": topo, "transport": transport, + "world_size": d.get("world_size"), "nodes": d.get("nodes"), + "dtype": (rows[0].get("dtype") if rows else None), + "comparison_class": d.get("comparison_class"), + "comparison_key": d.get("comparison_key"), + "contract": d.get("measurement_contract"), + "avg_busbw": (d.get("summary") or {}).get("avg_busbw_gbps"), + "status": status, "valid": valid, + # config identity for color: a (sku, topology, transport, world-size) cohort is one line. + "ckey": f"{sku}|{topo}|{transport}|ws{d.get('world_size')}", + "label": label, "color": COLORS.get(sku, "#555"), # provisional; reassigned below + "rows": rows, + }) + # DISTINCT color per config key within the SKU family (same scheme as the EP series), so an + # all-reduce line keeps a SKU-readable hue and same-SKU topologies stay distinguishable. + by_sku: dict[str, list[str]] = {} + for ck in sorted({s["ckey"] for s in series}): + by_sku.setdefault(ck.split("|")[0], []).append(ck) + ckcolor: dict[str, str] = {} + fb = 0 + for sku, cks in by_sku.items(): + fam = SKU_FAMILY.get(sku) + for j, ck in enumerate(cks): + if fam: + ckcolor[ck] = fam[j % len(fam)] + else: + ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1 + for s in series: + s["color"] = ckcolor[s["ckey"]] + return series + + +def load_allreduce_fw_series(results_dir: str) -> list[dict]: + """Load family=allreduce-fw docs (allreduce_fw_bench.py output) into JS-friendly series — ADDITIVE, + and shaped IDENTICALLY to load_nccl_series so they flow through the SAME All-reduce tab path with no + JS changes. One series per (doc, group/impl) so the nccl baseline, flashinfer-oneshot, and + flashinfer-twoshot lines each get their own color and are directly comparable. op is set to the same + "all_reduce" key the All-reduce tab filters on. `skipped` rows (no size, or no latency and no busbw) + are dropped so a not-applicable size doesn't draw a phantom point.""" + series = [] + for d in _iter_docs(results_dir): + if d.get("family") != "allreduce-fw" or not d.get("groups"): + continue + runner = d.get("runner") or "?" + sku = runner.split("_")[0].split("-")[0] + transport = d.get("transport") or "" + status = d.get("status") or "?" + valid = status == "valid" + for g in d["groups"]: + impl = g.get("impl") or "?" + world_size = g.get("world_size", d.get("world_size")) + topo = g.get("topology_class") or d.get("topology_class") or "?" + dtype = g.get("dtype") or d.get("dtype") + rows = [] + for r in (g.get("rows") or []): + size = r.get("size_bytes") + t_us = r.get("latency_us") + busbw = r.get("busbw_gbps") + # drop `skipped` rows: no size, or neither a latency nor a (nonzero) bandwidth observation. + if size is None or (t_us is None and busbw in (None, 0)): + continue + rows.append({ + "size": size, "dtype": dtype, + "t_us": t_us, "algbw": r.get("algbw_gbps"), "busbw": busbw, + "correct": r.get("correct"), + }) + if not rows: + continue + rows.sort(key=lambda x: x["size"]) + # label MUST carry the impl so nccl vs flashinfer-oneshot vs flashinfer-twoshot are distinct. + label = f'{sku.upper()} · {impl} (fw-AR · ws{world_size})' + series.append({ + "op": "all_reduce", "sku": sku, "runner": runner, + "topo": topo, "transport": transport, + "world_size": world_size, "nodes": d.get("nodes"), + "dtype": dtype, + "comparison_class": d.get("comparison_class"), + "comparison_key": g.get("comparison_key") or d.get("comparison_key"), + "contract": d.get("measurement_contract"), + "status": status, "valid": valid, + # config identity for color: each impl is its own line within the SKU family. + "ckey": f"{sku}|fwar|{impl}|ws{world_size}", + "label": label, "color": COLORS.get(sku, "#555"), # provisional; reassigned below + "rows": rows, + }) + # DISTINCT color per config key within the SKU family (same scheme as load_nccl_series), so each + # impl keeps a SKU-readable hue and the three impls stay distinguishable. + by_sku: dict[str, list[str]] = {} + for ck in sorted({s["ckey"] for s in series}): + by_sku.setdefault(ck.split("|")[0], []).append(ck) + ckcolor: dict[str, str] = {} + fb = 0 + for sku, cks in by_sku.items(): + fam = SKU_FAMILY.get(sku) + for j, ck in enumerate(cks): + if fam: + ckcolor[ck] = fam[j % len(fam)] + else: + ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1 + for s in series: + s["color"] = ckcolor[s["ckey"]] + return series + + +def _assign_coll_colors(series: list[dict]) -> list[dict]: + """Assign a DISTINCT color per `ckey` within each SKU's hue family (same scheme as the EP / NCCL + series), so a collective line keeps a SKU-readable hue and same-SKU configs stay distinguishable.""" + by_sku: dict[str, list[str]] = {} + for ck in sorted({s["ckey"] for s in series}): + by_sku.setdefault(ck.split("|")[0], []).append(ck) + ckcolor: dict[str, str] = {} + fb = 0 + for sku, cks in by_sku.items(): + fam = SKU_FAMILY.get(sku) + for j, ck in enumerate(cks): + if fam: + ckcolor[ck] = fam[j % len(fam)] + else: + ckcolor[ck] = PALETTE[fb % len(PALETTE)]; fb += 1 + for s in series: + s["color"] = ckcolor[s["ckey"]] + return series + + +def _dedup_newest(docs: list) -> list: + """Keep one doc per dedup-key, newest generated_at wins (the decode+prefill jobs ran the SAME + single-process bench, so two files share a (sku,config) — drawing both would double every line). + `docs` is a list of (dedup_key, generated_at, payload); returns the surviving payloads.""" + best: dict = {} + for key, gen, payload in docs: + cur = best.get(key) + if cur is None or (gen or "") > (cur[0] or ""): + best[key] = (gen, payload) + return [payload for _, payload in best.values()] + + +def load_offload_series(results_dir: str) -> list[dict]: + """family=offload (CPU<->GPU offload). ONE line per (sku, op, host_memory) so pinned-vs-pageable + and h2d-vs-d2h are directly visible (goal P2 "GPU->CPU / CPU->GPU bandwidth/latency, pinned vs + pageable"). Dedup to newest doc per (sku, topology, transport); surface the overlap % from + diagnostics as a per-doc note. ADDITIVE — independent of the family=moe series.""" + docs = [] + for d in _iter_docs(results_dir): + if d.get("family") != "offload" or not d.get("rows"): + continue + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + # dedup key: a (sku, topology, transport) cohort is one bench regardless of decode/prefill job. + docs.append(((sku, d.get("topology_class"), d.get("transport")), d.get("generated_at"), d)) + series = [] + for d in _dedup_newest(docs): + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + topo = d.get("topology_class") or "?" + transport = d.get("transport") or "" + valid = (d.get("status") or "?") == "valid" + ov = ((d.get("diagnostics") or {}).get("overlap_with_compute") or {}) + peak = d.get("peak_bandwidth_gbps") + note = (f"peak {peak:.0f} GB/s" if peak is not None else "") + if ov.get("overlap_pct") is not None: + note += f" · copy/compute overlap {ov['overlap_pct']:.0f}%" + numa = (d.get("diagnostics") or {}).get("numa") or {} + if numa.get("node_count") is not None: + note += f" · {numa['node_count']} NUMA node(s)" + lines: dict = {} # (op, host_memory) -> rows + for r in d["rows"]: + if r.get("size_bytes") is None or r.get("bandwidth_gbps") is None: + continue + lines.setdefault((r.get("op"), r.get("host_memory")), []).append({ + "size": r["size_bytes"], "bw": r.get("bandwidth_gbps"), "lat": r.get("latency_us")}) + for (op, host), rows in lines.items(): + rows.sort(key=lambda x: x["size"]) + series.append({ + "family": "offload", "sku": sku, "topo": topo, "transport": transport, + "op": op, "sub": host, "valid": valid, "status": d.get("status") or "?", + "note": note, "peak": peak, + "label": f'{sku.upper()} · {op} · {host}', + "ckey": f'{sku}|{op}|{host}', "color": COLORS.get(sku, "#555"), + "rows": rows, + }) + return _assign_coll_colors(series) + + +def load_copy_engine_series(results_dir: str) -> list[dict]: + """family=copy-engine (SDMA copy engine vs SM-driven copy). ONE line per (sku, op, engine) so the + copy-engine-vs-SM comparison (the headline of this view) is direct. Dedup to newest doc per + (sku, topology, transport); carry copy_engine_uses_near_zero_sms as a note. ADDITIVE.""" + docs = [] + for d in _iter_docs(results_dir): + if d.get("family") != "copy-engine" or not d.get("rows"): + continue + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + docs.append(((sku, d.get("topology_class"), d.get("transport")), d.get("generated_at"), d)) + series = [] + for d in _dedup_newest(docs): + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + topo = d.get("topology_class") or "?" + transport = d.get("transport") or "" + valid = (d.get("status") or "?") == "valid" + peak = d.get("peak_bandwidth_gbps") + nz = d.get("copy_engine_uses_near_zero_sms") + note = (f"peak {peak:.0f} GB/s" if peak is not None else "") + if nz is not None: + note += f" · copy-engine uses near-zero SMs: {'yes' if nz else 'no'}" + lines: dict = {} # (op, engine) -> rows + for r in d["rows"]: + if r.get("size_bytes") is None or r.get("bandwidth_gbps") is None: + continue + lines.setdefault((r.get("op"), r.get("engine")), []).append({ + "size": r["size_bytes"], "bw": r.get("bandwidth_gbps"), "lat": r.get("latency_us")}) + for (op, engine), rows in lines.items(): + rows.sort(key=lambda x: x["size"]) + series.append({ + "family": "copy-engine", "sku": sku, "topo": topo, "transport": transport, + "op": op, "sub": engine, "valid": valid, "status": d.get("status") or "?", + "note": note, "peak": peak, + "label": f'{sku.upper()} · {op} · {engine}', + "ckey": f'{sku}|{op}|{engine}', "color": COLORS.get(sku, "#555"), + "rows": rows, + }) + return _assign_coll_colors(series) + + +def load_kvcache_series(results_dir: str) -> list[dict]: + """family=kv-cache (KV block transfer). ONE line per (sku, direction, layout, backend) so paged- + vs-contiguous and the direction breakdown are visible. groups[] each carry their own rows[] + (transfer_bytes -> bandwidth_gb_s / time_ms). Dedup to newest doc per (sku, transport); note the + declared-unwired backends. ADDITIVE.""" + docs = [] + for d in _iter_docs(results_dir): + if d.get("family") != "kv-cache" or not d.get("groups"): + continue + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + docs.append(((sku, d.get("transport")), d.get("generated_at"), d)) + series = [] + for d in _dedup_newest(docs): + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + valid = (d.get("status") or "?") == "valid" + unwired = d.get("declared_unwired_backends") or [] + wired = d.get("wired_backends") or [] + note = (f"wired: {', '.join(wired)}" if wired else "") + if unwired: + note += f" · declared-unwired: {', '.join(unwired)}" + for g in d["groups"]: + direction, layout, backend = g.get("direction"), g.get("layout"), g.get("backend") + topo = g.get("topology_class") or d.get("transport") or "?" + rows = [] + for r in (g.get("rows") or []): + if r.get("transfer_bytes") is None or r.get("bandwidth_gb_s") is None: + continue + rows.append({"size": r["transfer_bytes"], "bw": r.get("bandwidth_gb_s"), + "lat": r.get("time_ms"), "size_class": r.get("size_class"), + "correct": r.get("correct")}) + if not rows: + continue + rows.sort(key=lambda x: x["size"]) + series.append({ + "family": "kv-cache", "sku": sku, "topo": topo, "transport": d.get("transport") or "", + "op": direction, "sub": f'{layout}/{backend}', "valid": valid, "status": d.get("status") or "?", + "note": note, + "label": f'{sku.upper()} · {direction} · {layout} · {backend}', + "ckey": f'{sku}|{direction}|{layout}|{backend}', "color": COLORS.get(sku, "#555"), + "rows": rows, + }) + return _assign_coll_colors(series) + + +def load_rlmesh_series(results_dir: str) -> list[dict]: + """family=rl-mesh (RL trainer<->generator weight-transfer mesh). ONE line per (sku, direction, + pattern) so trainer->gen vs gen->trainer AND paired (1:1 send/recv) vs redistribute (disjoint + all-to-all reshard) are all visible. groups-nested like kv-cache (each group carries its own + rows[]: transfer_bytes -> bandwidth_gb_s / time_ms). Dedup to newest doc per (sku, transport); + note the mesh split (trainer N <-> generator M). ADDITIVE.""" + docs = [] + for d in _iter_docs(results_dir): + if d.get("family") != "rl-mesh" or not d.get("groups"): + continue + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + docs.append(((sku, d.get("transport")), d.get("generated_at"), d)) + # short direction labels keep the legend compact (raw direction stays in `op` for grouping). + short = {"trainer_to_generator": "trn→gen", "generator_to_trainer": "gen→trn"} + series = [] + for d in _dedup_newest(docs): + sku = (d.get("runner") or "?").split("_")[0].split("-")[0] + valid = (d.get("status") or "?") == "valid" + peak = d.get("peak_bandwidth_gb_s") + ws, tr, gr = d.get("world_size"), d.get("trainer_ranks"), d.get("generator_ranks") + note = (f"peak {peak:.0f} GB/s" if peak is not None else "") + if ws is not None: + note += f" · world={ws}: trainer {tr} ↔ generator {gr}" + for g in d["groups"]: + direction, pattern = g.get("direction"), g.get("pattern") + topo = g.get("topology_class") or d.get("transport") or "?" + rows = [] + for r in (g.get("rows") or []): + if r.get("transfer_bytes") is None or r.get("bandwidth_gb_s") is None: + continue + rows.append({"size": r["transfer_bytes"], "bw": r.get("bandwidth_gb_s"), + "lat": r.get("time_ms"), "correct": r.get("correct")}) + if not rows: + continue + rows.sort(key=lambda x: x["size"]) + dlab = short.get(direction, direction) + series.append({ + "family": "rl-mesh", "sku": sku, "topo": topo, "transport": d.get("transport") or "", + "op": direction, "sub": pattern, "valid": valid, "status": d.get("status") or "?", + "note": note, + "label": f'{sku.upper()} · {dlab} · {pattern}', + "ckey": f'{sku}|{direction}|{pattern}', "color": COLORS.get(sku, "#555"), + "rows": rows, + }) + return _assign_coll_colors(series) + + +# Budgets (µs) for the "max tokens / rank under a p99 round-trip budget" decision view (goal P3-D, +# the previously-missing metric). Picked to bracket a typical decode SLO band. +RT_BUDGETS_US = [100, 250, 500] + + +def _rt_p99(row): + """measured round-trip p99 for a plot_ep row (v4 nested dict, falls back to isolated_sum).""" + rt = row.get("roundtrip") or {} + return rt.get("p99") + + +def max_tokens_under_budget(series, budgets=RT_BUDGETS_US): + """For each (sku, backend, phase, dtype, ep) HEADLINE cell (official, DeepSeek-V3 shape, uniform + routing), the largest tokens/rank whose MEASURED round-trip p99 <= each budget. This is the + "how much load fits under an SLO" number the chart did not previously expose. Honest about + misses: a budget no measured point satisfies reports None (rendered as '—').""" + cells = {} + for s in series: + sh = s.get("shape") or {} + if not (s.get("pub") == "official" and s.get("wid") + and sh.get("hidden") == 7168 and sh.get("topk") == 8 and sh.get("experts") == 256 + and s.get("routing") == "uniform"): + continue + key = (s["sku"], s["backend"], s["phase"], s["dtype"], s["ep"], s.get("mode", "normal")) + pts = cells.setdefault(key, []) + for r in s["rows"]: + q = _rt_p99(r) + if q and r.get("t"): + pts.append((r["t"], q)) + out = [] + for (sku, backend, phase, dtype, ep, mode), pts in sorted(cells.items()): + pts.sort() + row = {"sku": sku, "backend": backend, "phase": phase, "dtype": dtype, "ep": ep, "mode": mode} + for b in budgets: + ok = [t for (t, q) in pts if q <= b] + row[f"b{b}"] = max(ok) if ok else None + # only emit a row if at least one budget is satisfiable (keeps the table to useful cells) + if any(row.get(f"b{b}") is not None for b in budgets): + out.append(row) + return out + + +def summary_cards(series, sens_rows, failed, ll_rows): + """Industry-summary headline cards (goal P3-F), computed from the loaded series. Each card is + {title, value, sub, [warn], [href]}. Comparisons use the MEASURED round-trip p99 on the official + DeepSeek-V3 headline cohort so the cards match the default chart view. ll_rows is analyze_ep's + ll_crossover() output (used for the LL→normal crossover card).""" + def headline(s): + sh = s.get("shape") or {} + return (s.get("pub") == "official" and s.get("wid") + and sh.get("hidden") == 7168 and sh.get("topk") == 8 and sh.get("experts") == 256 + and s.get("routing") == "uniform") + + def best_rt(pred, T_decode=64, T_prefill=256): + """lowest round-trip p99 over series matching pred, at the phase's headline token count.""" + best = None + for s in series: + if not (headline(s) and pred(s)): + continue + T = T_decode if s["phase"] == "decode" else T_prefill + for r in s["rows"]: + if r.get("t") == T: + q = _rt_p99(r) + if q and (best is None or q < best[0]): + best = (q, s, T) + return best + + cards = [] + + def fmt_best(b, label): + if not b: + cards.append({"title": label, "value": "no data", "sub": "no official headline cell at this phase/EP"}) + return + q, s, T = b + cards.append({"title": label, + "value": f"{s['backend']} · {s['sku'].upper()}", + "sub": f"{q:.0f} µs RT p99 · {s['dtype']} · T={T}"}) + + fmt_best(best_rt(lambda s: s["phase"] == "decode" and s["ep"] == 8), "Best backend · decode EP8") + fmt_best(best_rt(lambda s: s["phase"] == "prefill" and s["ep"] == 8), "Best backend · prefill EP8") + + # LL crossover (measured-roundtrip basis, p50): first cell with a real crossover token count. + crosses = [r for r in (ll_rows or []) + if r.get("basis") == "measured-roundtrip" and r.get("stat") == "p50" + and isinstance(r.get("normal_faster_at_T"), int)] + if crosses: + c = min(crosses, key=lambda r: r["normal_faster_at_T"]) + cards.append({"title": "LL → normal crossover", + "value": f"T≈{c['normal_faster_at_T']} tok/rank", + "sub": f"{c['sku'].upper()} EP{c['ep']} {c['dtype']} · normal RT p50 wins above this (measured)"}) + else: + cards.append({"title": "LL → normal crossover", "value": "none in range", + "sub": "normal RT never beats LL within the measured token ladder"}) + + # Resource-normalized vs backend-default winners (decode EP8 headline). + rn = best_rt(lambda s: s["phase"] == "decode" and s["ep"] == 8 and s["suite"] == "resource-constrained") + bd = best_rt(lambda s: s["phase"] == "decode" and s["ep"] == 8 and s["suite"] == "backend-default") + fmt_best(rn, "Resource-normalized winner") + fmt_best(bd, "Backend-default winner") + + # Most unstable configuration: highest distribution-sensitivity ratio (p99 worst/uniform). + if sens_rows: + w = max(sens_rows, key=lambda g: g.get("distribution_sensitivity_ratio") or 0) + cards.append({"title": "Most unstable config", "warn": True, + "value": f"{w['sku'].upper()} · {w['backend']} {w['phase']}", + "sub": f"{w['distribution_sensitivity_ratio']:.2f}× p99 under {w.get('worst_distribution','?')} vs uniform"}) + else: + cards.append({"title": "Most unstable config", "value": "n/a", "sub": "no multi-distribution group yet"}) + + # Known invalid / diagnostic cases (count + link to the Evidence tab's failed table). + n = len(failed or []) + cards.append({"title": "Invalid / diagnostic cases", "warn": n > 0, + "value": str(n), "sub": ("see Evidence ▸ failed table" if n else "none — all runs publishable"), + "href": "#tab-evidence"}) + return cards + + +HEAD = """ + +CollectiveX — EP dispatch / combine +
+

CollectiveX — EP dispatch / combine

+

+""" + +TAIL = "
" + +JS = r""" +const SKUS = [...new Set(DATA.map(s=>s.sku))]; +// roundtrip = INDEPENDENTLY MEASURED chained latency (v4). isolated_sum = Σ of isolated +// dispatch+combine percentiles — NOT a measured op (no throughput/SLO use). serial(v3)->isolated_sum. +const OPS = {dispatch:"Dispatch", combine:"Combine", roundtrip:"Round trip (measured)", isolated_sum:"Isolated sum (Σp, not measured)"}; +// NOT algorithmic/bus bandwidth: logical routed payload (recv copies x hidden x dtype) +// over latency; dispatch & combine count their OWN bytes. Excludes scales/idx/meta/padding. +const YK = {lat:"Latency (µs)", tps:"Tokens / s", bw:"Logical routed payload rate (GB/s)"}; +const XK = {t:"Source tokens / rank", gt:"Global source tokens"}; +const PCT = {p50:"p50", p90:"p90", p99:"p99"}; +const SUITE = {all:"All", "backend-default":"Backend-default", "resource-constrained":"Resource-constrained"}; +// Routing distributions present in the data (+ "all"): uniform (baseline) / balanced / +// zipf (skewed) / zipf+eplb (skew rebalanced by EPLB replication). Default to uniform so the +// initial view matches the headline sweep; switch to compare zipf vs zipf+eplb. +const ROUTING = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.routing))].sort().forEach(r=>{o[r]=r;}); return o; })(); +// Prefill panels show only the real large-T prefill range. MoRI ramps its prefill sweep from 1 +// (cold-jump wedge) and records decode-scale points; the intended prefill floor is the DeepEP +// prefill ladder min. So every SKU's prefill panel starts there — the sub-floor MoRI points are +// ramp-warmup (same kernel as decode) and live in the decode panel, not fabricated/duplicated here. +const _dpf = DATA.filter(s=>s.phase==="prefill"&&s.backend==="deepep").flatMap(s=>s.rows.map(r=>r.t)); +const PREFILL_MIN = _dpf.length? Math.min(..._dpf) : 128; +// Publication-status filter (goal P1): default hides diagnostic/invalid/failed so the first +// view is publication-valid; "publishable" = official + comparable-experimental + legacy v3. +// The OFFICIAL view additionally drops wid=null lines (a non-canonical workload can never be +// official — goal P1) so an official chart can never show a wid=null or non-official cohort. +// "official-headline" (goal P0-1a, B6/B7) is the DEFAULT opening filter: official + canonical wid +// AND the single cross-hardware headline MoE shape (DeepSeek-V3 7168/8/256) — so the page opens on +// exactly the apples-to-apples headline cohort, never a mixed-shape official set. Every broader set +// (official / publishable / all) stays one click away. +// MODEL-SHAPE selector (follow-up): each result carries a MoE shape (hidden/topk/experts) named in +// Python (s.model). The headline shape is DeepSeek-V3/V4 (7168/8/256). The option list is built +// DYNAMICALLY from the shapes ACTUALLY present in DATA (a shape with no data is never offered); +// each option is keyed by "hidden/topk/experts" and labelled " (h/topk/e)". "all" = every +// shape. Default = the headline shape so the opening view is unchanged. +const HEADLINE_SHAPE = {hidden:7168, topk:8, experts:256}; +const SHAPE_KEY = sh => (sh? (sh.hidden+'/'+sh.topk+'/'+sh.experts) : '?'); +const HEADLINE_SHAPE_KEY = HEADLINE_SHAPE.hidden+'/'+HEADLINE_SHAPE.topk+'/'+HEADLINE_SHAPE.experts; +// {shapeKey -> "Model (h/topk/e)"} for every distinct shape in DATA, headline first then by size. +const MODELS = (()=>{ + const seen={}; DATA.forEach(s=>{ const k=SHAPE_KEY(s.shape); if(!(k in seen)) seen[k]=s.model||('shape '+k); }); + const keys=Object.keys(seen).sort((a,b)=>{ if(a===HEADLINE_SHAPE_KEY) return -1; if(b===HEADLINE_SHAPE_KEY) return 1; return a.localeCompare(b,undefined,{numeric:true}); }); + const o={all:"All shapes"}; keys.forEach(k=>{ o[k]=seen[k]+' ('+k+')'; }); return o; +})(); +const MODEL_DEFAULT = (HEADLINE_SHAPE_KEY in MODELS)? HEADLINE_SHAPE_KEY : Object.keys(MODELS).filter(k=>k!=="all")[0]; +function modelOk(s){ return ST.model==="all" || SHAPE_KEY(s.shape)===ST.model; } +// isHeadlineShape now means "matches the SELECTED model shape" (defaults to DeepSeek-V3/V4), so the +// official-headline filter follows the model selector instead of being pinned to one shape. +function isHeadlineShape(s){ return modelOk(s); } +const PUB = {"official-headline":"Official headline", official:"Official only", publishable:"Publishable", all:"All (incl. diagnostic)"}; +function pubOk(s){ + if(ST.pub==="all") return true; + if(ST.pub==="official-headline") return s.pub==="official" && !!s.wid && isHeadlineShape(s); // official + selected model shape + if(ST.pub==="official") return s.pub==="official" && !!s.wid; // official => canonical wid required + // publishable = official + comparable, but ONLY with a NON-NULL workload id (goal P0: every + // plotted official/comparable result carries non-null workload identity). A seeded-runtime + // (wid=null) line is shown only in the "All (incl. diagnostic)" view, never as publishable. + return !["diagnostic","invalid","failed"].includes(s.pub) && !!s.wid; +} +// dtype + EP-degree filters (goal P0-1a/B2): the headline opens on BF16 + EP8, but "All" keeps +// every dtype / EP degree selectable. Applied to the MAIN chart + legend only (the grid + heatmaps +// facet by EP themselves). Built from the data so a new dtype/EP shows up automatically. +const DTYPES = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.dtype))].sort().forEach(d=>{o[d]=d;}); return o; })(); +const EPS = (()=>{ const o={all:"All"}; [...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b).forEach(e=>{o[String(e)]="EP"+e;}); return o; })(); +function dtOk(s){ return ST.dtype==="all" || s.dtype===ST.dtype; } +function epOk(s){ return ST.ep==="all" || String(s.ep)===ST.ep; } +// HEADLINE DISTRIBUTION CONTRACT (goal P2 "define one headline distribution"): uniform is the +// single cross-hardware headline — controlled, deterministic, and present on every SKU, so it is +// the apples-to-apples reference. balanced / zipf / zipf+eplb / hotspot* are SENSITIVITY views +// (see the Distribution-sensitivity section), NOT peer headline dimensions. (Long-term headline +// will come from InferenceX trace replay; zipf+eplb is the interim load-realism reference.) +const HEADLINE_DISTRIBUTION = "uniform"; +// HEADLINE OPENING VIEW (goal P0-1a, B2/B6/B7): the page opens on the MEASURED round trip at p99, +// resource-constrained (normalized) suite, BF16, EP8, uniform routing, DeepSeek-V3 shape, official +// headline cohort. Every other value stays selectable via the toggles below — this only sets what +// the page OPENS with. resolveHeadlineDefaults() (called once at boot) falls the resource suite +// back to backend-default if no normalized data exists for the headline cell, so the chart is never +// empty on first paint while still defaulting to normalized whenever it is present. +const ST = {op:"roundtrip", phase:"decode", x:"t", y:"lat", xlog:true, ylog:true, pct:"p99", + suite:"resource-constrained", dtype:"bf16", ep:"8", model:MODEL_DEFAULT, + routing:HEADLINE_DISTRIBUTION, pub:"all"}; +// NOTE: pub defaults to "all" so the page opens showing the full sweep — the bulk of the data is +// SEEDED-RUNTIME (comparable-experimental, wid=null), which the "official"/"publishable" filters +// exclude by design (they require a canonical workload id). Toggle the publication filter to +// "Official headline"/"Official"/"Publishable" for the publication-grade cohort only. +// Count series visible under a candidate state (used only for graceful headline fallback). Model- +// aware: the candidate carries o.model, and the official-headline branch matches that shape. +function _visCount(o){ return DATA.filter(s=>s.phase===o.phase + && (o.suite==="all"||s.suite===o.suite) && (o.routing==="all"||s.routing===o.routing) + && (o.dtype==="all"||s.dtype===o.dtype) && (o.ep==="all"||String(s.ep)===o.ep) + && (o.model==="all"||SHAPE_KEY(s.shape)===o.model) + && _pubOkFor(s,o.pub,o.model)).length; } +function _pubOkFor(s,pub,model){ + if(pub==="all") return true; + const shapeOk = (model==null||model==="all"||SHAPE_KEY(s.shape)===model); + if(pub==="official-headline") return s.pub==="official" && !!s.wid && shapeOk; + if(pub==="official") return s.pub==="official" && !!s.wid; + return !["diagnostic","invalid","failed"].includes(s.pub) && !!s.wid; +} +// Resolve the opening view so the FIRST paint is never empty, while keeping normalized as the +// preferred default. Fallback order is least-surprising-first: relax the suite (normalized -> +// backend-default), then the dtype, then the EP degree, then the publication breadth. Each step +// only fires if the current candidate yields no visible series. +function resolveHeadlineDefaults(){ + if(_visCount(ST)>0) return; + const ladder=[["suite","all"],["dtype","all"],["ep","all"],["pub","publishable"],["pub","all"]]; + for(const [k,v] of ladder){ ST[k]=v; if(_visCount(ST)>0) return; } +} + +function xval(r,xk){ return xk==="t"? r.t : r.gt; } +function metric(r,op,yk,pct){ + const us=(r[op] && r[op][pct]!=null)? r[op][pct] : (r[op]? r[op].p50 : 0); + if(yk==="lat") return us; + if(yk==="tps") return r.gt/(us*1e-6); + const b = op==="dispatch"? r.dbytes : op==="combine"? r.cbytes : (r.dbytes + r.cbytes); + return us>0 ? b/(us*1e3) : 0; // logical routed payload rate (GB/s), per-op bytes +} +function fmt(v){ + if(v>=1e9) return (v/1e9).toFixed(v<1e10?2:0)+"G"; + if(v>=1e6) return (v/1e6).toFixed(v<1e7?2:0)+"M"; + if(v>=1e3) return (v/1e3).toFixed(v<1e4?1:0)+"k"; + if(v>=10) return v.toFixed(0); + if(v>=1) return v.toFixed(v<3?1:0); + return v.toFixed(2); +} +function logTicks(mn,mx){ + const t=[]; let e=Math.floor(Math.log10(mn)); + for(;Math.pow(10,e)<=mx*1.0001;e++) for(const m of [1,2,5]){const v=m*Math.pow(10,e); if(v>=mn*0.999&&v<=mx*1.001)t.push(v);} + return t.length?t:[mn,mx]; +} +function linTicks(mn,mx){ + const span=mx-mn||1, step=Math.pow(10,Math.floor(Math.log10(span))); const t=[]; + let s=step; if(span/step>6)s=step*2; if(span/step<3)s=step/2; + for(let v=Math.ceil(mn/s)*s; v<=mx*1.0001; v+=s) t.push(+v.toFixed(6)); + return t.length?t:[mn,mx]; +} +const mapLog=(v,a,b,p,q)=>p+(Math.log(v)-Math.log(a))/(Math.log(b)-Math.log(a))*(q-p); +const mapLin=(v,a,b,p,q)=>p+(v-a)/(b-a)*(q-p); + +// Build one SVG chart. opts: {op,phase,x,y,ylog,title,legend,w,h} +function chart(o){ + const W=o.w||900, H=o.h||520, m={l:64,r:16,t:34,b:46}; + const pct=o.pct||"p99", suite=o.suite||"all", routing=o.routing||"all"; + // o.dtype / o.epf / o.model are the MAIN-chart headline filters (default-off so the grid, which + // faces by EP via o.ep, is unaffected). epf "all"|"8"…; dtype "all"|"bf16"…; model "all"|"hidden/topk/experts". + const sl = DATA.filter(s=>s.phase===o.phase && (o.ep==null || s.ep===o.ep) + && (suite==="all" || s.suite===suite) + && (routing==="all" || s.routing===routing) + && (!o.dtype || o.dtype==="all" || s.dtype===o.dtype) + && (!o.epf || o.epf==="all" || String(s.ep)===o.epf) + && (!o.model || o.model==="all" || SHAPE_KEY(s.shape)===o.model) && pubOk(s)); + const pts = sl.map(s=>({s, P:s.rows.map(r=>({x:xval(r,o.x), y:metric(r,o.op,o.y,pct), r})) + .filter(p=>p.x>0 && (o.ylog? p.y>0 : p.y>=0) + && (o.phase!=="prefill" || p.r.t>=PREFILL_MIN))})); + let xs=[], ys=[]; pts.forEach(g=>g.P.forEach(p=>{xs.push(p.x);ys.push(p.y);})); + if(!xs.length) return 'no data'; + const xmn=Math.min(...xs), xmx=Math.max(...xs); + let ymn=Math.min(...ys), ymx=Math.max(...ys); + if(o.ylog){ ymn=Math.min(...ys.filter(v=>v>0)); } else { ymn=Math.min(0,ymn); } + if(ymx===ymn) ymx=ymn+1; + const X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t; + const xlog = o.xlog!==false; // x defaults to log (geometric sweep) + const xv=v=>xlog?mapLog(v,xmn,xmx,X0,X1):mapLin(v,xmn,xmx,X0,X1); + const yv=v=>o.ylog?mapLog(Math.max(v,ymn),ymn,ymx,Y0,Y1):mapLin(v,ymn,ymx,Y0,Y1); + let s=''; + s+=''+o.title+''; + // y grid + ticks + const yt=o.ylog?logTicks(ymn,ymx):linTicks(ymn,ymx); + yt.forEach(v=>{const y=yv(v); s+=''+ + ''+fmt(v)+'';}); + // x grid + ticks (label the actual sweep points) + const xt=[...new Set(xs)].sort((a,b)=>a-b); + xt.forEach(v=>{const x=xv(v); s+=''+ + ''+fmt(v)+'';}); + // axes + s+=''; + s+=''+XK[o.x]+(xlog?' (log)':'')+''; + s+=''+YK[o.y]+(o.ylog?' (log)':'')+''; + // lines + points + pts.forEach(g=>{ if(!g.P.length) return; + const d=g.P.map((p,i)=>(i?'L':'M')+xv(p.x).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' '); + const dash=g.s.dash?' stroke-dasharray="'+g.s.dash+'"':''; + s+=''; + g.P.forEach(p=>{ const D=p.r.dispatch, C=p.r.combine, R=p.r.roundtrip; + // artifact links (goal P1): the workflow run + source SHA + image digest + workload id + // that produced this point. (Result JSON / manifest / raw-samples live alongside by name.) + const run=g.s.run_id? ('\nrun '+g.s.run_id+(g.s.source_sha?' @'+g.s.source_sha:'')) : ''; + const art='\nworkload='+(g.s.wid||g.s.wsig||'?')+(g.s.image_digest?' · image '+g.s.image_digest:'') + +(g.s.repo?' · '+g.s.repo:''); + s+=''+ + ''+g.s.label+' ['+pct+'] ('+g.s.pub+')'+ + '\nmodel='+(g.s.model||'?')+' (hidden/topk/experts '+SHAPE_KEY(g.s.shape)+')'+ + '\nT/rank='+p.r.t+' · global='+p.r.gt+ + '\n'+YK[o.y]+' = '+fmt(p.y)+(o.y==='lat'?' µs':o.y==='bw'?' GB/s':'')+ + '\ndispatch µs p50/p90/p99 = '+D.p50.toFixed(1)+'/'+D.p90.toFixed(1)+'/'+D.p99.toFixed(1)+ + '\ncombine µs p50/p90/p99 = '+C.p50.toFixed(1)+'/'+C.p90.toFixed(1)+'/'+C.p99.toFixed(1)+ + '\nroundtrip µs p50/p90/p99 = '+R.p50.toFixed(1)+'/'+R.p90.toFixed(1)+'/'+R.p99.toFixed(1)+' (measured)'+ + '\nfan-out='+(p.r.fanout!=null?p.r.fanout.toFixed(2):'?')+' · recv(max)='+p.r.recv + +(p.r.straggler!=null?' · straggler=r'+p.r.straggler:'')+(p.r.correct?'':' ✗')+ + '\ncontract='+g.s.contract+' · suite='+g.s.suite+ + '\ndispatch='+g.s.dtype+' · combine='+(g.s.cqm||'none')+' · activation='+(g.s.act||'normal')+run+art+ + ''; }); + }); + s+=''; return s; +} +// Comparison guard (goal P1): flag when overlaid lines are NOT a direct comparison — +// differing topology at one EP, or differing realized workload signature within one routing. +function guardNote(vis){ + if(!vis.length) return ''; + const w=[]; + const topos=[...new Set(vis.map(s=>s.topo).filter(Boolean))]; + if(topos.length>1) w.push('mixed topology ('+topos.join(', ')+')'); + const byRt={}; vis.forEach(s=>{ (byRt[s.routing]=byRt[s.routing]||new Set()).add(s.wsig||'?'); }); + const split=Object.entries(byRt).filter(([k,v])=>v.size>1).map(([k])=>k); + if(split.length) w.push('different workload trace within routing ['+split.join(',')+'] — NOT identical workloads'); + // combine-quant / activation-value / workload-id are part of the workload contract: a quantized + // combine, a different value distribution, or a different canonical workload is NOT the same + // benchmark as the headline, even at matched routing/dims (review). + const cqms=[...new Set(vis.map(s=>s.cqm||'none'))]; + if(cqms.length>1) w.push('mixed combine-quant ('+cqms.join(', ')+') — quantized combine is a different contract from dispatch'); + const acts=[...new Set(vis.map(s=>s.act||'normal'))]; + if(acts.length>1) w.push('mixed activation profile ('+acts.join(', ')+') — value distribution differs'); + const wids=[...new Set(vis.map(s=>s.wid).filter(Boolean))]; + if(wids.length>1) w.push('mixed workload_id ('+wids.join(' / ')+') — not the same canonical workload'); + // source SHA: a cross-SKU OFFICIAL cohort must come from ONE benchmark source SHA (goal P1). + const shas=[...new Set(vis.map(s=>s.source_sha).filter(Boolean))]; + if(shas.length>1) w.push('mixed source SHA ('+shas.join(' / ')+') — official cohorts need one benchmark SHA'); + // wid=null cohorts can never be official (goal P1) — flag if any non-canonical line is shown. + const nullwid=vis.filter(s=>!s.wid).length; + if(nullwid && ST.pub==='official') w.push(nullwid+' line(s) have wid=null — excluded from the official view'); + const eps=[...new Set(vis.map(s=>s.ep))]; + if(eps.length>1) w.push('mixed EP degree '+eps.join('/')+' — compare only on the global-tokens x-axis'); + return w.length? '
⚠ not a direct comparison: '+w.join('; ')+'
' : ''; +} +function legend(phase, ep, suite, routing, dtype, epf, model){ + return '
'+DATA.filter(s=>s.phase===phase && (ep==null||s.ep===ep) + && (!suite||suite==="all"||s.suite===suite) + && (!routing||routing==="all"||s.routing===routing) + && (!dtype||dtype==="all"||s.dtype===dtype) + && (!model||model==="all"||SHAPE_KEY(s.shape)===model) + && (!epf||epf==="all"||String(s.ep)===epf) && pubOk(s)).map(s=>{ + const sw = s.dash ? 'background:repeating-linear-gradient(90deg,'+s.color+' 0 5px,transparent 5px 9px)' + : 'background:'+s.color; // dashed swatch = fp8 (matches the line) + // when shapes are mixed ("All shapes"), prefix the model so same-config lines of different + // models are distinguishable; a single-model view keeps the original (uncluttered) label. + const lab = (model==="all"? '['+(s.model||'?')+'] ' : '')+s.label; + return ''+lab+''; + }).join('')+'
'; +} +function seg(name,opts,cur){ + return '
'+Object.entries(opts).map(([k,v])=> + '').join('')+'
'; +} +function renderControls(){ + document.getElementById('controls').innerHTML = + '
Model shape (headline=DeepSeek-V3/V4)'+seg('model',MODELS,ST.model)+'
'+ + '
Operation'+seg('op',OPS,ST.op)+'
'+ + '
Phase'+seg('phase',{decode:"Decode",prefill:"Prefill"},ST.phase)+'
'+ + '
Percentile'+seg('pct',PCT,ST.pct)+'
'+ + '
Suite'+seg('suite',SUITE,ST.suite)+'
'+ + '
Dispatch dtype'+seg('dtype',DTYPES,ST.dtype)+'
'+ + '
EP degree'+seg('ep',EPS,ST.ep)+'
'+ + '
Routing (headline='+HEADLINE_DISTRIBUTION+')'+seg('routing',ROUTING,ST.routing)+'
'+ + '
Publication'+seg('pub',PUB,ST.pub)+'
'+ + '
X-axis'+seg('x',XK,ST.x)+'
'+ + '
X scale'+seg('xlog',{true:"Log",false:"Linear"},String(ST.xlog))+'
'+ + '
Y-axis'+seg('y',YK,ST.y)+'
'+ + '
Y scale'+seg('ylog',{true:"Log",false:"Linear"},String(ST.ylog))+'
'; + document.querySelectorAll('#controls button').forEach(b=>b.onclick=()=>{ + const g=b.dataset.grp, v=b.dataset.val; ST[g]= (g==='ylog'||g==='xlog')? v==='true' : v; + // grid/heatmaps also reflect pct/suite/phase/scale toggles; scaling is headline-only (static). + renderControls(); renderMain(); renderGrid(); renderHeatmaps(); }); +} +function renderMain(){ + const mtag=(ST.model==='all'?' · all shapes':' · '+(MODELS[ST.model]||ST.model)); + const tags=mtag+(ST.dtype==='all'?'':' · '+ST.dtype)+(ST.ep==='all'?'':' · EP'+ST.ep); + document.getElementById('chart').innerHTML = chart({op:ST.op,phase:ST.phase,x:ST.x,y:ST.y,xlog:ST.xlog,ylog:ST.ylog, + pct:ST.pct, suite:ST.suite, routing:ST.routing, dtype:ST.dtype, epf:ST.ep, model:ST.model, + title:OPS[ST.op]+' — '+ST.phase+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+tags+' ('+YK[ST.y].toLowerCase()+' vs '+XK[ST.x].toLowerCase()+')'}); + const vis=DATA.filter(s=>s.phase===ST.phase && (ST.suite==="all"||s.suite===ST.suite) + && (ST.routing==="all"||s.routing===ST.routing) + && dtOk(s) && epOk(s) && modelOk(s) && pubOk(s)); + document.getElementById('mlegend').innerHTML = guardNote(vis)+legend(ST.phase, null, ST.suite, ST.routing, ST.dtype, ST.ep, ST.model); +} +function renderGrid(){ + // SEPARATE panels per (phase, EP degree); within a panel, the SUITE selector keeps + // backend-default and resource-constrained lines from being read as one fair contest. + const phases=[...new Set(DATA.map(s=>s.phase))].sort(); + const eps=[...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b); + let h=''; + phases.forEach(ph=>{ eps.forEach(ep=>{ + const panelVis=DATA.filter(s=>s.phase===ph && s.ep===ep && (ST.suite==="all"||s.suite===ST.suite) + && (ST.routing==="all"||s.routing===ST.routing) && pubOk(s)); + if(!panelVis.length) return; + const scale=(ST.xlog?'log':'lin')+'–'+(ST.ylog?'log':'lin'); + h+='

'+ph[0].toUpperCase()+ph.slice(1)+' · EP'+ep+' · '+ST.pct+(ST.routing==='all'?'':' · '+ST.routing)+' — latency vs source tokens/rank (µs, '+scale+')

'+ + guardNote(panelVis)+legend(ph,ep,ST.suite,ST.routing)+'
'; + ['dispatch','combine','roundtrip'].forEach(op=>{ h+='
'+OPS[op]+'
'+ + chart({op,phase:ph,ep,x:'t',y:'lat',xlog:ST.xlog,ylog:ST.ylog,pct:ST.pct,suite:ST.suite,routing:ST.routing,title:'',w:340,h:260})+'
'; }); + h+='
'; }); }); + document.getElementById('grid').innerHTML=h; +} +// Strong + weak SCALING views (goal P2 "separate views for strong and weak scaling" — do NOT rely +// on the x-axis toggle to reinterpret one experiment). weak = fixed tokens/RANK, latency vs EP +// (ideal: flat). strong = fixed GLOBAL tokens, latency vs EP (ideal: falls ~1/EP). Each labels its +// scaling contract. Renders only for SKUs measured at >=2 EP degrees (the headline distribution). +function scalingChart(kind){ + // map: sku -> {ep -> {key(T or GT) -> p50 dispatch}} + const sl=DATA.filter(s=>s.routing===HEADLINE_DISTRIBUTION && s.mode==="normal" + && s.contract==="layout-and-dispatch-v1" && pubOk(s)); + const bySku={}; sl.forEach(s=>{ (bySku[s.sku]=bySku[s.sku]||{})[s.ep]=s; }); + const skuColor={}; DATA.forEach(s=>{ skuColor[s.sku]=skuColor[s.sku]||s.color; }); + const skus=Object.keys(bySku).filter(k=>Object.keys(bySku[k]).length>=2).sort(); + if(!skus.length) return '

No SKU measured at ≥2 EP degrees yet (needs e.g. GB300 EP4 + EP8). Strong/weak scaling renders here once a multi-EP cohort exists.

'; + // build series: one line per sku; x=EP, y=latency at a fixed anchor (weak: tokens/rank=64; strong: global=512). + const anchorT=64, anchorGT=512; + const W=900,H=360,m={l:64,r:16,t:34,b:46},X0=m.l,X1=W-m.r,Y0=H-m.b,Y1=m.t; + const lines=[]; let xs=[],ys=[]; + skus.forEach(sku=>{ const pts=[]; + Object.keys(bySku[sku]).map(Number).sort((a,b)=>a-b).forEach(ep=>{ const s=bySku[sku][ep]; + let r=null; + if(kind==="weak"){ r=s.rows.find(rr=>rr.t===anchorT); } + else { r=s.rows.find(rr=>rr.gt===anchorGT) || s.rows.find(rr=>rr.t===Math.round(anchorGT/ep)); } + if(r){ const y=r.dispatch.p50; if(y>0){ pts.push({ep,y}); xs.push(ep); ys.push(y);} } + }); + if(pts.length) lines.push({sku,pts,color:(skuColor[sku]||"#888")}); + }); + if(!xs.length) return '

No matched anchor points for '+kind+' scaling.

'; + const xmn=Math.min(...xs),xmx=Math.max(...xs),ymn=Math.min(...ys),ymx=Math.max(...ys); + const xv=v=>mapLin(v,xmn,xmx||xmn+1,X0,X1), yv=v=>mapLin(v,Math.min(0,ymn),ymx||1,Y0,Y1); + let s=''; + s+=''+(kind==="weak"?"Weak scaling — fixed tokens/rank="+anchorT+" (ideal: flat)":"Strong scaling — fixed global tokens="+anchorGT+" (ideal: ↓ ~1/EP)")+''; + [...new Set(xs)].sort((a,b)=>a-b).forEach(v=>{const x=xv(v);s+='EP'+v+'';}); + linTicks(Math.min(0,ymn),ymx).forEach(v=>{const y=yv(v);s+=''+fmt(v)+'';}); + s+=''; + s+='EP degree'; + s+='dispatch p50 (µs)'; + lines.forEach(g=>{ const d=g.pts.map((p,i)=>(i?'L':'M')+xv(p.ep).toFixed(1)+' '+yv(p.y).toFixed(1)).join(' '); + s+=''; + g.pts.forEach(p=>{ s+=''+g.sku.toUpperCase()+' EP'+p.ep+' '+kind+'-scaling: '+fmt(p.y)+' µs'; }); }); + s+=''; return s; +} +function renderScaling(){ + const el=document.getElementById('scaling'); if(!el) return; + el.innerHTML='
'+scalingChart("weak")+'
'+scalingChart("strong")+'
' + +'

Strong vs weak are DISTINCT experiments with distinct scaling contracts (labelled in each title) — not one chart reinterpreted by an x-axis toggle. Headline distribution = '+HEADLINE_DISTRIBUTION+', layout-and-dispatch-v1, normal mode.

'; +} +// HEATMAPS (goal P2): EP×tokens/rank and routing-skew×token-load (latency), placement×node and +// resource×load where data exists. A cell is colored by dispatch p50 (log scale); empty cells are +// blank (no measured point). One grid per (metric pairing) for the current phase + publishable set. +function heatmap(rowKeyFn, rowLabel, rowVals, colVals, title){ + const sl=DATA.filter(s=>s.phase===ST.phase && (ST.suite==="all"||s.suite===ST.suite) && pubOk(s)); + // cell value = min dispatch p50 across series matching (rowVal) at colVal (tokens/rank) + const cell={}; + sl.forEach(s=>{ const rk=rowKeyFn(s); if(rk==null) return; + s.rows.forEach(r=>{ const k=rk+'|'+r.t; const y=r.dispatch&&r.dispatch.p50; if(y>0) cell[k]=Math.min(cell[k]||1e9,y); }); }); + const present=Object.keys(cell); if(!present.length) return ''; + const cols=colVals.filter(c=>present.some(k=>k.endsWith('|'+c))); + const rows=rowVals.filter(rv=>present.some(k=>k.startsWith(rv+'|'))); + if(!rows.length||!cols.length) return ''; + const allv=Object.values(cell), lo=Math.min(...allv), hi=Math.max(...allv); + const cw=46,ch=26,L=120,T=30,W=L+cols.length*cw+16,H=T+rows.length*ch+24; + const col=v=>{ const t=(Math.log(v)-Math.log(lo))/((Math.log(hi)-Math.log(lo))||1); // green->red + const r=Math.round(40+t*200),g=Math.round(190-t*150); return 'rgb('+r+','+g+',70)'; }; + let s=''+title+''; + cols.forEach((c,j)=>{ s+=''+c+''; }); + rows.forEach((rv,i)=>{ s+=''+rv+''; + cols.forEach((c,j)=>{ const v=cell[rv+'|'+c]; const x=L+j*cw,y=T+i*ch; + if(v) s+=''+rowLabel+'='+rv+' T='+c+': '+fmt(v)+' µs'+fmt(v)+''; + else s+=''; }); }); + s+=''; return s; +} +function renderHeatmaps(){ + const el=document.getElementById('heatmaps'); if(!el) return; + const Ts=[...new Set(DATA.filter(s=>s.phase===ST.phase).flatMap(s=>s.rows.map(r=>r.t)))].sort((a,b)=>a-b); + const eps=[...new Set(DATA.map(s=>s.ep))].sort((a,b)=>a-b); + const routs=[...new Set(DATA.map(s=>s.routing))].sort(); + const ress=[...new Set(DATA.map(s=>s.resource))].sort(); + const places=[...new Set(DATA.map(s=>s.placement||'packed'))].sort(); + const grids=[ + heatmap(s=>'EP'+s.ep, 'EP', eps.map(e=>'EP'+e), Ts, 'EP × tokens/rank — dispatch p50 (µs), '+ST.phase), + heatmap(s=>s.routing, 'routing', routs, Ts, 'Routing skew × token load — dispatch p50 (µs), '+ST.phase), + heatmap(s=>s.resource, 'resource', ress, Ts, 'Resource regime × token load — dispatch p50 (µs), '+ST.phase), + ]; + if(places.length>1) grids.push(heatmap(s=>s.placement||'packed','placement',places,Ts,'Placement × token load — dispatch p50 (µs), '+ST.phase)); + const shown=grids.filter(Boolean); + el.innerHTML=(shown.length? shown.map(g=>'
'+g+'
').join('') : '

No heatmap cells for this phase/suite.

') + +'

Cell = min dispatch p50 (µs) over matching publishable series; green→red = fast→slow (log). Blank = no measured point. Placement×node and a populated routing×load grid fill in as multi-node / skew runs land.

'; +} +// Coverage table (goal P2): publication status per measured config (validated=official, +// experimental=comparable/legacy, failed=invalid/failed). Supported/unsupported come from +// generate_matrix.py (capability), which records omissions with reasons. +function renderCoverage(){ + const cls={official:'#2ca02c','comparable-experimental':'#d6a72b',legacy:'#7f7f7f', + diagnostic:'#9467bd',invalid:'#d62728',failed:'#a30000'}; + const by={}; DATA.forEach(s=>{ (by[s.sku]=by[s.sku]||[]).push(s); }); + let h=''; + Object.keys(by).sort().forEach(sku=>{ + // sort by model then EP then label so the per-model coverage (which SKUs have which shape) groups. + by[sku].sort((a,b)=>(a.model||'').localeCompare(b.model||'')||(a.ep-b.ep)||a.label.localeCompare(b.label)).forEach(s=>{ + const ok=s.rows.filter(r=>r.correct).length; + // dispatch dtype / mode / contract, + combine-quant + activation profile ONLY when non-default + // (so today's bf16/none/normal rows stay uncluttered; a PR311 quant-combine run shows /cq:…). + const cfg=(s.dtype||'?')+'/'+s.mode+'/'+(s.contract||'?').replace('-v1','') + +((s.cqm&&s.cqm!=='none')?'/cq:'+s.cqm:'')+((s.act&&s.act!=='normal')?'/'+s.act:''); + // workload identity column (goal P1): canonical wid, else flag wid=null as an official blocker. + const wcell = s.wid? (''+s.wid.slice(0,10)+'') + : 'wid=null ⚠'; + h+='' + +'' + +'' + +'' + +''; + }); + }); + document.getElementById('coverage').innerHTML=h+'
SKUmodel (h/topk/e)EPconfigphaseroutingworkloadstatuscorrect pts
'+sku+''+(s.model||'?')+' '+SHAPE_KEY(s.shape)+''+s.ep+''+cfg+''+s.phase+''+s.routing+''+wcell+''+s.pub+''+ok+'/'+s.rows.length+'
' + +'

model column = the MoE shape (hidden/topk/experts) named per the model registry; this is the per-model coverage (which SKUs ran which model shape). workload=wid is the canonical workload id; wid=null marks a seeded-runtime (non-canonical) line that is capped at comparable-experimental and is hidden from the Official view. Status is machine-derived from validity (goal P1).

'; +} +// Failed / quarantined cases (goal immediate P2 "preserve failed cases in aggregation"): no-row +// failed-case records (classified wedge/timeout/crash) + diagnostic/invalid/failed docs, surfaced +// so a failure is never silently dropped. Diagnostic = quarantined (e.g. LL-FP8 roundtrip anomaly, +// MoRI resource-nonconforming) — kept, labelled, excluded from official/comparable. +function renderFailed(){ + const el=document.getElementById('failed'); if(!el) return; + if(typeof FAILED==='undefined' || !FAILED.length){ el.innerHTML='

No failed or quarantined cases — every run completed and is publishable.

'; return; } + const cls={failed:'#a30000',invalid:'#d62728',diagnostic:'#9467bd'}; + let h=''; + FAILED.slice().sort((a,b)=>(a.sku||'').localeCompare(b.sku||'')).forEach(r=>{ + h+='' + +'' + +''; + }); + el.innerHTML=h+'
SKUbackendphaseconfigstatusreason / failure moderc
'+r.sku+''+(r.backend||'?')+''+(r.phase||'?')+''+r.cfg+''+r.status+''+(r.reason||'?')+''+(r.rc==null?'—':r.rc)+'

Preserved, not dropped: failed-case records (run_in_container emits a tests/failure_taxonomy classification on a wedge/timeout/crash) + quarantined diagnostic/invalid docs (e.g. an LL-FP8 roundtrip anomaly, or a resource-nonconforming MoRI run). These are excluded from the official/comparable views above.

'; +} +// Distribution-sensitivity summary (review: don't add a 7th chart dimension — collapse it to one +// ratio per sku/backend/phase). p99(worst stressor distribution) / p99(uniform) at matched +// tokens/rank, computed by tests/sensitivity.py and injected as SENS. +function renderSensitivity(){ + const el=document.getElementById('sensitivity'); if(!el) return; + if(typeof SENS==='undefined' || !SENS.length){ el.innerHTML='

No multi-distribution groups in this view (need uniform + a stressor at matched tokens/rank).

'; return; } + let h=''; + SENS.slice().sort((a,b)=>(a.sku.localeCompare(b.sku))||a.backend.localeCompare(b.backend)||a.phase.localeCompare(b.phase)).forEach(r=>{ + const cfg=r.dispatch_dtype+'·'+r.mode+'·'+(r.contract||'').replace('-v1',''); + const rng=r.headline_p99_range_us, sr=r.distribution_sensitivity_ratio; + const sc = sr>=1.5?'#d62728':(sr>=1.2?'#d6a72b':'#2ca02c'); + const ev=r.eplb_recovery? (r.eplb_recovery.zipf.toFixed(2)+'→'+r.eplb_recovery['zipf+eplb'].toFixed(2)+'×') : '—'; + h+='' + +'' + +''; + }); + el.innerHTML=h+'
SKUbackendphaseconfigheadline p99 µsworst dist @TsensitivityEPLB zipf→+eplb
'+r.sku+''+r.backend+''+r.phase+''+cfg+''+rng[0]+'–'+rng[1]+''+r.worst_distribution+' @'+r.worst_at_T+''+sr.toFixed(2)+'×'+ev+'
' + +'

distribution_sensitivity_ratio = p99(worst stressor distribution) ÷ p99(uniform) at matched tokens/rank — how much routing skew/spread degrades this backend (>1 = fragile, ~1 = robust). Stressors exclude the min-comm best case + EPLB-remedied runs. A single number, NOT a chart dimension (tests/sensitivity.py).

'; +} +// Industry summary cards (goal P3-F): CARDS is precomputed in Python (main()) from the loaded +// series so the numbers match the analysis modules exactly. Rendered as a responsive grid. +function renderCards(){ + const el=document.getElementById('cards'); if(!el) return; + // bare reference (NOT window.CARDS): top-level const in a classic \n" + TAIL + with open(args.out, "w") as fh: + fh.write(html) + phases = sorted({s["phase"] for s in series}) + print(f"wrote {args.out} ({len(series)} series across SKUs={sorted({s['sku'] for s in series})}, phases={phases})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/prune_results.py b/experimental/CollectiveX/prune_results.py new file mode 100644 index 000000000..e48cb8504 --- /dev/null +++ b/experimental/CollectiveX/prune_results.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +"""CollectiveX — prune results/ to the fresh canonical set. + +The results/ dir accumulates every GHA download across sessions (885+ files): many are SUPERSEDED +debug re-runs of the same config, stale runs from older code, or failed-case stubs that now have a +valid newer counterpart. This prunes to the FRESH canonical set: + + * group every result by its comparison_key (the config identity the plot/aggregator uses); + * within a group, keep the newest KEEP_PER_KEY runs whose publication_status/status is usable + (official | comparable-experimental | valid) — newest by generated_at; + * move everything else (older-than-KEEP valids, and failed/invalid runs that have >=1 usable run in + their group) to results/.superseded/ (NOT hard-deleted — recoverable; already out of the plot glob). + +Keeping KEEP_PER_KEY>1 preserves the repeat-run aggregation (median + error bands across runs, a +P0 deliverable) while removing the long tail of stale debug duplicates. A failed-case with NO usable +counterpart is KEPT (the "preserve genuinely-failed cases" deliverable). env_*.json + analysis.json +are kept. Stdlib only. + + python3 prune_results.py --results-dir results # prune (move to .superseded) + python3 prune_results.py --results-dir results --dry-run # just report +""" +from __future__ import annotations + +import argparse +import json +import os +import shutil + +KEEP_PER_KEY = 3 # newest usable runs to keep per config (repeat-run aggregation) +USABLE = {"official", "comparable-experimental", "valid"} + + +def _doc_key(d: dict) -> str: + """Config identity: top-level comparison_key (EP), else family+runner+a stable signature.""" + if d.get("comparison_key"): + return str(d["comparison_key"]) + # collective families (kv-cache/copy-engine/nccl/rl-mesh/allreduce-fw): derive from group keys. + keys = [g.get("comparison_key") for g in d.get("groups", []) if g.get("comparison_key")] + if keys: + return "|".join(sorted(str(k) for k in keys)) + return "|".join(str(d.get(k, "")) for k in ("family", "runner", "backend", "phase", "measurement_contract")) + + +def _usable(d: dict) -> bool: + ps = d.get("publication_status") or d.get("status") + return ps in USABLE + + +def main() -> int: + ap = argparse.ArgumentParser(description="Prune CollectiveX results/ to the fresh canonical set") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--keep-per-key", type=int, default=KEEP_PER_KEY) + ap.add_argument("--dry-run", action="store_true") + a = ap.parse_args() + + rd = a.results_dir + sup = os.path.join(rd, ".superseded") + files = [f for f in os.listdir(rd) if f.endswith(".json") + and not f.startswith("env_") and f != "analysis.json"] + docs = [] # (fname, key, generated_at, usable, is_failed) + for f in files: + try: + d = json.load(open(os.path.join(rd, f))) + except Exception: + continue + docs.append((f, _doc_key(d), d.get("generated_at") or d.get("generated_at", ""), + _usable(d), f.startswith("failed_") or d.get("record_type") == "failed-case")) + + # group by key + groups: dict = {} + for rec in docs: + groups.setdefault(rec[1], []).append(rec) + + move = [] + for key, recs in groups.items(): + usable = sorted([r for r in recs if r[3]], key=lambda r: r[2], reverse=True) + keep = set(r[0] for r in usable[:a.keep_per_key]) + for r in recs: + f, _, _, is_usable, is_failed = r + if f in keep: + continue + # keep a failed/unusable run ONLY if its group has NO usable run at all + if (is_failed or not is_usable) and not usable: + continue + move.append(f) + + print(f"prune: {len(files)} result files, {len(groups)} configs, keep<= {a.keep_per_key}/config -> " + f"move {len(move)} superseded/stale to {sup}") + if a.dry_run: + for f in sorted(move)[:20]: + print(" would move:", f) + return 0 + os.makedirs(sup, exist_ok=True) + for f in move: + try: + shutil.move(os.path.join(rd, f), os.path.join(sup, f)) + except Exception as e: + print(f" WARN move {f}: {e!r}") + print(f"pruned -> {len([x for x in os.listdir(rd) if x.endswith('.json')])} json kept in {rd}, " + f"{len(os.listdir(sup))} in .superseded") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/regression.py b/experimental/CollectiveX/regression.py new file mode 100644 index 000000000..7d48af5b0 --- /dev/null +++ b/experimental/CollectiveX/regression.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +"""CollectiveX performance-regression thresholds (goal P1 "Add regression thresholds"). + +Threshold-based regression detection ACROSS independent benchmark runs of the same fixed config. +A config's identity is its `comparison_key` (same as repeated_runs.py / validate_results.py); a +config is measured at several `tokens_per_rank` (T) ladders. For each (comparison_key, T) we form: + + * CANDIDATE — the NEWEST independent run (latest `generated_at`). + * BASELINE — either an explicit baseline (a --baseline file/dir, e.g. last published headline), + or, by default, the run-to-run MEDIAN of all-but-the-newest runs (historical + median). The candidate is compared against that. + +A larger metric is slower (these are microsecond latencies). We flag: + + * REGRESSION candidate exceeds baseline by > --threshold (default 10%), AND the change is OUTSIDE + run-to-run noise. Noise is the historical variability of THIS (ck, T) point measured + by repeated runs (MAD / CV, computed exactly like repeated_runs.py). A "regression" + whose candidate value still sits inside the historical [median ± k·MAD] band — or + whose pct delta is within the historical CV — is reported as `regression-in-noise` + (noted, but NOT a CI-gating failure), because we cannot distinguish it from jitter. + * IMPROVEMENT candidate faster than baseline by > --threshold (and outside noise). + * OK |delta| within threshold. + +Configs with < 2 independent runs (and no explicit baseline) have no baseline -> `insufficient +history` (skipped, not failed). Missing rows / missing the chosen metric+percentile are skipped +gracefully. + +Exit code is non-zero iff at least one HARD regression (outside noise) is found, so CI can gate on +it. `--json` writes the full machine-readable report; a markdown table always goes to stdout. + + python3 regression.py results/ + python3 regression.py results/ --metric roundtrip --pct p99 --threshold 0.10 + python3 regression.py results/ --baseline published/headline/ --json regression.json + python3 regression.py results/ --metric dispatch --pct p95 --threshold 0.05 +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +from collections import defaultdict + +# Operations / percentiles a row may carry. Mirrors the row schema used across the repo. +OPS = ("roundtrip", "dispatch", "combine") +PCTS = ("p50", "p90", "p95", "p99") + +# How many MADs around the historical median still count as "within run-to-run noise". 3·MAD is a +# robust analogue of a 3-sigma band; a candidate inside it is statistically indistinguishable from +# the established jitter of this exact point, so we refuse to call it a hard regression. +NOISE_MAD_K = 3.0 + + +def _p(r, op, pct): + """Extract one percentile for one op from a row, tolerating both the nested-dict form + (`r[op][pct]`) and the flat `r["{op}_us_{pct}"]` form. Same accessor as repeated_runs.py.""" + if isinstance(r.get(op), dict): + return r[op].get(pct) + return r.get(f"{op}_us_{pct}") + + +def _median(xs): + s = sorted(xs) + n = len(s) + return (s[n // 2] if n % 2 else (s[n // 2 - 1] + s[n // 2]) / 2.0) if n else float("nan") + + +def _noise_stats(xs): + """Run-to-run dispersion of a metric at one (ck, T). Same math as repeated_runs._stats: + median / MAD / CV over the independent-run values. Returns None for <2 points (no dispersion).""" + n = len(xs) + if n < 2: + return None + mean = sum(xs) / n + std = (sum((x - mean) ** 2 for x in xs) / n) ** 0.5 + med = _median(xs) + mad = _median([abs(x - med) for x in xs]) + return {"n": n, "median": round(med, 3), "mad": round(mad, 3), + "cv": round(std / mean, 4) if mean > 0 else None} + + +def _parse_ts(doc): + """Sort key for recency. generated_at is ISO-8601 (e.g. 2026-06-27T00:54:19.552522+00:00); + a lexicographic compare on the normalized string orders ISO timestamps correctly. Fall back to + the filename (which embeds a ...T..Z stamp) so files without generated_at still order sanely.""" + ts = doc.get("generated_at") + if isinstance(ts, str) and ts: + return ts + return "" + + +def load(paths): + """Load moe result docs from files/dirs into per-run records, mirroring repeated_runs.load(): + skip env_* sidecars, require family==moe with rows, drop preserved failed-case records (they + carry no comparable timings), and collapse to ONE record per independent run via its git run_id + (falling back to the filename) so in-process repeats of one job aren't counted as separate runs. + Returns {comparison_key: {run_id: record}} where record.rows maps T -> row.""" + files = [] + for p in paths: + if os.path.isdir(p): + files += glob.glob(os.path.join(p, "**", "*.json"), recursive=True) + elif os.path.isfile(p): + files.append(p) + files = sorted(f for f in files if not os.path.basename(f).startswith("env_")) + + by_ck = defaultdict(dict) # ck -> {run_id: record} + for f in files: + try: + doc = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if doc.get("family") != "moe" or not doc.get("rows"): + continue + if doc.get("record_type") == "failed-case": + continue + ck = doc.get("comparison_key") + if not ck: + continue + gr = (doc.get("reproduction") or {}).get("git_run") or {} + run_id = gr.get("run_id") or os.path.basename(f) + rec = { + "file": os.path.basename(f), + "run_id": run_id, + "generated_at": _parse_ts(doc), + "runner": doc.get("runner") or "?", + "publication_status": doc.get("publication_status"), + "rows": {r["tokens_per_rank"]: r for r in doc["rows"] if "tokens_per_rank" in r}, + } + # If the same run_id appears more than once (e.g. several files from one job), keep the + # newest by generated_at so each independent run contributes a single set of values. + prev = by_ck[ck].get(run_id) + if prev is None or rec["generated_at"] >= prev["generated_at"]: + by_ck[ck][run_id] = rec + return by_ck + + +def _baseline_index(paths, metric, pct): + """Build an explicit-baseline lookup {(comparison_key, T): value} from a baseline file/dir. + Each (ck, T) takes its value from the newest baseline doc that carries that point.""" + idx = {} # (ck, T) -> (generated_at, value) + for ck, runs in load(paths).items(): + for run in runs.values(): + for T, row in run["rows"].items(): + val = _p(row, metric, pct) + if val is None: + continue + key = (ck, T) + cur = idx.get(key) + if cur is None or run["generated_at"] >= cur[0]: + idx[key] = (run["generated_at"], val) + return {k: v[1] for k, v in idx.items()} + + +def _verdict(baseline, candidate, threshold, noise): + """Classify one (ck, T). Returns (verdict, pct_delta, within_noise). + + pct_delta > 0 means the candidate is SLOWER (worse) than baseline. within_noise is True when the + change cannot be distinguished from this point's historical run-to-run jitter: either the + candidate still lies inside the historical [median ± k·MAD] band, or |pct_delta| is within the + historical CV. A change inside noise is never a HARD regression/improvement.""" + if baseline is None or candidate is None or baseline <= 0: + return "skip", None, False + delta = (candidate - baseline) / baseline + + within_noise = False + if noise: + cv = noise.get("cv") + med, mad = noise.get("median"), noise.get("mad") + # band test: candidate within k·MAD of the historical median. + if med is not None and mad is not None and mad > 0 and abs(candidate - med) <= NOISE_MAD_K * mad: + within_noise = True + # cv test: the observed move is no larger than typical run-to-run variation. + if cv is not None and abs(delta) <= cv: + within_noise = True + + if delta > threshold: + return ("regression-in-noise" if within_noise else "regression"), delta, within_noise + if delta < -threshold: + return ("improvement-in-noise" if within_noise else "improvement"), delta, within_noise + return "ok", delta, within_noise + + +def analyze(paths, metric="roundtrip", pct="p99", threshold=0.10, baseline_paths=None): + """Core comparison. For each (comparison_key, T): establish baseline (explicit if provided, else + historical median of all-but-newest runs), candidate (newest run), historical noise (MAD/CV over + all runs at that point), and a verdict. Returns a structured report dict.""" + explicit = _baseline_index(baseline_paths, metric, pct) if baseline_paths else None + by_ck = load(paths) + + points = [] + insufficient = [] + for ck in sorted(by_ck): + runs = sorted(by_ck[ck].values(), key=lambda r: r["generated_at"]) + n_runs = len(runs) + # All T measured across this config's runs. + all_T = sorted({T for r in runs for T in r["rows"]}) + for T in all_T: + # values for this (ck, T) in chronological order (one per independent run that has it). + series = [(r, _p(r["rows"][T], metric, pct)) for r in runs if T in r["rows"]] + series = [(r, v) for r, v in series if v is not None] + if not series: + continue + cand_run, cand_val = series[-1] # newest run with this point + hist_vals = [v for _, v in series] # all runs (incl. candidate) for noise + noise = _noise_stats(hist_vals) + + if explicit is not None: + # An explicit baseline is authoritative: compare ONLY points it covers. Points it + # lacks are insufficient — we never silently fall back to a historical median, so a + # single report mixes only one baseline notion. + if (ck, T) not in explicit: + insufficient.append({"comparison_key": ck, "tokens_per_rank": T, + "runner": cand_run["runner"], "n_runs": n_runs, + "reason": "not in explicit baseline"}) + continue + base_val = explicit[(ck, T)] + base_kind = "explicit" + base_n = 1 + else: + older = [v for _, v in series[:-1]] # all-but-newest + if not older: + # <2 independent runs -> no historical baseline for this point. + insufficient.append({"comparison_key": ck, "tokens_per_rank": T, + "runner": cand_run["runner"], "n_runs": n_runs, + "reason": "<2 independent runs"}) + continue + base_val = _median(older) + base_kind = "historical-median" + base_n = len(older) + + verdict, delta, within_noise = _verdict(base_val, cand_val, threshold, noise) + if verdict == "skip": + continue + points.append({ + "comparison_key": ck, + "tokens_per_rank": T, + "runner": cand_run["runner"], + "publication_status": cand_run["publication_status"], + "baseline_kind": base_kind, + "baseline_runs": base_n, + "n_independent_runs": n_runs, + "baseline": round(base_val, 3), + "candidate": round(cand_val, 3), + "candidate_file": cand_run["file"], + "pct_delta": round(delta, 4), + "verdict": verdict, + "within_noise": within_noise, + "noise": noise, + }) + + n_reg = sum(1 for p in points if p["verdict"] == "regression") + n_reg_noise = sum(1 for p in points if p["verdict"] == "regression-in-noise") + n_imp = sum(1 for p in points if p["verdict"].startswith("improvement")) + n_ok = sum(1 for p in points if p["verdict"] == "ok") + # rank worst-first: hard regressions, then by delta. + points.sort(key=lambda p: (p["verdict"] != "regression", -p["pct_delta"])) + return { + "metric": metric, "percentile": pct, "threshold": threshold, + "noise_mad_k": NOISE_MAD_K, + "baseline_source": ("explicit:" + ",".join(baseline_paths)) if baseline_paths else "historical-median", + "n_comparison_keys": len(by_ck), + "n_points_compared": len(points), + "n_insufficient_history": len(insufficient), + "counts": {"regression": n_reg, "regression_in_noise": n_reg_noise, + "improvement": n_imp, "ok": n_ok}, + "hard_regressions": n_reg, + "points": points, + "insufficient_history": insufficient, + } + + +_VERDICT_MARK = { + "regression": "REGRESSION", "regression-in-noise": "regression (noise)", + "improvement": "improvement", "improvement-in-noise": "improvement (noise)", + "ok": "ok", +} + + +def to_markdown(report): + m, pct, thr = report["metric"], report["percentile"], report["threshold"] + c = report["counts"] + h = (f"### Performance regression — {m} {pct} (threshold ±{thr:.0%}, " + f"noise band {report['noise_mad_k']:g}·MAD)\n\n" + f"Baseline: {report['baseline_source']}. " + f"{report['n_points_compared']} (config, T) point(s) compared across " + f"{report['n_comparison_keys']} comparison_key(s); " + f"{report['n_insufficient_history']} point(s) have insufficient history.\n\n" + f"**{c['regression']} regression · {c['improvement']} improvement · {c['ok']} ok · " + f"{c['regression_in_noise']} regression-in-noise.**\n\n") + + # Only surface points that moved (regression/improvement, either side of the noise line). A wall + # of "ok" rows is noise; the counts line above already accounts for them. + moved = [p for p in report["points"] if p["verdict"] != "ok"] + if not moved: + h += ("_No (config, T) point moved beyond the threshold — every compared point is within " + f"±{thr:.0%} of its baseline (or inside run-to-run noise)._\n") + return h + h += ("| comparison_key | T | runner | baseline | candidate | Δ% | verdict | within noise |\n" + "|---|--:|---|--:|--:|--:|---|---|\n") + for p in moved: + n = p["noise"] + noise_txt = (f"CV={n['cv']}, MAD={n['mad']} (n={n['n']})" if n and n.get("cv") is not None + else ("n<2" if not n else "—")) + h += (f"| `{(p['comparison_key'] or '')[:12]}` | {p['tokens_per_rank']} | {p['runner']} | " + f"{p['baseline']:.1f} | {p['candidate']:.1f} | {p['pct_delta']:+.1%} | " + f"{_VERDICT_MARK.get(p['verdict'], p['verdict'])} | " + f"{'yes' if p['within_noise'] else 'no'} |\n") + if report["hard_regressions"]: + h += (f"\n**{report['hard_regressions']} hard regression(s) outside run-to-run noise — " + f"CI gate fails (exit 1).**\n") + return h + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX performance-regression thresholds") + ap.add_argument("paths", nargs="*", default=["results"], + help="result JSON files or dirs (default: results)") + ap.add_argument("--baseline", action="append", default=None, + help="explicit baseline file/dir (repeatable). Default: historical median of " + "all-but-newest runs per (config, T).") + ap.add_argument("--metric", default="roundtrip", choices=list(OPS), + help="operation to compare (default roundtrip)") + ap.add_argument("--pct", default="p99", choices=list(PCTS), + help="percentile to compare (default p99)") + ap.add_argument("--threshold", type=float, default=0.10, + help="fractional change to flag, e.g. 0.10 = ±10%% (default 0.10)") + ap.add_argument("--json", dest="json_out", help="also write the full report to this JSON file") + a = ap.parse_args() + + report = analyze(a.paths or ["results"], metric=a.metric, pct=a.pct, + threshold=a.threshold, baseline_paths=a.baseline) + if a.json_out: + os.makedirs(os.path.dirname(a.json_out) or ".", exist_ok=True) + json.dump(report, open(a.json_out, "w"), indent=2, sort_keys=True) + print(f"wrote {a.json_out}") + print(to_markdown(report)) + # Non-zero exit iff a hard regression (outside noise) exists, so CI can gate on it. + return 1 if report["hard_regressions"] else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/repeated_runs.py b/experimental/CollectiveX/repeated_runs.py new file mode 100644 index 000000000..f9beeaed3 --- /dev/null +++ b/experimental/CollectiveX/repeated_runs.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""CollectiveX repeated independent-run statistics (goal Part 1 "repeated independent workflow-run +statistics"). Distinguishes TWO kinds of repetition that are easy to conflate: + + * in-process trials — the `trials x iters` samples POOLED inside ONE result doc (already + reduced into that doc's p50/p90/p99). Counted as `samples_pooled`. + * independent job reps — SEPARATE benchmark jobs (distinct GitHub run ids / files) of the SAME + fixed config (same `comparison_key`). These reveal run-to-run variance + that a single job cannot — clock state, fabric warm-up, scheduling. + +For each (comparison_key, tokens/rank, op, percentile) measured by >= 2 independent runs it reports +the run-to-run median / min / max / coefficient-of-variation / MAD. An official p99 claim should be +backed by repeated-run STABILITY: >= `--min-runs` independent runs whose p99 CV <= `--cv-threshold`. + + python3 repeated_runs.py --results-dir results + python3 repeated_runs.py --results-dir results --cv-threshold 0.15 --min-runs 2 --out results/repeated.json +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +from collections import defaultdict + + +def _p(r, op, pct): + if isinstance(r.get(op), dict): + return r[op].get(pct) + return r.get(f"{op}_us_{pct}") + + +def _median(xs): + s = sorted(xs); n = len(s) + return (s[n // 2] if n % 2 else (s[n // 2 - 1] + s[n // 2]) / 2.0) if n else float("nan") + + +def _stats(xs): + n = len(xs) + if n == 0: + return None + mean = sum(xs) / n + var = sum((x - mean) ** 2 for x in xs) / n + std = var ** 0.5 + med = _median(xs) + mad = _median([abs(x - med) for x in xs]) + return {"n": n, "median": round(med, 3), "min": round(min(xs), 3), "max": round(max(xs), 3), + "mean": round(mean, 3), "cv": round(std / mean, 4) if mean > 0 else None, + "mad": round(mad, 3)} + + +def load(results_dir): + runs = [] + for f in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)): + if os.path.basename(f).startswith("env_"): + continue + try: + doc = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if doc.get("family") != "moe" or not doc.get("rows"): + continue + gr = (doc.get("reproduction") or {}).get("git_run") or {} + runs.append({ + "file": os.path.basename(f), "ck": doc.get("comparison_key"), + "run_id": gr.get("run_id") or os.path.basename(f), + "sku": (doc.get("runner") or "?").split("_")[0].split("-")[0], + "samples_pooled": (doc["rows"][0].get("samples_pooled") if doc["rows"] else None), + "rows": {r["tokens_per_rank"]: r for r in doc["rows"]}, + }) + return runs + + +def analyze(results_dir, metric="roundtrip", cv_threshold=0.15, min_runs=2): + runs = load(results_dir) + by_ck = defaultdict(list) + for r in runs: + if r["ck"]: + by_ck[r["ck"]].append(r) + out = [] + for ck, group in by_ck.items(): + # independent job reps = distinct run ids within this comparison_key. + run_ids = sorted({g["run_id"] for g in group}) + n_runs = len(run_ids) + # one value per independent run (take the first file for a run id) per T. + per_run = {} + for g in group: + per_run.setdefault(g["run_id"], g) + Ts = sorted({t for g in per_run.values() for t in g["rows"]}) + points = [] + for T in Ts: + vals = {op: [] for op in ("dispatch", "combine", "roundtrip")} + for pct in ("p50", "p99"): + pass + rec = {"tokens_per_rank": T, "n_independent_runs": 0} + for op in ("dispatch", "combine", "roundtrip"): + for pct in ("p50", "p99"): + xs = [_p(g["rows"][T], op, pct) for g in per_run.values() + if T in g["rows"] and _p(g["rows"][T], op, pct) is not None] + st = _stats(xs) + if st: + rec[f"{op}_{pct}"] = st + rec["n_independent_runs"] = max(rec["n_independent_runs"], st["n"]) + points.append(rec) + # stability verdict on the chosen metric's p99. + stable_pts, unstable_pts = [], [] + for rec in points: + st = rec.get(f"{metric}_p99") + if st and st["n"] >= min_runs and st["cv"] is not None: + (stable_pts if st["cv"] <= cv_threshold else unstable_pts).append( + {"T": rec["tokens_per_rank"], "cv": st["cv"], "n": st["n"]}) + out.append({ + "comparison_key": ck, "skus": sorted({g["sku"] for g in group}), + "n_independent_runs": n_runs, "run_ids": run_ids, + "in_process_samples_per_run": sorted({g["samples_pooled"] for g in group if g["samples_pooled"]}), + f"{metric}_p99_stable": len(stable_pts) > 0 and not unstable_pts, + "stable_points": stable_pts, "unstable_points": unstable_pts, + "points": points, + }) + out.sort(key=lambda c: -c["n_independent_runs"]) + return {"metric": metric, "cv_threshold": cv_threshold, "min_runs": min_runs, + "n_comparison_keys": len(out), + "n_with_repeats": sum(1 for c in out if c["n_independent_runs"] >= min_runs), + "cohorts": out} + + +def to_markdown(report): + rep = [c for c in report["cohorts"] if c["n_independent_runs"] >= report["min_runs"]] + h = (f"### Repeated-run stability ({report['metric']} p99; CV ≤ {report['cv_threshold']} over " + f"≥ {report['min_runs']} independent runs)\n\n" + f"{report['n_with_repeats']}/{report['n_comparison_keys']} comparison_keys have ≥ " + f"{report['min_runs']} independent runs.\n\n") + if not rep: + return h + ("_No config has been run as ≥2 independent jobs yet — every point is a single " + "job's pooled in-process trials. Re-dispatch a config to populate run-to-run " + "stability (an official p99 claim requires it)._\n") + h += "| comparison_key | SKUs | runs | p99 stable | stable/unstable pts |\n|---|---|---|---|---|\n" + for c in rep: + h += (f"| `{(c['comparison_key'] or '')[:12]}` | {','.join(c['skus'])} | " + f"{c['n_independent_runs']} | {'YES' if c[report['metric']+'_p99_stable'] else 'NO'} | " + f"{len(c['stable_points'])}✓/{len(c['unstable_points'])}✗ |\n") + return h + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX repeated independent-run statistics") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--metric", default="roundtrip", choices=["roundtrip", "dispatch", "combine"]) + ap.add_argument("--cv-threshold", type=float, default=0.15) + ap.add_argument("--min-runs", type=int, default=2) + ap.add_argument("--out") + a = ap.parse_args() + report = analyze(a.results_dir, a.metric, a.cv_threshold, a.min_runs) + if a.out: + os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True) + json.dump(report, open(a.out, "w"), indent=2, sort_keys=True) + print(f"wrote {a.out}") + print(to_markdown(report)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt new file mode 100644 index 000000000..574afb1f0 --- /dev/null +++ b/experimental/CollectiveX/requirements.txt @@ -0,0 +1,9 @@ +# CollectiveX spike dependencies. +# +# run_nccl.py + env_capture.py : Python standard library only (run anywhere). +# run_deepep.py : torch + deep_ep — provided by the benchmark +# container; DeepEP is built at job setup +# (rebuild-deepep), NOT pinned here. +# plot.py : the only thing worth a local venv: +matplotlib +numpy diff --git a/experimental/CollectiveX/results/.gitkeep b/experimental/CollectiveX/results/.gitkeep new file mode 100644 index 000000000..8940934a2 --- /dev/null +++ b/experimental/CollectiveX/results/.gitkeep @@ -0,0 +1,3 @@ +# CollectiveX result bundles land here as flat *.json (one per runner×op), +# plus plots/ and raw_*.txt captures (gitignored). Keep this file so the dir +# exists before the first run. diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py new file mode 100644 index 000000000..c22654c59 --- /dev/null +++ b/experimental/CollectiveX/run_nccl.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — NCCL primitive benchmark wrapper. + +Runs stock `nccl-tests` binaries (built in-container at job time — the login +nodes have no nvcc), parses the text table (NOT JSON — we do not assume the +build emits JSON), and writes a flat, provenance-tagged JSON result the plot +script and the eventual schema-freeze can consume. + +Standard library only, so it runs in any minimal container. + +Run (inside the container, after building nccl-tests): + python run_nccl.py --op all_reduce \\ + --nccl-tests-dir /tmp/nccl-tests/build \\ + --world-size 8 --min-bytes 8 --max-bytes 8G \\ + --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/b200_all_reduce.json + +Verify the parser offline (no GPU needed): + python run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \\ + --world-size 8 --runner b200-dgxc --topology-class b200-nvlink-island \\ + --out /tmp/parsed.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import subprocess +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "nccl-tests-v1" + +# op -> nccl-tests binary name +OP_BINARY = { + "all_reduce": "all_reduce_perf", + "all_gather": "all_gather_perf", + "reduce_scatter": "reduce_scatter_perf", + "alltoall": "alltoall_perf", + "all_to_all": "alltoall_perf", + "broadcast": "broadcast_perf", + "sendrecv": "sendrecv_perf", +} + + +def _f(tok: str): + """Parse a numeric cell; nccl-tests prints 'N/A' for #wrong when -c 0.""" + if tok in ("N/A", "n/a", "-"): + return None + try: + return float(tok) + except ValueError: + return None + + +def parse_nccl_table(text: str) -> tuple[list[dict], dict]: + """Parse nccl-tests stdout into per-size rows + a run summary. + + Robust across ops: the column count varies (all_reduce/reduce_scatter carry + redop+root; all_gather/alltoall do not), but every op prints the same 8 + trailing numeric columns — out-of-place (time, algbw, busbw, #wrong) then + in-place (time, algbw, busbw, #wrong). `size` is always the first token and + `type` the third. So we key off the first token and the last 8 tokens. + """ + rows: list[dict] = [] + summary: dict = {"avg_busbw_gbps": None, "out_of_bounds": None, "check_passed": None} + for line in text.splitlines(): + s = line.strip() + if not s: + continue + if s.startswith("#"): + if "Avg bus bandwidth" in s: + summary["avg_busbw_gbps"] = _f(s.split(":")[-1].strip()) + elif "Out of bounds values" in s: + tail = s.split(":")[-1].strip() + summary["out_of_bounds"] = tail + summary["check_passed"] = tail.endswith("OK") + continue + toks = s.split() + # Data line: first token is the byte size (all digits), and we need the + # 8 trailing metric columns plus size+count+type up front (>=11 tokens). + if len(toks) < 11 or not toks[0].isdigit(): + continue + tail = toks[-8:] + size = int(toks[0]) + dtype = toks[2] if len(toks) >= 3 else None + oop_wrong = _f(tail[3]) + ip_wrong = _f(tail[7]) + rows.append( + { + "size_bytes": size, + "dtype": dtype, + "out_of_place": { + "time_us": _f(tail[0]), + "algbw_gbps": _f(tail[1]), + "busbw_gbps": _f(tail[2]), + "wrong": oop_wrong, + }, + "in_place": { + "time_us": _f(tail[4]), + "algbw_gbps": _f(tail[5]), + "busbw_gbps": _f(tail[6]), + "wrong": ip_wrong, + }, + # convenience: best (max) busbw across the two placements + "busbw_gbps": max( + [b for b in (_f(tail[2]), _f(tail[6])) if b is not None], + default=None, + ), + "correct": ( + None + if oop_wrong is None and ip_wrong is None + else ((oop_wrong or 0) == 0 and (ip_wrong or 0) == 0) + ), + } + ) + return rows, summary + + +def comparison_key(meta: dict) -> str: + """Machine key gating which rows may share a curve (see plan §Comparability). + Topology-class is intentionally part of the key, so B200(IB) and + GB200(MNNVL) are labelled distinct rather than silently overlaid.""" + parts = [ + meta["op"], + meta["dtype"], + str(meta["world_size"]), + str(meta["nodes"]), + meta["topology_class"], + meta["comparison_class"], + meta["measurement_contract"], + ] + digest = hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + return digest + + +def build_command(args, binary_path: str) -> list[str]: + cmd: list[str] = [] + if args.launch_prefix: + cmd += args.launch_prefix.split() + cmd += [ + binary_path, + "-b", str(args.min_bytes), + "-e", str(args.max_bytes), + "-f", str(args.factor), + "-g", str(args.gpus_per_proc), + "-c", str(args.check), + "-w", str(args.warmup), + "-n", str(args.iters), + ] + if args.extra_args: + cmd += args.extra_args.split() + return cmd + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX NCCL primitive runner") + ap.add_argument("--op", required=True, choices=sorted(OP_BINARY)) + ap.add_argument("--nccl-tests-dir", help="dir containing _perf binaries (build/)") + ap.add_argument("--parse-only", help="parse this captured stdout file instead of running") + # nccl-tests knobs + ap.add_argument("--min-bytes", default="8") + ap.add_argument("--max-bytes", default="8G") + ap.add_argument("--factor", type=int, default=2, help="size step factor") + ap.add_argument("--gpus-per-proc", type=int, default=8, + help="-g: GPUs per process (single-node multi-GPU). Use 1 under MPI.") + ap.add_argument("--check", type=int, default=1, help="-c: 1 enables correctness check") + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=20) + ap.add_argument("--extra-args", default="", help="extra args appended to the binary") + ap.add_argument("--launch-prefix", default="", + help="e.g. 'mpirun -np 16 --hostfile hf' for multi-node; empty for single-node -g mode") + # provenance + ap.add_argument("--runner", required=True, help="runner label, e.g. b200-dgxc") + ap.add_argument("--world-size", type=int, required=True, help="total ranks/GPUs in the run") + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True, + help="e.g. b200-nvlink-island, b200-nvlink-island+cx7-ib, gb200-nvl72-mnnvl") + ap.add_argument("--transport", default="", help="observed transport label: nvlink | ib | mnnvl") + ap.add_argument("--comparison-class", default="standardized", + choices=["standardized", "backend-optimized", "framework-integrated"]) + ap.add_argument("--env-json", help="path to env_capture.py output to embed") + ap.add_argument("--timestamp", help="ISO timestamp (default now)") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + binary = OP_BINARY[args.op] + command = None + if args.parse_only: + with open(args.parse_only) as fh: + stdout = fh.read() + ran_ok = True + else: + if not args.nccl_tests_dir: + ap.error("--nccl-tests-dir is required unless --parse-only is given") + binary_path = os.path.join(args.nccl_tests_dir, binary) + if not os.path.exists(binary_path): + print(f"ERROR: binary not found: {binary_path}", file=sys.stderr) + return 2 + command = build_command(args, binary_path) + print("running:", " ".join(command), file=sys.stderr) + proc = subprocess.run(command, capture_output=True, text=True, check=False) + stdout = proc.stdout + ran_ok = proc.returncode == 0 + if not ran_ok: + print(stdout, file=sys.stderr) + print(proc.stderr, file=sys.stderr) + print(f"ERROR: {binary} exited {proc.returncode}", file=sys.stderr) + + rows, summary = parse_nccl_table(stdout) + dtype = rows[0]["dtype"] if rows else None + + meta = { + "op": args.op, + "dtype": dtype, + "world_size": args.world_size, + "nodes": args.nodes, + "topology_class": args.topology_class, + "comparison_class": args.comparison_class, + "measurement_contract": MEASUREMENT_CONTRACT, + } + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + # All-zero busbw means the benchmark didn't actually communicate — e.g. an + # MPI=0 binary launched under srun --mpi=pmix runs as N standalone world=1 + # procs (busbw formula -> 0). Don't let that pass the gate as "valid". + peak_busbw = max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) + + doc = { + "schema_version": SCHEMA_VERSION, + "family": "nccl", + "generated_by": "run_nccl.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, + "binary": binary, + "command": " ".join(command) if command else f"", + "transport": args.transport, + "status": ("valid" if (rows and ran_ok and peak_busbw > 0.0 + and (summary.get("check_passed") is True + or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"), + "comparison_key": comparison_key(meta), + **meta, + "summary": summary, + "num_rows": len(rows), + "rows": rows, + "environment": env, + } + + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + + print( + f"{args.op}: parsed {len(rows)} sizes -> {args.out} " + f"(status={doc['status']}, avg_busbw={summary.get('avg_busbw_gbps')} GB/s, " + f"key={doc['comparison_key']})" + ) + return 0 if doc["status"] == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/runtime/_xnode_net.sh b/experimental/CollectiveX/runtime/_xnode_net.sh new file mode 100644 index 000000000..ffbd2172a --- /dev/null +++ b/experimental/CollectiveX/runtime/_xnode_net.sh @@ -0,0 +1,42 @@ +# shellcheck shell=bash +# CollectiveX — cross-node PG bootstrap network fix + diagnostic (sourced per-rank/per-node). +# +# torch.distributed's gloo/NCCL TCP bootstrap advertises each rank's address from its hostname. On +# clusters whose /etc/hosts aliases the hostname to loopback 127.0.1.1 (MI355X) the per-rank gloo +# connectFullMesh then tries to connect to 127.0.1.1 and fails ("Gloo connectFullMesh ... Connection +# refused, remote=[127.0.1.1]"). Pinning GLOO_SOCKET_IFNAME / NCCL_SOCKET_IFNAME to the NIC that holds +# the cluster's routable address (the 10.x management/ethernet subnet) makes the mesh advertise the +# reachable interface. RDMA EP transports (UCCL/MoRI/IBGDA) use their own RDMA NICs; this only fixes +# the TCP control-plane rendezvous. +# +# NOTE this does NOT change the TCPStore *connect target* (that is MASTER_ADDR, fixed by the launcher): +# if the rank-0 MASTER_ADDR is unreachable from inside a peer's container network namespace, no iface +# pin helps — that is a cluster topology / container-net property, surfaced by the diagnostic below. +# +# The diagnostic ALWAYS prints what the container can see (hostname + every IPv4), so a cross-node GHA +# log is self-documenting even when auto-detection or reachability fails. Robust to a missing iproute2 +# (`ip`) in minimal CUDA images: falls back to `hostname -I` / /proc parsing. + +# ---- diagnostic: what does this container's network namespace actually see? ---- +_cx_host="$(hostname 2>/dev/null || echo '?')" +if command -v ip >/dev/null 2>&1; then + _cx_addrs="$(ip -o -4 addr show 2>/dev/null | awk '{print $2"="$4}' | tr '\n' ' ')" +else + _cx_addrs="(no iproute2) hostname-I=[$(hostname -I 2>/dev/null)]" +fi +printf '[collectivex] xnode-net host=%s rank=%s addrs: %s\n' "$_cx_host" "${RANK:-?}" "$_cx_addrs" >&2 + +# ---- pin GLOO/NCCL bootstrap iface to the routable 10.x NIC (operator override respected) ---- +if [ -z "${GLOO_SOCKET_IFNAME:-}" ]; then + _cx_if="" + if command -v ip >/dev/null 2>&1; then + _cx_if="$(ip -o -4 addr show 2>/dev/null | awk '$4 ~ /^10\./ {print $2; exit}')" + fi + if [ -n "$_cx_if" ]; then + export GLOO_SOCKET_IFNAME="$_cx_if" NCCL_SOCKET_IFNAME="$_cx_if" + printf '[collectivex] cross-node PG iface: GLOO/NCCL_SOCKET_IFNAME=%s\n' "$_cx_if" >&2 + else + printf '[collectivex] xnode-net: no routable 10.x iface auto-detected (ip present=%s); relying on MASTER_ADDR\n' \ + "$(command -v ip >/dev/null 2>&1 && echo yes || echo no)" >&2 + fi +fi diff --git a/experimental/CollectiveX/runtime/common.sh b/experimental/CollectiveX/runtime/common.sh new file mode 100644 index 000000000..992485a77 --- /dev/null +++ b/experimental/CollectiveX/runtime/common.sh @@ -0,0 +1,209 @@ +# shellcheck shell=bash +# CollectiveX — shared launcher helpers (sourced, not executed). +# +# Cluster-generic scaffolding only (Slurm/container/build/staging); no +# model-serving. Logging goes to stderr so functions can `echo` a single +# result on stdout. + +cx_log() { printf '[collectivex] %s\n' "$*" >&2; } +cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; } + +# Allocate via salloc (--no-shell is appended) and echo the GRANTED Slurm job id, parsed from +# salloc's OWN output. Use INSTEAD of `salloc ...; JOB_ID=$(squeue --name= -h -o %A | head -1)`: +# that lookup is not unique per allocation, so under GHA-matrix concurrency (several cells calling +# salloc with the same --job-name on one cluster) it returns a SIBLING cell's job id. Observed on +# gb300: salloc granted 11354 but the name lookup returned a still-pending 11356 -> srun "Expired or +# invalid job 11356" -> the cell failed even though its own allocation was fine. Parsing salloc's own +# "Granted job allocation N" is race-free. salloc progress still streams live to the job log via tee. +cx_salloc_jobid() { + local _t; _t="$(mktemp)" + salloc "$@" --no-shell 2>&1 | tee "$_t" >&2 || true + sed -n 's/.*Granted job allocation \([0-9][0-9]*\).*/\1/p' "$_t" | head -n1 + rm -f "$_t" +} + +# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI +# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import +# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.) +# IMPORT BY TAG, not by digest: enroot's anonymous Docker Hub token scope is built +# from the tag; a bare `repo@sha256:` ref makes enroot prompt for a password and +# HANG in non-interactive CI (and a combined `tag@sha256` ref 400s). The expected +# multi-arch index digest is recorded for provenance/verification: +CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975" +# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based +# squash creation on these nodes — "failed to mount overlay ... Invalid argument". +# v0.5.11-cu130 imports cleanly and is pre-staged on GB200.) +# DeepEP is NOT bundled here -> run_in_container.sh builds it via rebuild-deepep. +# (The arch-specific deepseek-v4-{blackwell,grace-blackwell} images DO bundle +# DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.) +CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130" + +# AMD (ROCm/CDNA): the multi-arch NVIDIA image above is x86_64+aarch64 CUDA and +# cannot run on MI355X. AMD uses a separate ROCm image that bundles MoRI (the +# AMD EP library). Single-arch (linux/amd64 host, ROCm runtime); not digest- +# pinned yet — pin once validated on the runner. See CONTAINERS.md. +CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2" + +# NIXL stack: the sglang multiarch image has neither the NIXL agent nor the device-EP build deps, +# and its Abseil (20220623) is what blocked the NIXL EP meson build (docs/gated.md). The dynamo +# tensorrtllm-runtime image (CUDA-13, 2026) ships NIXL + a modern Abseil/UCX — the container-switch +# the gated NIXL item calls for. Selected automatically for CX_BENCH=nixl on NVIDIA SKUs (override +# with CX_IMAGE). Listed in .github/configs/nvidia-master.yaml. +CX_IMAGE_NIXL="${CX_IMAGE_NIXL:-nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13}" + +cx_default_image() { + # CX_BENCH=nixl needs the NIXL/dynamo container — switch automatically on NVIDIA SKUs (CX_BENCH is + # already in the inherited env at this point). AMD keeps the MoRI image (no NIXL build there). + if [ "${CX_BENCH:-}" = "nixl" ]; then + case "$1" in + b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_NIXL"; return ;; + esac + fi + case "$1" in + mi355x*|mi350x*|mi325x*|mi300x*) echo "$CX_IMAGE_AMD_MORI" ;; + b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;; + *) cx_die "no default image for runner prefix: $1" ;; + esac +} + +# cx_ensure_squash -> echoes the squash file path. +# Imports via enroot only if a valid squash is not already present (flock-guarded, +# mirroring runners/launch_b200-dgxc.sh). +cx_ensure_squash() { + local squash_dir="$1" image="$2" + mkdir -p "$squash_dir" 2>/dev/null || true + local key sq locks + key="$(printf '%s' "$image" | sed 's#[/:@#]#_#g')" + sq="$squash_dir/${key}.sqsh" + locks="$squash_dir/.locks"; mkdir -p "$locks" 2>/dev/null || true + ( + flock -w 900 9 || cx_die "lock timeout for $sq" + if unsquashfs -l "$sq" >/dev/null 2>&1; then + cx_log "squash present: $sq" + else + cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)" + rm -f "$sq" + # &2 \ + || cx_die "enroot import failed for $image (anonymous auth needs a TAG ref, not a bare digest; or pre-stage the squash)" + unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq" + fi + ) 9>"$locks/${key}.lock" + echo "$sq" +} + +# cx_stage_repo -> echoes the mount-source root. +# Some clusters (e.g. GB200/watchtower) do not cross-mount the runner workspace +# to compute nodes. If CX_STAGE_DIR is set, rsync the CollectiveX tree onto that +# compute-visible shared FS and mount from there. No-op (echo repo_root) when +# stage_dir is empty or equals repo_root. +cx_stage_repo() { + local repo_root="$1" stage_dir="${2:-}" + if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then + echo "$repo_root"; return 0 + fi + # Concurrency isolation. Under GHA the per-config concurrency fan-out runs many + # same-SKU dispatches at once, all staging into the SAME shared base dir; a + # shared dir + `rsync --delete` lets one job unlink/replace a file a peer is + # mid-read of -> "error reading input file: Stale file handle" on the next + # `srun ... run_in_container.sh`. Give each EXECUTING job its own subdir keyed on + # the runner name (a self-hosted runner runs one job at a time, so concurrent + # jobs never share a dir); sequential reuse on one runner is safe (the jobs do + # not overlap, and --delete refreshes the tree). Outside GHA (no RUNNER_NAME / + # GITHUB_RUN_ID) keep the single shared dir — SSH use is single-tenant. + local tag="${RUNNER_NAME:-${GITHUB_RUN_ID:-}}" + if [ -n "$tag" ]; then + stage_dir="$stage_dir/job_$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')" + fi + mkdir -p "$stage_dir/experimental" || cx_die "cannot create stage dir $stage_dir" + cx_log "staging experimental/CollectiveX -> $stage_dir (compute-visible)" + rsync -a --delete \ + --exclude='.nccl-tests/' --exclude='__pycache__/' --exclude='results/plots/' \ + "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" >&2 \ + || cx_die "rsync to stage dir failed" + echo "$stage_dir" +} + +# cx_collect_results +# When the run used a staged (compute-visible) mount, copy result JSONs back to +# the original checkout's results/ so the workflow's upload-artifact (which reads +# the checkout, not the stage dir) finds them. No-op when no staging was used. +cx_collect_results() { + local mount_src="$1" repo_root="$2" dst + [ "$mount_src" = "$repo_root" ] && return 0 + dst="$repo_root/experimental/CollectiveX/results" + mkdir -p "$dst" + cp "$mount_src/experimental/CollectiveX/results/"*.json "$dst/" 2>/dev/null || true + cx_log "copied results from stage dir -> $dst (for artifact upload)" +} + +# cx_build_nccl_tests -> echoes the build/ dir. +# Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built. +# CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang +# cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed. +cx_build_nccl_tests() { + local parent="$1" mpi="${2:-0}" dir bin sfx="" + # Cache MPI=0 and MPI=1 builds in SEPARATE dirs. A single-node (MPI=0) binary + # reused under `srun --mpi=pmix` runs as N standalone world=1 procs (busbw=0); + # keying the cache by flavor prevents that cross-contamination. + [ "$mpi" = "1" ] && sfx="-mpi" + dir="$parent/nccl-tests$sfx" + bin="$dir/build/all_reduce_perf" + if [ -x "$bin" ]; then + cx_log "nccl-tests already built: $dir/build" + echo "$dir/build"; return 0 + fi + mkdir -p "$parent" + if [ ! -d "$dir/.git" ]; then + cx_log "cloning nccl-tests -> $dir" + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \ + || cx_die "git clone nccl-tests failed" + fi + # MPI=1 needs MPI_HOME. On Debian/Ubuntu OpenMPI the headers live under + # /usr/lib//openmpi/include (NOT /usr/include), so MPI_HOME=/usr fails; + # point it at that openmpi dir (libmpi resolves via the default linker path). + # Works for both x86_64 (B200) and aarch64 (GB200). Override with CX_MPI_HOME. + local mpi_home="${CX_MPI_HOME:-}" + if [ "$mpi" = "1" ] && [ -z "$mpi_home" ]; then + mpi_home="$(ls -d /usr/lib/*/openmpi 2>/dev/null | head -n1)" + fi + cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr}${mpi_home:+, MPI_HOME=$mpi_home})" + make -C "$dir" -j MPI="$mpi" \ + CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \ + NCCL_HOME="${CX_NCCL_HOME:-/usr}" \ + ${mpi_home:+MPI_HOME="$mpi_home"} >&2 \ + || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME/CX_MPI_HOME; need nccl.h + libnccl)" + [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin" + echo "$dir/build" +} + +# cx_build_rccl_tests -> echoes the build/ dir. +# AMD/ROCm counterpart of cx_build_nccl_tests: ROCm/rccl-tests is a fork of +# nccl-tests producing the SAME binary names (_perf) and output format, so +# run_nccl.py parses it unchanged. `make` defaults to ROCm at /opt/rocm +# (amdclang++ + librccl); validated building in-container on MI355X. Override +# CX_ROCM_HOME / CX_RCCL_HOME / CX_MPI_HOME if the toolchain lives elsewhere. +cx_build_rccl_tests() { + local parent="$1" mpi="${2:-0}" dir bin + dir="$parent/rccl-tests" + bin="$dir/build/all_reduce_perf" + if [ -x "$bin" ]; then + cx_log "rccl-tests already built: $dir/build" + echo "$dir/build"; return 0 + fi + mkdir -p "$parent" + if [ ! -d "$dir/.git" ]; then + cx_log "cloning rccl-tests -> $dir" + git clone --depth 1 https://github.com/ROCm/rccl-tests.git "$dir" >&2 \ + || cx_die "git clone rccl-tests failed" + fi + cx_log "building rccl-tests (MPI=$mpi, ROCm ${CX_ROCM_HOME:-/opt/rocm})" + make -C "$dir" -j MPI="$mpi" \ + ${CX_ROCM_HOME:+HIP_HOME="$CX_ROCM_HOME"} \ + ${CX_RCCL_HOME:+RCCL_HOME="$CX_RCCL_HOME"} \ + ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \ + || cx_die "rccl-tests build failed (need ROCm + librccl; try CX_ROCM_HOME)" + [ -x "$bin" ] || cx_die "rccl-tests build produced no binary at $bin" + echo "$dir/build" +} diff --git a/experimental/CollectiveX/runtime/run_in_container.sh b/experimental/CollectiveX/runtime/run_in_container.sh new file mode 100644 index 000000000..10777bdcf --- /dev/null +++ b/experimental/CollectiveX/runtime/run_in_container.sh @@ -0,0 +1,722 @@ +#!/usr/bin/env bash +# CollectiveX — generic in-container benchmark dispatcher (single-node). +# +# Runs INSIDE the container under `srun`, invoked by every per-SKU adapter +# (launch_.sh). The SKU adapter handles allocation/container/transport-env; +# this script decides WHICH benchmark to run from CX_BENCH, so any benchmark can +# be driven through any SKU's launch script. Writes provenance-tagged JSON to +# results/. +# +# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO +# Selector: CX_BENCH = nccl | deepep | mori | all (default nccl) +# (mori = AMD ROCm EP; nccl/deepep = NVIDIA. `all` = nccl+deepep.) +# NCCL knobs: CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME +# EP knobs (DeepEP/MoRI), all -> tests/run_ep.py: +# CX_PHASE = decode | prefill | both (default decode) <- picks the token sweep +# CX_TOKENS_LADDER (space/comma sep; blank = phase default), CX_TOKENS_PER_RANK (legacy single point) +# CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE CX_ROUTING CX_MODE(normal|ll) +# CX_NUM_SMS (DeepEP comm SMs) CX_SEED CX_ITERS +set -euo pipefail + +cd /ix/experimental/CollectiveX +# shellcheck source=../runtime/common.sh +source runtime/common.sh +mkdir -p results + +: "${CX_RUNNER:?CX_RUNNER not set}" +: "${CX_NGPUS:?CX_NGPUS not set}" +: "${CX_TS:?CX_TS not set}" +: "${CX_TOPO:?CX_TOPO not set}" +CX_BENCH="${CX_BENCH:-nccl}" +CX_TRANSPORT="${CX_TRANSPORT:-}" +ENVJSON="results/env_${CX_RUNNER}_${CX_TS}.json" + +# CX_TIMING="iters:trials:warmup" unpacks into the individual knobs (one workflow input feeds three, +# since GitHub caps workflow_dispatch at 25 inputs). Blank fields keep their defaults. Used for the +# MoRI/MI355X large-T probe (e.g. "8:1:4" — minimal sustained load to dodge the wedge). +if [ -n "${CX_TIMING:-}" ]; then + _ti="${CX_TIMING%%:*}"; _rest="${CX_TIMING#*:}"; _tt="${_rest%%:*}"; _tw="${_rest#*:}" + [ -n "$_ti" ] && [ "$_ti" != "$CX_TIMING" ] && export CX_ITERS="$_ti" + [ -n "$_tt" ] && [ "$_tt" != "$_rest" ] && export CX_TRIALS="$_tt" + [ -n "$_tw" ] && [ "$_tw" != "$_rest" ] && export CX_WARMUP="$_tw" + cx_log "CX_TIMING=$CX_TIMING -> iters=${CX_ITERS:-200} trials=${CX_TRIALS:-3} warmup=${CX_WARMUP:-32}" +fi + +cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO" +python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS" + +run_nccl_suite() { + local build ops op sfail=0 impl=nccl + # AMD/ROCm -> rccl-tests (fork; same binaries + output, parsed by run_nccl.py); + # NVIDIA/CUDA -> nccl-tests. Both single-node: MPI=0, -g N. + if [ -d /opt/rocm ] || command -v hipcc >/dev/null 2>&1; then + impl=rccl + build="$(cx_build_rccl_tests "$PWD/.nccl-tests" 0)" || return 1 + else + build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1 + fi + cx_log "collective impl=$impl build=$build" + ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + for op in $ops; do + if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \ + --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \ + --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then + cx_log "WARN: $impl $op failed or invalid"; sfail=1 + fi + done + return "$sfail" +} + +# Resolve the source-tokens-per-rank sweep: explicit CX_TOKENS_LADDER wins; else +# the legacy single-point CX_TOKENS_PER_RANK becomes a one-point ladder; else +# blank => tests/run_ep.py picks the phase default (decode small / prefill large). +cx_ep_ladder() { + if [ -n "${CX_TOKENS_LADDER:-}" ]; then printf '%s' "$CX_TOKENS_LADDER" + elif [ -n "${CX_TOKENS_PER_RANK:-}" ]; then printf '%s' "$CX_TOKENS_PER_RANK" + else printf ''; fi +} + +# Canonical workload staging (goal P1 "official" cohort). make_workloads.py is DETERMINISTIC, so +# every SKU/backend generates byte-identical serialized traces in-container => identical workload_id +# + checksum => proven cross-hardware workload identity with NO shared filesystem. When CX_CANONICAL=1 +# (and CX_WORKLOAD_DIR not already provided) we generate the routing's traces for the run's ladder +# into a NON-results dir (.cx_workloads/ — so the *.manifest.json never pollute the results glob) and +# point run_ep at it. A canonical-serialized run with full GHA provenance is publication 'official'. +cx_stage_canonical() { + [ "${CX_CANONICAL:-0}" = "1" ] || return 0 + [ -n "${CX_WORKLOAD_DIR:-}" ] && return 0 + local dir="$PWD/.cx_workloads" + local ladder; ladder="$(cx_ep_ladder)" + # cover both phase ladders when none is given, so either phase finds its files. + [ -z "$ladder" ] && ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096" + cx_log "staging canonical workloads (routing=${CX_ROUTING:-uniform} ep=$CX_NGPUS ladder='$ladder')" + python3 tests/make_workloads.py --out-dir "$dir" --routing "${CX_ROUTING:-uniform}" \ + --ep "$CX_NGPUS" --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" \ + --experts "${CX_EXPERTS:-256}" --seed "${CX_SEED:-67}" --tokens-ladder "$ladder" \ + || { cx_log "WARN: canonical workload staging failed — falling back to seeded-runtime"; return 0; } + export CX_WORKLOAD_DIR="$dir" + cx_log "canonical workloads staged at $dir" +} + +# run_ep_suite +# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and +# combine are timed separately inside it. One JSON per (backend, phase). +# Preserve a FAILED case as a classified record (goal immediate P2 "preserve failed cases in +# aggregation") so a wedge/timeout/crash becomes a bounded artifact in results/ (uploaded + surfaced +# by the plot/validator) instead of vanishing. Uses tests/failure_taxonomy.py for the mode. +emit_failed_case() { # backend phase rc + python3 - "$1" "$2" "$3" "$CX_RUNNER" "$CX_TOPO" \ + "results/failed_${CX_RUNNER}_${1}_${2}_${CX_TS}.json" <<'PY' || true +import sys, json, os +sys.path.insert(0, "tests") +import failure_taxonomy as ft +backend, phase, rc, runner, topo, out = sys.argv[1:7] +rec = {"family": "moe", "record_type": "failed-case", "schema_version": 3, + "generated_by": "run_in_container.sh", "runner": runner, "backend": backend, + "phase": phase, "topology_class": topo, "status": "failed", + "publication_status": "failed", "rows": [], + "failure": ft.record(rc=int(rc), case={"backend": backend, "phase": phase, + "dispatch_dtype": os.environ.get("CX_DISPATCH_DTYPE", "bf16"), + "mode": os.environ.get("CX_MODE", "normal"), + "contract": os.environ.get("CX_MEASUREMENT_CONTRACT", "layout-and-dispatch-v1"), + "routing": os.environ.get("CX_ROUTING", "uniform")})} +json.dump(rec, open(out, "w"), indent=2) +print(f"preserved failed-case record ({rec['failure']['failure_mode']}) -> {out}") +PY +} + +run_ep_suite() { + local backend="$1" phase phases ladder rc=0 rc_run + ladder="$(cx_ep_ladder)" + phases="${CX_PHASE:-decode}" + [ "$phases" = "both" ] && phases="decode prefill" + cx_stage_canonical || true # sets CX_WORKLOAD_DIR when CX_CANONICAL=1 (official cohort) + # CROSS-NODE EP (goal 182): when CX_NNODES>1 (set per-node by a multi-node launcher with + # CX_NODE_RANK + CX_RDZV_FILE) we span CX_NNODES*CX_NGPUS ranks over the inter-node fabric. We do + # NOT use torchrun: its elastic agent runs its OWN cross-node TCPStore at --master-addr, which is + # unreachable from a peer rank's enroot container net namespace (the management-subnet NodeAddr is + # not in the container's net view — torchrun timed out 900s at exactly that bootstrap). Instead each + # node spawns its NGPUS local ranks directly (global RANK = CX_NODE_RANK*NGPUS + local) and they + # rendezvous via a FileStore on the compute-visible shared mount (CX_RDZV_FILE, consumed by + # run_ep.py), so NCCL exchanges its unique-id through the shared file and connects peers over IB. + local xnode=0 + if [ -n "${CX_NNODES:-}" ] && [ "${CX_NNODES}" -gt 1 ]; then + xnode=1 + # shellcheck source=_xnode_net.sh + source runtime/_xnode_net.sh 2>/dev/null || true + : "${CX_RDZV_FILE:=$PWD/.rdzv_${CX_TS}}"; export CX_RDZV_FILE + cx_log "cross-node EP: nnodes=$CX_NNODES node_rank=${CX_NODE_RANK:-0} world=$((CX_NNODES*CX_NGPUS)) rdzv=file://$CX_RDZV_FILE (no torchrun agent)" + fi + for phase in $phases; do + cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-}'" + local out="results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json" + # Common run_ep.py args (shared by single-node torchrun + cross-node local-spawn). + local -a EPARGS=(--backend "$backend" --phase "$phase" --tokens-ladder "$ladder" --mode "${CX_MODE:-normal}" + --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" + --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" --routing "${CX_ROUTING:-uniform}" + --num-sms "${CX_NUM_SMS:-24}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-200}" + --trials "${CX_TRIALS:-3}" --warmup "${CX_WARMUP:-32}" + --measurement-contract "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" + --resource-mode "${CX_RESOURCE_MODE:-normalized}" --sm-fraction "${CX_SM_FRACTION:-0.18}" + --activation-profile "${CX_ACTIVATION_PROFILE:-normal}" --placement "${CX_PLACEMENT:-packed}" + --routing-step "${CX_ROUTING_STEP:-0}" --uneven-tokens "${CX_UNEVEN_TOKENS:-none}" + --combine-dtype "${CX_COMBINE_DTYPE:-bf16}" --combine-quant-mode "${CX_COMBINE_QUANT_MODE:-none}" + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" + --env-json "$ENVJSON" --out "$out") + [ -n "${CX_EPLB:-}" ] && EPARGS+=(--eplb) + [ -n "${CX_WORKLOAD_DIR:-}" ] && EPARGS+=(--workload-dir "$CX_WORKLOAD_DIR") + [ -n "${CX_WAIVE_ANOMALY:-}" ] && EPARGS+=(--waive-anomaly) + # Hard wall-clock guard: a wedged collective must FAIL FAST (timeout -k SIGKILLs after grace). + if [ "$xnode" = 1 ]; then + # Cross-node: spawn NGPUS local ranks, FileStore rendezvous (no torchrun agent). Only the global + # rank 0 writes --out; the rest participate in the collectives. wait collects every rank's rc. + local base=$(( ${CX_NODE_RANK:-0} * CX_NGPUS )) world=$(( CX_NNODES * CX_NGPUS )) i; local -a pids=() + for i in $(seq 0 $((CX_NGPUS - 1))); do + RANK=$((base + i)) LOCAL_RANK="$i" WORLD_SIZE="$world" \ + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/run_ep.py "${EPARGS[@]}" & + pids+=($!) + done + rc_run=0; for i in "${pids[@]}"; do wait "$i" || rc_run=$?; done + else + # shellcheck disable=SC2086 + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \ + torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py "${EPARGS[@]}" + rc_run=$? + fi + if [ "$rc_run" != 0 ]; then + cx_log "WARN: $backend $phase run failed/timed out rc=$rc_run (CX_RUN_TIMEOUT=${CX_RUN_TIMEOUT:-900}s)" + emit_failed_case "$backend" "$phase" "$rc_run" # preserve the classified failed case + rc=1 + fi + done + return "$rc" +} + +# Build DeepEP V2 (NCCL Gin backend) from source, overriding the image's bundled V1 (1.2.1). +# V2 needs NCCL>=2.30.4 (symmetric memory) STRICTLY matching the NCCL torch loads, and builds JIT +# (no precompile). arch 9.0 for Hopper (H100/H200), 10.0 for Blackwell (B300/B200/GB300). Best-effort: +# on failure the deepep run still fails loudly (preserved failed-case), never a silent V1 fallback. +cx_build_deepep_v2() { + # IDEMPOTENT: SHARD mode calls dispatch_bench (hence this) once PER CASE. Build once per allocation, + # then skip — else a 60-case shard re-runs the from-source build 60x (force-reinstall) and blows the + # slurm --time. Sentinel lives in the container fs (persists across the x86 in-container case loop). + [ -f /tmp/.cx_built_deepep_v2 ] && { cx_log "DeepEP V2 already built this allocation — skip rebuild"; return 0; } + local arch="9.0"; case "$CX_RUNNER" in b300*|gb300*|b200*) arch="10.0";; esac + cx_log "DeepEP V2: building from source (TORCH_CUDA_ARCH_LIST=$arch) — overrides bundled V1" + # PEP 668: newer images (H200/B300) ship an externally-managed Python that refuses `pip install`. + # PIP_BREAK_SYSTEM_PACKAGES is honored by pip>=23.0.1 and silently ignored by older pip (H100), + # so this is safe across every image; --break-system-packages as a flag would error on old pip. + export PIP_BREAK_SYSTEM_PACKAGES=1 + pip install -q "nvidia-nccl-cu13>=2.30.4" >&2 2>&1 || cx_log "WARN: nvidia-nccl-cu13 install warning" + rm -rf /tmp/DeepEP_v2 + git clone --depth 1 https://github.com/deepseek-ai/DeepEP /tmp/DeepEP_v2 >&2 2>&1 \ + || { cx_log "ERROR: DeepEP V2 git clone failed (compute-node network?)"; return 1; } + export DEEPEP_COMMIT="v2-$(git -C /tmp/DeepEP_v2 rev-parse --short HEAD 2>/dev/null || echo main)" + ( cd /tmp/DeepEP_v2 && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \ + pip install -q --no-build-isolation --force-reinstall . ) >&2 2>&1 \ + || { cx_log "ERROR: DeepEP V2 build/install failed (arch=$arch; NCCL/toolchain?)"; return 1; } + python3 -c "import deep_ep; print('built deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \ + || { cx_log "ERROR: DeepEP V2 import failed after build (NCCL version mismatch?)"; return 1; } + : > /tmp/.cx_built_deepep_v2 # sentinel: skip rebuild on subsequent cases in this allocation + cx_log "DeepEP V2 ready ($DEEPEP_COMMIT)" +} + +# Build the DeepEP `hybrid-ep` branch (NVIDIA's TMA-based impl: HybridEPBuffer, intranode NVLink + +# internode RDMA/NIXL). Three container-specific fixes, all probe-confirmed on the B300 sglang image: +# 1. CUDA-13 moved cccl/libcudacxx headers to /include/cccl/ (not on nvcc's default path) — +# its nvshmem_tensor.h #includes -> add that dir via CPATH/NVCC_PREPEND_FLAGS. +# 2. The final link wants -l:libnvshmem_host.so but the bundled nvshmem ships only .so.3 -> create +# the unversioned symlink. +# 3. NVSHMEM_DIR set to the bundled nvshmem enables build; unset => intranode-only (internode/LL off). +# Intranode HybridEPBuffer (single NVLink domain, <=8 ranks) needs no multi-node/NVSHMEM bring-up. +cx_build_deepep_hybrid() { + [ -f /tmp/.cx_built_deepep_hybrid ] && { cx_log "hybrid-ep already built this allocation — skip rebuild"; return 0; } + local arch="9.0"; case "$CX_RUNNER" in b300*|gb300*|b200*) arch="10.0";; esac + cx_log "DeepEP hybrid-ep: building NVIDIA TMA branch from source (TORCH_CUDA_ARCH_LIST=$arch)" + export PIP_BREAK_SYSTEM_PACKAGES=1 + export NVSHMEM_DIR="$(python3 -c 'import os,nvidia.nvshmem as n; print(os.path.dirname(n.__file__))' 2>/dev/null || echo /usr/local/lib/python3.12/dist-packages/nvidia/nvshmem)" + local cccl; cccl="$(echo /usr/local/cuda*/targets/*/include/cccl | awk '{print $1}')" + [ -d "$cccl" ] && { export CPATH="$cccl:${CPATH:-}"; export NVCC_PREPEND_FLAGS="-I$cccl ${NVCC_PREPEND_FLAGS:-}"; } + [ -e "$NVSHMEM_DIR/lib/libnvshmem_host.so.3" ] && ln -sf libnvshmem_host.so.3 "$NVSHMEM_DIR/lib/libnvshmem_host.so" 2>/dev/null || true + export LD_LIBRARY_PATH="$NVSHMEM_DIR/lib:${LD_LIBRARY_PATH:-}" + rm -rf /tmp/DeepEP_hybrid + git clone --depth 1 --branch hybrid-ep https://github.com/deepseek-ai/DeepEP /tmp/DeepEP_hybrid >&2 2>&1 \ + || { cx_log "ERROR: hybrid-ep git clone failed"; return 1; } + export DEEPEP_COMMIT="hybrid-$(git -C /tmp/DeepEP_hybrid rev-parse --short HEAD 2>/dev/null || echo hybrid-ep)" + # Install into SITE-PACKAGES so the build persists across srun steps in the pyxis named container. The + # EP8 multi-srun runs the build-once and each case as SEPARATE srun steps; only the container rootfs + # (site-packages) persists — /tmp does NOT. The old `build_ext --inplace` under /tmp/DeepEP_hybrid + + # PYTHONPATH worked for the EP4 single-node path (build+run share one process) but was LOST at EP8, + # giving `module deep_ep has no attribute HybridEPBuffer`. pip install mirrors deepep-v2 (which persists + # correctly at EP8). Fall back to in-place build (EP4 single-node only) if this branch can't plain-install. + if ( cd /tmp/DeepEP_hybrid && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \ + pip install -q --no-build-isolation --force-reinstall . ) >&2 2>&1; then + cx_log "hybrid-ep installed into site-packages (persists across srun steps)" + else + cx_log "WARN: hybrid-ep pip install failed — falling back to build_ext --inplace (EP4 single-node only)" + ( cd /tmp/DeepEP_hybrid && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 python3 setup.py build_ext --inplace ) >&2 2>&1 \ + || { cx_log "ERROR: hybrid-ep build failed (arch=$arch; cccl/nvshmem?)"; return 1; } + export PYTHONPATH="/tmp/DeepEP_hybrid:${PYTHONPATH:-}" + fi + python3 -c "import deep_ep; assert hasattr(deep_ep,'HybridEPBuffer'); print('built hybrid-ep deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \ + || { cx_log "ERROR: hybrid-ep import / HybridEPBuffer missing after build"; return 1; } + # nvshmem runtime libs are in site-packages (persistent); the env pointing at them is process-local, and + # a PYTHONPATH is needed only if the in-place fallback ran. Persist both to a file the EP8 case-srun WRAP + # sources (best-effort; with pip install the package itself is already on the default site-packages path). + { printf 'export LD_LIBRARY_PATH=%s/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}\n' "$NVSHMEM_DIR" + [ -n "${PYTHONPATH:-}" ] && printf 'export PYTHONPATH=%s\n' "$PYTHONPATH" + } > /tmp/.cx_hybrid_env 2>/dev/null || cx_log "WARN: could not write /tmp/.cx_hybrid_env" + : > /tmp/.cx_built_deepep_hybrid # sentinel: skip rebuild on subsequent cases in this allocation + cx_log "DeepEP hybrid-ep ready ($DEEPEP_COMMIT)" +} + +# UCCL EP (uccl.ep.Buffer is a DeepEP-API clone). The prebuilt wheel is cu12; on a cu13 +# image its kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH (probe-confirmed). PEP-668 +# images need PIP_BREAK_SYSTEM_PACKAGES. Best-effort; failure to import fails loudly. +cx_build_uccl() { + cx_log "UCCL EP: pip install uccl + cu12 runtime shim" + export PIP_BREAK_SYSTEM_PACKAGES=1 + pip install -q uccl >&2 2>&1 || { cx_log "ERROR: pip install uccl failed"; return 1; } + pip install -q nvidia-cuda-runtime-cu12 >&2 2>&1 || cx_log "WARN: nvidia-cuda-runtime-cu12 warning" + local cu12lib + cu12lib="$(python3 -c "import nvidia.cuda_runtime as m, os; print(os.path.join(os.path.dirname(m.__file__),'lib'))" 2>/dev/null)" + [ -n "$cu12lib" ] && export LD_LIBRARY_PATH="$cu12lib:${LD_LIBRARY_PATH:-}" + export UCCL_COMMIT="pkg-$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))' 2>/dev/null || echo uccl)" + # import torch FIRST: uccl.ep's C extension links libc10.so (torch), which is only on the loader + # path once torch is imported (rpath). The adapter (ep_uccl.py) imports torch before uccl.ep too. + python3 -c "import torch; from uccl.ep import Buffer; print('uccl.ep ready')" >&2 \ + || { cx_log "ERROR: uccl.ep import failed (cu12 runtime on LD_LIBRARY_PATH?)"; return 1; } + # Vendor UCCL's DeepEP-API wrapper (ep/deep_ep_wrapper/deep_ep) under a NON-conflicting name + # (uccl_deepep) so it doesn't shadow the container's real deep_ep. Its Buffer(group, num_nvl_bytes, + # ...) takes a torch ProcessGroup (matching DeepEP + ep_uccl.py's calls) and runs the full + # proxy/IPC-handle/runtime.sync bootstrap that the low-level uccl.ep.Buffer(rank,num_ranks) lacks. + rm -rf /tmp/uccl_src /tmp/uccl_deepep_pkg + # Pin the wrapper to the SAME tag as the installed wheel (pkg-0.1.1 -> v0.1.1): the wrapper's + # dispatch calls into uccl.ep (get_rdma_buffer etc.), so a main-branch wrapper vs a 0.1.1 wheel + # mismatches signatures. Match them. + _uccl_tag="v$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))' 2>/dev/null || echo 0.1.1)" + if { git clone --depth 1 --branch "$_uccl_tag" https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1 \ + || git clone --depth 1 https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1; } \ + && [ -d /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep ]; then + mkdir -p /tmp/uccl_deepep_pkg/uccl_deepep + cp /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep/*.py /tmp/uccl_deepep_pkg/uccl_deepep/ 2>/dev/null + export PYTHONPATH="/tmp/uccl_deepep_pkg:${PYTHONPATH:-}" + if python3 -c "import torch; from uccl_deepep import Buffer; print('uccl_deepep wrapper ready')" >&2; then + export CX_UCCL_WRAPPER=1 + else + cx_log "WARN: uccl_deepep wrapper import failed — falling back to low-level uccl.ep" + fi + else + cx_log "WARN: uccl deep_ep_wrapper not vendored (clone/path) — low-level uccl.ep fallback" + fi + cx_log "UCCL EP ready ($UCCL_COMMIT, wrapper=${CX_UCCL_WRAPPER:-0})" +} + +run_deepep_suite() { + # CX_DEEPEP_V2=1 -> build the V2 (NCCL Gin) kernels from source first (Hopper+Blackwell only). + if [ "${CX_DEEPEP_V2:-0}" = "1" ]; then + cx_build_deepep_v2 || { cx_log "WARN: DeepEP V2 setup failed — cannot run V2"; return 1; } + fi + # DeepEP is not bundled in the multi-arch image. Try to import; if absent, + # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a + # failure, not a silent skip — the caller asked for deepep. + if ! python3 -c "import deep_ep" 2>/dev/null; then + if command -v rebuild-deepep.sh >/dev/null 2>&1; then + cx_log "building DeepEP via rebuild-deepep.sh" + rebuild-deepep.sh >&2 || { cx_log "WARN: rebuild-deepep.sh failed"; return 1; } + else + cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; cannot run deepep" + return 1 + fi + fi + run_ep_suite deepep +} + +run_mori_suite() { + # MoRI (AMD ROCm EP), bundled in the AMD MoRI image. If absent this is a + # failure (MoRI is not rebuildable here), not a silent skip. Single-node + # 8x MI355X over XGMI; torch.cuda maps onto ROCm/HIP. + if ! python3 -c "import mori" 2>/dev/null; then + cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori" + return 1 + fi + run_ep_suite mori +} + +run_uccl_suite() { + # UCCL EP (NVIDIA) — DeepEP-API clone; build the wheel + cu12 shim, then reuse the generic + # EP sweep (run_ep.py --backend uccl). Inability to install/import is a failure, not a skip. + cx_build_uccl || { cx_log "WARN: UCCL EP setup failed — cannot run uccl"; return 1; } + run_ep_suite uccl +} +run_nccl_ep_suite() { + # NCCL/RCCL all-to-all EP (tests/ep_nccl.py) — pure torch.distributed collectives, already in every + # image (no build). The canonical token-shuffle EP + the only cross-node path that survives without + # GPUDirect-RDMA: NCCL host-stages where UCCL's ibv_reg_mr / MoRI's RDMA registration abort. Works + # cross-node via the FileStore rendezvous (CX_RDZV_FILE) on both NVIDIA (nccl) and AMD (rccl). + run_ep_suite nccl-ep +} +run_deepep_hybrid_suite() { + # DeepEP hybrid-ep branch (NVIDIA TMA HybridEPBuffer) — build from source (cccl + libnvshmem + # fixes), then the generic EP sweep (run_ep.py --backend deepep-hybrid). Intranode NVLink path. + cx_build_deepep_hybrid || { cx_log "WARN: hybrid-ep setup failed — cannot run deepep-hybrid"; return 1; } + run_ep_suite deepep-hybrid +} + +run_collective_bench() { + # Single-process host/GPU memcpy-family collectives (NOT torchrun): CPU-GPU offload, + # copy-engine/SDMA, KV-cache transfer. Each emits one family-tagged JSON like run_nccl.py. + local kind="$1" script out rc=0 + case "$kind" in + offload) script="tests/offload_bench.py"; out="results/${CX_RUNNER}_offload_${CX_TS}.json" ;; + copy-engine) script="tests/copy_engine_bench.py"; out="results/${CX_RUNNER}_copy_engine_${CX_TS}.json" ;; + kv-cache) script="tests/kv_cache_transfer.py"; out="results/${CX_RUNNER}_kvcache_${CX_TS}.json" ;; + *) cx_die "unknown collective kind '$kind'" ;; + esac + cx_log "collective bench=$kind -> $out" + local extra=""; [ "$kind" = "kv-cache" ] && extra="--direction all" + # shellcheck disable=SC2086 + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 "$script" $extra \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \ + --env-json "$ENVJSON" --out "$out" || rc=$? + [ "$rc" = 0 ] || cx_log "WARN: collective $kind failed/timed out rc=$rc" + return "$rc" +} + +run_rl_mesh() { + # RL trainer<->generator mesh transfer (multi-process: torchrun splits world into two meshes). + cx_log "rl-mesh bench ngpus=$CX_NGPUS" + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \ + torchrun --nproc_per_node="$CX_NGPUS" tests/rl_mesh_bench.py \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_rl_mesh_${CX_TS}.json" + local rc=$? + [ "$rc" = 0 ] || cx_log "WARN: rl-mesh failed/timed out rc=$rc" + return "$rc" +} + +run_allreduce_fw() { + # Framework custom all-reduce (flashinfer one-shot/two-shot + sglang/vllm), multi-process torchrun. + cx_log "allreduce-fw bench ngpus=$CX_NGPUS" + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \ + torchrun --nproc_per_node="$CX_NGPUS" tests/allreduce_fw_bench.py \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_allreduce_fw_${CX_TS}.json" + local rc=$? + [ "$rc" = 0 ] || cx_log "WARN: allreduce-fw failed/timed out rc=$rc" + return "$rc" +} + +# Upgrade FlashInfer in-container to the latest wheel — the bundled 0.6.8.post1 lacks the +# quantized-COMBINE OUTPUT path (moe_a2a_combine output_dtype/output_scales, added in a newer +# release; confirmed in the main-branch source). A combine-quant run needs it; the dispatch path +# (bf16/fp8/mxfp8/nvfp4) is unaffected and stays on whatever is installed. Best-effort: a failed +# upgrade leaves the run on the bundled version (the combine-quant adapter then rejects loudly). +cx_build_flashinfer_latest() { + [ -f /tmp/.cx_built_flashinfer ] && { cx_log "FlashInfer quant-combine build already done this allocation — skip"; return 0; } + cx_log "FlashInfer: upgrading to latest wheel for quantized-combine output (moe_a2a_combine output_dtype)" + export PIP_BREAK_SYSTEM_PACKAGES=1 + # moe_a2a_combine output_dtype is on flashinfer MAIN but NOT in the latest PyPI release (0.6.13) — + # so `pip -U flashinfer-python` (PyPI) is insufficient. Install from the NIGHTLY wheel index + # (built from main): flashinfer-python (--no-deps; the container already has torch etc.) + the + # matching cubin + cu130 jit-cache. FLASHINFER_DISABLE_VERSION_CHECK=1 bypasses any residual + # sub-package skew. Falls back to a PyPI -U (which then asserts-out cleanly if it lacks output_dtype). + export FLASHINFER_DISABLE_VERSION_CHECK=1 + local before after NIDX="https://flashinfer.ai/whl/nightly" + before="$(python3 -c 'import flashinfer;print(flashinfer.__version__)' 2>/dev/null || echo none)" + { pip install -q -U --pre flashinfer-python --index-url "$NIDX/" --no-deps >&2 2>&1 \ + && pip install -q -U --pre flashinfer-cubin --index-url "$NIDX/" >&2 2>&1 \ + && pip install -q -U --pre flashinfer-jit-cache --index-url "$NIDX/cu130" >&2 2>&1; } \ + || { cx_log "WARN: flashinfer nightly index failed — falling back to PyPI -U"; \ + pip install -q -U flashinfer-python flashinfer-cubin flashinfer-jit-cache >&2 2>&1 || true; } + # The nightly (main) flashinfer's CuTe-DSL kernels import newer cutlass.cute symbols (e.g. + # OperandMajorMode) than the bundled nvidia-cutlass-dsl provides — upgrade it to match (PyPI). + pip install -q -U nvidia-cutlass-dsl >&2 2>&1 || cx_log "WARN: nvidia-cutlass-dsl upgrade warning" + # The cu130 nightly WHEEL (0.6.13.dev20260612) still predates the combine output_dtype PR — if it's + # absent, build flashinfer MAIN from source (the container has the cu130 toolchain that built + # deep_ep-v2 + hybrid-ep; cutlass-dsl 4.5.2 is now installed; JIT-first build, time-boxed). + if ! python3 -c "import inspect, flashinfer.comm as c; assert 'output_dtype' in str(inspect.signature(c.MoeAlltoAll.combine))" 2>/dev/null; then + cx_log "FlashInfer nightly wheel lacks combine output_dtype — building flashinfer main from source" + # Uninstall the precompiled cubin + jit-cache FIRST: they ship the OLD 10-arg moe_a2a_combine + # kernel, which the main Python wrapper (14-arg, with output_dtype) then mis-calls ("Expected 10 + # but got 14 arguments"). Removing them forces get_moe_alltoall_module() to JIT-compile the + # kernel FRESH from main's csrc at runtime (14-arg, matching the wrapper). + pip uninstall -y flashinfer-cubin flashinfer-jit-cache >&2 2>&1 || true + rm -rf /tmp/fi_main ~/.cache/flashinfer 2>/dev/null || true + if git clone --recursive --depth 1 https://github.com/flashinfer-ai/flashinfer.git /tmp/fi_main >&2 2>&1; then + ( cd /tmp/fi_main && timeout 2400 pip install -q --no-build-isolation . >&2 2>&1 ) \ + || cx_log "WARN: flashinfer main source build failed/timed out" + else + cx_log "WARN: flashinfer main clone failed (compute-node network?)" + fi + fi + after="$(python3 -c 'import flashinfer;print(flashinfer.__version__)' 2>/dev/null || echo none)" + export FLASHINFER_COMMIT="pkg-$after" + # Record the EXACT upgraded library stack for reproducibility — the upgrade happens AFTER + # env_capture, so these versions live nowhere else. CX_FLASHINFER_STACK is read into the result's + # backend_provenance by ep_flashinfer. Also logged to the GHA log even if the run later fails. + export CX_FLASHINFER_STACK="$(python3 - <<'PY' 2>/dev/null || echo 'capture-failed' +import importlib.metadata as m +def v(p): + try: return m.version(p) + except Exception: return "absent" +pkgs=["flashinfer-python","flashinfer-cubin","flashinfer-jit-cache","nvidia-cutlass-dsl","torch"] +print(" ".join(f"{p}={v(p)}" for p in pkgs)) +PY +)" + cx_log "FlashInfer upgrade (nightly): $before -> $after" + cx_log "FlashInfer stack: $CX_FLASHINFER_STACK" + python3 -c "import inspect, flashinfer.comm as c; assert 'output_dtype' in str(inspect.signature(c.MoeAlltoAll.combine)), 'combine still has no output_dtype'; print('combine output_dtype: present')" >&2 \ + || { cx_log "ERROR: upgraded FlashInfer combine still lacks output_dtype — cannot quant-combine"; return 1; } + : > /tmp/.cx_built_flashinfer # sentinel: skip rebuild on subsequent cases in this allocation +} + +# NIXL device-EP build-probe — the gated EP item (goal "NIXL EP"). The OLD sglang image blocked the +# meson build on Abseil 20220623; this runs in the dynamo tensorrtllm-runtime image (container switch) +# and reports whether THIS container clears it. Reports the build deps the meson tree needs (nixl lib, +# Abseil, meson/ninja/ucx) then attempts `meson setup` (which enumerates any missing dep) + a +# time-boxed compile. Informational: logs the precise outcome; never fails the suite (the transfer +# bench is the guaranteed datapoint). If it SUCCEEDS we wire ep_nixl.py against nixl_ep_cpp next. +cx_probe_nixl_ep() { + cx_log "NIXL device-EP build-probe (gated EP item — does examples/device/ep build on this container?)" + export PIP_BREAK_SYSTEM_PACKAGES=1 + python3 - >&2 2>&1 <<'PY' || true +import importlib.metadata as m, shutil, glob +def v(p): + try: return m.version(p) + except Exception: return "absent" +print("NIXL_EP_PROBE deps: nixl=%s meson=%s ninja=%s pybind11=%s cmake=%s" % + (v("nixl"), shutil.which("meson"), shutil.which("ninja"), v("pybind11"), shutil.which("cmake"))) +# Abseil version was the OLD container's blocker (20220623) — report what THIS container ships. +hits = glob.glob("/usr/**/libabsl_base*", recursive=True) + glob.glob("/opt/**/libabsl_base*", recursive=True) +print("NIXL_EP_PROBE abseil libs:", hits[:4] or "not found on /usr,/opt") +try: + import nixl, os; print("NIXL_EP_PROBE nixl at", os.path.dirname(nixl.__file__)) +except Exception as e: + print("NIXL_EP_PROBE nixl import:", repr(e)) +PY + pip install -q meson ninja pybind11 >&2 2>&1 || cx_log "NIXL_EP_PROBE: meson/ninja/pybind11 pip warn" + # The device-EP build needs UCX's GPU device API header ; the + # dynamo image's UCX lacks it (meson "UCX GPU Device API: NO"). Build a recent UCX from source WITH + # CUDA (ships the device-API header) and point pkg-config at it — the directive's "see if a build + # fixes it". If the header is still absent (device-comm needs GPUDirect-Async driver support), the + # meson reports NO again and that precise wall is documented. + if ! find /usr /opt -name 'ucp_device_impl.h' 2>/dev/null | grep -q .; then + cx_log "NIXL_EP_PROBE: building UCX from source with CUDA device API -> /opt/ucx-dev" + rm -rf /tmp/ucx_src + if git clone --depth 1 https://github.com/openucx/ucx /tmp/ucx_src >&2 2>&1; then + ( cd /tmp/ucx_src && timeout 1300 bash -c ' + ./autogen.sh >/dev/null 2>&1 + ./configure --prefix=/opt/ucx-dev --with-cuda=/usr/local/cuda --enable-mt --without-go --without-java >/dev/null 2>&1 + make -j"$(nproc)" install 2>&1 | tail -4' ) >&2 2>&1 || cx_log "NIXL_EP_PROBE: UCX build failed/timed out" + export PKG_CONFIG_PATH="/opt/ucx-dev/lib/pkgconfig:${PKG_CONFIG_PATH:-}" + export LD_LIBRARY_PATH="/opt/ucx-dev/lib:${LD_LIBRARY_PATH:-}" + fi + find /opt/ucx-dev -name 'ucp_device_impl.h' 2>/dev/null | head -1 | sed 's/^/NIXL_EP_PROBE built-ucx device header: /' >&2 || true + fi + rm -rf /tmp/nixl_src + git clone --depth 1 https://github.com/ai-dynamo/nixl /tmp/nixl_src >&2 2>&1 \ + || { cx_log "NIXL_EP_PROBE: clone failed (compute-node network?)"; return 0; } + # meson-setup the whole project (it now sees the source-built UCX via PKG_CONFIG_PATH -> the "UCX + # GPU Device API" line shows YES/NO), then a time-boxed compile. tail the decisive lines to the log. + ( cd /tmp/nixl_src && timeout 1500 bash -c ' + echo "--- meson setup ---"; meson setup build 2>&1 | tail -34 + echo "--- meson compile (time-boxed) ---"; meson compile -C build 2>&1 | tail -40 + ' ) >&2 2>&1 || true + if find /tmp/nixl_src/build -name 'nixl_ep_cpp*.so' 2>/dev/null | grep -q .; then + cx_log "NIXL_EP_PROBE: SUCCESS — nixl_ep_cpp built on this container (wire ep_nixl.py next)" + else + cx_log "NIXL_EP_PROBE: nixl_ep_cpp NOT produced — see 'meson setup' output above for the blocker" + fi +} + +run_mooncake_suite() { + # MoonCake KV transfer (the goal's kv-cache 'mooncake' backend). Mooncake is in no CollectiveX + # container -> pip-install mooncake-transfer-engine first (the directive's "import a new one", as a + # pip import). Then the single-process RDMA loopback bench. Needs an RDMA NIC. + local out="results/${CX_RUNNER}_mooncake_${CX_TS}.json" rc=0 + export PIP_BREAK_SYSTEM_PACKAGES=1 + if ! python3 -c "import mooncake.engine" 2>/dev/null; then + cx_log "mooncake: pip install mooncake-transfer-engine" + pip install -q mooncake-transfer-engine >&2 2>&1 || cx_log "WARN: mooncake pip install failed" + fi + cx_log "mooncake transfer bench -> $out" + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/mooncake_transfer.py \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-rdma}" \ + --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: mooncake failed/timed out rc=$rc"; } + return "$rc" +} + +run_nccl_kv_suite() { + # NCCL/RCCL KV-cache transfer (the goal's kv-cache 'nccl'/'rccl' backend). torchrun 2 ranks, + # rank0 dist.send -> rank1 dist.recv of KV-block-sized buffers. NCCL on NVIDIA, RCCL on ROCm + # (same torch.distributed API). Needs >=2 GPUs. + local out="results/${CX_RUNNER}_nccl_kv_${CX_TS}.json" rc=0 np=2 + [ "$CX_NGPUS" -lt 2 ] && { cx_log "WARN: nccl-kv needs >=2 GPUs (have $CX_NGPUS)"; return 1; } + cx_log "nccl-kv transfer bench (2-rank send/recv) -> $out" + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \ + torchrun --nproc_per_node="$np" tests/nccl_kv_transfer.py \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \ + --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: nccl-kv failed/timed out rc=$rc"; } + return "$rc" +} + +run_mori_io_suite() { + # MoRI-IO (ROCm/mori mori.io) — AMD RDMA p2p transfer engine, bundled in the AMD MoRI image. The + # WIRED kv-cache 'mori-io' backend (a guaranteed datapoint when mori.io imports + RDMA loopback + # works on the ionic_rdma NICs). Single process, 2 IOEngines, GPU0<->GPU1 RDMA read. + if ! python3 -c "import mori.io" 2>/dev/null; then + cx_log "WARN: mori.io not importable — needs the AMD MoRI image; cannot run mori-io"; return 1 + fi + local out="results/${CX_RUNNER}_mori_io_${CX_TS}.json" rc=0 + cx_log "mori-io transfer bench -> $out" + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/mori_io_transfer.py \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-rdma}" \ + --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: mori-io failed/timed out rc=$rc"; } + return "$rc" +} + +run_nixl_suite() { + # NIXL (ai-dynamo/nixl) — runs in the dynamo tensorrtllm-runtime image (cx_default_image switched + # CX_IMAGE for CX_BENCH=nixl). Two parts: (1) the NIXL point-to-point TRANSFER bench (the wired + # KV-cache 'nixl' backend — a guaranteed datapoint when nixl imports); (2) the device-EP build-probe + # (the gated NIXL EP item). The transfer result drives the suite's pass/fail; the probe is logged. + local out rc=0 + out="results/${CX_RUNNER}_nixl_${CX_TS}.json" + cx_log "nixl transfer bench -> $out" + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" python3 tests/nixl_transfer.py --direction all \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "${CX_TRANSPORT:-nvlink}" \ + --env-json "$ENVJSON" --out "$out" || { rc=$?; cx_log "WARN: nixl transfer failed/timed out rc=$rc"; } + cx_probe_nixl_ep || true # informational; never fails the suite + return "$rc" +} + +run_flashinfer_suite() { + # FlashInfer EP (flashinfer.comm.MoeAlltoAll) — pre-installed in the sglang image. When a + # combine-quant run is requested (CX_COMBINE_DTYPE != bf16), first upgrade FlashInfer to a wheel + # that has the quantized-combine OUTPUT path; otherwise run on the bundled version (dispatch path). + # Upgrade FlashInfer to the newer wheel when: (a) a combine-quant run needs the output_dtype path, OR + # (b) CX_FLASHINFER_UPGRADE=1 — the bundled 0.6.8 MoeAlltoAll MNNVL barrier intermittently deadlocks on + # h100 ('Rank N timed out waiting for completion flag' -> CUDA unspecified launch failure); newer + # flashinfer carries MNNVL fixes (e.g. socket-collision #36674). Otherwise run on the bundled version. + if { [ -n "${CX_COMBINE_DTYPE:-}" ] && [ "${CX_COMBINE_DTYPE}" != "bf16" ]; } || [ "${CX_FLASHINFER_UPGRADE:-}" = "1" ]; then + cx_build_flashinfer_latest || { cx_log "WARN: flashinfer upgrade setup failed"; return 1; } + fi + if ! python3 -c "import flashinfer.comm" 2>/dev/null; then + cx_log "WARN: flashinfer.comm not importable — cannot run flashinfer EP"; return 1 + fi + run_ep_suite flashinfer +} + +# dispatch_bench runs the CURRENT CX_BENCH (+ CX_* config env) once. The sweep workflow runs many +# of these per allocation (SHARD mode below), reusing this single container + its built backend. +dispatch_bench() { + local rc=0 + case "$CX_BENCH" in + nccl) run_nccl_suite || rc=1 ;; + deepep) run_deepep_suite || rc=1 ;; + mori) run_mori_suite || rc=1 ;; + uccl) run_uccl_suite || rc=1 ;; + nccl-ep) run_nccl_ep_suite || rc=1 ;; + flashinfer) run_flashinfer_suite || rc=1 ;; + deepep-hybrid) run_deepep_hybrid_suite || rc=1 ;; + nixl) run_nixl_suite || rc=1 ;; + mori-io) run_mori_io_suite || rc=1 ;; + nccl-kv) run_nccl_kv_suite || rc=1 ;; + mooncake) run_mooncake_suite || rc=1 ;; + offload) run_collective_bench offload || rc=1 ;; + copy-engine) run_collective_bench copy-engine || rc=1 ;; + kv-cache) run_collective_bench kv-cache || rc=1 ;; + rl-mesh) run_rl_mesh || rc=1 ;; + allreduce-fw) run_allreduce_fw || rc=1 ;; + all) run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;; + *) cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|mori|uccl|nccl-ep|flashinfer|deepep-hybrid|nixl|mori-io|nccl-kv|mooncake|offload|copy-engine|kv-cache|rl-mesh|allreduce-fw|all)" ;; + esac + return $rc +} + +rc=0 +# Build-only mode: the rack EP8 launcher runs this ONCE per node inside a PERSISTENT named container +# to pre-build the from-source kernels (DeepEP V2 / flashinfer quant-combine) that the per-rank +# multi-srun case loop cannot build itself (8 separate ephemeral containers). Build the requested +# kernels into this (named, persisting) container's site-packages, then exit — no benchmark run. +if [ -n "${CX_BUILD_ONLY:-}" ]; then + [ -n "${CX_DEEPEP_V2:-}" ] && { cx_build_deepep_v2 || rc=1; } + [ "${CX_BENCH:-}" = "deepep-hybrid" ] && { cx_build_deepep_hybrid || rc=1; } + [ -n "${CX_COMBINE_DTYPE:-}" ] && [ "${CX_COMBINE_DTYPE}" != "bf16" ] && { cx_build_flashinfer_latest || rc=1; } + cx_log "CX_BUILD_ONLY: build complete rc=$rc (deepep_v2=${CX_DEEPEP_V2:-} bench=${CX_BENCH:-} combine=${CX_COMBINE_DTYPE:-})" + exit "$rc" +fi +if [ -n "${CX_SHARD_FILE:-}" ] && [ -f "${CX_SHARD_FILE:-/nonexistent}" ]; then + # SHARD/SWEEP mode (collectivex-sweep.yml): run EVERY case of this shard in THIS one allocation. + # All cases share (sku, backend, v2, nodes) so the backend build (cx_build_*) is paid once and cached + # for the rest. Each case overrides its own mode/resource_mode/dtype/contract/routing/phase/eplb/ + # workload, then reuses the same per-config path (dispatch_bench). Collapses a whole build-group's + # cases (all modes/resource_modes) into one allocation — the sweep shard key is now (sku,backend,v2,nodes). + ncases="$(python3 -c "import json;print(len(json.load(open('$CX_SHARD_FILE')).get('cases',[])))" 2>/dev/null || echo 0)" + cx_log "SHARD mode: $ncases case(s) in one allocation (shard=$CX_SHARD_FILE)" + _cx_ts_base="$CX_TS" # per-case CX_TS suffix below keeps each case's result file UNIQUE (else + # cases sharing backend+phase overwrite each other at the same timestamp). + ci=0 + while [ "$ci" -lt "$ncases" ]; do + export CX_TS="${_cx_ts_base}-c$(printf '%03d' "$ci")" + # Map case[ci] fields -> CX_* env (shell-quoted). The setup job pre-resolved hidden/topk/experts + # + the token ladder into each case, so the loop is config-only (no workloads.yaml lookup here). + _exports="$(python3 - "$CX_SHARD_FILE" "$ci" <<'PY' +import json, sys, shlex +c = json.load(open(sys.argv[1]))["cases"][int(sys.argv[2])] +def g(k, d=""): + v = c.get(k, d); return "" if v is None else str(v) +env = { + "CX_BENCH": g("backend"), "CX_MODE": g("mode", "normal"), + "CX_DISPATCH_DTYPE": g("dtype", "bf16"), + "CX_MEASUREMENT_CONTRACT": g("contract", "layout-and-dispatch-v1"), + "CX_ROUTING": g("routing", "uniform"), "CX_PHASE": g("phase", "decode"), + "CX_RESOURCE_MODE": g("resource_mode", "normalized"), + "CX_ACTIVATION_PROFILE": g("activation_profile", "normal"), + "CX_PLACEMENT": g("placement", "packed"), "CX_ROUTING_STEP": g("routing_step", "0"), + "CX_UNEVEN_TOKENS": g("uneven_tokens", "none"), + "CX_EPLB": "1" if c.get("eplb") else "", + "CX_HIDDEN": g("hidden"), "CX_TOPK": g("topk"), "CX_EXPERTS": g("experts"), + "CX_TOKENS_LADDER": g("ladder"), "CX_CANONICAL": ("1" if c.get("canonical") else ""), +} +print("\n".join(f"export {k}={shlex.quote(v)}" for k, v in env.items())) +PY +)" + eval "$_exports" + # Each case has its OWN routing/dims -> its own canonical workload manifest. cx_stage_canonical + # short-circuits when CX_WORKLOAD_DIR is already set, so without this unset the first case's + # staged dir is reused for the rest and run_ep.py can't find the later cases' manifests + # (FileNotFoundError .cx_workloads/.manifest.json). Unset so every case re-stages its own. + unset CX_WORKLOAD_DIR 2>/dev/null || true + cx_log " [$((ci+1))/$ncases] $CX_BENCH $CX_PHASE $CX_DISPATCH_DTYPE/$CX_MODE/${CX_MEASUREMENT_CONTRACT/-v1/} rt=$CX_ROUTING eplb=${CX_EPLB:-0}" + # flashinfer's MoeAlltoAll MNNVL barrier INTERMITTENTLY deadlocks on h100 ('Rank N timed out waiting + # for completion flag' -> CUDA unspecified launch failure): ~half of cases, scattered across T/routing, + # the SAME config both crashes AND passes (a transient, not config/pidfd). Upgrade to flashinfer 0.6.14 + # + a between-case shm-drop settle were both TESTED and did NOT fix it (the settle made it worse). Since + # it's intermittent, RETRY: each fresh torchrun is another independent ~50% shot, so a few retries + # recover almost all cases. On a retry success, drop this case's intermediate failed-case record so it + # doesn't pollute the shard. Non-flashinfer backends run ONCE — their failures are deterministic + # (h200 flashinfer pidfd, aarch64 uccl, deepep-hybrid ll) so retrying only wastes the allocation. + attempts=1; [ "$CX_BENCH" = "flashinfer" ] && attempts=$(( ${CX_FLASHINFER_RETRIES:-3} + 1 )) + a=1 + while :; do + if dispatch_bench; then + [ "$a" -gt 1 ] && rm -f results/failed_*"${CX_TS}"*.json 2>/dev/null || true + break + fi + [ "$a" -ge "$attempts" ] && { rc=1; break; } + cx_log " [$((ci+1))/$ncases] $CX_BENCH attempt $a/$attempts failed — retry (intermittent MNNVL barrier)" + a=$((a+1)) + done + ci=$((ci + 1)) + done +else + dispatch_bench || rc=1 +fi + +# Summary table for the log; also fails the job if no valid results were produced. +python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1 +exit "$rc" diff --git a/experimental/CollectiveX/schemas/ep-result-v4.schema.json b/experimental/CollectiveX/schemas/ep-result-v4.schema.json new file mode 100644 index 000000000..0d0035997 --- /dev/null +++ b/experimental/CollectiveX/schemas/ep-result-v4.schema.json @@ -0,0 +1,195 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://semianalysis/collectivex/schemas/ep-result-v4.schema.json", + "title": "CollectiveX EP dispatch/combine result (v4)", + "description": "One (backend, phase, dtype, mode, contract, routing) sweep. v4 adds multi-dimensional validity + machine-derived publication_status, measured roundtrip, dual byte contracts, per-rank diagnostics, raw-sample histograms, and workload identity. v3 docs load via compatibility (publication_status absent => treated as legacy/experimental).", + "type": "object", + "required": ["schema_version", "family", "runner", "backend", "mode", "phase", + "ep_size", "measurement_contract", "shape", "rows", + "validity", "publication_status", "workload", "reproduction", + "backend_provenance", "comparison_key"], + "properties": { + "schema_version": {"type": "integer", "minimum": 3}, + "family": {"const": "moe"}, + "runner": {"type": "string"}, + "backend": {"type": "string", "enum": ["deepep", "deepep-hybrid", "mori", "aiter", "uccl", "flashinfer"]}, + "mode": {"type": "string", "enum": ["normal", "ll"]}, + "phase": {"type": "string", "enum": ["decode", "prefill"]}, + "ep_size": {"type": "integer", "minimum": 1}, + "world_size": {"type": "integer", "minimum": 1}, + "nodes": {"type": "integer", "minimum": 1}, + "topology_class": {"type": "string"}, + "transport": {"type": "string"}, + "resource_mode": {"type": "string", "enum": ["normalized", "tuned", "default"]}, + "measurement_contract": {"type": "string", + "enum": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1", + "mori-quant-combine-v1"]}, + "publication_status": {"type": "string", + "enum": ["official", "comparable-experimental", "diagnostic", "invalid", "failed"]}, + "validity": { + "type": "object", + "required": ["execution_status", "semantic_correctness", "workload_identity", + "measurement_conformance", "resource_conformance", "provenance_complete"], + "properties": { + "execution_status": {"type": "string", "enum": ["complete", "failed"]}, + "semantic_correctness": {"type": "string", "enum": ["pass", "fail"]}, + "workload_identity": {"type": "string"}, + "workload_source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]}, + "measurement_conformance": {"type": "string", "enum": ["conformant", "nonconformant"]}, + "resource_conformance": {"type": "string"}, + "provenance_complete": {"type": "boolean"}, + "anomaly_free": {"type": "boolean"} + } + }, + "workload": { + "type": "object", + "required": ["source", "trace_signature", "cross_rank_consistent"], + "properties": { + "source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]}, + "workload_id": {"type": ["string", "null"]}, + "manifest_checksums": {"type": ["object", "null"]}, + "trace_signature": {"type": "string"}, + "distinct_per_T_hashes": {"type": "array", "items": {"type": "string"}}, + "cross_rank_consistent": {"type": "boolean"}, + "activation_profile": {"type": "string"}, + "activation_identity": {"type": ["string", "null"]} + } + }, + "shape": { + "type": "object", + "required": ["hidden", "topk", "experts", "experts_per_rank", "dispatch_dtype", "routing"], + "properties": { + "hidden": {"type": "integer"}, "topk": {"type": "integer"}, + "experts": {"type": "integer"}, "experts_per_rank": {"type": "integer"}, + "dispatch_dtype": {"type": "string", "enum": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast", "mxfp8", "mxfp4", "nvfp4"]}, + "routing": {"type": "string"}, + "eplb": {"type": "boolean"}, "num_logical_experts": {"type": "integer"}, + "kernel_gen": {"type": "string"}, + "activation_profile": {"type": "string"}, + "quant": { + "type": "object", + "properties": { + "combine_input_dtype": {"type": "string"}, + "combine_accum_dtype": {"type": "string"}, + "combine_output_dtype": {"type": "string"}, + "combine_quant_mode": {"type": "string"}, + "scale_layout": {"type": ["string", "null"]} + } + } + } + }, + "reproduction": { + "type": "object", + "required": ["command", "seed", "warmup", "iters", "trials", "measurement_contract"], + "properties": { + "command": {"type": "string"}, + "image": {"type": ["string", "null"]}, + "image_digest": {"type": ["string", "null"]}, + "image_arch": {"type": ["string", "null"]}, + "squash_sha256": {"type": ["string", "null"]}, + "git_run": {"type": ["object", "null"]}, + "fp8_quant_in_timing": {"type": ["boolean", "null"]}, + "combine_quant_in_timing": {"type": ["boolean", "null"]}, + "combine_dequant_in_timing": {"type": ["boolean", "null"]}, + "combine_dtype": {"type": "string"}, "combine_quant_mode": {"type": "string"}, + "activation_profile": {"type": "string"}, + "routing_step": {"type": "integer"}, "uneven_tokens": {"type": "string"}, + "waive_anomaly": {"type": "boolean"}, "roundtrip_anomaly_threshold": {"type": "number"} + } + }, + "backend_provenance": {"type": "object"}, + "phase_profile": {"type": "object"}, + "source_allocation": { + "type": "object", + "properties": { + "mode": {"type": "string", "enum": ["none", "linear", "empty-rank"]}, + "routing_step": {"type": "integer"} + } + }, + "placement": { + "type": "object", + "properties": { + "kind": {"type": "string", "enum": ["packed", "striped", "runtime-native", "adversarial"]}, + "nodes": {"type": "integer"}, "gpus_per_node": {"type": "integer"}, + "scale_up_domain": {"type": "integer"}, "ranks": {"type": "integer"} + } + }, + "eplb": { + "type": "object", + "properties": { + "enabled": {"type": "boolean"}, + "num_logical_experts": {"type": "integer"}, "num_physical_experts": {"type": "integer"}, + "imbalance_before": {"type": "number"}, "imbalance_after": {"type": "number"}, + "mapping_hash": {"type": ["string", "null"]} + } + }, + "anomalies": {"type": "array", "items": {"type": "object"}}, + "anomaly_summary": { + "type": "object", + "properties": { + "count": {"type": "integer"}, "waived": {"type": "boolean"}, + "types": {"type": "array", "items": {"type": "string"}} + } + }, + "rows": { + "type": "array", "minItems": 1, + "items": { + "type": "object", + "required": ["tokens_per_rank", "global_tokens", "dispatch", "combine", "roundtrip", + "isolated_sum", "samples_pooled", "byte_contracts", "correct"], + "properties": { + "tokens_per_rank": {"type": "integer", "minimum": 1}, + "global_tokens": {"type": "integer", "minimum": 1}, + "dispatch": {"$ref": "#/definitions/percentiles"}, + "combine": {"$ref": "#/definitions/percentiles"}, + "roundtrip": {"$ref": "#/definitions/percentiles"}, + "isolated_sum": {"type": "object"}, + "samples_pooled": {"type": "integer", "minimum": 1}, + "percentile_interpolation": {"type": "string"}, + "per_rank_dispatch_us": {"type": "object"}, + "raw_samples": {"type": "object"}, + "byte_contracts": { + "type": "object", + "required": ["token_rank_payload_copies", "token_expert_payload_copies", + "dispatch_bytes", "combine_bytes"], + "properties": { + "token_rank_payload_copies": {"type": "integer"}, + "token_expert_payload_copies": {"type": "integer"}, + "dispatch_bytes": {"type": "integer"}, "combine_bytes": {"type": "integer"} + } + }, + "roundtrip_tokens_per_second": {"type": ["number", "null"]}, + "bandwidth": { + "type": "object", + "properties": { + "logical_payload_rate_gbps": {"type": "object"}, + "backend_buffer_rate_gbps": {"type": "object"}, + "algorithm_bandwidth_gbps": {"type": ["number", "null"]}, + "bus_bandwidth_gbps": {"type": ["number", "null"]}, + "wire_utilization": {"type": ["number", "null"]} + } + }, + "fanout_hist": {"type": "array"}, + "rank_load_hist": {"type": "array"}, + "expert_load_cv": {"type": "number"}, "rank_load_cv": {"type": "number"}, + "hotspot_ratio": {"type": "number"}, + "dest_rank_load_max": {"type": "integer"}, "dest_rank_load_mean": {"type": "number"}, + "empty_expert_count": {"type": "integer"}, "empty_rank_count": {"type": "integer"}, + "source_token_stats": {"type": ["object", "null"]}, + "anomalies": {"type": "array", "items": {"type": "object"}}, + "correct": {"type": "boolean"} + } + } + } + }, + "definitions": { + "percentiles": { + "type": "object", + "required": ["p50", "p90", "p95", "p99"], + "properties": { + "p50": {"type": "number"}, "p90": {"type": "number"}, + "p95": {"type": "number"}, "p99": {"type": "number"} + } + } + } +} diff --git a/experimental/CollectiveX/schemas/workload-v1.schema.json b/experimental/CollectiveX/schemas/workload-v1.schema.json new file mode 100644 index 000000000..5a12b5af0 --- /dev/null +++ b/experimental/CollectiveX/schemas/workload-v1.schema.json @@ -0,0 +1,50 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://semianalysis/collectivex/schemas/workload-v1.schema.json", + "title": "CollectiveX canonical MoE routing workload manifest", + "description": "Manifest for a serialized routing trace (tests/workload.py). The .npz holds topk_idx/topk_weights; this manifest carries the identity, dimensions, routing profile, and SHA-256 checksums that gate cross-hardware comparison.", + "type": "object", + "additionalProperties": false, + "required": ["schema_version", "workload_id", "generator_version", "gate_weight_format", + "dims", "routing_profile", "seed", "checksums"], + "properties": { + "schema_version": {"const": 1}, + "workload_id": {"type": "string", "pattern": "^[0-9a-f]{16}$", + "description": "Immutable id = sha256(generator|routing|hidden|topk|experts|gt|seed)[:16]."}, + "generator_version": {"type": "string", + "description": "Routing generator identity; bump when numerics change so stale files can't masquerade."}, + "gate_weight_format": {"type": "string"}, + "dims": { + "type": "object", + "additionalProperties": false, + "required": ["hidden", "topk", "experts", "global_tokens", "experts_per_rank"], + "properties": { + "hidden": {"type": "integer", "minimum": 1}, + "topk": {"type": "integer", "minimum": 1}, + "experts": {"type": "integer", "minimum": 1}, + "global_tokens": {"type": "integer", "minimum": 1}, + "experts_per_rank": {"type": "integer", "minimum": 1} + } + }, + "routing_profile": {"type": "string", + "enum": ["uniform", "balanced", "balanced-rank-local", "zipf", + "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single"]}, + "seed": {"type": "integer"}, + "checksums": { + "type": "object", + "additionalProperties": false, + "required": ["topk_idx", "topk_weights", "trace"], + "properties": { + "topk_idx": {"type": "string", "pattern": "^[0-9a-f]{64}$"}, + "topk_weights": {"type": "string", "pattern": "^[0-9a-f]{64}$"}, + "trace": {"type": "string", "pattern": "^[0-9a-f]{64}$"} + } + }, + "routing_stats": {"type": "object", + "description": "Realized fan-out / load / locality stats (advisory; not identity-defining)."}, + "activation_profile": {"type": "string", + "description": "Value distribution of expert inputs (e.g. 'normal'); reserved for the value-sensitivity rig."}, + "activation_identity": {"type": ["string", "null"], + "description": "Deterministic descriptor hash of the activation distribution (profile|seed|dims). Becomes a byte-hash once activations are serialized (model-trace)."} + } +} diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py new file mode 100644 index 000000000..509d22cf8 --- /dev/null +++ b/experimental/CollectiveX/summarize.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +"""CollectiveX — summarize a run's results. + +Two output modes over the same data: + (default) a plain-text table for the Slurm/container log; ALSO the result + gate — exits non-zero if no valid results were produced, so a + failed/skipped benchmark doesn't get reported as a green job. + --markdown GitHub-flavored markdown for a GitHub Actions job summary + (https://github.blog/.../supercharging-github-actions-with-job-summaries/); + reporting only, always exits 0. A workflow step appends this to + $GITHUB_STEP_SUMMARY so the run page shows a rendered table. + + python summarize.py --results-dir results --runner gb200-nv_1 --ts + python summarize.py --results-dir results --markdown >> "$GITHUB_STEP_SUMMARY" +""" +from __future__ import annotations + +import argparse +import glob +import json +import os + + +def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[dict]: + docs = [] + for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))): + base = os.path.basename(path) + if base.startswith("env_"): + continue + if runner and not base.startswith(f"{runner}_"): + continue + if ts and ts not in base: + continue + try: + with open(path) as fh: + d = json.load(fh) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") in CLI_FAMILIES: + docs.append(d) + return docs + + +# Families summarize.py recognizes: EP (moe), NCCL primitives, and the single-process +# memcpy-family collectives (offload/copy-engine/kv-cache). A doc of any other family is +# ignored; a run that produces ONLY recognized families must not be reported as "nothing". +CLI_FAMILIES = ("nccl", "moe", "offload", "copy-engine", "kv-cache", "rl-mesh", "allreduce-fw") +COLLECTIVE_FAMILIES = ("offload", "copy-engine", "kv-cache", "rl-mesh", "allreduce-fw") + + +def _peak_busbw(rows): + return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) + + +def _coll_peak(d) -> float: + """Peak bandwidth (GB/s) across a collective doc. Field name varies by family: + offload/copy-engine use top-level peak_bandwidth_gbps + rows[].bandwidth_gbps; + kv-cache nests rows[].bandwidth_gb_s under groups[]. Defensive: 0.0 if none found.""" + top = d.get("peak_bandwidth_gbps") + if top: + return top + best = 0.0 + + def _scan(rows): + nonlocal best + for r in rows or []: + for k in ("bandwidth_gbps", "bandwidth_gb_s", "busbw_gbps"): + v = r.get(k) + if v: + best = max(best, v) + + _scan(d.get("rows")) + for g in d.get("groups", []) or []: + _scan(g.get("rows")) + return best + + +_OP_ORDER = ["all_reduce", "reduce_scatter", "all_gather", "alltoall"] + + +def _row_lat(r): + vals = [(r.get(k) or {}).get("time_us") for k in ("out_of_place", "in_place")] + vals = [v for v in vals if v is not None] + return min(vals) if vals else None + + +def _lat_floor(rows): + # Small-message latency floor: time at the smallest REAL (size>0) message. + # (Sub-granularity 0-byte rows are a no-op ~1 us and not a real latency.) + real = [r for r in rows if (r.get("size_bytes") or 0) > 0] + if not real: + return float("nan") + v = _row_lat(min(real, key=lambda r: r["size_bytes"])) + return v if v is not None else float("nan") + + +def _at_size(rows, size, fn): + for r in rows: + if r.get("size_bytes") == size: + return fn(r) + return None + + +def _fmt_bytes(b): + for u, s in ((2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")): + if b >= u and b % u == 0: + return f"{b // u} {s}" + return f"{b} B" + + +def _ops_sorted(nccl): + present = {d.get("op") for d in nccl} + ordered = [o for o in _OP_ORDER if o in present] + return ordered + sorted(present - set(ordered)) + + +def _ladder(nccl): + sizes = sorted({r["size_bytes"] for d in nccl for r in d.get("rows", []) + if (r.get("size_bytes") or 0) > 0}) + if not sizes: + return [] + cand = [16384, 262144, 4194304, 67108864, 268435456, 1073741824, 4294967296] + lad = [s for s in cand if s in set(sizes) and s < sizes[-1]] + lad.append(sizes[-1]) + return lad + + +def _sweep_table(nccl, title, rowfn, fmt): + lad = _ladder(nccl) + if not lad: + return [] + ops = _ops_sorted(nccl) + rows_by_op = {d.get("op"): d.get("rows", []) for d in nccl} + out = [f"\n**{title}**\n", + "| bytes/rank | " + " | ".join(f"`{o}`" for o in ops) + " |", + "|---" + "|--:" * len(ops) + "|"] + for s in lad: + cells = [] + for o in ops: + v = _at_size(rows_by_op.get(o, []), s, rowfn) + cells.append(format(v, fmt) if isinstance(v, (int, float)) else "—") + out.append(f"| {_fmt_bytes(s)} | " + " | ".join(cells) + " |") + return out + + +def _fnum(x, fmt): + return format(x, fmt) if isinstance(x, (int, float)) else "—" + + +def _moe_sorted(moe): + return sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""), x.get("ep_size", 0))) + + +def _moe_sweep_table(d): + """Markdown sweep table for one EP doc — the rows already ARE the ladder, so + emit one row per source-tokens-per-rank point. Skips old single-point docs + (no rows[]).""" + rows = d.get("rows") + if not rows: + return [] + sh = d.get("shape", {}) + head = (f"\n**`{d.get('backend')}` · {d.get('phase')} · ep{d.get('ep_size')} · " + f"H{sh.get('hidden')} top{sh.get('topk')} E{sh.get('experts')} " + f"{sh.get('dispatch_dtype')} {sh.get('routing')}** — latency vs source tokens/rank\n") + out = [head, + "| tokens/rank | fan-out | dispatch µs | combine µs | serial µs (D+C) | tokens/s | recv max | correct |", + "|--:|--:|--:|--:|--:|--:|--:|:--:|"] + for r in rows: + out.append(f"| {r.get('tokens_per_rank')} | {_fnum(r.get('fanout_mean'), '.2f')} | " + f"{_fnum(r.get('dispatch_us_p50'), '.2f')} | {_fnum(r.get('combine_us_p50'), '.2f')} | " + f"{_fnum(r.get('serial_us_p50', r.get('roundtrip_us_p50')), '.2f')} | " + f"{_fnum(r.get('tokens_per_second'), '.3e')} | " + f"{r.get('recv_tokens_max', r.get('recv_tokens', '—'))} | {'✅' if r.get('correct') else '❌'} |") + return out + + +def render_plain(nccl, moe, coll, n_valid, total) -> str: + out = [] + hdr = "CollectiveX results" + anchor = (nccl + moe + coll) + if anchor: + d0 = anchor[0] + hdr += f" — runner={d0.get('runner')} topology={d0.get('topology_class')} transport={d0.get('transport')}" + out += ["=" * len(hdr), hdr, "=" * len(hdr)] + if coll: + out.append("\nMemcpy-family collectives (offload / copy-engine / kv-cache):") + out.append(f" {'family':<13}{'status':<9}{'peak bw (GB/s)':>15}") + for d in sorted(coll, key=lambda x: x.get("family", "")): + out.append(f" {d.get('family',''):<13}{d.get('status',''):<9}{_coll_peak(d):>15.1f}") + if nccl: + out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):") + out.append(f" {'op':<16}{'status':<9}{'peak busbw':>12}{'lat floor':>10}{'avg busbw':>11}") + for d in sorted(nccl, key=lambda x: x["op"]): + rows = d.get("rows", []) + avg = (d.get("summary") or {}).get("avg_busbw_gbps") + out.append(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" + f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") + if moe: + out.append("\nMoE EP dispatch/combine (DeepEP / MoRI) — headline (* = headline tokens/rank):") + out.append(f" {'backend':<9}{'phase':<8}{'ep':>3} {'status':<9}{'T*':>5}{'disp_p50':>10}{'comb_p50':>10}{'serial':>9} correct") + for d in sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""))): + m, c = d.get("metrics", {}), d.get("correctness", {}) + ser = m.get("serial_us_p50", m.get("roundtrip_us_p50")) + out.append(f" {d.get('backend',''):<9}{d.get('phase',''):<8}{str(d.get('ep_size','')):>3} {d.get('status',''):<9}" + f"{str(m.get('headline_tokens_per_rank','')):>5}" + f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}{(m.get('combine_us_p50') or float('nan')):>10.1f}" + f"{(ser or float('nan')):>9.1f} {c.get('passed')}") + return "\n".join(out) + + +def _emoji(status) -> str: + return "✅ valid" if status == "valid" else f"❌ {status}" + + +def render_markdown(nccl, moe, coll, n_valid, total) -> str: + out = [] + anchor = (nccl + moe + coll) + if anchor: + d0 = anchor[0] + out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}") + if coll: + out.append("\n### Memcpy-family collectives\n") + out.append("| family | status | peak bw (GB/s) |") + out.append("|---|---|--:|") + for d in sorted(coll, key=lambda x: x.get("family", "")): + out.append(f"| `{d.get('family','')}` | {_emoji(d.get('status'))} | {_coll_peak(d):.1f} |") + if nccl: + out.append(f"\n### NCCL/RCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n") + out.append("| op | status | peak busbw (GB/s) | lat floor (µs) |") + out.append("|---|---|--:|--:|") + for d in sorted(nccl, key=lambda x: _OP_ORDER.index(x["op"]) if x["op"] in _OP_ORDER else 99): + rows = d.get("rows", []) + out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | {_lat_floor(rows):.2f} |") + out += _sweep_table(nccl, "Bus bandwidth vs bytes/rank (GB/s)", lambda r: r.get("busbw_gbps"), ".1f") + out += _sweep_table(nccl, "Latency vs bytes/rank (µs)", _row_lat, ".2f") + out.append("\n> bytes/rank = nccl/rccl-tests message size (= per-rank for all-reduce / " + "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small " + "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.") + if moe: + out.append("\n### MoE EP dispatch / combine (DeepEP / MoRI)\n") + out.append("Headline = the reference point (tokens/rank shown as `T*`); the per-line " + "sweep tables below carry the full source-tokens-per-rank curve.\n") + out.append("| backend | phase | mode | dtype | resource | ep | routing (fan-out) | status | T\\* | dispatch p50 (µs) | combine p50 (µs) | serial p50 (µs) | tokens/s | correct |") + out.append("|---|---|---|---|---|--:|---|---|--:|--:|--:|--:|--:|:--:|") + for d in _moe_sorted(moe): + m, c = d.get("metrics", {}), d.get("correctness", {}) + rp = d.get("routing_profile", {}) + ser = m.get("serial_us_p50", m.get("roundtrip_us_p50")) + sh = d.get("shape") or {} + fo = f"{sh.get('routing','?')} ({_fnum(rp.get('fanout_mean'), '.1f')})" + # dtype shows whether the fp8 cast was inside the timed dispatch (LL) or not. + dt = sh.get("dispatch_dtype", "?") + fit = (d.get("reproduction") or {}).get("fp8_quant_in_timing") + dt += "*" if fit else ("⁺" if fit is False else "") + out.append(f"| `{d.get('backend')}` | {d.get('phase','')} | {d.get('mode','')} | {dt} | " + f"{d.get('resource_mode','')} | {d.get('ep_size','')} | {fo} | {_emoji(d.get('status'))} | " + f"{m.get('headline_tokens_per_rank','—')} | {_fnum(m.get('dispatch_us_p50'), '.1f')} | " + f"{_fnum(m.get('combine_us_p50'), '.1f')} | {_fnum(ser, '.1f')} | " + f"{_fnum(m.get('tokens_per_second'), '.3e')} | {'✅' if c.get('passed') else '❌'} |") + for d in _moe_sorted(moe): + out += _moe_sweep_table(d) + out.append("\n> EP sweep: only source tokens/rank varies along a line. **fan-out** = mean " + "destination ranks/token (representativeness — top-k spread, not a permutation). " + "Dispatch & combine timed **separately** (staging untimed); **serial = dispatch + " + "combine** (a sum, not an independently-measured chained op). dtype `fp8*` = fp8 cast " + "IS inside the timed dispatch (LL kernel); `fp8⁺` = cast is untimed preprocessing " + "(normal mode). `mode` ll = DeepEP low-latency; `resource` = comm SM/CU regime.") + if not total: + out.append("\n> No result files found — the benchmark produced nothing.") + return "\n".join(out) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX result summary") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--runner", default=None) + ap.add_argument("--ts", default=None) + ap.add_argument("--markdown", action="store_true", + help="emit GitHub job-summary markdown (reporting only; always exits 0)") + args = ap.parse_args() + + docs = load_results(args.results_dir, args.runner, args.ts) + nccl = [d for d in docs if d["family"] == "nccl"] + moe = [d for d in docs if d["family"] == "moe"] + coll = [d for d in docs if d["family"] in COLLECTIVE_FAMILIES] + total = len(docs) + n_valid = sum(d.get("status") == "valid" for d in docs) + + if args.markdown: + print(render_markdown(nccl, moe, coll, n_valid, total)) + return 0 # reporting step — never fail the job here + + print(render_plain(nccl, moe, coll, n_valid, total)) + if total == 0: + print("ERROR: no result files found — benchmark produced nothing.") + return 1 + if n_valid < total: + print(f"ERROR: {total - n_valid} result(s) invalid — failing the job.") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/sweep_matrix.py b/experimental/CollectiveX/sweep_matrix.py new file mode 100644 index 000000000..2c0d98d14 --- /dev/null +++ b/experimental/CollectiveX/sweep_matrix.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +"""CollectiveX — sweep matrix resolver (the `setup` job of collectivex-sweep.yml). + +Resolves the requested suites into the GHA matrix of SHARDS. A shard = one allocation that sweeps +many cases sharing (sku, backend, mode, resource_mode) — generate_matrix's own grouping. Big shards +are CHUNKED so no single matrix cell exceeds the GHA 6h job budget. Each case is enriched with its +model dims (hidden/topk/experts from workloads.yaml) + token ladder + canonical flag, so the in- +container shard loop (run_in_container.sh SHARD mode) needs no further config lookup. + +Knobs: --backends sweeps every EP library in ONE matrix; --backend remaps the deepep matrix onto a +single other library (capability-filtered); --deepep-v2 threads kernel_gen=v2. Emits a JSON matrix for `fromJSON` in the +workflow: {"include": [ {id, sku, backend, mode, resource, deepep_v2, n, cases:[...]}, ... ]}. + + python3 sweep_matrix.py --suites all --out matrix.json + python3 sweep_matrix.py --suites all --backend uccl --max-cases 12 --out matrix.json +""" +from __future__ import annotations + +import argparse +import json +import os +import sys + +HERE = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, HERE) +sys.path.insert(0, os.path.join(HERE, "tests")) +import yaml # noqa: E402 +import generate_matrix as gm # noqa: E402 +import capability as cap # noqa: E402 + +# platform key -> workflow `sku` input value (must match the workflow's sku choices + runner label) +SKU = {"h100": "h100-dgxc", "h200": "h200", "b300": "b300", "b200": "b200-dgxc", + "mi355x": "mi355x", "gb300": "gb300", "gb200": "gb200"} + + +def _dims(wl_cfg, name): + for sec in ("synthetic", "model_derived"): + m = (wl_cfg.get(sec) or {}).get(name) + if m: + return m.get("hidden"), m.get("topk"), m.get("experts", m.get("routed_experts")) + return None, None, None + + +def _ladder(suite_cfg, phase): + if phase == "decode" and suite_cfg.get("token_points_decode"): + return " ".join(map(str, suite_cfg["token_points_decode"])) + if phase == "prefill" and suite_cfg.get("token_points_prefill"): + return " ".join(map(str, suite_cfg["token_points_prefill"])) + if suite_cfg.get("token_points"): + return " ".join(map(str, suite_cfg["token_points"])) + return "" + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX sweep matrix resolver") + ap.add_argument("--suites", default="all", help="'all' or comma-list of suite names") + ap.add_argument("--backend", default="", help="remap deepep cases onto ONE EP lib (uccl/flashinfer/deepep-hybrid/nccl-ep)") + ap.add_argument("--backends", default="", + help="combined multi-backend matrix in ONE run: 'all' or a comma-list " + "(deepep,deepep-v2,uccl,flashinfer,deepep-hybrid,nccl-ep). Each deepep-origin " + "case is emitted once per backend (capability-filtered); mori stays AMD-native. " + "Supersedes per-backend dispatches. Overrides --backend/--deepep-v2 when set.") + ap.add_argument("--deepep-v2", action="store_true") + ap.add_argument("--only-sku", default="", help="restrict to one workflow sku value") + ap.add_argument("--min-nodes", type=int, default=0, + help="keep only shards whose tray count (nodes, blank=1) is >= this; " + "e.g. 2 = rack-scale EP8 only (skip the single-tray EP4 cells)") + ap.add_argument("--max-nodes", type=int, default=0, + help="keep only shards whose tray count (nodes, blank=1) is <= this; " + "e.g. 1 = single-tray EP4 only (skip the rack-scale EP8 cells)") + ap.add_argument("--max-cases", type=int, default=128, help="chunk shards larger than this into sub-cells (128 = effectively no chunking for current suites; each shard's cases run consecutively in ONE allocation, amortizing runner/enroot/build startup)") + ap.add_argument("--out", default="") + ap.add_argument("--slim", action="store_true", + help="emit matrix WITHOUT the per-cell cases list (fits the GHA output size cap); " + "cells re-resolve their own cases via --emit-shard") + ap.add_argument("--emit-shard", default="", + help="write just this shard id's {cases:[...]} (the CX_SHARD_FILE for run_in_container)") + ap.add_argument("--shard-out", default="results/.shard.json") + a = ap.parse_args() + + wl_cfg = yaml.safe_load(open(os.path.join(HERE, "configs", "workloads.yaml"))) + suites_cfg = yaml.safe_load(open(os.path.join(HERE, "configs", "suites.yaml")))["suites"] + suite_names = list(suites_cfg) if a.suites == "all" else [s.strip() for s in a.suites.split(",")] + + # Backend expansion targets for a deepep-origin case, as (backend, deepep_v2) pairs: + # --backends "all"|comma-list -> COMBINED matrix (every backend in ONE run; supersedes the + # per-backend dispatches). 'deepep-v2' is the from-source V2 kernel = deepep + v2 flag. + # else -> the legacy single --backend (+ --deepep-v2) behavior. + NV_EP_ALL = ["deepep", "deepep-v2", "uccl", "flashinfer", "deepep-hybrid", "nccl-ep"] + if a.backends: + names = NV_EP_ALL if a.backends == "all" else [x.strip() for x in a.backends.split(",") if x.strip()] + targets = [("deepep", True) if n == "deepep-v2" else (n, False) for n in names] + else: + targets = [(a.backend or "deepep", a.deepep_v2)] + + # collect enriched cases, deduped globally (a config shared by several suites appears once) + seen = set() + shards: dict = {} + for sname in suite_names: + scfg = suites_cfg[sname] + for c in gm.generate(sname)["cases"]: + plat = c["platform"] + beng0 = c["backend"] + if beng0 not in ("deepep", "mori"): + continue + sku = SKU.get(plat, plat) + if a.only_sku and sku != a.only_sku: + continue + phase = c["phase"] + rmode = c["resource_mode"] + lad = _ladder(scfg, phase) + h, t, e = _dims(wl_cfg, c["workload"]) + # MoRI envelope guard: capped ladder (T=1..16) + tuned for BOTH phases. MoRI prefill IS + # supported (MORI-EP does intra+inter-node, both modes — ROCm/mori); prefill at the capped + # ladder is validated 5/5 (run 28461798511). It was an UNCAPPED ladder to T=128 that timed + # out, not prefill itself — so prefill is capped here, NOT skipped (correcting an earlier + # decode-only assumption). + if sku == "mi355x": + lad, rmode = "1 2 4 8 16", "tuned" + # rack-scale tray->nodes (gb200/gb300 = 4 GPU/tray): EP4 = 1 tray, EP8 = 2 trays. ALWAYS + # set an EXPLICIT count: the gb300 launcher does NODES="${CX_NODES:-2}", so an EMPTY + # CX_NODES coerces to 2 (EP8) — an EP4 cell with nodes="" silently ran EP8 (the rack + # multi-srun, which bypasses cx_build_deepep_v2 / cx_build_flashinfer_latest). nodes="1" + # makes EP4 actually run EP4 (run_in_container, which builds V2/quant-combine). + nodes = "" + if plat in ("gb200", "gb300"): + nodes = str(max(1, int(c.get("ep") or 8) // 4)) + # The broad sweep runs SEEDED-runtime (comparable-experimental), NOT pre-staged canonical: + # a fixed seed + identical params already yields the same cross-SKU trace for a fair + # comparison, without the per-case canonical-manifest staging (overhead + a fragility — the + # official cohort is a separate targeted run). run_in_container also re-stages per case if + # canonical is ever re-enabled (the CX_WORKLOAD_DIR unset fix). + canonical = False + # mori cases stay AMD-native; deepep-origin cases expand across the requested backend set. + case_targets = [("mori", False)] if beng0 == "mori" else targets + for (beng, v2) in case_targets: + ok, _r = cap.resolve(plat, beng, mode=c["mode"], dtype=c["dtype"], contract=c["contract"], + routing=c["routing"], eplb=bool(c.get("eplb")), + activation_profile=c.get("activation_profile", "normal")) + if not ok: + continue + # DeepEP V2 (from-source kernel_gen=v2) is genuine on aarch64 gb200/gb300 at BOTH EP4 + # (single-tray, gb300 run 28429220764) AND EP8 rack (2-tray MNNVL, gb300 run 28434764062 + # -> kernel_gen=v2/ws8/correct). The EP8 rack path builds V2 once-per-node into a persistent + # container (CX_BUILD_ONLY) and the harness passes allow_mnnvl=True (CX_ALLOW_MNNVL) so the + # NVL buffer spans trays — so v2 is now allowed on gb200/gb300 at every EP degree. + case = { + "backend": beng, "deepep_v2": v2, "mode": c["mode"], "dtype": c["dtype"], + "contract": c["contract"], "routing": c["routing"], "phase": phase, + "eplb": bool(c.get("eplb")), "resource_mode": rmode, + "activation_profile": c.get("activation_profile", "normal"), + "placement": c.get("placement", "packed"), "routing_step": str(c.get("routing_step", 0)), + "uneven_tokens": c.get("uneven_tokens", "none"), + "hidden": "" if h in (None, 7168) else str(h), + "topk": "" if t in (None, 8) else str(t), + "experts": "" if e in (None, 256) else str(e), + "ladder": lad, "canonical": canonical, "nodes": nodes, + } + sig = (sku, beng, v2, c["mode"], c["dtype"], c["contract"], c["routing"], phase, + case["eplb"], rmode, case["activation_profile"], case["placement"], + case["routing_step"], case["uneven_tokens"], case["hidden"], case["topk"], + case["experts"], nodes) + if sig in seen: + continue + seen.add(sig) + # shard key = the CONTAINER/allocation-determining fields only: (sku, backend, v2, nodes). + # mode + resource_mode are per-case runtime knobs (run_in_container reads CX_MODE/ + # CX_RESOURCE_MODE per case), so they do NOT split shards — all modes/rmodes of one + # (sku,backend,v2,nodes) run consecutively in ONE allocation, paying the enroot import + + # from-source build ONCE (not once per mode). + key = (sku, beng, v2, nodes) + shards.setdefault(key, []).append(case) + + # PER-BACKEND chunk size. Fast backends (deepep*/nccl-ep/mori/deepep-hybrid) run a whole build-group + # in ONE allocation (max_cases, ~no chunking). flashinfer is SLOW (~3.2 min/case, heavy per-case MNNVL + # workspace setup) and intermittently hits `CUDA error: unspecified launch failure` under rapid + # back-to-back cases — so chunk it small: bounded, PARALLEL jobs, fewer successive setups per + # allocation. (uccl is NOT chunked: it fit a 74-case allocation cleanly; its only misses were a few + # ll-mode per-case timeouts that chunking wouldn't change.) + SLOW_MAX_CASES = {"flashinfer": 12} # 12 (not 16): flashinfer cases retry up to 3x for the intermittent + # MNNVL-barrier deadlock, so smaller chunks keep a chunk within --time. + include = [] + for (sku, beng, v2, nodes), cases in sorted(shards.items()): + if a.min_nodes and max(1, int(nodes or 1)) < a.min_nodes: + continue # --min-nodes: skip single-tray (EP4) shards, keep only rack-scale (EP8+) + if a.max_nodes and max(1, int(nodes or 1)) > a.max_nodes: + continue # --max-nodes: skip rack-scale (EP8+) shards, keep only single-tray (EP4) + tag = beng + ("-v2" if v2 else "") # distinct shard id/runner for the V2 kernel variant + mc = min(a.max_cases, SLOW_MAX_CASES.get(beng, a.max_cases)) + for ci in range(0, len(cases), mc): + chunk = cases[ci:ci + mc] + part = ci // mc + sid = f"{sku}-{tag}" + (f"-n{nodes}" if nodes else "") + (f"-p{part}" if len(cases) > mc else "") + include.append({ + "id": sid, "sku": sku, "backend": beng, + "nodes": nodes, "deepep_v2": v2, + "n": len(chunk), "cases": chunk, + }) + + # --emit-shard: write just one shard's cases (the per-cell CX_SHARD_FILE) and exit. + if a.emit_shard: + match = next((x for x in include if x["id"] == a.emit_shard), None) + if match is None: + print(f"ERROR: shard id '{a.emit_shard}' not found among {len(include)} cells", file=sys.stderr) + return 2 + os.makedirs(os.path.dirname(a.shard_out) or ".", exist_ok=True) + with open(a.shard_out, "w") as fh: + json.dump({"id": match["id"], "sku": match["sku"], "backend": match["backend"], + "nodes": match["nodes"], "deepep_v2": match["deepep_v2"], + "cases": match["cases"]}, fh) + print(f"wrote shard {a.emit_shard} ({match['n']} cases) -> {a.shard_out}", file=sys.stderr) + return 0 + + n_cells = len(include) + n_cases = sum(x["n"] for x in include) + # slim: drop the heavy `cases` from each cell so the matrix fits the GHA job-output size cap; + # each cell re-derives its cases with --emit-shard . + out_include = ([{k: v for k, v in x.items() if k != "cases"} for x in include] + if a.slim else include) + matrix = {"include": out_include} + if a.out: + with open(a.out, "w") as fh: + json.dump(matrix, fh) + print(f"resolved {n_cells} shard-cells, {n_cases} cases " + f"(suites={len(suite_names)} backends={a.backends or a.backend or 'deepep'} v2={a.deepep_v2})", + file=sys.stderr) + # stdout = the matrix JSON (for `$(...)` capture in the workflow) + print(json.dumps(matrix)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/_gb300_ep_probe.py b/experimental/CollectiveX/tests/_gb300_ep_probe.py new file mode 100644 index 000000000..3889c98f5 --- /dev/null +++ b/experimental/CollectiveX/tests/_gb300_ep_probe.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +"""GB300 EP8 GO/NO-GO probe — does DeepEP work across 2 NVL72 trays (8 ranks / 2 nodes)? + +Read-only spike (no artifacts). One PATH per process (CX_PROBE_PATH), because NVSHMEM +inits once per process and the internode/LL buffers each bootstrap it. Reports, on rank 0, +which Buffer construction + a 1-shot dispatch/combine round-trip actually runs on this fabric: + + intranode Buffer(group, nvl, 0) (MNNVL-as-one-NVLink-domain hope) + internode Buffer(group, nvl, rdma>0) (DeepEP NVSHMEM path, over NVLink/IB) + ll Buffer(group, 0, rdma, low_latency_mode=True) (decode path; nvlink-LL allowed) + +Env (set per-rank by the srun wrapper): RANK WORLD_SIZE LOCAL_RANK MASTER_ADDR MASTER_PORT + CX_PROBE_PATH=intranode|internode|ll +""" +import os +import socket +import sys +import traceback + +import torch +import torch.distributed as dist + +RANK = int(os.environ["RANK"]) +WORLD = int(os.environ["WORLD_SIZE"]) +LR = int(os.environ["LOCAL_RANK"]) +PATH = os.environ.get("CX_PROBE_PATH", "intranode") +HOST = socket.gethostname() +H = 7168 +TOPK = 8 +EXPERTS = WORLD * 32 # 256 at world=8 — same as the real sweep +T = 8 # tiny: this is a does-it-run probe, not a timing run + + +def log(msg): + print(f"[r{RANK}@{HOST} {PATH}] {msg}", flush=True) + + +def main(): + torch.cuda.set_device(LR) + dev = torch.device(f"cuda:{LR}") + dist.init_process_group("nccl", rank=RANK, world_size=WORLD) + + import deep_ep + from deep_ep import Buffer + if RANK == 0: + import inspect + try: + import importlib.metadata as md + ver = md.version("deep_ep") + except Exception: + ver = getattr(deep_ep, "__version__", "?") + log(f"deep_ep={ver} torch={torch.__version__} cuda={torch.version.cuda}") + log(f"Buffer.__init__{inspect.signature(Buffer.__init__)}") + log(f"caps: internode_dispatch={hasattr(Buffer,'internode_dispatch')} " + f"get_dispatch_config={hasattr(Buffer,'get_dispatch_config')} " + f"low_latency_dispatch={hasattr(Buffer,'low_latency_dispatch')} " + f"ll_rdma_hint={hasattr(Buffer,'get_low_latency_rdma_size_hint')}") + + hosts = [None] * WORLD + dist.all_gather_object(hosts, HOST) + if RANK == 0: + uniq = sorted(set(hosts)) + log(f"world={WORLD} over {len(uniq)} node(s): {uniq}") + + group = dist.group.WORLD + x = torch.randn(T, H, dtype=torch.bfloat16, device=dev) + g = torch.Generator(device=dev).manual_seed(1234 + RANK) + idx = torch.stack([torch.randperm(EXPERTS, device=dev, generator=g)[:TOPK] + for _ in range(T)]).to(torch.int64) + w = torch.rand(T, TOPK, device=dev, generator=g).to(torch.float32) + + dist.barrier() + try: + if PATH == "intranode": + buf = Buffer(group, 1 * 1024**3, 0) + try: + Buffer.set_num_sms(24) + except Exception: + pass + ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS) + rx, _ri, rw, _nre, h, _ev = buf.dispatch( + x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr, + num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir, + num_tokens_per_expert=ntpe) + cx, _, _ = buf.combine(rx, h, topk_weights=rw) + rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape + log(f"RESULT intranode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} " + f"rdma_rank_layout={'present' if ntrr is not None else 'None'}") + + elif PATH == "internode": + buf = Buffer(group, 1 * 1024**3, 1 * 1024**3) + try: + Buffer.set_num_sms(24) + except Exception: + pass + ntr, ntrr, ntpe, itir, _ = buf.get_dispatch_layout(idx, EXPERTS) + rx, _ri, rw, _nre, h, _ev = buf.dispatch( + x, topk_idx=idx, topk_weights=w, num_tokens_per_rank=ntr, + num_tokens_per_rdma_rank=ntrr, is_token_in_rank=itir, + num_tokens_per_expert=ntpe) + cx, _, _ = buf.combine(rx, h, topk_weights=rw) + rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape + log(f"RESULT internode OK: recv={tuple(rxs)} combine={tuple(cx.shape)} " + f"rdma_rank_layout={'present' if ntrr is not None else 'None'}") + + elif PATH == "ll": + num_max = 128 + rdma = Buffer.get_low_latency_rdma_size_hint(num_max, H, WORLD, EXPERTS) + nq = max(1, EXPERTS // WORLD) + buf = Buffer(group, 0, rdma, low_latency_mode=True, num_qps_per_rank=nq, + allow_nvlink_for_low_latency_mode=True) + rx, rc, h, _ev, _hook = buf.low_latency_dispatch( + x, idx, num_max, EXPERTS, use_fp8=False, return_recv_hook=False) + cx, _ev2, _hook2 = buf.low_latency_combine(rx, idx, w, h) + rxs = rx[0].shape if isinstance(rx, tuple) else rx.shape + log(f"RESULT ll OK: recv={tuple(rxs)} combine={tuple(cx.shape)}") + else: + log(f"unknown CX_PROBE_PATH={PATH}") + return 2 + dist.barrier() + except Exception as exc: + if RANK == 0: + log(f"RESULT {PATH} FAIL: {exc!r}") + tb = traceback.format_exc().strip().splitlines() + for ln in tb[-8:]: + log(f" | {ln}") + # let other ranks print their error too (often the real one is rank-specific) + else: + log(f"FAIL(non0): {exc!r}") + try: + dist.barrier() + except Exception: + pass + return 1 + finally: + try: + dist.destroy_process_group() + except Exception: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/experimental/CollectiveX/tests/allreduce_fw_bench.py b/experimental/CollectiveX/tests/allreduce_fw_bench.py new file mode 100644 index 000000000..609c2c7b1 --- /dev/null +++ b/experimental/CollectiveX/tests/allreduce_fw_bench.py @@ -0,0 +1,531 @@ +#!/usr/bin/env python3 +"""CollectiveX — framework custom all-reduce benchmark (family=allreduce-fw). + +Goal P2 "Low-latency all-reduce suite", framework-integrated tier. The standardized +NCCL all-reduce is already covered by run_nccl.py (nccl-tests); this benchmark times the +CUSTOM all-reduce kernels the serving frameworks ship — the ones that beat NCCL in the +small-to-medium, latency-bound regime (TP all-reduce of activations: a few KiB .. tens of +MiB) by doing a single one-shot or two-shot NVLink reduction instead of a ring. + +It runs under torchrun (multi-process, one rank per GPU) and, for EACH importable +framework, times an all-reduce-sum of a bf16/fp32 tensor across the whole world over a +latency-focused size ladder, CUDA-event timed, validating the result against a known +reference. NCCL (torch.distributed.all_reduce) is the always-present baseline. + +Implementations measured (each IMPORT-GUARDED — a framework that isn't importable in the +container is recorded as skipped, never faked): + * nccl — torch.distributed.all_reduce (baseline) + * flashinfer-oneshot } flashinfer custom all-reduce (trtllm fusion / vLLM-style + * flashinfer-twoshot } custom-allreduce), one-shot and two-shot recorded separately + * sglang — sgl_kernel / sglang custom all-reduce + * vllm — vllm custom all-reduce (vllm may or may not be in the image) + +Each measured impl is one group: + {impl, dtype, world_size, rows:[{size_bytes, latency_us, algbw_gbps, busbw_gbps, correct}]} +busbw uses the all-reduce factor 2*(n-1)/n (same as nccl-tests) so framework and NCCL bus +bandwidth are directly comparable. status=valid iff nccl + >=1 framework impl produced rows +with bw>0. A top-level frameworks_available dict records which frameworks were importable. + +Stdlib + torch; torch (and every framework) is imported lazily so `--help` works on a login +node with no GPU. One provenance-tagged JSON like rl_mesh_bench.py / run_nccl.py. + + torchrun --nproc_per_node=8 tests/allreduce_fw_bench.py --runner h200-dgxc \\ + --topology-class h200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/h200_allreduce_fw.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "allreduce-fw-v1" +FAMILY = "allreduce-fw" + +# Latency-focused ladder: 1 KiB .. 64 MiB. This is the regime where a custom one-shot / +# two-shot NVLink all-reduce beats the NCCL ring (small messages are latency-bound; the +# ring's 2*(n-1) hops dominate). Above ~tens of MiB NCCL's bandwidth-optimal ring wins, so +# we deliberately stop at 64 MiB — past the crossover the framework kernels stop being the +# point. Geometric x4 keeps the sweep short (9 points) so per-impl warmup cost stays bounded. +DEFAULT_MIN_BYTES = 1 << 10 # 1 KiB +DEFAULT_MAX_BYTES = 64 << 20 # 64 MiB + +# Custom all-reduce kernels are written for fp16/bf16 activations (TP all-reduce); a few also +# take fp32. bf16 is the headline serving dtype. Map to torch dtype lazily (torch imported in main). +_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp32": 4} + + +def _sizes(lo: int, hi: int, factor: int = 4): + out, s = [], lo + while s <= hi: + out.append(s) + s *= factor + return out + + +def comparison_key(meta: dict) -> str: + """Rows may share a curve only within the same (impl, dtype, world, topology, contract). + impl + topology-class are part of the key so e.g. flashinfer-oneshot on H200(NVLink) is + never silently overlaid on sglang or on a different topology.""" + parts = [meta["impl"], meta["dtype"], str(meta["world_size"]), + meta["topology_class"], meta["measurement_contract"]] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _bench(fn, torch, warmup: int, iters: int) -> float: + """CUDA-event timed mean ms/iter (identical pattern to rl_mesh_bench._bench).""" + for _ in range(warmup): + fn() + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(iters): + fn() + end.record() + torch.cuda.synchronize() + return start.elapsed_time(end) / iters # ms/iter + + +def _bandwidths(nbytes: int, ms: float, world: int): + """algbw + busbw (GB/s) for an all-reduce, matching nccl-tests so framework numbers are + directly comparable to run_nccl.py. algbw = size/time; busbw = algbw * 2*(n-1)/n.""" + if ms <= 0: + return 0.0, 0.0 + sec = ms / 1e3 + algbw = (nbytes / sec) / 1e9 + factor = (2.0 * (world - 1) / world) if world > 1 else 1.0 + return algbw, algbw * factor + + +# -------------------------------------------------------------------------------------- +# Implementation registry. Each entry is a builder: given (torch, dist, dev, world, rank, +# dtype_str) it returns either None (framework/kernel not available -> skipped) or a dict +# {"runner": fn(tensor)->None in-place all-reduce-sum, "free": optional teardown}. +# Every builder is fully import-guarded and never raises out — an unavailable framework is a +# recorded skip with a note, never a fake row. Several framework entrypoints are GUESSED +# defensively across plausible API surfaces (flashinfer/sglang/vllm reorganize these often); +# each guess is tried under try/except and simply yields "skipped" if absent, so a wrong guess +# degrades to a skip rather than a crash. +# -------------------------------------------------------------------------------------- + +def _build_nccl(torch, dist, dev, world, rank, dtype): + """Baseline: torch.distributed.all_reduce (NCCL). Always available when dist is up.""" + def run(t): + dist.all_reduce(t, op=dist.ReduceOp.SUM) + return {"runner": run, "note": "torch.distributed.all_reduce (NCCL ring)"} + + +# FlashInfer custom AR works on a [token_num, hidden_dim] activation tensor (the TP all-reduce +# shape), so the flashinfer impls sweep this fixed hidden and reshape the bench's flat buffer to +# [numel/H, H]. Sizes not a multiple of H (only the smallest 1 KiB point) raise _SkipSize -> the +# bench records a skipped row and continues (does NOT mark the impl failed). +_FI_AR_HIDDEN = 2048 + + +class _SkipSize(Exception): + """Raised by an impl's run() for a size its kernel can't shape (skip that size, keep the impl).""" + + +def _build_flashinfer(torch, dist, dev, world, rank, dtype, variant): + """FlashInfer custom all-reduce, one-shot vs two-shot as distinct impls — the REAL contract + (pinned on B300, flashinfer 0.6.8.post1): trtllm_allreduce_fusion with pattern_code= + AllReduceFusionPattern.kAllReduce (pure AR, no fusion) and use_oneshot True/False selecting + one-shot vs two-shot. The IPC workspace comes from trtllm_create_ipc_workspace_for_all_reduce_ + fusion(tp_rank, tp_size, max_token_num, hidden_dim, group) -> (ipc_handles, workspace_ptrs[7]). + Both variants validated correct=True at EP2. (These APIs carry a deprecation note toward a future + allreduce.py, but are the functional one/two-shot entrypoints in this wheel.)""" + try: + import flashinfer.comm as ficomm + from flashinfer.comm import trtllm_ar as fi_ar + except Exception: + return None + fusion = getattr(ficomm, "trtllm_allreduce_fusion", None) + mkws = getattr(ficomm, "trtllm_create_ipc_workspace_for_all_reduce_fusion", None) + rmws = getattr(ficomm, "trtllm_destroy_ipc_workspace_for_all_reduce_fusion", None) + Pat = getattr(fi_ar, "AllReduceFusionPattern", None) or getattr(ficomm, "AllReduceFusionPattern", None) + if fusion is None or mkws is None or Pat is None or not hasattr(Pat, "kAllReduce"): + return {"runner": None, + "skip": "flashinfer.comm lacks trtllm_allreduce_fusion / IPC workspace / " + "AllReduceFusionPattern.kAllReduce"} + H = _FI_AR_HIDDEN + use_oneshot = (variant == "oneshot") + max_tok = max(1, (DEFAULT_MAX_BYTES // _DTYPE_BYTES[dtype]) // H) + try: + ws = mkws(rank, world, max_tok, H, group=dist.group.WORLD) + except Exception as exc: + return {"runner": None, "skip": f"fusion IPC workspace creation failed: {exc!r}"} + ipc_handles = ws[0] if isinstance(ws, (list, tuple)) else None + ws_ptrs = ws[1] if isinstance(ws, (list, tuple)) and len(ws) >= 2 else None + pat = Pat.kAllReduce + out_buf = {} + + def run(t, _f=fusion, _pat=pat, _os=use_oneshot, _wp=ws_ptrs): + numel = t.numel() + if numel < H or (numel % H) != 0: + raise _SkipSize(f"size {numel} elems not a multiple of hidden {H}") + Tn = numel // H + # Two-shot splits the sequence dim across ranks -> it asserts token_num > tp_size. One-shot + # has no such floor. Skip (don't fail) the small sizes where two-shot can't run. + if not _os and Tn <= world: + raise _SkipSize(f"two-shot needs token_num({Tn}) > tp_size({world})") + inp = t.view(Tn, H) + out = out_buf.get(Tn) + if out is None: + out = torch.empty_like(inp) + out_buf[Tn] = out + _f(allreduce_in=inp, world_size=world, world_rank=rank, token_num=Tn, hidden_dim=H, + workspace_ptrs=_wp, launch_with_pdl=False, trigger_completion_at_end=True, + fp32_acc=True, pattern_code=_pat, use_oneshot=_os, allreduce_out=out, + residual_in=None, residual_out=None, norm_out=None, quant_out=None, scale_out=None, + rms_gamma=None, rms_eps=None, scale_factor=None, layout_code=None) + # The kernel is out-of-place; copy back so the bench's in-place run(t) contract + its + # correctness check (which reads t) hold. The copy is small vs the AR and noted in the row. + t.copy_(out.view(-1)) + + def free(): + if rmws is not None and ipc_handles is not None: + try: + rmws(ipc_handles, group=dist.group.WORLD) + except Exception: + pass + + return {"runner": run, "free": free, + "note": f"flashinfer.comm.trtllm_allreduce_fusion kAllReduce use_oneshot={use_oneshot} " + f"(hidden={H}, out-of-place + copy-back)"} + + +def _sglang_vllm_ca_runner(ps, torch, dev, world, rank, fw): + """Shared: replicate the framework's SERVING distributed init (init_distributed_environment + + initialize_model_parallel) on the existing torchrun group, then return a run() that calls the TP + GroupCoordinator's custom-allreduce. sglang AND vllm expose the identical parallel_state API + (sglang forked vllm's), so one helper drives both. The serving init is exactly the context the + CustomAllreduce wrapper needs (it builds ca_comm only after initialize_model_parallel) — which is + why a bare-wrapper construction skipped before. Fully guarded -> skip dict on any failure.""" + try: + if not ps.model_parallel_is_initialized(): + ps.init_distributed_environment(world_size=world, rank=rank, + distributed_init_method="env://", + local_rank=local_device_index(dev), backend="nccl") + ps.initialize_model_parallel(tensor_model_parallel_size=world) + tp = ps.get_tp_group() + except Exception as e: + return {"runner": None, "skip": f"{fw} distributed init failed: {e!r}"} + # sglang/vllm expose ca_comm directly on the GroupCoordinator; aiter nests it under + # device_communicator.ca_comm — try both. + ca = getattr(tp, "ca_comm", None) or getattr(getattr(tp, "device_communicator", None), "ca_comm", None) + if ca is None or getattr(ca, "disabled", True): + return {"runner": None, + "skip": f"{fw} TP group ca_comm absent/disabled (no custom-AR at world={world}; " + f"needs >1 rank + a supported topology/size)"} + + def run(t, _ca=ca): + if hasattr(_ca, "should_custom_ar") and not _ca.should_custom_ar(t): + raise _SkipSize(f"{fw} ca_comm: size outside custom-AR range") + out = _ca.custom_all_reduce(t) + if out is not None and out.data_ptr() != t.data_ptr(): + t.copy_(out) + return {"runner": run, "free": getattr(tp, "destroy", None), + "note": f"{fw} GroupCoordinator.ca_comm.custom_all_reduce (serving init replicated)"} + + +def _build_sglang(torch, dist, dev, world, rank, dtype): + """SGLang custom all-reduce. The wrapper builds its IPC buffer only inside the framework's + distributed init (initialize_model_parallel) — so replicate that on the torchrun group and use + the TP group's ca_comm (the prior bare-CustomAllreduce construction skipped for exactly this).""" + try: + from sglang.srt.distributed import parallel_state as ps + except Exception as e: + return {"runner": None, "skip": f"sglang.srt.distributed import failed (not in image?): {e!r}"} + return _sglang_vllm_ca_runner(ps, torch, dev, world, rank, "sglang") + + +def _build_vllm(torch, dist, dev, world, rank, dtype): + """vLLM in-tree custom all-reduce via its GroupCoordinator — same serving-init replication as + sglang (vllm.distributed.parallel_state has the identical init/get_tp_group/ca_comm API). vLLM + isn't in the sglang image, so this runs under the vLLM container switch (CX_BENCH=allreduce-fw + + sku/image -> a vllm image); skips on absence.""" + try: + from vllm.distributed import parallel_state as ps + except Exception as e: + return {"runner": None, "skip": f"vllm.distributed import failed (not in image — needs a vLLM container): {e!r}"} + # vLLM's CustomAllreduce is a CustomOp that asserts an ACTIVE VllmConfig at instantiation + # ("Current vLLM config is not set" — observed on vllm/vllm-openai). Enter set_current_vllm_config + # PERSISTENTLY so the init + the timed run() calls all see the config (it sets a contextvar); + # free() exits it. Guarded: a vLLM without this API proceeds without (the helper reports failures). + cm = None + try: + from vllm.config import VllmConfig, set_current_vllm_config + cm = set_current_vllm_config(VllmConfig()) + cm.__enter__() + except Exception: + cm = None + built = _sglang_vllm_ca_runner(ps, torch, dev, world, rank, "vllm") + if cm is not None: + _orig_free = built.get("free") + def _free(_of=_orig_free, _cm=cm): + try: + if _of: + _of() + finally: + try: + _cm.__exit__(None, None, None) + except Exception: + pass + built["free"] = _free + return built + + +def _module_exists(name: str) -> bool: + import importlib.util + try: + return importlib.util.find_spec(name) is not None + except Exception: + return False + + +def _build_aiter(torch, dist, dev, world, rank, dtype): + """AITER (AMD) custom all-reduce via its GroupCoordinator. aiter.dist.parallel_state forked + vllm's (same init_distributed_environment / initialize_model_parallel / get_tp_group), with + ca_comm nested under device_communicator — so the shared serving-init helper drives it. The + first version constructed the wrapper BARE and got a nan; replicating the init gives a working + ca_comm. Skips on absence (NVIDIA image has no aiter).""" + try: + from aiter.dist import parallel_state as ps + except Exception as e: + return {"runner": None, "skip": f"aiter.dist import failed (not in image?): {e!r}"} + return _sglang_vllm_ca_runner(ps, torch, dev, world, rank, "aiter") + + +def local_device_index(dev) -> int: + return dev.index if getattr(dev, "index", None) is not None else 0 + + +# (impl-name, builder, top-level framework key). flashinfer one/two-shot share the "flashinfer" +# framework key; nccl's framework is "torch". The framework key drives frameworks_available. +def _impl_registry(): + return [ + ("nccl", lambda *a: _build_nccl(*a), "torch"), + ("flashinfer-oneshot", lambda *a: _build_flashinfer(*a, variant="oneshot"), "flashinfer"), + ("flashinfer-twoshot", lambda *a: _build_flashinfer(*a, variant="twoshot"), "flashinfer"), + ("sglang", lambda *a: _build_sglang(*a), "sglang"), + ("vllm", lambda *a: _build_vllm(*a), "vllm"), + ("aiter", lambda *a: _build_aiter(*a), "aiter"), + ] + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX framework custom all-reduce benchmark") + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--dtype", default="bf16", choices=sorted(_DTYPE_BYTES)) + ap.add_argument("--warmup", type=int, default=10) + ap.add_argument("--iters", type=int, default=50) + ap.add_argument("--impls", default="", + help="comma/space-separated subset of impls to run (default: all). " + "e.g. 'nccl,flashinfer-oneshot' — nccl is always included as baseline.") + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="nvlink") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world < 2: + if rank == 0: + print(f"ERROR: allreduce-fw needs world_size >= 2 (got {world}); " + f"launch under torchrun --nproc_per_node=N", file=sys.stderr) + return 5 + torch.cuda.set_device(local_rank) + dev = torch.device(f"cuda:{local_rank}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12359") + if not dist.is_initialized(): + dist.init_process_group("nccl") + + torch_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[args.dtype] + elem_bytes = _DTYPE_BYTES[args.dtype] + sizes = _sizes(args.min_bytes, args.max_bytes) + + # Which impls to attempt. nccl baseline is always included. + want = {s for s in args.impls.replace(",", " ").split() if s} + registry = _impl_registry() + if want: + registry = [e for e in registry if e[0] in want or e[0] == "nccl"] + + # frameworks_available: framework key -> {available: bool, note/skip-reason}. Probed once. + frameworks_available: dict = {} + + def _note_framework(fwkey: str, available: bool, detail: str): + prev = frameworks_available.get(fwkey) + # importable wins over a per-variant skip (flashinfer may import yet a variant be absent). + if prev is None or (available and not prev.get("available")): + frameworks_available[fwkey] = {"available": available, "detail": detail} + + groups = [] + peak_bw = 0.0 + nccl_ok = False + framework_ok = False + + for impl_name, builder, fwkey in registry: + # Build the impl on every rank (custom AR needs collective IPC setup on all ranks). + try: + built = builder(torch, dist, dev, world, rank, args.dtype) + except Exception as exc: + built = {"runner": None, "skip": f"builder raised: {exc!r}"} + + if built is None: + _note_framework(fwkey, False, "framework not importable") + if rank == 0: + print(f" {impl_name}: skipped (framework '{fwkey}' not importable)", file=sys.stderr) + continue + if built.get("runner") is None: + reason = built.get("skip", "no usable entrypoint") + # framework imported (we got past `is None`) but this impl/variant isn't wireable. + _note_framework(fwkey, fwkey == "torch", reason if fwkey != "torch" else "baseline") + if rank == 0: + print(f" {impl_name}: skipped ({reason})", file=sys.stderr) + continue + + _note_framework(fwkey, True, built.get("note", "available")) + run = built["runner"] + rows = [] + impl_failed = False + for nbytes in sizes: + numel = max(1, nbytes // elem_bytes) + actual_bytes = numel * elem_bytes + # Known inputs so the reduced result has a closed form: every rank fills with its + # (rank+1); all-reduce-sum -> world*(world+1)/2 in every element. Lets us validate + # custom kernels against a reference without trusting the kernel to define "correct". + base = float(rank + 1) + expected = float(world * (world + 1) // 2) + try: + t = torch.full((numel,), base, dtype=torch_dtype, device=dev) + + def step(_t=t): + run(_t) + ms = _bench(step, torch, args.warmup, args.iters) + except _SkipSize as sk: + # The kernel can't shape this size (e.g. below the custom-AR hidden) — record a + # skipped row and CONTINUE; do NOT fail the impl (it works at the other sizes). + rows.append({"size_bytes": actual_bytes, "latency_us": None, + "algbw_gbps": 0.0, "busbw_gbps": 0.0, "correct": None, + "skipped": str(sk)}) + continue + except Exception as exc: + rows.append({"size_bytes": actual_bytes, "latency_us": None, + "algbw_gbps": 0.0, "busbw_gbps": 0.0, "correct": None, + "error": repr(exc)}) + impl_failed = True + break + + # Correctness: re-run once on a fresh known buffer and compare to the reference. + correct = None + try: + chk = torch.full((numel,), base, dtype=torch_dtype, device=dev) + run(chk) + ref = torch.full((numel,), expected, dtype=torch_dtype, device=dev) + # bf16/fp16 accumulate with rounding; tolerance scales with the magnitude. + atol = 0.0 if args.dtype == "fp32" else max(1.0, expected * 0.02) + correct = bool(torch.allclose(chk, ref, atol=atol, rtol=0.0)) + except Exception: + correct = None + + # Reduce timing across ranks (max = slowest rank) for a stable cross-rank number, + # exactly like rl_mesh_bench. Done with the always-present NCCL collective on a tiny + # tensor (not the impl under test). + tt = torch.tensor([ms], device=dev) + dist.all_reduce(tt, op=dist.ReduceOp.MAX) + ms_max = float(tt.item()) + algbw, busbw = _bandwidths(actual_bytes, ms_max, world) + peak_bw = max(peak_bw, busbw) + rows.append({"size_bytes": actual_bytes, + "latency_us": round(ms_max * 1e3, 3), + "algbw_gbps": round(algbw, 3), + "busbw_gbps": round(busbw, 3), + "correct": correct}) + + if built.get("free"): + try: + built["free"]() + except Exception: + pass + + had_bw = any((r.get("busbw_gbps") or 0.0) > 0.0 for r in rows) + if had_bw: + if impl_name == "nccl": + nccl_ok = True + else: + framework_ok = True + meta = {"impl": impl_name, "framework": fwkey, "dtype": args.dtype, + "world_size": world, "topology_class": args.topology_class, + "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), + "note": built.get("note"), "rows": rows, + "incomplete": impl_failed}) + if rank == 0: + mn = min((r["latency_us"] for r in rows if r.get("latency_us")), default=None) + print(f" {impl_name}: {len(rows)} sizes, min latency " + f"{mn if mn is not None else float('nan')} us, peak busbw " + f"{max((r.get('busbw_gbps') or 0.0) for r in rows):.1f} GB/s", file=sys.stderr) + + if rank != 0: + dist.barrier() + dist.destroy_process_group() + return 0 + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + # valid iff the NCCL baseline produced real (bw>0) rows — the all-reduce curve itself is the + # deliverable. Which framework custom kernels were importable on this image is recorded in + # frameworks_available + the `framework_ok` flag (not all frameworks ship in every image); a run + # with only nccl is a valid latency/bandwidth baseline, not a failure. + status = "valid" if nccl_ok else "invalid" + + doc = { + "schema_version": SCHEMA_VERSION, "family": FAMILY, + "generated_by": "allreduce_fw_bench.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, + "world_size": world, "dtype": args.dtype, + "size_min_bytes": args.min_bytes, "size_max_bytes": args.max_bytes, + "status": status, + "peak_busbw_gbps": round(peak_bw, 2), + "frameworks_available": frameworks_available, + "num_groups": len(groups), "groups": groups, "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + + avail = sorted(k for k, v in frameworks_available.items() if v.get("available")) + print(f"allreduce-fw: {len(groups)} impl group(s) -> {args.out} " + f"(status={status}, world={world}, dtype={args.dtype}, " + f"frameworks_available={avail}, peak_busbw={peak_bw:.1f} GB/s)") + dist.barrier() + dist.destroy_process_group() + return 0 if status == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/capability.py b/experimental/CollectiveX/tests/capability.py new file mode 100644 index 000000000..da689ec2a --- /dev/null +++ b/experimental/CollectiveX/tests/capability.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""CollectiveX capability resolver (stdlib-only — runs on a login node, no torch). + +A workflow that exposes backend x SKU x mode x dtype x contract can request combinations +no backend supports, and 'all' is not the same backend set across vendors. This static +table mirrors the adapters' SUPPORTED_* sets so the matrix compiler / a pre-flight step +can REJECT or OMIT invalid combinations BEFORE consuming a runner (review #3). The +adapters still reject at runtime — this just fails fast and keeps the matrix honest. + + python3 tests/capability.py --sku b300 --backend deepep --mode ll --dtype fp8 \ + --contract layout-and-dispatch-v1 # exit 0 if valid, 3 + reason if not + python3 tests/capability.py --list # dump the table +""" +from __future__ import annotations + +import argparse +import json +import sys + +# SKU -> vendor. The runner label's SKU prefix selects the launcher; vendor gates backend. +SKU_VENDOR = { + "h100": "nvidia", "h200": "nvidia", "b200": "nvidia", "b300": "nvidia", + "gb200": "nvidia", "gb300": "nvidia", "h100-dgxc": "nvidia", "b200-dgxc": "nvidia", + "mi355x": "amd", "mi350x": "amd", "mi325x": "amd", "mi300x": "amd", +} + + +def _sku_arch(sku: str) -> str: + s = (sku or "").lower() + if s.startswith(("gb300", "gb200", "b300", "b200")): + return "blackwell" + if s.startswith(("h100", "h200")): + return "hopper" + if s.startswith("mi3"): + return "cdna" + return "unknown" + + +# Dispatch dtypes that need a specific GPU arch. NVFP4 (e2m1 4-bit) is a Blackwell-native tensor +# format — FlashInfer's fp4 quantize/dequantize does NOT round-trip correctly on Hopper sm90 +# (validated: nvfp4 dispatch correct=True on B300, correct=False on H100). mxfp8 (e4m3) is fine on +# Hopper. Gated here so a Hopper nvfp4 dispatch is cleanly REJECTED, not run-and-marked-invalid. +ARCH_ONLY_DTYPES = {"nvfp4": "blackwell", "mxfp4": "blackwell"} + +# Backend capability table — MIRRORS the adapter SUPPORTED_* sets (the runtime source of +# truth). Keep in sync with ep_deepep.py / ep_mori.py. LL is decode-only; cached-layout is +# normal-only; MoRI is bf16/normal/layout-and-dispatch only. +# All synthetic routing distributions (trace transforms — backend-agnostic) + the temporal modes. +ALL_ROUTINGS = ["uniform", "balanced", "balanced-rank-local", "zipf", "zipf-mild", + "zipf-moderate", "zipf-heavy", "hotspot-single", "hotspot-moving", "alternating-groups"] +# Activation value profiles. Under bf16 combine all are RUNNABLE but latency-neutral; the +# non-normal ones become latency-relevant only under a quantized combine (PR311 — see quant_modes). +ALL_ACTIVATION_PROFILES = ["normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation"] +CAP = { + "deepep": { + "vendors": ["nvidia"], + "modes": ["normal", "ll"], + # DISPATCH-side precision + fp8 scale-layout recipe variants (same kernel, different cast): + # fp8=per-block-128, fp8-pertoken=per-token scale, fp8-directcast=unscaled (no scale transport). + "dtypes": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast"], + "contracts": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"], + "transports": ["nvlink", "rdma"], + # Combine path is a SEPARATE axis from dispatch dtype (review): today combine is bf16 + # with no quant on every backend regardless of dispatch_dtype. fp8/quantized combine is + # reserved until a kernel is wired — capability rejects it so it can't be silently faked. + "combine_dtypes": ["bf16"], # quantized combine (mxfp8/mxfp4/nvfp4) is in flashinfer + "quant_modes": ["none"], # moe_a2a_combine (PR3376/3643, merged) but MNNVL-gated on + # x86_64 — reserved, see docs/upstream_precision.md + gated.md + # routing/EPLB/activation semantics (goal P2 "distribution + quant-combine constraints in + # capabilities"): DeepEP honors any trace (routing is a pure trace transform) + EPLB. + "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES, + }, + "uccl": { + # UCCL EP (uccl.ep.Buffer) is a DeepEP-API clone on NVIDIA — mirror DeepEP's capability. + # bf16+fp8 dispatch, normal+ll modes, the same 3 contracts, bf16/none combine. + "vendors": ["nvidia"], + "modes": ["normal", "ll"], + "dtypes": ["bf16", "fp8"], + "contracts": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"], + "transports": ["nvlink", "rdma"], + "combine_dtypes": ["bf16"], + "quant_modes": ["none"], + "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES, + }, + "flashinfer": { + # FlashInfer EP = flashinfer.comm.trtllm_moe_alltoall.MoeAlltoAll (pre-installed) — the + # TRT-LLM throughput-backend one-sided A2A over an MNNVL symmetric workspace. The A2A is a + # dtype-agnostic byte-mover taking input_payloads as a LIST, so a quantized dispatch = + # move [q, scale_factor] + dequant in stage(). DISPATCH precisions: + # bf16; fp8/fp8-pertoken/fp8-directcast (e4m3, DeepEP convention); mxfp8/mxfp4/nvfp4 + # (OCP-microscaling via FlashInfer's native quantize/dequantize kernels). + "vendors": ["nvidia"], + "modes": ["normal"], + "dtypes": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast", "mxfp8", "mxfp4", "nvfp4"], + "contracts": ["layout-and-dispatch-v1"], + "transports": ["nvlink", "mnnvl"], + # Combine: bf16 default, OR a quantized COMBINE OUTPUT (fp8 e4m3) via moe_a2a_combine + # output_dtype — present in a NEWER flashinfer (PR3376/3643), pulled in by the run's + # cx_build_flashinfer_latest upgrade (the bundled 0.6.8.post1 lacks it). nvfp4/mxfp8 combine + # reserved (fp4/e8m0 output packing) until fp8-combine is GHA-validated. + "combine_dtypes": ["bf16", "fp8", "nvfp4"], + "quant_modes": ["none", "fp8", "nvfp4"], + "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES, + }, + "deepep-hybrid": { + # DeepEP hybrid-ep branch (NVIDIA TMA HybridEPBuffer), built from source by + # cx_build_deepep_hybrid. Intranode NVLink path (<=8 ranks, one NVLink domain). bf16 normal + # layout-and-dispatch only; fp8 (use_fp8) + internode NVLink<->RDMA forwarding are further lift. + "vendors": ["nvidia"], + "modes": ["normal"], + "dtypes": ["bf16"], + "contracts": ["layout-and-dispatch-v1"], + "transports": ["nvlink"], + "combine_dtypes": ["bf16"], + "quant_modes": ["none"], + "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES, + }, + "mori": { + "vendors": ["amd"], + "modes": ["normal"], + # DISPATCH-side precision. fp8 = e4m3fnuz DIRECT-CAST (the ROCm-native FNUZ format) via MoRI's + # quant_type=fp8_direct_cast — the only fp8 mode this MoRI build accepts (GHA introspection + # found the valid set is ['none','fp8_direct_cast']; the kernel casts bf16<->e4m3fnuz + # internally, scale_dim=0). bf16 combine OUTPUT unchanged (combine_dtypes below). + "dtypes": ["bf16", "fp8"], + "contracts": ["layout-and-dispatch-v1"], + "transports": ["xgmi", "rdma"], + "combine_dtypes": ["bf16"], # + "fp8" via MoRI PR311 (merged): QuantType::Fp8BlockwiseQuant + "quant_modes": ["none"], # + "fp8_blockwise" (MoRI PR311) once wired — see docs/upstream_precision.md + # MoRI also honors any trace + EPLB (a routing-trace transform), bf16 value-neutral. + "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES, + }, + "nccl-ep": { + # NCCL/RCCL all-to-all EP (tests/ep_nccl.py) — the canonical token-shuffle EP built on pure + # torch.distributed collectives (all_to_all_single), no custom RDMA. Runs on BOTH vendors + # (NCCL on NVIDIA, RCCL on AMD — identical API) and is the only EP backend that survives + # cross-node WITHOUT GPUDirect-RDMA: NCCL/RCCL host-stage the all-to-all, where UCCL's + # ibv_reg_mr (EINVAL) and MoRI's RDMA registration abort. bf16 / normal / layout-and-dispatch. + "vendors": ["nvidia", "amd"], + "modes": ["normal"], + "dtypes": ["bf16"], + "contracts": ["layout-and-dispatch-v1"], + "transports": ["nvlink", "rdma", "xgmi"], + "combine_dtypes": ["bf16"], + "quant_modes": ["none"], + "routings": ALL_ROUTINGS, "eplb": True, "activation_profiles": ALL_ACTIVATION_PROFILES, + }, +} +# nccl/rccl are collective primitives, not EP dispatch/combine — phase is meaningless. The `nccl` +# BENCHMARK runs on BOTH vendors: run_nccl_suite auto-selects nccl-tests on CUDA and rccl-tests on +# ROCm (same binaries/output), so the All-reduce/All-gather tabs get an MI355X line too. (`rccl` is +# kept as an explicit amd-only alias for direct dispatch.) +COLLECTIVE = {"nccl": ["nvidia", "amd"], "rccl": ["amd"]} +# Non-EP benchmarks (family != moe): memcpy-family (offload/copy-engine/kv-cache) + the RL +# trainer<->generator mesh transfer (rl-mesh, multi-process NCCL send/recv). The EP capability +# axes (mode/dtype/contract/phase) don't apply, so they pass validation unconditionally on their +# vendors. (offload/copy-engine are NVIDIA-only; kv-cache + rl-mesh run anywhere with CUDA/NCCL.) +HOST_GPU_BENCH = {"offload": ["nvidia"], "copy-engine": ["nvidia", "amd"], + "kv-cache": ["nvidia", "amd"], "rl-mesh": ["nvidia", "amd"], + "allreduce-fw": ["nvidia", "amd"], + # nixl = the NIXL point-to-point transfer bench (kv-cache family) + the device-EP + # build-probe; runs in the dynamo tensorrtllm-runtime container (NVIDIA-only). + "nixl": ["nvidia"], + # mori-io = MoRI-IO RDMA p2p transfer engine (mori.io); AMD MoRI image only. + "mori-io": ["amd"], + # nccl-kv = NCCL/RCCL p2p KV transfer (torch.distributed send/recv); both vendors. + "nccl-kv": ["nvidia", "amd"], + # mooncake = Mooncake transfer-engine RDMA KV transfer (pip-installed); both vendors + # (transfer_write_on_cuda / _on_hip), needs an RDMA NIC. + "mooncake": ["nvidia", "amd"]} + +# 'all' resolves to a DEFINED per-vendor backend set (not the same across vendors). +VENDOR_BACKENDS = {"nvidia": ["nccl", "deepep", "uccl", "flashinfer"], "amd": ["rccl", "mori"]} + + +def resolve(sku, backend, mode="normal", dtype="bf16", + contract="layout-and-dispatch-v1", combine_dtype="bf16", combine_quant_mode="none", + routing="uniform", eplb=False, activation_profile="normal"): + """Return (ok: bool, reason: str). dtype = DISPATCH precision; combine_dtype/ + combine_quant_mode are the SEPARATE combine-path axes (default bf16/none = today's behavior). + routing/eplb/activation_profile gate the distribution semantics a backend admits (goal P2).""" + sku = (sku or "").split("_")[0] + vendor = SKU_VENDOR.get(sku) + if vendor is None: + return False, f"unknown SKU '{sku}'" + if backend in COLLECTIVE: + if vendor not in COLLECTIVE[backend]: + return False, f"{backend} is not the {vendor} collective backend" + return True, "collective primitive (phase/dtype/mode/contract not applicable)" + if backend in HOST_GPU_BENCH: + if vendor not in HOST_GPU_BENCH[backend]: + return False, f"{backend} bench not available on {vendor}" + return True, f"{backend} host/GPU memcpy-family bench (EP axes not applicable)" + cap = CAP.get(backend) + if cap is None: + return False, f"unknown backend '{backend}'" + if vendor not in cap["vendors"]: + return False, f"{backend} runs on {cap['vendors']}, not {vendor} SKU '{sku}'" + if mode not in cap["modes"]: + return False, f"{backend} modes={cap['modes']} (got '{mode}')" + if dtype not in cap["dtypes"]: + return False, f"{backend} dispatch dtypes={cap['dtypes']} (got '{dtype}')" + need_arch = ARCH_ONLY_DTYPES.get(dtype) + if need_arch and _sku_arch(sku) != need_arch: + return False, (f"{dtype} dispatch requires {need_arch} (FP4 is Blackwell-native; FlashInfer's " + f"fp4 kernels don't round-trip on Hopper); SKU '{sku}' is {_sku_arch(sku)}") + if contract not in cap["contracts"]: + return False, f"{backend} contracts={cap['contracts']} (got '{contract}')" + if mode == "ll" and contract == "cached-layout-comm-only-v1": + return False, "cached-layout-comm-only-v1 is meaningless for LL (layout is in-kernel)" + if combine_dtype not in cap.get("combine_dtypes", ["bf16"]): + return False, f"{backend} combine_dtypes={cap.get('combine_dtypes', ['bf16'])} (got '{combine_dtype}')" + if combine_quant_mode not in cap.get("quant_modes", ["none"]): + return False, (f"{backend} quant_modes={cap.get('quant_modes', ['none'])} " + f"(got '{combine_quant_mode}') — quant combine not wired yet") + if routing not in cap.get("routings", ALL_ROUTINGS): + return False, f"{backend} routings={cap.get('routings', ALL_ROUTINGS)} (got '{routing}')" + if eplb and not cap.get("eplb", False): + return False, f"{backend} does not support EPLB" + if activation_profile not in cap.get("activation_profiles", ["normal"]): + return False, (f"{backend} activation_profiles={cap.get('activation_profiles', ['normal'])} " + f"(got '{activation_profile}')") + # an activation profile that needs special scaling is only MEANINGFUL under a quantized combine + # (bf16 is value-independent) — runnable but flagged so it isn't read as a latency result. + if activation_profile != "normal" and combine_quant_mode == "none": + return True, (f"ok (note: activation_profile={activation_profile} is latency-neutral under " + f"bf16/none combine — value sensitivity needs a quantized combine)") + return True, "ok" + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX capability resolver") + ap.add_argument("--sku"); ap.add_argument("--backend") + ap.add_argument("--mode", default="normal"); ap.add_argument("--dtype", default="bf16") + ap.add_argument("--contract", default="layout-and-dispatch-v1") + ap.add_argument("--combine-dtype", default="bf16") + ap.add_argument("--combine-quant-mode", default="none") + ap.add_argument("--routing", default="uniform") + ap.add_argument("--eplb", action="store_true") + ap.add_argument("--activation-profile", default="normal") + ap.add_argument("--list", action="store_true") + a = ap.parse_args() + if a.list: + print(json.dumps({"sku_vendor": SKU_VENDOR, "cap": CAP, + "collective": COLLECTIVE, "vendor_backends": VENDOR_BACKENDS}, indent=2)) + return 0 + ok, reason = resolve(a.sku, a.backend, a.mode, a.dtype, a.contract, + a.combine_dtype, a.combine_quant_mode, + a.routing, a.eplb, a.activation_profile) + print(f"{'VALID' if ok else 'INVALID'}: sku={a.sku} backend={a.backend} mode={a.mode} " + f"dtype={a.dtype} contract={a.contract} combine_dtype={a.combine_dtype} " + f"combine_quant_mode={a.combine_quant_mode} routing={a.routing} eplb={a.eplb} " + f"activation_profile={a.activation_profile} — {reason}") + return 0 if ok else 3 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/copy_engine_bench.py b/experimental/CollectiveX/tests/copy_engine_bench.py new file mode 100644 index 000000000..4e2e0aea1 --- /dev/null +++ b/experimental/CollectiveX/tests/copy_engine_bench.py @@ -0,0 +1,533 @@ +#!/usr/bin/env python3 +"""CollectiveX — Copy-engine / SDMA collectives (goal P2). + +Compares the NVIDIA COPY-ENGINE (DMA) path against an SM-based copy: + + * copy-engine path — cudaMemcpyAsync (torch .copy_/Tensor copy that lowers to + cudaMemcpyDeviceToDevice) issued on a DEDICATED copy + stream. Hardware routes device-to-device memcpy through a + copy engine (DMA), not the SMs. + * SM path — an elementwise kernel (torch mul-add) that necessarily + occupies SMs to move the same bytes. + +For each it reports latency + bandwidth across a size sweep (DtoD, and HtoD as a +second op). It then VALIDATES that the copy-engine path uses ~0 SMs: + + Primary : if pynvml is importable, sample SM utilization (nvmlDeviceGetUtilization + / process-SM) during a sustained copy-engine loop vs a sustained SM-copy + loop. copy-engine should read near-zero, SM-copy should read high. + Fallback : a concurrent-kernel NON-INTERFERENCE probe. Run a long SM-bound + "victim" kernel alone (t_victim). Then run it concurrently with a + copy-engine copy on a separate stream (t_with_ce) and with an + SM-copy on a separate stream (t_with_sm). If the copy engine truly + uses no SMs, t_with_ce ~ t_victim (the copy is hidden), whereas + t_with_sm > t_victim (the SM-copy steals SM cycles from the victim). + The ratio is reported as evidence; the proxy is documented in the doc. + +family="copy-engine". NVIDIA only (AMD SDMA is out of scope) — refuses on ROCm. + +Stdlib + torch; --help / --parse-only work without torch (import-safe writer+CLI). + +Run (inside the container, 1 GPU is enough): + python tests/copy_engine_bench.py \\ + --runner h200 --topology-class h200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/h200_copy_engine.json + +Verify offline (no GPU/torch needed): + python tests/copy_engine_bench.py --parse-only --runner h200 \\ + --topology-class h200-nvlink-island --out /tmp/parsed.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +FAMILY = "copy-engine" +MEASUREMENT_CONTRACT = "copy-engine-vs-sm-v1" +GENERATED_BY = "copy_engine_bench.py" + +# (op, engine) sub-ops. engine = copy-engine (DMA) vs sm (kernel). +SUBOPS = [ + ("dtod", "copy-engine"), + ("dtod", "sm"), + ("htod", "copy-engine"), + ("htod", "sm"), +] + +DEFAULT_MIN_BYTES = 64 * 1024 +DEFAULT_MAX_BYTES = 256 * 1024 * 1024 +DEFAULT_FACTOR = 4 + + +# --------------------------------------------------------------------------- # +# import-safe helpers (no torch) # +# --------------------------------------------------------------------------- # +def size_ladder(min_bytes: int, max_bytes: int, factor: int) -> list[int]: + sizes, s = [], int(min_bytes) + while s <= int(max_bytes): + sizes.append(s) + s *= factor + return sizes + + +def comparison_key(meta: dict) -> str: + parts = [ + meta["op"], + meta["engine"], + meta["dtype"], + meta["transport"], + meta["topology_class"], + meta["comparison_class"], + meta["measurement_contract"], + ] + return hashlib.sha256("|".join(map(str, parts)).encode()).hexdigest()[:16] + + +def _load_env(path: str | None) -> dict | None: + if path and os.path.exists(path): + with open(path) as fh: + return json.load(fh) + return None + + +def _provenance() -> dict: + import platform as _plat + + arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine()) + run = { + "run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + } + return { + "image": os.environ.get("COLLECTIVEX_IMAGE", ""), + "image_digest": os.environ.get("COLLECTIVEX_IMAGE_DIGEST", ""), + "image_arch": arch, + "squash_sha256": os.environ.get("COLLECTIVEX_SQUASH_SHA256"), + "git_run": run if any(run.values()) else None, + } + + +# --------------------------------------------------------------------------- # +# GPU path (torch only here) # +# --------------------------------------------------------------------------- # +def _copy_engine_copy(torch, dst, src, stream): + """DtoD/HtoD memcpy that lowers to cudaMemcpyAsync on `stream` (copy engine).""" + with torch.cuda.stream(stream): + dst.copy_(src, non_blocking=True) + + +def _sm_copy(torch, dst, src, stream): + """Bytes moved by an elementwise KERNEL (occupies SMs): dst = src * 1 + 0. + + mul/add lowers to a CUDA elementwise kernel scheduled on the SMs — the + deliberate SM-based contrast to the copy engine. Same byte volume as .copy_.""" + with torch.cuda.stream(stream): + torch.add(src, 0, out=dst) if dst.dtype == src.dtype else dst.copy_(src) + + +def _time_loop(torch, fn, iters: int) -> float: + torch.cuda.synchronize() + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + for _ in range(iters): + fn() + e.record() + torch.cuda.synchronize() + return s.elapsed_time(e) / iters # ms/iter + + +def _bench_one(torch, op: str, engine: str, nbytes: int, dtype, + warmup: int, iters: int, copy_stream) -> dict: + elem = torch.tensor([], dtype=dtype).element_size() + n = max(1, nbytes // elem) + + dev_dst = torch.empty(n, dtype=dtype, device="cuda") + if op == "dtod": + src = torch.randn(n, dtype=dtype, device="cuda") if dtype.is_floating_point \ + else torch.zeros(n, dtype=dtype, device="cuda") + else: # htod + src = torch.empty(n, dtype=dtype, device="cpu", pin_memory=True) + + if engine == "copy-engine": + fn = lambda: _copy_engine_copy(torch, dev_dst, src, copy_stream) + else: + # SM kernel copy. For HtoD an add kernel can't read host memory directly, + # so stage to device first then SM-copy device->device (still SM-bound). + if op == "htod": + staged = torch.empty(n, dtype=dtype, device="cuda") + staged.copy_(src) + torch.cuda.synchronize() + src = staged + fn = lambda: _sm_copy(torch, dev_dst, src, copy_stream) + + for _ in range(warmup): + fn() + copy_stream.synchronize() + torch.cuda.synchronize() + + avg_ms = _time_loop(torch, fn, iters) + actual_bytes = n * elem + gbps = (actual_bytes / (avg_ms / 1e3)) / 1e9 if avg_ms > 0 else 0.0 + return { + "op": op, + "engine": engine, + "size_bytes": actual_bytes, + "requested_bytes": nbytes, + "latency_us": round(avg_ms * 1e3, 4), + "bandwidth_gbps": round(gbps, 3), + } + + +# ---- SM-utilization validation (primary: nvml; fallback: non-interference) -- # +def _victim_kernel_factory(torch, device): + """A long SM-bound kernel used as the 'victim' in the non-interference probe. + + Repeated matmuls saturate the SMs for a measurable, stable duration; if a + concurrent copy steals SM cycles, the victim slows down.""" + m = 2048 + a = torch.randn(m, m, device=device, dtype=torch.float16) + b = torch.randn(m, m, device=device, dtype=torch.float16) + inner = 8 + + def victim(): + c = a + for _ in range(inner): + c = torch.matmul(c, b) + return c + + return victim, [m, m, m, inner] + + +def _attention_victim_factory(torch, device): + """An SM-bound ATTENTION victim (scaled_dot_product_attention = the flash-attention kernel) for + the copy-vs-attention interference probe (goal "Interference with attention kernels"). Decode-ish + attention shape [batch, heads, seq, head_dim]; repeated to saturate the SMs for a stable duration.""" + import torch.nn.functional as _F + b_, h_, s_, d_ = 8, 32, 2048, 128 + q = torch.randn(b_, h_, s_, d_, device=device, dtype=torch.float16) + k = torch.randn(b_, h_, s_, d_, device=device, dtype=torch.float16) + v = torch.randn(b_, h_, s_, d_, device=device, dtype=torch.float16) + inner = 6 + + def victim(): + o = q + for _ in range(inner): + o = _F.scaled_dot_product_attention(o, k, v) + return o + + return victim, [b_, h_, s_, d_, inner] + + +def _probe_victim(torch, victim, copy_engine_copy, sm_copy, dst, src, copy_stream, iters): + """Time a victim alone vs concurrent with a copy-engine copy vs concurrent with an SM-copy. + Returns (t_victim_us, t_with_ce_us, t_with_sm_us, ce_slowdown, sm_slowdown, near_zero).""" + for _ in range(3): + victim(); copy_engine_copy(); sm_copy() + torch.cuda.synchronize() + t_victim = _time_loop(torch, lambda: victim(), iters) + t_with_ce = _time_loop(torch, lambda: (copy_engine_copy(), victim()), iters) + t_with_sm = _time_loop(torch, lambda: (sm_copy(), victim()), iters) + copy_stream.synchronize() + ce_slow = (t_with_ce / t_victim) if t_victim > 0 else None + sm_slow = (t_with_sm / t_victim) if t_victim > 0 else None + near_zero = (ce_slow is not None and sm_slow is not None + and ce_slow < 1.15 and (sm_slow - ce_slow) > 0.05) + return (round(t_victim * 1e3, 4), round(t_with_ce * 1e3, 4), round(t_with_sm * 1e3, 4), + round(ce_slow, 4) if ce_slow else None, round(sm_slow, 4) if sm_slow else None, bool(near_zero)) + + +def _sm_validation(torch, device, nbytes: int, iters: int) -> dict: + """Return evidence the copy-engine path uses ~0 SMs. + + Tries pynvml SM utilization sampling first; always also runs the + concurrent-kernel non-interference probe and records BOTH. The doc documents + which signal is authoritative.""" + elem = 2 # float16 + n = max(1, nbytes // elem) + src = torch.randn(n, dtype=torch.float16, device=device) + dst = torch.empty(n, dtype=torch.float16, device=device) + copy_stream = torch.cuda.Stream() + victim, gemm_shape = _victim_kernel_factory(torch, device) + + result: dict = { + "method": None, + "nvml": None, + "non_interference": None, + "copy_engine_uses_near_zero_sms": None, + "proxy_doc": ( + "Non-interference proxy: a long SM-bound victim kernel timed alone " + "(t_victim) vs concurrent with a copy-engine copy on a separate " + "stream (t_with_ce) vs concurrent with an SM-copy (t_with_sm). " + "ce_slowdown=t_with_ce/t_victim ~1.0 => the copy engine stole no SM " + "cycles; sm_slowdown=t_with_sm/t_victim >1.0 => the SM-copy did. " + "copy_engine_uses_near_zero_sms is asserted when ce_slowdown is " + "materially smaller than sm_slowdown (and < ce_slowdown_threshold)." + ), + } + + # ---- primary: pynvml SM utilization while copying on the copy engine ---- + try: + import pynvml # type: ignore + + pynvml.nvmlInit() + idx = torch.cuda.current_device() + handle = pynvml.nvmlDeviceGetHandleByIndex(idx) + + def _sample_during(fn, n_samples=40) -> float: + # launch a long stream of the op, sample SM util repeatedly, take max + import time + for _ in range(3): + fn() + samples = [] + # keep the queue full while sampling + for _ in range(n_samples): + for _ in range(8): + fn() + u = pynvml.nvmlDeviceGetUtilizationRates(handle) + samples.append(u.gpu) + time.sleep(0.001) + torch.cuda.synchronize() + return max(samples) if samples else 0.0 + + ce_util = _sample_during( + lambda: _copy_engine_copy(torch, dst, src, copy_stream)) + sm_util = _sample_during( + lambda: _sm_copy(torch, dst, src, copy_stream)) + result["nvml"] = { + "source": "pynvml nvmlDeviceGetUtilizationRates (whole-GPU SM util %)", + "copy_engine_max_sm_util_pct": ce_util, + "sm_copy_max_sm_util_pct": sm_util, + "note": "whole-GPU util is a coarse proxy; copy-engine should read low, SM-copy high", + } + pynvml.nvmlShutdown() + except Exception as exc: + result["nvml"] = {"available": False, "error": repr(exc)} + + # ---- always: concurrent-kernel non-interference probe ---- + try: + # warmup + for _ in range(3): + victim() + _copy_engine_copy(torch, dst, src, copy_stream) + _sm_copy(torch, dst, src, copy_stream) + torch.cuda.synchronize() + + t_victim = _time_loop(torch, lambda: victim(), iters) + + def _victim_with_ce(): + _copy_engine_copy(torch, dst, src, copy_stream) + victim() + + def _victim_with_sm(): + _sm_copy(torch, dst, src, copy_stream) + victim() + + t_with_ce = _time_loop(torch, _victim_with_ce, iters) + t_with_sm = _time_loop(torch, _victim_with_sm, iters) + copy_stream.synchronize() + + ce_slow = (t_with_ce / t_victim) if t_victim > 0 else None + sm_slow = (t_with_sm / t_victim) if t_victim > 0 else None + threshold = 1.15 + near_zero = ( + ce_slow is not None and sm_slow is not None + and ce_slow < threshold and (sm_slow - ce_slow) > 0.05 + ) + result["non_interference"] = { + "victim_kernel": "matmul x8 (fp16 2048^3)", + "gemm_shape": gemm_shape, + "t_victim_us": round(t_victim * 1e3, 4), + "t_victim_with_copy_engine_us": round(t_with_ce * 1e3, 4), + "t_victim_with_sm_copy_us": round(t_with_sm * 1e3, 4), + "ce_slowdown": round(ce_slow, 4) if ce_slow else None, + "sm_slowdown": round(sm_slow, 4) if sm_slow else None, + "ce_slowdown_threshold": threshold, + } + result["copy_engine_uses_near_zero_sms"] = bool(near_zero) + result["method"] = ("nvml+non-interference" + if result.get("nvml", {}).get("source") else "non-interference") + except Exception as exc: + result["non_interference"] = {"error": repr(exc)} + result["method"] = result["method"] or "failed" + + # ---- copy-vs-ATTENTION interference (goal "Interference with attention kernels") ---- + # Same probe with a flash-attention (scaled_dot_product_attention) victim instead of GEMM, so + # the copy engine's non-interference is shown against BOTH expert-GEMM and attention kernels. + try: + avictim, ashape = _attention_victim_factory(torch, device) + tv, tce, tsm, ce_s, sm_s, az = _probe_victim( + torch, avictim, + lambda: _copy_engine_copy(torch, dst, src, copy_stream), + lambda: _sm_copy(torch, dst, src, copy_stream), + dst, src, copy_stream, iters) + result["non_interference_attention"] = { + "victim_kernel": "scaled_dot_product_attention x6 (fp16 [8,32,2048,128])", + "attn_shape": ashape, "t_victim_us": tv, + "t_victim_with_copy_engine_us": tce, "t_victim_with_sm_copy_us": tsm, + "ce_slowdown": ce_s, "sm_slowdown": sm_s, "ce_slowdown_threshold": 1.15} + result["copy_engine_uses_near_zero_sms_attention"] = az + except Exception as exc: + result["non_interference_attention"] = {"error": repr(exc)} + + return result + + +def run_gpu(args) -> tuple[list[dict], dict, str | None]: + try: + import torch + except Exception as exc: # pragma: no cover + return [], {}, f"torch unavailable: {exc!r}" + if not torch.cuda.is_available(): + return [], {}, "torch.cuda.is_available() is False (no GPU in this container)" + # Accelerator-aware: on NVIDIA the off-SM DMA path is the copy engine; on AMD/ROCm the same + # async stream-copy lowers to the SDMA (System DMA) engines (the "AMD SDMA path"). The bench + # body is identical (torch.cuda maps to HIP); we label the DMA engine honestly per accelerator + # and let the non-interference probe characterize SDMA-vs-CU interference (pynvml is absent on + # ROCm, so _sm_validation falls back to the pure-torch non-interference path automatically). + is_rocm = bool(getattr(torch.version, "hip", None)) + accel = "rocm" if is_rocm else "cuda" + copy_engine_kind = "sdma" if is_rocm else "copy-engine" + + dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16, + "float32": torch.float32}[args.dtype] + sizes = size_ladder(args.min_bytes, args.max_bytes, args.factor) + copy_stream = torch.cuda.Stream() + + rows: list[dict] = [] + for op, engine in SUBOPS: + for nbytes in sizes: + try: + rows.append(_bench_one(torch, op, engine, nbytes, dtype, + args.warmup, args.iters, copy_stream)) + except RuntimeError as exc: + rows.append({"op": op, "engine": engine, "size_bytes": nbytes, + "requested_bytes": nbytes, "latency_us": None, + "bandwidth_gbps": None, "error": repr(exc)}) + + diagnostics = { + "sm_validation": _sm_validation(torch, torch.device("cuda"), + args.validation_bytes, max(10, args.iters)), + "device_name": torch.cuda.get_device_name(0), + "multiprocessor_count": torch.cuda.get_device_properties(0).multi_processor_count, + "accelerator": accel, + "copy_engine_kind": copy_engine_kind, # "sdma" on AMD/ROCm, "copy-engine" on NVIDIA + "hip_version": getattr(torch.version, "hip", None), + } + return rows, diagnostics, None + + +# --------------------------------------------------------------------------- # +# document assembly + CLI # +# --------------------------------------------------------------------------- # +def build_doc(args, rows: list[dict], diagnostics: dict, error: str | None) -> dict: + measured = [r for r in rows if r.get("bandwidth_gbps")] + peak_bw = max((r["bandwidth_gbps"] for r in measured), default=0.0) + # gate: must have transferred on BOTH the copy-engine and SM paths with bw>0 + ce_ok = any(r["engine"] == "copy-engine" and r.get("bandwidth_gbps") for r in rows) + sm_ok = any(r["engine"] == "sm" and r.get("bandwidth_gbps") for r in rows) + transferred = bool(measured) and peak_bw > 0.0 and ce_ok and sm_ok + + meta = { + "op": "memcpy", "engine": "mixed", "dtype": args.dtype, + "transport": args.transport, "topology_class": args.topology_class, + "comparison_class": args.comparison_class, + "measurement_contract": MEASUREMENT_CONTRACT, + } + curve_keys = {} + for op, engine in SUBOPS: + curve_keys[f"{op}/{engine}"] = comparison_key(dict(meta, op=op, engine=engine)) + for r in rows: + r["comparison_key"] = curve_keys.get(f"{r['op']}/{r['engine']}") + + doc = { + "schema_version": SCHEMA_VERSION, + "family": FAMILY, + "generated_by": GENERATED_BY, + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, + "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, + "topology_class": args.topology_class, + "comparison_class": args.comparison_class, + "dtype": args.dtype, + "sub_ops": [f"{o}/{e}" for o, e in SUBOPS], + "comparison_key": comparison_key(meta), + "curve_keys": curve_keys, + "status": "valid" if transferred else "invalid", + "error": error, + # "copy-engine" on NVIDIA, "sdma" on AMD/ROCm (same off-SM DMA-engine role) — labeled so the + # AMD SDMA result is not conflated with the NVIDIA copy-engine result in the plot. + "accelerator": diagnostics.get("accelerator"), + "copy_engine_kind": diagnostics.get("copy_engine_kind"), + "peak_bandwidth_gbps": round(peak_bw, 3), + "copy_engine_uses_near_zero_sms": diagnostics.get("sm_validation", {}).get( + "copy_engine_uses_near_zero_sms"), + "sweep": {"min_bytes": args.min_bytes, "max_bytes": args.max_bytes, + "factor": args.factor, "warmup": args.warmup, "iters": args.iters}, + "num_rows": len(rows), + "rows": rows, + "diagnostics": diagnostics, + "provenance": _provenance(), + "environment": _load_env(args.env_json), + } + return doc + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX copy-engine vs SM copy bench (NVIDIA)") + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--factor", type=int, default=DEFAULT_FACTOR) + ap.add_argument("--dtype", default="float16", choices=["float16", "bfloat16", "float32"]) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=30) + ap.add_argument("--validation-bytes", type=int, default=16 * 1024 * 1024, + help="copy size used by the SM-utilization validation probe") + ap.add_argument("--parse-only", action="store_true", + help="emit a well-formed (status=invalid) doc with no GPU — schema check") + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="nvlink", + help="DtoD transport: nvlink (intra-node) | pcie") + ap.add_argument("--comparison-class", default="standardized", + choices=["standardized", "backend-optimized", "framework-integrated"]) + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + if args.parse_only: + rows, diagnostics, error = [], {}, "parse-only (no GPU run)" + else: + rows, diagnostics, error = run_gpu(args) + + doc = build_doc(args, rows, diagnostics, error) + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + + sv = doc["diagnostics"].get("sm_validation", {}) + print( + f"copy-engine: {doc['num_rows']} rows -> {args.out} " + f"(status={doc['status']}, peak_bw={doc['peak_bandwidth_gbps']} GB/s, " + f"ce_near_zero_sms={doc['copy_engine_uses_near_zero_sms']}, " + f"method={sv.get('method')}, key={doc['comparison_key']})", + file=sys.stderr, + ) + return 0 if doc["status"] == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py new file mode 100644 index 000000000..ee300b58f --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — DeepEP (NVIDIA), normal mode. + +The harness owns the deterministic shared routing trace, the comm-only timing, and +the doc; this file owns only DeepEP's API calls and its correctness reference. +`make_problem` materializes the harness-provided rank slice (no RNG here), so every +SKU runs the identical routed workload. + +Correctness (per DeepEP's intranode test): a pure dispatch->combine round trip with no +expert compute reconstructs x only after dividing by the number of ranks each token was +sent to, so the harness expects combined ≈ x * is_token_in_rank.sum(dim=1). +""" +from __future__ import annotations + +import os +import sys +import types + +import torch +import torch.distributed as dist + +try: + from deep_ep import Buffer # type: ignore + import deep_ep # for version/provenance +except Exception as exc: # pragma: no cover - needs the built DeepEP + print("ERROR: deep_ep import failed — DeepEP must be present/built at job setup. " + f"{exc!r}", file=sys.stderr) + raise + + +def _deepep_version() -> str: + try: + import importlib.metadata as _md + return _md.version("deep_ep") + except Exception: + return getattr(deep_ep, "__version__", "unknown") + + +# DeepEP's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token +# block-128 scale (deep_ep 1.2.1 ships NO helper for this — utils is empty — so we +# implement the exact convention its kernels expect: scales [T, H//128] float32, e4m3, +# 448 = e4m3 max). Both directions of the cast run OUTSIDE the timed window (cast in +# make_problem, dequant in stage), so fp8 quantization is NOT included in dispatch time. +_FP8_MAX = 448.0 +_FP8_BLOCK = 128 + + +def _per_token_cast_to_fp8(x): + # PER-BLOCK-128 scale layout (DeepEP default): one scale per 128-elem block per token. + # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32) + T, H = x.shape + xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK) + amax = xv.abs().amax(dim=2).clamp(min=1e-4) # [T, H//128] + x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H) + return x_fp8, (amax / _FP8_MAX).contiguous() + + +def _per_token_cast_to_fp8_pertoken(x): + # PER-TOKEN scale layout: ONE amax per token (over all H), broadcast across the H//128 blocks. + # Coarser than block-128 (slightly higher quant error) but the same scale transport cost. + T, H = x.shape + amax = x.float().abs().amax(dim=1, keepdim=True).clamp(min=1e-4) # [T, 1] + x_fp8 = (x.float() * (_FP8_MAX / amax)).to(torch.float8_e4m3fn) + scales = (amax / _FP8_MAX).expand(T, H // _FP8_BLOCK).contiguous() # broadcast per-token + return x_fp8, scales + + +def _directcast_to_fp8(x): + # DIRECT-CAST: clamp to the e4m3 range and cast with NO learned scale (unit scale). Carries no + # scale metadata (zero scale-transport overhead) but truncates activations above e4m3 max — the + # recipe MoRI PR311 replaced for accuracy. scales=ones so _per_block_dequant is the plain cast-back. + T, H = x.shape + x_fp8 = x.float().clamp(-_FP8_MAX, _FP8_MAX).to(torch.float8_e4m3fn) + scales = torch.ones((T, H // _FP8_BLOCK), dtype=torch.float32, device=x.device) + return x_fp8, scales + + +# dispatch_dtype value -> (scale_layout label, cast fn). All feed DeepEP's same (fp8, scales) kernel +# input; they differ only in the quant recipe, so they are distinct OPERATING POINTS, not dtypes. +_FP8_RECIPES = { + "fp8": ("per-block-128", _per_token_cast_to_fp8), + "fp8-pertoken": ("per-token", _per_token_cast_to_fp8_pertoken), + "fp8-directcast": ("direct-cast", _directcast_to_fp8), +} + + +def _per_block_dequant(x_fp8, scales): + # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16 + R, H = x_fp8.shape + xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK) + return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16) + + +def _per_block_dequant_3d(x_fp8, scales): + # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16 + E, S, H = x_fp8.shape + xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK) + return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16) + + +def _mnnvl_buffer_kwargs() -> dict: + """Cross-tray (NVL72/MNNVL) Buffer kwargs. + + DeepEP V2's `Buffer` added `allow_mnnvl` (default False); when it is False DeepEP itself sets + `NVSHMEM_DISABLE_MNNVL=1` and the legacy buffer falls onto the intranode-only CUDA-IPC peer path, + which faults across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). On a real + multi-tray MNNVL allocation (the rack launcher exports CX_ALLOW_MNNVL=1) request allow_mnnvl=True + so the NVLink buffer spans trays over the fabric API. The bundled V1 `Buffer` predates the param + (its NVL buffer already spans MNNVL trays), so only pass it when the installed Buffer accepts it — + keeping x86 single-node and bundled-V1 rack paths byte-for-byte unchanged. + """ + if os.environ.get("CX_ALLOW_MNNVL") != "1": + return {} + try: + import inspect + if "allow_mnnvl" in inspect.signature(Buffer.__init__).parameters: + return {"allow_mnnvl": True} + except (ValueError, TypeError): + pass + return {} + + +class DeepEPBackend: + name = "deepep" + combine_needs_redispatch = False # DeepEP combine reuses the handle (its own bench does too) + # Blackwell (B300) drops GPU clocks during the tiny small-T points, so the harness + # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100. + wants_warm_burst = True + # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no + # fallback/mislabel). Expanded as each path is implemented + hardware-validated. + # normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink. + # ll mode: low_latency_dispatch/combine — verified RUNNING intranode over NVLink via + # allow_nvlink_for_low_latency_mode (IBGDA not required intranode) on 8xH100. + SUPPORTED_PRECISIONS = {"bf16", "fp8", "fp8-pertoken", "fp8-directcast"} + SUPPORTED_MODES = {"normal", "ll"} + # Three contracts (review #3 + goal P1 runtime-visible): + # layout-and-dispatch-v1 — times get_dispatch_layout INSIDE dispatch; fp8 cast/dequant + # OUTSIDE (preprocessing mirrors a producer handing quantized x). + # cached-layout-comm-only-v1 — layout hoisted out (untimed); dispatch = pure comm (DeepEP's + # own benchmark boundary). normal mode only. + # runtime-visible-v1 — the serving-realistic boundary: dispatch INCLUDES the fp8 + # quant (cast) + layout + comm + the recv-dequant that makes + # expert input consumable; combine starts from bf16 expert + # outputs. (normal mode; LL already times all of this in-kernel.) + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = args.mode + self.ll = (args.mode == "ll") + self.contract = args.measurement_contract + # hoist layout out of the timed dispatch only for the cached contract in normal mode. + self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll + # runtime-visible-v1: the fp8 cast + recv-dequant move INSIDE the timed dispatch (normal + # mode). LL already times cast+layout+comm in its single kernel, so it's runtime-visible + # by construction — the flag only changes normal mode's boundary. + self.runtime_visible = (self.contract == "runtime-visible-v1") and not self.ll + self.group = dist.group.WORLD + assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \ + "run_ep.py must reject unsupported dtype/mode before constructing the backend" + # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest + # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is + # recorded in the artifact so the looser fp8 gate is explicit, not hidden. + self.fp8 = args.dispatch_dtype.startswith("fp8") + # fp8 scale-layout recipe (per-block-128 default / per-token / direct-cast) — all use the + # same DeepEP fp8 kernel; only the cast differs. Recorded so they're distinct operating points. + self.fp8_recipe, self._fp8_cast = _FP8_RECIPES.get( + args.dispatch_dtype, ("per-block-128", _per_token_cast_to_fp8)) + self.scale_layout = self.fp8_recipe if self.fp8 else None + # direct-cast truncates above e4m3 (no scale) -> a touch looser gate than scaled recipes. + self.tolerance = ((1.5e-1 if self.fp8_recipe == "direct-cast" else 1.25e-1) + if self.fp8 else 5e-2) + dev_sms = torch.cuda.get_device_properties(device).multi_processor_count + ver = _deepep_version() + if self.ll: + self._init_ll(args, dev_sms, ver) + else: + self._init_normal(args, rank, dev_sms, ver) + + def _init_normal(self, args, rank, dev_sms, ver): + # fp8 cast: UNTIMED (make_problem) under layout-and-dispatch / cached-layout; TIMED (inside + # dispatch) under runtime-visible-v1. So fp8_in_timing tracks the contract honestly. + self.fp8_in_timing = (self.runtime_visible if self.fp8 else None) + self.combine_needs_redispatch = False # normal combine reuses the handle + # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points + # (review: a phase-dependent 2/4 GiB made the shared T=128 point differ between + # the decode and prefill sweeps). 4 GiB holds T up to 4096 (validated). + num_nvl_bytes = int(os.environ.get("CX_DEEPEP_NVL_BYTES", str(4 * 1024 * 1024 * 1024))) + mnnvl_kw = _mnnvl_buffer_kwargs() + self.buffer = Buffer(self.group, num_nvl_bytes, 0, **mnnvl_kw) + rm = args.resource_mode + tuned_src = None + if rm == "normalized": + num_sms = max(1, round(args.sm_fraction * dev_sms)) # ~same device fraction as MoRI + elif rm == "tuned": + # Best-available for the installed DeepEP: its OWN default SM count + # (Buffer.num_sms — the library's analytic choice; it deliberately uses + # fewer SMs). get_dispatch_config(num_ranks) returns the recommended Config + # but doesn't expose num_sms to Python, and the default already reflects it. + num_sms = int(getattr(Buffer, "num_sms", args.num_sms)) + tuned_src = "deepep-default-num_sms" + else: # default — the bring-up budget + num_sms = args.num_sms + try: + Buffer.set_num_sms(num_sms) + except Exception as exc: # pragma: no cover - version dependent + if rank == 0: + print(f"WARN: could not set num_sms={num_sms}: {exc!r}", file=sys.stderr) + self.backend_provenance = { + "deepep_version": ver, + "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}", + "mode": "normal", "resource_mode": rm, "num_sms": num_sms, "device_sms": dev_sms, + "sm_fraction": (num_sms / dev_sms), "tuned_source": tuned_src or "n/a", + "num_nvl_bytes": num_nvl_bytes, "allow_mnnvl": bool(mnnvl_kw), + "fp8_recipe": self.fp8_recipe if self.fp8 else "n/a", + "scale_layout": self.scale_layout, + } + + def _init_ll(self, args, dev_sms, ver): + # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink + # via allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch + # so for fp8 the quantization IS inside the timed window (recorded honestly). The + # buffer is sized for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical), + # so LL is a decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop). + # set_num_sms does NOT apply (the LL kernel picks its own occupancy) — recorded n/a. + self.fp8_in_timing = (True if self.fp8 else None) + self.combine_needs_redispatch = True # re-dispatch (untimed) before each timed combine + self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128")) + self.experts = args.experts + rdma_bytes = Buffer.get_low_latency_rdma_size_hint( + self.num_max, args.hidden, self.world_size, args.experts) + # one QP per local expert is the DeepEP convention for LL + self.num_qps = max(1, args.experts // self.world_size) + mnnvl_kw = _mnnvl_buffer_kwargs() + self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True, + num_qps_per_rank=self.num_qps, + allow_nvlink_for_low_latency_mode=True, **mnnvl_kw) + self.backend_provenance = { + "deepep_version": ver, + "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}", + "mode": "ll", "resource_mode": args.resource_mode, + "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel", + "num_max_dispatch_tokens_per_rank": self.num_max, + "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps, + "low_latency_mode": True, "use_fp8": self.fp8, "allow_mnnvl": bool(mnnvl_kw), + } + + def buffer_cap(self, args): + # LL is sized for a fixed num_max; cap the sweep there (reported, not silent). + return self.num_max if self.ll else None + + def make_problem(self, T, idx, weights, x): + # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice. + p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), layout=None) + if self.fp8 and not self.ll and not self.runtime_visible: + # layout-and-dispatch / cached-layout: per-token block-128 cast, UNTIMED (preprocessing, + # mirrors the real producer that hands the dispatcher already-quantized activations). + # runtime-visible does NOT pre-cast (the cast is timed inside dispatch); LL casts in-kernel. + p.x_fp8, p.x_scales = self._fp8_cast(x) + if self.cache_layout: + # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed) + # so the timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None + # and dispatch computes it inside the timed window.) + ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + p.layout = (ntr, ntpe, itir) + return p + + def dispatch(self, p): + if self.ll: + return self._dispatch_ll(p) + if p.layout is not None: # cached-layout-comm-only-v1 + num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout + else: # layout-and-dispatch / runtime-visible (timed layout) + (num_tokens_per_rank, _, num_tokens_per_expert, + is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + ref_fp8 = ref_scales = None + if self.fp8: + if self.runtime_visible: + # runtime-visible: the per-token block-128 cast is INSIDE the timed dispatch. + x_fp8, x_scales = self._fp8_cast(p.x) + ref_fp8, ref_scales = x_fp8, x_scales # for the correctness reference + else: + x_fp8, x_scales = p.x_fp8, p.x_scales # pre-cast (untimed) + x_in = (x_fp8, x_scales) + else: + x_in = p.x + recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch( + x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert) + out = types.SimpleNamespace( + recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle, + is_token_in_rank=is_token_in_rank, ref_fp8=ref_fp8, ref_scales=ref_scales) + if self.fp8 and self.runtime_visible: + # dispatch ENDS when expert input is consumable: dequant fp8 recv -> bf16 INSIDE the + # timed window (the contract's "expert input genuinely consumable" boundary). stage() + # then no-ops for this contract. + recv_fp8, recv_scales = recv_x + out.combine_input = _per_block_dequant(recv_fp8, recv_scales) + out.rv_staged = True + return out + + def _dispatch_ll(self, p): + # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the + # cast IS inside this timed op — fp8_in_timing=True). recv is the expert-major + # 3D layout [num_local_experts, num_max*world, hidden] (+scales when fp8). + recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch( + p.x, p.topk_idx, self.num_max, self.experts, + use_fp8=self.fp8, return_recv_hook=False) + return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle) + + def stage(self, p, h): + # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv + # to bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in + # both precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.) + if getattr(h, "rv_staged", False): + return None # runtime-visible already produced bf16 combine_input inside dispatch (timed) + if self.ll: + if self.fp8: + recv_fp8, recv_scales = h.recv_x + h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales) + else: + h.combine_input = h.recv_x + elif self.fp8: + recv_fp8, recv_scales = h.recv_x + h.combine_input = _per_block_dequant(recv_fp8, recv_scales) + else: + h.combine_input = h.recv_x + return None + + def combine(self, p, h): + if self.ll: + # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones. + combined_x, _event, _hook = self.buffer.low_latency_combine( + h.combine_input, p.topk_idx, p.topk_weights, h.handle) + return combined_x + combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle, + topk_weights=h.recv_topk_weights) + return combined_x + + def expected(self, p, h): + if self.ll: + # LL combine reduces each token's topk expert copies weighted by topk_weights; + # with no expert compute each copy is (the kernel's fp8 cast of) x, so + # combined ≈ x * sum(topk_weights). fp8 quant error is covered by self.tolerance. + wsum = p.topk_weights.sum(dim=1, keepdim=True) + return p.x.float() * wsum, p.T + # normal: round trip with no expert compute reconstructs x*(#destination ranks); + # for fp8 compare against the dequantized cast that was actually sent. + ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float() + ref = p.x.float() + if self.fp8: + # runtime-visible cast lives on the handle (no pre-cast on p); else use the pre-cast. + x_fp8 = getattr(h, "ref_fp8", None) + x_scales = getattr(h, "ref_scales", None) + if x_fp8 is None: + x_fp8, x_scales = p.x_fp8, p.x_scales + ref = _per_block_dequant(x_fp8, x_scales).float() + return ref * ranks_per_token, p.T + + def recv_tokens(self, h): + if self.ll: + return int(h.recv_count.sum().item()) # token-copies received across local experts + rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x + return int(rx.shape[0]) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_deepep_hybrid.py b/experimental/CollectiveX/tests/ep_deepep_hybrid.py new file mode 100644 index 000000000..594cae735 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep_hybrid.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — DeepEP `hybrid-ep` branch (NVIDIA TMA-based HybridEPBuffer). + +The hybrid-ep branch (https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is NVIDIA's TMA + +warp-pipeline implementation of expert-parallel all-to-all, exposing `deep_ep.HybridEPBuffer` +(distinct from the mainline `deep_ep.Buffer`). HybridEP is NVIDIA's MoE backend built for NVL72 +rack-scale (Megatron `moe_flex_dispatcher_backend="hybridep"`). This adapter drives the single- +NVLink-domain path (`num_of_hybrid_ep_ranks_per_nvlink_domain == world_size`, <=8 ranks). That domain +is ONE node on x86 — but on a GB200/GB300 NVL72 the MNNVL fabric makes multiple trays a single NVLink +domain, so the SAME path spans trays: gb300 EP8 (8 ranks / 2 trays) is validated `transport=mnnvl`, +decode 8/8 + prefill 6/6 (run 28480519588). The container build is done by runtime/run_in_container.sh +`cx_build_deepep_hybrid` (CUDA-13 cccl include + libnvshmem symlink fixes; pip-installed so it persists +across the EP8 multi-srun's separate srun steps). + +API (pinned on B300, branch e0a5b1d): + HybridEPBuffer(group, hidden_dim, max_num_of_tokens_per_rank, num_local_experts, use_fp8=False, ...) + .dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> (recv_hidden, recv_x2, None, handle) + .combine(hidden, handle=) -> [T, hidden] + +CORRECTNESS: identity expert (no expert compute), combine WITHOUT probs -> each source token is +reconstructed as x * (distinct ranks among its top_k experts) — verified: an 8-rank uniform top_k=8 +round trip gives relerr(combined, x) = 4.28, matching E[distinct ranks] ~ 5.26 exactly. So this uses +the SAME "ranks" factor as ep_flashinfer (per-rank-sum combine, no gate re-weight). bf16 tol 5e-2. + +STATUS: bf16 / normal / layout-and-dispatch-v1. Single-NVLink-domain path (<=8 ranks) validated on x86 +single-node AND across GB300 NVL72 trays at EP8 via MNNVL (one NVLink domain, run 28480519588). fp8 and +the cross-RACK (>1 NVL72, IBGDA/RDMA) path are further lift; docs/gated.md rack-scale. +""" +from __future__ import annotations + +import os +import sys +import types + +import torch +import torch.distributed as dist + +try: + import deep_ep + HybridEPBuffer = deep_ep.HybridEPBuffer +except Exception as exc: # pragma: no cover - needs the hybrid-ep build + print("ERROR: deep_ep.HybridEPBuffer import failed — the hybrid-ep branch must be built at job " + "setup (cx_build_deepep_hybrid). " + f"{exc!r}", file=sys.stderr) + raise + + +def _deepep_hybrid_version() -> str: + return os.environ.get("DEEPEP_COMMIT", getattr(deep_ep, "__version__", "hybrid-ep")) + + +class DeepEPHybridBackend: + name = "deepep-hybrid" + # HybridEPBuffer.combine consumes the recv payload + the dispatch handle (no re-dispatch needed + # before a timed combine); the harness times dispatch and combine separately (like ep_deepep). + combine_needs_redispatch = False + wants_warm_burst = True + # Capabilities — run_ep.py REJECTS anything outside these before construction. + SUPPORTED_PRECISIONS = {"bf16"} # fp8 = use_fp8 path, further lift + SUPPORTED_MODES = {"normal"} + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"} + SUPPORTED_COMBINE_DTYPES = {"bf16"} + SUPPORTED_COMBINE_QUANT_MODES = {"none"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = args.mode + self.contract = args.measurement_contract + self.group = dist.group.WORLD + assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \ + "run_ep.py must reject unsupported dtype/mode before constructing the backend" + self.tolerance = 5e-2 + self.fp8_in_timing = None + self.top_k = int(args.topk) + self.num_experts = int(args.experts) + self.hidden = int(args.hidden) + self.local_experts = max(1, self.num_experts // world_size) + # Token cap (per rank) for the symmetric buffer; the sweep is capped here (buffer_cap). + self.max_tokens = int(os.environ.get("CX_HYBRIDEP_MAX_TOKENS", "4096")) + dev_sms = torch.cuda.get_device_properties(device).multi_processor_count + ver = _deepep_hybrid_version() + + # Construct the HybridEPBuffer treating all ranks as ONE NVLink domain (default + # num_of_hybrid_ep_ranks_per_nvlink_domain == world_size). On x86 that domain is one node; on a + # GB200/GB300 NVL72 the MNNVL fabric makes 2 trays one NVLink domain, so EP8 (8 ranks) is covered + # by this same path (validated transport=mnnvl). SM counts default. + try: + self.buffer = HybridEPBuffer( + self.group, hidden_dim=self.hidden, + max_num_of_tokens_per_rank=self.max_tokens, + num_local_experts=self.local_experts, use_fp8=False) + except Exception as exc: + raise RuntimeError( + f"HybridEPBuffer construction failed (hidden={self.hidden} max_tokens={self.max_tokens} " + f"local_experts={self.local_experts} world={world_size}): {exc!r}") from exc + if rank == 0: + print(f"[deepep-hybrid] HybridEPBuffer constructed (single NVLink domain, world={world_size}, " + f"local_experts={self.local_experts}, hidden={self.hidden})", file=sys.stderr) + + self.backend_provenance = { + "deepep_commit": ver, "branch": "hybrid-ep", + "impl": "deep_ep.HybridEPBuffer (NVIDIA TMA + warp-pipeline)", + "mode": "normal", "transport": "nvlink-domain", # one node (x86) or one NVL72 MNNVL domain (gb300 EP8) + "resource_mode": args.resource_mode, + "num_sms": None, "device_sms": dev_sms, "tuned_source": "fixed-kernel", + "max_num_tokens": self.max_tokens, "top_k": self.top_k, + "num_experts": self.num_experts, "local_experts": self.local_experts, + "routing_factor": "ranks", + } + + def buffer_cap(self, args): + return self.max_tokens + + def make_problem(self, T, idx, weights, x): + return types.SimpleNamespace( + T=int(T), x=x, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + # HybridEPBuffer.dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> + # (recv_hidden [n_recv, H], recv_x2, None, handle). + out = self.buffer.dispatch(p.x, topk_idx=p.topk_idx, topk_weights=p.topk_weights, + num_of_experts=self.num_experts) + recv = out[0] if isinstance(out, (tuple, list)) else out + handle = None + if isinstance(out, (tuple, list)): + for o in out: + if isinstance(o, tuple): + handle = o + return types.SimpleNamespace(recv=recv, recv_payload=recv, handle=handle, combine_input=None) + + def stage(self, p, h): + # Identity expert: the recv hidden IS the "expert output". combine reduces it per source token. + h.combine_input = h.recv_payload + return None + + def combine(self, p, h): + # combine(hidden, handle=) -> [T, H] per-source-token reduction (no gate re-weight: "ranks"). + comb = self.buffer.combine(h.combine_input, handle=h.handle) + return comb[0] if isinstance(comb, (tuple, list)) else comb + + def expected(self, p, h): + # Round trip, identity expert, per-RANK-sum combine (no gate weights): each source token is + # x * (distinct ranks among its top_k experts) — same as ep_flashinfer's "ranks" factor. + ref = p.x.float() + epr = max(1, self.num_experts // self.world_size) + ranks = (p.topk_idx.long() // epr).clamp_(0, self.world_size - 1) # [T, topk] + present = torch.zeros(ranks.shape[0], self.world_size, device=ranks.device, dtype=torch.float32) + present.scatter_(1, ranks, 1.0) + factor = present.sum(dim=1, keepdim=True) # [T, 1] distinct ranks + return ref * factor, p.T + + def recv_tokens(self, h): + rp = h.recv_payload + if torch.is_tensor(rp) and rp.dim() >= 1: + return int(rp.shape[0]) + return 0 + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_flashinfer.py b/experimental/CollectiveX/tests/ep_flashinfer.py new file mode 100644 index 000000000..f12b6c709 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_flashinfer.py @@ -0,0 +1,805 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — FlashInfer EP (NVIDIA), normal mode. + +This file owns ONLY FlashInfer's MoE-AllToAll API calls + its correctness reference; +the harness (ep_harness.py) owns the deterministic shared routing trace, the comm-only +timing, the correctness gate, and the provenance-tagged doc. The adapter protocol +(make_problem / dispatch / stage / combine / expected / buffer_cap / recv_tokens / +finalize + backend_provenance + SUPPORTED_*) mirrors ep_deepep.py exactly. + +WHAT FLASHINFER PROVIDES (flashinfer 0.6.8.post1, NVIDIA container): + * `flashinfer.comm.MoeAlltoAll(mapping, max_num_tokens, top_k, num_experts)` — a class + holding an MNNVL symmetric workspace, with + .dispatch(token_selected_experts, input_payloads: list[Tensor], + runtime_max_tokens_per_rank, ...) -> recv payload(s) + .combine(payload, runtime_max_tokens_per_rank, payload_in_workspace=False) -> combined + * module-level `flashinfer.comm.trtllm_moe_alltoall` and the lower-level + `moe_a2a_dispatch` / `moe_a2a_combine` / `moe_a2a_initialize` / + `get_workspace_size_per_rank` — the TensorRT-LLM one-sided path. Selected by + env CX_FLASHINFER_TRTLLM=1 (provenance trtllm=True); covers goal's + "TensorRT-LLM NVLink one-sided AllToAll EP". + +The exact kwarg names for dispatch/combine and the Mapping constructor differ across +FlashInfer point releases. This adapter has NO GPU to validate against, so EVERY +FlashInfer API call is wrapped to fail LOUD + SPECIFIC (the call site, the kwargs +tried, and the underlying error) so the parent's GHA smoke shows precisely what to fix +rather than a bare TypeError. See `_call_variants` and `_build_mapping`. + +CORRECTNESS (`expected`): FlashInfer's MoeAlltoAll is expert-centric (TensorRT-LLM MoE +A2A): `dispatch` sends each token to its top_k selected experts; `combine` gathers the +per-expert results back and reduces the top_k copies for each SOURCE token. With an +identity expert (the harness does NO expert compute) and a combine that does NOT apply +the gate weights (the public `combine(payload, ...)` takes no topk_weights — gate +weighting is the MoE epilogue, not the comm), the round trip yields: + combined ≈ x * top_k (sum of top_k identical copies of x) +This is structurally DeepEP-LL-like (per-expert reduce) but WITHOUT LL's weight multiply. +The alternative (combine applies softmax gate weights, like DeepEP LL) would give +`x * sum(topk_weights)`. We LEAD with `x * top_k` and document both; the parent's GHA +validates which FlashInfer actually implements and flips ONE constant (_ROUTING_FACTOR). +Tolerance bf16 ~5e-2 (FlashInfer dispatch keeps bf16 end-to-end; no fp8 round-trip yet). + +STATUS: normal / layout-and-dispatch-v1. Dispatch precisions: bf16; fp8/fp8-pertoken/ +fp8-directcast (e4m3, DeepEP convention); mxfp8/mxfp4/nvfp4 (OCP-microscaling via +FlashInfer's native quantizers — the A2A moves [q, scale_factor] as a payload LIST, dequant +in stage()). Combine stays bf16 (MoeAlltoAll.combine has no output_dtype in 0.6.8.post1). +The MoeAlltoAll workspace bootstraps inside the single torch.distributed NCCL group of +same-user ranks (MNNVL symmetric memory) — the launcher/image owns CAP_SYS_PTRACE / FABRIC +plumbing (docs/gated.md; H200 runner denies the ptrace cap the MNNVL fd-share needs). +""" +from __future__ import annotations + +import os +import sys +import types + +import torch +import torch.distributed as dist + +try: + import flashinfer # for version/provenance + import flashinfer.comm as fi_comm # MoeAlltoAll / trtllm_moe_alltoall / moe_a2a_* live here +except Exception as exc: # pragma: no cover - needs the FlashInfer wheel on the container + print("ERROR: flashinfer import failed — FlashInfer must be present on the container at job " + "setup (cx_build_flashinfer: `pip install flashinfer-python`). " + f"{exc!r}", file=sys.stderr) + raise + + +def _flashinfer_version() -> str: + try: + import importlib.metadata as _md + return _md.version("flashinfer-python") + except Exception: + try: + import importlib.metadata as _md + return _md.version("flashinfer") + except Exception: + return getattr(flashinfer, "__version__", "unknown") + + +# --- The round-trip routing factor (see module docstring). LEAD = top_k (sum of top_k +# identical copies, combine does NOT weight). If GHA shows FlashInfer's combine applies +# the gate weights instead, flip this to "weight-sum" and the reference becomes +# x * sum(topk_weights). This is the ONE knob the parent edits after the first GHA run. --- +_ROUTING_FACTOR = os.environ.get("CX_FLASHINFER_ROUTING_FACTOR", "ranks") # "ranks" | "topk" | "weight-sum" + + +def _loud(where: str, attempted, exc: Exception) -> RuntimeError: + """Build a LOUD + SPECIFIC error for a failed FlashInfer call so the parent's GHA smoke + shows exactly which API/kwargs to fix (no GPU here to discover the right names).""" + return RuntimeError( + f"FlashInfer EP adapter: {where} failed against flashinfer {_flashinfer_version()}. " + f"Attempted: {attempted}. Underlying error: {exc!r}. " + f"FIX: inspect the installed flashinfer.comm signatures " + f"(python3 -c 'import flashinfer.comm as c; help(c.MoeAlltoAll)') and adjust the " + f"kwarg names / Mapping construction in tests/ep_flashinfer.py.") + + +def _call_variants(where: str, fn, variants): + """Try a sequence of (args, kwargs) plausible signatures for one FlashInfer call. + Returns (result, chosen_index). Raises a LOUD error listing EVERY attempt if all fail. + Used so a renamed kwarg surfaces as a precise, actionable message in GHA — not a + silent fallback (the harness contract forbids faking) and not a bare TypeError.""" + errors = [] + for i, (args, kwargs) in enumerate(variants): + try: + return fn(*args, **kwargs), i + except TypeError as exc: # wrong kwarg name / arity — try the next signature + errors.append(f" variant[{i}] args={_shape_repr(args)} kwargs={list(kwargs)} -> {exc!r}") + # any non-TypeError (e.g. a real CUDA/runtime error) is NOT a signature problem — + # re-raise immediately, wrapped, so it isn't masked by trying other signatures. + except Exception as exc: + raise _loud(where, _shape_repr(args) + f" kwargs={list(kwargs)}", exc) + raise _loud(where, "all signature variants exhausted:\n" + "\n".join(errors), + TypeError("no matching signature")) + + +def _shape_repr(args): + out = [] + for a in args: + if torch.is_tensor(a): + out.append(f"Tensor{tuple(a.shape)}:{a.dtype}") + elif isinstance(a, (list, tuple)): + out.append("[" + ",".join( + f"Tensor{tuple(t.shape)}:{t.dtype}" if torch.is_tensor(t) else repr(t) for t in a) + "]") + else: + out.append(repr(a)) + return "(" + ", ".join(out) + ")" + + +def _build_mapping(world_size, rank): + """Construct the FlashInfer Mapping for PURE EP. FlashInfer's Mapping REQUIRES + world_size == tp_size*pp_size*cp_size, and realizes MoE-EP as a VIEW over the TP dimension + (moe_ep_size ranks taken from the tp ranks). So pure EP across all ranks = + tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1 (pp=cp=1). The kwarg set varies + across releases, so try the plausible constructors defensively; record which worked (logged + at rank 0). Raises a LOUD error (listing every attempt) if none construct.""" + Mapping = getattr(fi_comm, "Mapping", None) or getattr(flashinfer, "Mapping", None) + if Mapping is None: + raise _loud("Mapping lookup", + "flashinfer.comm.Mapping / flashinfer.Mapping not found", + AttributeError("Mapping")) + # tp_size=world_size so the world_size==tp*pp*cp invariant holds; moe_ep_size=world_size = full EP. + variants = [ + ((), dict(world_size=world_size, rank=rank, gpus_per_node=world_size, + tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)), + ((), dict(world_size=world_size, rank=rank, + tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)), + ((), dict(world_size=world_size, rank=rank, tp_size=world_size, moe_ep_size=world_size)), + ((), dict(world_size=world_size, rank=rank, moe_ep_size=world_size, moe_tp_size=1, + tp_size=world_size)), + ((), dict(world_size=world_size, rank=rank, tp_size=world_size)), # EP defaults from tp + # positional last-resort: (world_size, rank) with tp=world_size + ((world_size, rank), dict(tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)), + ] + mapping, idx = _call_variants("Mapping(...)", Mapping, variants) + return mapping, idx + + +# -------------------------------------------------------------------------------------- +# Quantized dispatch recipes. FlashInfer's MoE A2A dispatch takes input_payloads as a LIST +# of [local_num_tokens, *] tensors and moves them as bytes (dtype-agnostic) — so a quantized +# dispatch = pass [q, scale_factor] as the payload list, recv [recv_q, recv_sf], then DEQUANT +# in stage() (UNTIMED, outside the comm window — the quant/dequant mirrors a producer handing +# already-quantized activations, exactly like ep_deepep's layout-and-dispatch-v1 contract). +# +# Two families: +# * e4m3 block-128 / per-token / direct-cast — pure-torch (identical convention to ep_deepep, +# so FlashInfer-fp8 and DeepEP-fp8 are the SAME operating point on different transports). +# * mxfp8 / mxfp4 / nvfp4 — FlashInfer's native OCP-microscaling quantizers (mxfp8_quantize, +# mxfp4_quantize, nvfp4_quantize) + their matching dequantizers. These check goal's +# "MXFP8 / MXFP4 / NVFP4 dispatch" — reachable here precisely because the A2A is a byte +# mover and FlashInfer ships the quantize/dequantize kernels (flashinfer 0.6.8.post1). +# The comm-correctness gate compares against the DEQUANTIZED cast that was actually sent +# (ref = dequant(quant(x)) * factor), so it verifies the COMM, not the quantizer — same as +# ep_deepep.expected(). Tolerance per format (4-bit fp4 is far looser than 8-bit fp8). +_FP8_MAX = 448.0 +_FP8_BLOCK = 128 + + +def _e4m3_block128_cast(x): + # PER-BLOCK-128 e4m3 (DeepEP default convention): scales [T, H//128] f32. + T, H = x.shape + xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK) + amax = xv.abs().amax(dim=2).clamp(min=1e-4) + x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H) + return x_fp8, (amax / _FP8_MAX).contiguous() + + +def _e4m3_pertoken_cast(x): + T, H = x.shape + amax = x.float().abs().amax(dim=1, keepdim=True).clamp(min=1e-4) + x_fp8 = (x.float() * (_FP8_MAX / amax)).to(torch.float8_e4m3fn) + scales = (amax / _FP8_MAX).expand(T, H // _FP8_BLOCK).contiguous() + return x_fp8, scales + + +def _e4m3_directcast(x): + T, H = x.shape + x_fp8 = x.float().clamp(-_FP8_MAX, _FP8_MAX).to(torch.float8_e4m3fn) + scales = torch.ones((T, H // _FP8_BLOCK), dtype=torch.float32, device=x.device) + return x_fp8, scales + + +def _e4m3_dequant_nd(x_fp8, scales): + # Works for [R,H]+[R,H//128] (2D) and [E,S,H]+[E,S,H//128] (3D recv). Last dim is H; scale + # repeats per 128-block. + *lead, H = x_fp8.shape + blocks = H // _FP8_BLOCK + xv = x_fp8.float().reshape(*lead, blocks, _FP8_BLOCK) + return (xv * scales.reshape(*lead, blocks, 1)).reshape(*lead, H).to(torch.bfloat16) + + +class _MicroscaleRecipe: + """FlashInfer-native mxfp8 / mxfp4 / nvfp4 quant+dequant, validated on the runner via the + library's own kernels. Quantize on a flat [N, H] view (the A2A moves per-token payloads), + keep the swizzled scale-factor as a SECOND payload, dequant the 3D recv by flattening the + [ep, max_tokens] dims to [N, H] (the SF swizzle is per-row so the flatten is layout-safe), + then reshaping back. Imports flashinfer lazily so a wheel without these kernels fails LOUD.""" + + _MX_BLOCK = 32 # mxfp8 e8m0 block size + _NV_VEC = 16 # nvfp4 e4m3 scale block size (sf_vec_size) + + _MXFP4_VEC = 32 # mxfp4 e8m0 block size (sf_vec_size) + # OCP e2m1 magnitudes indexed by (exp<<1)|mant (3 low bits); bit3 = sign. + _E2M1_MAG = (0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0) + + def __init__(self, kind): + self.kind = kind # "mxfp8" | "nvfp4" | "mxfp4" + # mxfp4 is reachable after all: mxfp4_quantize() forces a tile-padded SWIZZLED SF, but the + # lower-level fp4_quantize(sf_vec_size=32, sf_use_ue8m0=True, is_sf_swizzled_layout=False) + # emits e2m1 + e8m0 in a LINEAR per-token layout (movable through the A2A). dequant is a manual + # e2m1 LUT * 2^(e8m0-127) (no flashinfer linear-mxfp4 dequant exists; mxfp4_dequantize wants + # swizzled). The dispatch gate is consistency-based, so this validates the comm honestly. + import flashinfer as _fi + self._fi = _fi + need = {"mxfp8": ("mxfp8_quantize",), + "nvfp4": ("fp4_quantize", "e2m1_and_ufp8sf_scale_to_float"), + "mxfp4": ("fp4_quantize",)}[kind] + for fn in need: + if not hasattr(_fi, fn): + raise _loud(f"{kind} quantizer lookup", f"flashinfer.{fn} not found", + AttributeError(fn)) + + def cast(self, x): + # Returns (q, sf) — BOTH per-token (first-dim == T) so the A2A moves them as a payload list. + # mxfp8: q [T,H] e4m3, sf [T, H/32] e8m0(uint8), LINEAR (is_sf_swizzled_layout=False). + # nvfp4: q [T, H/2] uint8 (packed e2m1), sf [T, H/16] uint8 (ufp8 e4m3), per-tensor global sf. + # mxfp4: q [T, H/2] uint8 (packed e2m1), sf [T, H/32] uint8 (e8m0), LINEAR — via fp4_quantize. + fi = self._fi + xt = x.contiguous() + T, H = xt.shape + if self.kind == "mxfp8": + q, sf = fi.mxfp8_quantize(xt, is_sf_swizzled_layout=False) + sf = sf.reshape(T, H // self._MX_BLOCK) + elif self.kind == "mxfp4": + q, sf = fi.fp4_quantize(xt, sf_vec_size=self._MXFP4_VEC, sf_use_ue8m0=True, + is_sf_swizzled_layout=False) + if sf.dim() == 1: + sf = sf.reshape(T, -1) + else: # nvfp4: global_scale maps amax -> the max representable (e4m3max * e2m1max = 448*6); + # dequant divides by it. (the reciprocal — amax/(448*6) — yields ~0 output, relerr~1.) + gsf = ((_FP8_MAX * 6.0) / xt.float().abs().amax().clamp(min=1e-4)).reshape(1) + q, sf = fi.fp4_quantize(xt, global_scale=gsf, sf_vec_size=self._NV_VEC, + sf_use_ue8m0=False, is_sf_swizzled_layout=False) + self._gsf = gsf + if sf.dim() == 1: + sf = sf.reshape(T, -1) + return q.contiguous(), sf.contiguous() + + def dequant_nd(self, q, sf): + # q/sf are recv tensors — 2D [T,*] (the x_ref path) or 3D [E,S,*] (the stage recv path). + # Flatten leading dims to [N,*], dequant on device, reshape back. NO host round-trip. + lead = q.shape[:-1] + N = 1 + for d in lead: + N *= d + if self.kind == "mxfp8": + # Manual DEVICE e8m0 dequant (FlashInfer ships only a CPU mxfp8_dequantize_host, too slow + # in the timing loop): x ~= q_e4m3 * 2^(sf_uint8 - 127), per block-32. Verified to match + # mxfp8_dequantize_host on the runner (see cx_fi_quant_smoke). + H = q.shape[-1] + B = self._MX_BLOCK + qf = q.reshape(N, H // B, B).float() + sff = sf.reshape(N, H // B).float() + out = (qf * torch.pow(torch.tensor(2.0, device=q.device), sff - 127.0).unsqueeze(-1)).reshape(N, H) + elif self.kind == "mxfp4": + # Manual e2m1 (LUT) + e8m0 block-32 decode (no flashinfer linear-mxfp4 dequant exists). + Hp = q.shape[-1] + H = Hp * 2 + qb = q.reshape(N, Hp) + lut = torch.tensor(self._E2M1_MAG, device=q.device, dtype=torch.float32) + def _dec(nib): # nib uint8 [N,Hp] 0..15 -> signed e2m1 magnitude + sign = 1.0 - 2.0 * ((nib >> 3) & 1).float() + return sign * lut[(nib & 0x7).long()] + lo = _dec(qb & 0xF) + hi = _dec((qb >> 4) & 0xF) # byte packs [v_lo, v_hi] + vals = torch.stack([lo, hi], dim=-1).reshape(N, H) + blk = H // self._MXFP4_VEC + scale = torch.pow(torch.tensor(2.0, device=q.device), sf.reshape(N, blk).float() - 127.0) + out = (vals.view(N, blk, self._MXFP4_VEC) * scale.view(N, blk, 1)).reshape(N, H) + else: # nvfp4 — DEVICE dequant (e2m1 + ufp8 e4m3 scale + per-tensor global), linear layout. + qf = q.reshape(N, q.shape[-1]).contiguous() + sff = sf.reshape(N, sf.shape[-1]).contiguous() + # dequant divides by the global scale -> pass its RECIPROCAL (verified on the runner: + # quant gsf=(448*6)/amax + dequant 1/gsf -> relerr ~0.09 = the 4-bit nvfp4 floor). + gsf = getattr(self, "_gsf", None) + out = self._fi.e2m1_and_ufp8sf_scale_to_float( + qf, sff, global_scale_tensor=(1.0 / gsf).cpu() if gsf is not None else None, + sf_vec_size=self._NV_VEC, is_sf_swizzled_layout=False) + H = out.shape[-1] + # e2m1_and_ufp8sf_scale_to_float returns on CPU; move back to the payload's device. + return out.reshape(*lead, H).to(device=q.device, dtype=torch.bfloat16) + + +# dispatch_dtype -> (label, kind). kind selects the cast/dequant path in make_problem/stage. +# mxfp4 uses fp4_quantize(sf_use_ue8m0=True, is_sf_swizzled_layout=False) — a LINEAR e8m0 SF that +# moves per-token through the A2A (mxfp4_quantize's tile-padded swizzled SF does NOT; that was the +# old blocker). mxfp8/mxfp4/nvfp4 + the e4m3 fp8 recipes cover the OCP-microscaling dispatch goal. +_QUANT_RECIPES = { + "fp8": ("per-block-128", "e4m3"), + "fp8-pertoken": ("per-token", "e4m3"), + "fp8-directcast": ("direct-cast", "e4m3"), + "mxfp8": ("mxfp8-e8m0-block32", "mxfp8"), + "mxfp4": ("mxfp4-e8m0-block32", "mxfp4"), + "nvfp4": ("nvfp4-e4m3-block16", "nvfp4"), +} +_E4M3_CASTS = {"fp8": _e4m3_block128_cast, "fp8-pertoken": _e4m3_pertoken_cast, + "fp8-directcast": _e4m3_directcast} +# Per-format comm-correctness tolerance (round-trip of the dequantized cast through the comm). +_QUANT_TOL = {"e4m3": 1.25e-1, "mxfp8": 1.5e-1, "mxfp4": 3.5e-1, "nvfp4": 3.0e-1} + + +class FlashInferBackend: + name = "flashinfer" + # FlashInfer combine reuses the dispatch workspace/handle (no re-dispatch needed before + # a timed combine), mirroring DeepEP normal mode — combine consumes the recv payload. + # MoeAlltoAll is a stateful idle->dispatched->idle FSM (asserts "dispatch called twice without + # combine"). The harness times dispatch in isolation (loops it) AND combine in isolation. Setting + # this True makes the combine-timing loop run an untimed dispatch+stage (pre=) before each combine + # sample, so combine always sees a "dispatched" state; dispatch() resets the FSM to idle at its + # start so the dispatch-timing loop + the roundtrip (paired) timing all stay valid. + combine_needs_redispatch = True + # MoeAlltoAll's paired dispatch/combine FSM means isolated/looped dispatch timing corrupts the + # symmetric workspace (CUDA launch failure). Only the PAIRED roundtrip is measurable — the + # harness times the roundtrip and mirrors it into dispatch/combine (isolated_sum is N/A here). + # The roundtrip IS goal P0's headline metric, so this is the right measurement for this backend. + roundtrip_only = True + # Blackwell (B300/GB300) drops GPU clocks during the tiny small-T points, so the harness + # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100/H200. + wants_warm_burst = True + # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no + # fallback/mislabel). + # bf16 : MoeAlltoAll keeps bf16 payloads end-to-end (no quant round trip). + # fp8* : e4m3 dispatch (per-block-128 / per-token / direct-cast) — SAME convention + # as ep_deepep, so FlashInfer-fp8 == DeepEP-fp8 operating point, different + # transport (the TRT-LLM throughput A2A vs DeepEP NVLink). + # mxfp8/mxfp4/nvfp4: OCP-microscaling dispatch via FlashInfer's native quantizers. The A2A + # moves [q, scale_factor] as a payload LIST (byte-agnostic), dequant in + # stage(). Covers goal's "MXFP8 / MXFP4 / NVFP4 dispatch" — reachable on + # this working path because FlashInfer ships the quantize/dequantize kernels. + SUPPORTED_PRECISIONS = {"bf16", "fp8", "fp8-pertoken", "fp8-directcast", + "mxfp8", "mxfp4", "nvfp4"} + SUPPORTED_MODES = {"normal"} + # Only the contract whose timing boundary FlashInfer can honor: layout (the dispatch + # send-counts) is computed inside dispatch and cannot be hoisted to a separate untimed + # step the way DeepEP's get_dispatch_layout can — so cached-layout-comm-only-v1 and + # runtime-visible-v1 (fp8) are NOT offered. + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"} + # Combine path: bf16 (default) OR a quantized COMBINE OUTPUT via the newer flashinfer + # moe_a2a_combine output_dtype (fp8 e4m3 wired; the bundled 0.6.8.post1 has no output_dtype, so + # a combine-quant run upgrades FlashInfer first via cx_build_flashinfer_latest). nvfp4/mxfp8 + # combine reserved (fp4/e8m0 output packing — extend once fp8-combine is GHA-validated). + SUPPORTED_COMBINE_DTYPES = {"bf16", "fp8", "nvfp4"} + SUPPORTED_COMBINE_QUANT_MODES = {"none", "fp8", "nvfp4"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = args.mode + self.contract = args.measurement_contract + self.group = dist.group.WORLD + assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \ + "run_ep.py must reject unsupported dtype/mode before constructing the backend" + # Quant recipe (None for bf16). e4m3 = pure-torch cast (DeepEP convention); mx/nvfp4 = + # FlashInfer-native quantizer. dispatch passes [q, sf]; stage() dequants (UNTIMED). + self.dispatch_dtype = args.dispatch_dtype + self.quant_label, self.quant_kind = _QUANT_RECIPES.get(args.dispatch_dtype, (None, None)) + self._micro = None + if self.quant_kind in ("mxfp8", "mxfp4", "nvfp4"): + self._micro = _MicroscaleRecipe(self.quant_kind) # lazy flashinfer import, LOUD if absent + elif self.quant_kind == "e4m3": + self._e4m3_cast = _E4M3_CASTS[args.dispatch_dtype] + # bf16 round-trip error ~5e-3 (tol 5e-2); fp8 e4m3 ~1/16; fp4 (4-bit) far looser. Per-format + # tolerance recorded in the artifact so the looser quant gate is explicit, not hidden. + self.tolerance = _QUANT_TOL.get(self.quant_kind, 5e-2) + # The quant CAST + recv-DEQUANT run in make_problem/stage (OUTSIDE the timed comm window) — + # the layout-and-dispatch-v1 contract (producer hands quantized activations). Recorded honestly. + self.fp8_in_timing = False if self.quant_kind else None + self.scale_layout = self.quant_label + + # Combine-side quant (SEPARATE axis from dispatch): a quantized COMBINE OUTPUT via the newer + # flashinfer moe_a2a_combine output_dtype (the bundled 0.6.8.post1 has NO output_dtype, so a + # combine-quant run upgrades FlashInfer first — cx_build_flashinfer_latest). The combine + # kernel emits the per-source-token reduction already as fp8 + per-token scales; we dequant + # (cached, untimed) for the correctness gate. The quantized reduction is what's TIMED. + self.combine_dtype = getattr(args, "combine_dtype", "bf16") + self.combine_quant = self.combine_dtype not in ("bf16", None, "") + self.combine_input_dtype = self.combine_dtype + self.combine_quant_mode = getattr(args, "combine_quant_mode", "none") + self.combine_quant_in_timing = True if self.combine_quant else None + self.combine_dequant_in_timing = False if self.combine_quant else None + self._qc_out_dtype = None + self._qc_scale_shape = None # cached working output_scales shape (discovered on first combine) + if self.combine_quant: + import inspect as _inspect + if "output_dtype" not in str(_inspect.signature(fi_comm.MoeAlltoAll.combine)): + raise RuntimeError( + "combine-quant requested but flashinfer.comm.MoeAlltoAll.combine has NO output_dtype — " + "this wheel (likely 0.6.8.post1) predates PR3376/3643. The run must upgrade FlashInfer " + "first (CX_COMBINE_DTYPE!=bf16 triggers cx_build_flashinfer_latest in run_in_container.sh).") + # fp8 -> e4m3 output + UE8M0 uint8 vec-32 scales (= MXFP8). nvfp4 -> uint8 packed-e2m1 + # output + e4m3 vec-16 scales + a per-tensor output_scalar_scale (the fp4 path). + self._qc_out_dtype = {"fp8": torch.float8_e4m3fn, "nvfp4": torch.uint8}.get(self.combine_dtype) + if self._qc_out_dtype is None: + raise RuntimeError(f"combine_dtype={self.combine_dtype} not wired (fp8|nvfp4)") + # quantized-combine round-trip is looser than the bf16 reconstruction (fp8 ~1/16 + + # whatever the dispatch added); keep at least the dispatch tol. + self.tolerance = max(self.tolerance, 1.6e-1) + + # TensorRT-LLM lineage: MoeAlltoAll LIVES IN flashinfer.comm.trtllm_moe_alltoall (the + # "throughput backend" — the TRT-LLM NVLink one-sided AllToAll over an MNNVL symmetric + # workspace). So this adapter's DEFAULT path IS the TRT-LLM one-sided EP; CX_FLASHINFER_TRTLLM + # only flips the provenance label (there is no separate functional path — both call the same + # moe_a2a_dispatch/combine kernels). Kept as a label so the artifact can be tagged trtllm. + self.trtllm = os.environ.get("CX_FLASHINFER_TRTLLM", "0") == "1" + + self.top_k = int(args.topk) + self.num_experts = int(args.experts) + # Workspace/buffer ceiling. The MoeAlltoAll symmetric workspace is sized for + # max_num_tokens per rank; the sweep is capped at this (buffer_cap) so a too-large T + # is dropped (reported) rather than overflowing. 4096 holds the prefill ladder top. + self.max_num_tokens = int(os.environ.get("CX_FLASHINFER_MAX_TOKENS", "4096")) + + dev_sms = torch.cuda.get_device_properties(device).multi_processor_count + ver = _flashinfer_version() + + # Build the pure-EP Mapping (defensive over kwarg variants; logs which worked). + self.mapping, map_variant = _build_mapping(world_size, rank) + if rank == 0: + print(f"[flashinfer] Mapping constructed via variant #{map_variant} " + f"(world={world_size} rank={rank} tp=1 moe_ep={world_size} moe_tp=1)", + file=sys.stderr) + + # Construct the comm object. MoeAlltoAll (in flashinfer.comm.trtllm_moe_alltoall) IS the + # TRT-LLM throughput-backend one-sided A2A — it allocates its MNNVL symmetric workspace + # internally and calls the same moe_a2a_dispatch/combine kernels the functional API exposes. + # So we ALWAYS construct it; the trtllm flag only tags provenance (no separate path). + self.path = "trtllm_moe_alltoall" if self.trtllm else "moe_alltoall" + self.a2a = None + self.workspace = None + self.ws_size = None + self._init_moe_alltoall(ver) + + self.backend_provenance = { + "flashinfer_version": ver, + "flashinfer_commit": os.environ.get("FLASHINFER_COMMIT") or f"pkg-{ver}", + # exact upgraded library stack (flashinfer-python/cubin/jit-cache + cutlass-dsl + torch), + # set by cx_build_flashinfer_latest — the only record of post-env_capture upgrade versions. + "flashinfer_stack": os.environ.get("CX_FLASHINFER_STACK"), + "mode": "normal", "path": self.path, "trtllm": self.trtllm, + # MoeAlltoAll's home module — proves this EP path IS the TRT-LLM one-sided throughput A2A. + "backend_lineage": "flashinfer.comm.trtllm_moe_alltoall.MoeAlltoAll", + "transport": "trtllm-throughput-backend-onesided", + # quant provenance (None/bf16 path -> nulls). scale_layout + dispatch_dtype name the recipe. + "dispatch_dtype": self.dispatch_dtype, "quant_kind": self.quant_kind, + "scale_layout": self.scale_layout, "quant_in_timing": self.fp8_in_timing, + # combine-side quant (a SEPARATE axis): a quantized COMBINE OUTPUT (fp8 e4m3) when set. + "combine_dtype": self.combine_dtype, "combine_quant": self.combine_quant, + "combine_quant_in_timing": self.combine_quant_in_timing, + "resource_mode": args.resource_mode, + # FlashInfer MoE A2A occupancy is fixed by the library (a symmetric-memory kernel, not + # an SM/CU budget we set) — like DeepEP LL. Recorded as a fixed-kernel run so the + # resource_profile maps it to resource_class=fixed-kernel (excluded from the Pareto). + "num_sms": None, "device_sms": dev_sms, "tuned_source": "fixed-kernel", + "max_num_tokens": self.max_num_tokens, "top_k": self.top_k, + "num_experts": self.num_experts, + "mapping_variant": map_variant, + "routing_factor": _ROUTING_FACTOR, + # MNNVL symmetric workspace — comm bootstrapped via torch.distributed (TorchDistBackend), + # NOT MPI, so it works under torchrun without mpi4py / an MPI launch. + "workspace": "mnnvl-symmetric", "mnnvl_comm": getattr(self, "_mnnvl_comm", "n/a"), + } + + def _init_moe_alltoall(self, ver): + """Class path: flashinfer.comm.MoeAlltoAll(mapping, max_num_tokens, top_k, num_experts).""" + MoeAlltoAll = getattr(fi_comm, "MoeAlltoAll", None) + if MoeAlltoAll is None: + raise _loud("MoeAlltoAll lookup", "flashinfer.comm.MoeAlltoAll not found", + AttributeError("MoeAlltoAll")) + # The MNNVL symmetric workspace bootstraps its cross-rank comm via MPI by default + # (MnnvlMemory.get_comm -> MpiComm().Split) — which fails under torchrun (no mpi4py / no MPI + # launch). FlashInfer ships a TorchDistBackend; wrap it in an MnnvlConfig so the workspace + # uses the torch.distributed NCCL group torchrun already set up. This is the no-MPI path. + mnnvl_config = None + try: + from flashinfer.comm.mnnvl import MnnvlConfig, TorchDistBackend, MnnvlMemory + mnnvl_config = MnnvlConfig(comm_backend=TorchDistBackend(group=None)) + # get_comm() returns the cached class-level comm if set, else MPI-Splits. Register the + # torch-dist comm explicitly so the workspace bootstrap NEVER touches MPI/mpi4py. + if MnnvlMemory.comm is None: + MnnvlMemory.set_comm_from_config(self.mapping, mnnvl_config) + if self.rank == 0: + print("[ep_flashinfer] MNNVL via TorchDistBackend (no MPI)", flush=True) + except Exception as exc: # older flashinfer without TorchDistBackend -> fall back (will MPI-fail loudly) + if self.rank == 0: + print(f"[ep_flashinfer] WARN: no TorchDistBackend ({exc!r}); MoeAlltoAll will need MPI", + flush=True) + self._mnnvl_comm = "torch-dist" if mnnvl_config else "mpi-default" # provenance built later + # kwarg names have drifted across releases; hidden_size is REQUIRED (else MoeAlltoAll asserts + # "hidden_size must be provided if workspace_size_per_rank is not provided"); mnnvl_config + # supplies the torch-dist comm. Try with mnnvl_config first, then without (older releases). + hs = int(self.args.hidden) + mc = dict(mnnvl_config=mnnvl_config) if mnnvl_config is not None else {} + variants = [ + ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k, + num_experts=self.num_experts, hidden_size=hs, **mc)), + ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k, + num_experts=self.num_experts, hidden_size=hs)), + ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k, + num_experts=self.num_experts, hidden_size=hs, + ep_size=self.world_size)), + ((self.mapping, self.max_num_tokens, self.top_k, self.num_experts, hs), {}), + ((self.mapping,), dict(max_num_tokens_per_rank=self.max_num_tokens, top_k=self.top_k, + num_experts=self.num_experts, hidden_size=hs)), + ] + self.a2a, idx = _call_variants("MoeAlltoAll(...)", MoeAlltoAll, variants) + self.path = "moe_alltoall" + if self.rank == 0: + print(f"[flashinfer] MoeAlltoAll constructed via variant #{idx}", file=sys.stderr) + + def buffer_cap(self, args): + # The symmetric workspace is sized for max_num_tokens per rank; cap the sweep there + # (reported by the harness, never silently truncated). + return self.max_num_tokens + + def make_problem(self, T, idx, weights, x): + # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice. + # token_selected_experts is commonly int32 in TensorRT-LLM kernels; keep an int32 copy + # alongside the int64 (the harness/expected use int64; the kernel call uses int32). + # input_payloads = [x] for bf16, or [q, scale_factor] for a quantized dispatch — the cast + # runs HERE (UNTIMED preprocessing). x_ref = the dequantized cast = the COMM correctness + # reference (so the gate verifies the all-to-all, not the quantizer). + p = types.SimpleNamespace( + T=int(T), x=x, + topk_idx=idx.to(torch.int64), + topk_idx_i32=idx.to(torch.int32), + topk_weights=weights.to(torch.float32), + payloads=None, x_ref=None, + ) + if self.quant_kind == "e4m3": + q, sf = self._e4m3_cast(x) + p.payloads = [q, sf] + p.x_ref = _e4m3_dequant_nd(q, sf) + elif self._micro is not None: + q, sf = self._micro.cast(x) + p.payloads = [q, sf] + p.x_ref = self._micro.dequant_nd(q, sf) # 2D recv path (lead=(T,)) = source-token ref + else: # bf16 + p.payloads = [x] + p.x_ref = x + return p + + def _reset_moe_fsm(self): + # Force the MoeAlltoAll FSM back to idle so a fresh dispatch is legal. The harness loops + # dispatch in isolation (and re-dispatches before each combine); a pending "dispatched" + # state from a prior un-combined dispatch would assert. Discarding it is fine for timing + # (each dispatch re-populates the workspace). Defensive: the internal attr may move. + a = getattr(self, "a2a", None) + st = getattr(a, "_state", None) + if st is not None and getattr(st, "phase", "idle") != "idle": + try: + st.phase = "idle" + except Exception: + pass + + def dispatch(self, p): + self._reset_moe_fsm() + # MoeAlltoAll.dispatch(token_selected_experts, input_payloads, runtime_max_tokens_per_rank) + # -> a LIST of recv tensors [ep_size, max_tokens, *] (one per input payload, same order). + # input_payloads = p.payloads ([x] bf16, or [q, scale_factor] for a quantized dispatch). + variants = [ + ((p.topk_idx_i32, p.payloads, p.T), {}), + ((p.topk_idx_i32, p.payloads), dict(runtime_max_tokens_per_rank=p.T)), + ((p.topk_idx_i32, p.payloads), dict(runtime_max_tokens=p.T)), + ((p.topk_idx, p.payloads, p.T), {}), # int64 idx fallback + ] + recv, idx = _call_variants("MoeAlltoAll.dispatch(...)", self.a2a.dispatch, variants) + recv_list = list(recv) if isinstance(recv, (list, tuple)) else [recv] + recv_q = recv_list[0] + recv_sf = recv_list[1] if len(recv_list) > 1 else None + return types.SimpleNamespace(recv=recv, recv_q=recv_q, recv_sf=recv_sf, + recv_payload=self._first_payload(recv), + dispatch_variant=idx, combine_input=None) + + @staticmethod + def _first_payload(recv): + """dispatch may return a Tensor, a (payloads, meta) tuple, or a list of payloads. + Return the first payload Tensor (the routed x on this rank) for recv_tokens/staging.""" + if torch.is_tensor(recv): + return recv + if isinstance(recv, (list, tuple)) and recv: + head = recv[0] + if torch.is_tensor(head): + return head + if isinstance(head, (list, tuple)) and head and torch.is_tensor(head[0]): + return head[0] + return recv # leave as-is; recv_tokens guards with is_tensor + + def stage(self, p, h): + # No expert compute (identity expert). For bf16, the recv IS the "expert output" as-is — + # combine reads back from the SAME workspace dispatch populated, so we hand recv[0] straight + # to combine (NO clone — a clone of the workspace-backed recv broke the layout and + # async-corrupted CUDA; combine is called payload_in_workspace=False so the kernel stages it). + # For a QUANTIZED dispatch, DEQUANT the recv (recv_q + recv_sf) -> bf16 HERE (UNTIMED, outside + # the comm window): this is the bf16 "expert input" that combine reduces. The dequant produces + # a fresh tensor (not workspace-backed), which combine stages via payload_in_workspace=False. + if self.quant_kind: + # Dequant is UNTIMED preprocessing (layout-and-dispatch-v1) — but FlashInfer is + # roundtrip_only, so stage() runs INSIDE the timed dispatch->combine loop. The recv is + # DETERMINISTIC for a fixed problem (same x + routing -> same workspace contents), so we + # dequant ONCE and cache it on the problem; steady-state timing then measures comm only + # (the dequant is amortized, exactly as DeepEP's separately-timed stage is untimed). This + # keeps FlashInfer-fp8 comparable to DeepEP-fp8 (same timing boundary) and stops the + # CPU-side nvfp4 dequant from dominating the roundtrip. + ci = getattr(p, "_combine_input_cache", None) + if ci is None: + ci = (_e4m3_dequant_nd(h.recv_q, h.recv_sf) if self.quant_kind == "e4m3" + else self._micro.dequant_nd(h.recv_q, h.recv_sf)) + p._combine_input_cache = ci + h.combine_input = ci + else: + h.combine_input = h.recv_payload + if self.rank == 0 and not getattr(self, "_shape_logged", False) and torch.is_tensor(h.combine_input): + self._shape_logged = True + print(f"[ep_flashinfer] dtype={self.dispatch_dtype} recv_q={tuple(h.recv_q.shape)}:{h.recv_q.dtype}" + f" combine_input={tuple(h.combine_input.shape)}:{h.combine_input.dtype}", flush=True) + return None + + def combine(self, p, h): + if self.combine_quant: + return self._combine_quant(p, h) + # MoeAlltoAll.combine(payload, runtime_max_tokens_per_rank, payload_in_workspace=False) + # -> the per-source-token reduced result on this rank ([T, hidden] bf16). Because the + # dispatch populated the symmetric workspace, the data is already there: try + # payload_in_workspace=True first (no payload re-copy), then the explicit-payload forms. + # payload_in_workspace=False FIRST: combine_input is a cloned external tensor (see stage), + # so the kernel copies it into the workspace itself — avoids the exact-pointer requirement + # that payload_in_workspace=True enforces (which raised a RuntimeError, not a TypeError, so + # _call_variants would not fall through to it). + variants = [ + ((h.combine_input, p.T), dict(payload_in_workspace=False)), + ((h.combine_input, p.T), {}), + ((h.combine_input,), dict(runtime_max_tokens_per_rank=p.T, payload_in_workspace=False)), + ((h.combine_input,), dict(runtime_max_tokens_per_rank=p.T)), + ] + combined, idx = _call_variants("MoeAlltoAll.combine(...)", self.a2a.combine, variants) + h.combine_variant = idx + return self._as_tensor(combined) + + _QC_VEC = 32 # fp8 combine output uses UE8M0 scales, vector size 32 (flashinfer main source) + + def _combine_quant(self, p, h): + # Quantized COMBINE OUTPUT. Pinned from the flashinfer-main source: combine(output_dtype= + # float8_e4m3fn) emits the reduced result as e4m3 + UE8M0 scale factors "packed in torch.uint8, + # vector size 32" (linear layout) — i.e. MXFP8 (e4m3 + e8m0 block-32). So output_scales MUST be + # uint8 [T, H/32] (the kernel WRITES it; first run failed "float32 vs uint8"). We dequant + # (cached, UNTIMED — deterministic recv) via e8m0: x = e4m3 * 2^(scale_uint8 - 127) per block-32. + # The fp8 reduction is what's TIMED. CX_QC_SCALE override: "block32" (default) | "pertoken"[T,1]. + H = int(getattr(self, "hidden", 0)) or int(self.args.hidden) + T = p.T + if self.combine_dtype == "nvfp4": + # NVFP4 combine: uint8 packed-e2m1 output + e4m3 (float8) scales vec-16 + per-tensor scalar. + blocks = max(1, H // 16) + sc = torch.zeros(T, blocks, device=self.device, dtype=torch.float8_e4m3fn) + self._qc_scalar = float(os.environ.get("CX_QC_NVFP4_SCALAR", "1.0")) + kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype, + output_scales=sc, output_scalar_scale=self._qc_scalar) + label = f"nvfp4 output_scales=e4m3[{T},{blocks}] scalar={self._qc_scalar}" + elif os.environ.get("CX_QC_SCALE") == "scalar": + # DIRECT-CAST fp8 combine: a single per-tensor output_scalar_scale, NO per-block + # output_scales (the unscaled/global-scaled e4m3 emit — goal "Direct-cast FP8 combine"). + # The working mxfp8 path emits SCALED e4m3+e8m0; this probes whether the same kernel also + # supports the scalar-only mode. If the kernel REQUIRES per-block output_scales for fp8 + # output, the call below raises and the run records that (the documented kernel limit). + sc = None + self._qc_scalar = float(os.environ.get("CX_QC_FP8_SCALAR", "1.0")) + kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype, + output_scalar_scale=self._qc_scalar) + label = f"fp8-directcast output_scalar_scale={self._qc_scalar} (no per-block scales)" + else: + # MXFP8 combine: e4m3 output + UE8M0 uint8 scales vec-32 (the main-source spec). + mode = os.environ.get("CX_QC_SCALE", "block32") + blocks = 1 if mode == "pertoken" else max(1, H // self._QC_VEC) + sc = torch.zeros(T, blocks, device=self.device, dtype=torch.uint8) + kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype, output_scales=sc) + label = f"mxfp8 output_scales=uint8[{T},{blocks}]" + try: + out = self.a2a.combine(h.combine_input, T, **kw) + except Exception as exc: + raise _loud(f"MoeAlltoAll.combine({label})", + f"quant-combine call failed ({self.combine_dtype}; per the main-source spec)", exc) + if self.rank == 0 and not getattr(self, "_qc_logged", False): + self._qc_logged = True + oq = out[0] if isinstance(out, (tuple, list)) else out + print(f"[ep_flashinfer] combine-quant {label} OK out={tuple(oq.shape)}:{oq.dtype}", flush=True) + return self._finish_qcombine(p, out, sc, H) + + def _finish_qcombine(self, p, out, sc, H): + # Dequant the quantized combine output (cached, UNTIMED) -> bf16 for the correctness gate. + # mxfp8: e4m3 * 2^(UE8M0_uint8 - 127), per block-32. + # nvfp4: e2m1_and_ufp8sf_scale_to_float(packed-e2m1, e4m3-scales, global=1/scalar), vec-16. + out_q = out[0] if isinstance(out, (tuple, list)) else out + cached = getattr(p, "_qc_dequant", None) + if cached is None: + T = out_q.shape[0] + if self.combine_dtype == "nvfp4": + gsf = torch.tensor([1.0 / max(1e-6, getattr(self, "_qc_scalar", 1.0))], dtype=torch.float32) + # nvfp4 dequant via the flashinfer e2m1 decoder (linear layout, vec-16) + import flashinfer as _fi + # the combine wrote the nvfp4 scales as float8_e4m3fn, but the e2m1 decoder wants the + # raw ufp8 bytes as uint8 — reinterpret (same 1-byte storage), don't cast. + sc_u8 = sc.reshape(T, -1).contiguous().view(torch.uint8) + o = _fi.e2m1_and_ufp8sf_scale_to_float( + out_q.reshape(T, -1).contiguous(), sc_u8, + global_scale_tensor=gsf, sf_vec_size=16, is_sf_swizzled_layout=False) + cached = o.reshape(T, H).to(device=out_q.device, dtype=torch.bfloat16) + elif sc is None: + # direct-cast fp8: single global scalar, no per-block scales -> x = e4m3 * scalar + cached = (out_q.float() * float(getattr(self, "_qc_scalar", 1.0))).to(torch.bfloat16) + p._qc_dequant = cached + return cached + else: + of = out_q.float() + blocks = sc.shape[-1] if torch.is_tensor(sc) and sc.dim() >= 2 else 1 + if blocks > 1 and (H % blocks) == 0: + bs = H // blocks + scale = torch.pow(torch.tensor(2.0, device=of.device), sc.float() - 127.0) # e8m0 + cached = (of.view(T, blocks, bs) * scale.view(T, blocks, 1)).reshape(T, H).to(torch.bfloat16) + else: + scale = torch.pow(torch.tensor(2.0, device=of.device), sc.float().reshape(T, 1) - 127.0) + cached = (of * scale).to(torch.bfloat16) + p._qc_dequant = cached + return cached + + @staticmethod + def _as_tensor(x): + if torch.is_tensor(x): + return x + if isinstance(x, (list, tuple)) and x and torch.is_tensor(x[0]): + return x[0] + raise _loud("combine result", f"expected a Tensor, got {type(x)}", + TypeError("non-tensor combine result")) + + def expected(self, p, h): + # Round trip, identity expert. FlashInfer combine takes NO gate weights and reduces the + # recv [ep_size, max_tokens, hidden] over the ep_size (per-RANK) axis — so each source token + # is reconstructed as x * (number of DISTINCT ranks its top_k experts land on), exactly like + # DeepEP normal mode (combine does not re-weight). Factor is computed from the routing trace: + # "ranks" (default) -> x * distinct_ranks_per_token (per-rank-sum combine) + # "topk" -> x * top_k (if combine sums every expert copy) + # "weight-sum" -> x * sum(topk_weights) (if combine applies the gate) + # For a quantized dispatch, compare against the DEQUANTIZED cast that was actually sent + # (p.x_ref = dequant(quant(x))), so the gate verifies the COMM not the quantizer. bf16 -> x. + ref = (p.x_ref if p.x_ref is not None else p.x).float() + if _ROUTING_FACTOR == "weight-sum": + factor = p.topk_weights.sum(dim=1, keepdim=True) # [T, 1] + elif _ROUTING_FACTOR == "topk": + factor = float(self.top_k) + else: # "ranks": distinct ranks among each token's top_k experts (vectorized) + epr = max(1, self.num_experts // self.world_size) + ranks = (p.topk_idx.long() // epr).clamp_(0, self.world_size - 1) # [T, topk] + present = torch.zeros(ranks.shape[0], self.world_size, + device=ranks.device, dtype=torch.float32) + present.scatter_(1, ranks, 1.0) + factor = present.sum(dim=1, keepdim=True) # [T, 1] distinct ranks/token + return ref * factor, p.T + + def recv_tokens(self, h): + # Realized token-copies received on this rank (the routed payload's first dim). FlashInfer + # pads to max_num_tokens-per-source-rank; the row count is the realistic recv-buffer size + # the harness reports (it does NOT gate on this — recv_total>0 is the only liveness check). + rp = h.recv_payload + if torch.is_tensor(rp) and rp.dim() >= 1: + return int(rp.shape[0]) + return 0 + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py new file mode 100644 index 000000000..90454dda3 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_harness.py @@ -0,0 +1,1021 @@ +#!/usr/bin/env python3 +"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness. + +Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`) +implement a small duck-typed protocol; this module owns the source-tokens-per-rank +sweep, the timing, the correctness gate, and the provenance-tagged JSON doc. + +Fair-comparison contract (hardened after review — see notes.md / plan.md): + * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs + + gate weights are generated once from a fixed seed over the *global* batch and are + identical on every SKU; each rank materializes its slice. So every platform runs + the *same* problem (no per-rank/per-platform RNG in the adapters). + * **Explicit measurement contract** (review #3): adapters conform to a NAMED timing + boundary, they do not each choose their own. layout-and-dispatch-v1 times the + routing-layout step inside dispatch (the only contract MoRI can honor); cached- + layout-comm-only-v1 hoists it out (DeepEP). Combine excludes staging in both. + Serial = SUM of the two isolated medians (NOT a measured chained op). + * **Correct collective percentile**: each iteration's latency is reduced MAX across + ranks first (a collective finishes with its slowest rank), THEN percentiled — + `median_i(max_r)`, not `max_r(median_i)`. + * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and + `global_tokens = T * ep_size` are recorded for the weak/strong-scaling x toggle. + +stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported +lazily inside run_sweep) so this file `py_compile`s without torch. + +Backend protocol: + name, mode, combine_needs_redispatch, backend_provenance(dict) + buffer_cap(args) -> int|None + make_problem(T, idx, weights, x) -> problem # materialize this rank's trace slice + dispatch(problem) -> handle # pure dispatch comm (timed) + stage(problem, handle) # untimed expert-output placement + combine(problem, handle) -> tensor # pure combine comm (timed) + expected(problem, handle) -> (tensor, n_cmp) # correctness reference + recv_tokens(handle) -> int # realized tokens received this rank + finalize(rc) -> int|NoReturn +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os + +SCHEMA_VERSION = 3 # v3: explicit contracts, pooled trials p50/p90/p99, routing-identity proof, separated logical bytes + +# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal +# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a +# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap). +DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128] +PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096] + +# Dispatch-payload element size (bytes/element of hidden) for the derived-bandwidth estimate. +# fp8/mxfp8 = 1B e4m3/e8m0; mxfp4/nvfp4 = 0.5B (4-bit e2m1, 2 values/byte) — the headline metric +# is measured LATENCY (dtype-independent); only the secondary GB/s estimate uses this. +_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp8": 1, "fp8-pertoken": 1, "fp8-directcast": 1, + "mxfp8": 1, "mxfp4": 0.5, "nvfp4": 0.5} + +# Phase profiles (goal P2 "decode/prefill representation"): decode/prefill are token-size REGIMES +# that also carry distinct serving semantics — NOT merely ladder aliases. Emitted into the doc so a +# T=128 point launched under "prefill" is never silently read as decode (the shared-T overlap is +# the same kernel at the same T; the phase records what serving situation it stands in). Each point +# is ONE MoE layer, ONE step, a SINGLE dispatch+combine collective pair — not a whole model or +# several concurrent layers. +PHASE_PROFILE = { + "decode": {"regime": "decode", "tokens_per_iter": "1 (or few) per active sequence", + "microbatch": "one decode step across the active sequences", + "routing_variability": "varies step-to-step (temporal routing modes model this)", + "represents": "one MoE layer · one decode step · one dispatch+combine collective"}, + "prefill": {"regime": "prefill", "chunk": "chunked-prefill — many tokens/sequence per MoE layer", + "request_mixture": "tokens of one chunk entering a single MoE layer at once", + "represents": "one MoE layer · one prefill chunk · one dispatch+combine collective"}, +} + + +def add_common_args(ap: argparse.ArgumentParser) -> None: + """CLI args shared by every backend (the entrypoint adds --backend).""" + ap.add_argument("--phase", default="decode", choices=["decode", "prefill"], + help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder") + ap.add_argument("--tokens-ladder", default="", + help="space/comma-separated source-tokens-per-rank sweep; blank = phase default") + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)") + ap.add_argument("--dispatch-dtype", default="bf16", + choices=["bf16", "fp8", "fp8-pertoken", "fp8-directcast", + "mxfp8", "mxfp4", "nvfp4"]) + # Combine-path precision/quant is a SEPARATE axis from dispatch (review: don't let + # dispatch_dtype=fp8 imply the whole EP path is quantized). Today every backend combines + # bf16 with no quant (combine_quant_mode=none); a future quantized combine (e.g. ROCm/MoRI + # PR311) sets these WITHOUT changing --dispatch-dtype. Defaults reproduce today exactly; + # capability.py gates unsupported values. + ap.add_argument("--combine-dtype", default="bf16", choices=["bf16", "fp8", "nvfp4"], + help="combine OUTPUT precision (bf16 default; fp8=MXFP8 e4m3+e8m0, nvfp4=e2m1 — " + "quantized combine via flashinfer-main moe_a2a_combine output_dtype)") + ap.add_argument("--combine-quant-mode", default="none", + help="combine quantization mode; 'none' today. capability.py rejects unwired modes") + # Activation VALUE distribution of expert inputs (goal P2). normal = seeded N(0,1) (the only + # latency-relevant one under bf16 combine — bf16 is value-independent); the others stress a + # FUTURE quantized combine's scale computation (amax/outliers/saturation). routing.py owns + # the generators; capability.py gates which a backend/mode admits. + ap.add_argument("--activation-profile", default="normal", + choices=["normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation"], + help="value distribution of expert inputs (routing.ACTIVATION_PROFILES)") + # uniform = realistic top-k (fan-out ≈5.3 over EP8); balanced = load-equalized, + # one-expert-per-rank (fan-out = ep_size); balanced-rank-local = fan-out 1 (min + # comm) edge case; zipf = skewed; hotspot-* = adversarial single hot expert (static + # or moving across steps); alternating-groups = expert halves that toggle by step. + ap.add_argument("--routing", default="uniform", + choices=["uniform", "balanced", "balanced-rank-local", "zipf", + "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single", + "hotspot-moving", "alternating-groups"]) + # Temporal snapshot index for the moving/alternating distributions (goal P2 "temporal routing + # changes"). One run = one step; a temporal suite launches steps 0..N and analyze_ep compares + # them. Folds into workload_id only when non-zero (preserves existing canonical ids). + ap.add_argument("--routing-step", type=int, default=0, + help="temporal step for hotspot-moving / alternating-groups (0 = first/static)") + # Uneven source-token allocation (goal P2 "support uneven source-token allocation"): per-rank + # token counts vary (global may not divide EP); empty-source-rank case included. Default 'none' + # = every rank gets exactly the ladder T (perfectly even; source-token CV 0) — no behavior + # change for existing runs. 'linear' ramps counts ~0.5T..1.5T; 'empty-rank' zeroes rank 0. + ap.add_argument("--uneven-tokens", default="none", choices=["none", "linear", "empty-rank"], + help="per-rank source-token allocation skew (records source_token_stats)") + # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical + # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform + # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew. + ap.add_argument("--eplb", action="store_true", + help="apply EPLB expert replication/placement to the routing trace") + ap.add_argument("--num-redundant-experts", type=int, default=32, + help="EPLB: redundant physical expert slots (rounded up to a multiple of ep_size)") + # Canonical serialized workload (goal P1): consume pre-generated trace bytes instead of the + # seeded runtime generator, so a result is provably the SAME workload as another machine's + # (checksum match). Points at a dir of .npz/.manifest.json (make_workloads.py). + ap.add_argument("--workload-dir", default="", + help="dir of canonical workload traces; empty = seeded runtime generation (dev)") + ap.add_argument("--mode", default="normal", choices=["normal", "ll"], + help="kernel path: normal or low-latency (LL); LL is backend-dependent") + # Measurement contract — the EXPLICIT timing boundary every adapter must conform to + # (review #3: adapters must not each decide their own boundary). Backends declare + # SUPPORTED_CONTRACTS; run_ep.py rejects an unsupported one. + # layout-and-dispatch-v1 — dispatch timing INCLUDES routing-layout generation + # (the only contract MoRI can honor; its layout is + # computed inside the kernel and cannot be hoisted). + # cached-layout-comm-only-v1 — layout computed ONCE untimed; dispatch times pure + # comm (DeepEP-only; matches DeepEP's own benchmark). + # Combine excludes staging in BOTH (staging is untimed for every backend). + # runtime-visible-v1 — the serving-realistic boundary: dispatch starts from what the + # runtime has right after routing and INCLUDES required quant / + # scale creation / layout / packing / comm / sync; combine starts + # from expert outputs and ends when token outputs are consumable. + # (DeepEP-only today; the FP8 cast moves INSIDE the timed window.) + ap.add_argument("--measurement-contract", default="layout-and-dispatch-v1", + choices=["layout-and-dispatch-v1", "cached-layout-comm-only-v1", + "runtime-visible-v1"]) + ap.add_argument("--num-sms", type=int, default=24, + help="DeepEP comm-SM budget in 'default' resource-mode (MoRI uses block_num/warps)") + # Resource regime (review: budgets were neither normalized nor tuned): + # normalized — each backend restricted to ~sm_fraction of its device's units + # (DeepEP set_num_sms(frac·SMs); MoRI block_num≈frac·CUs). Fraction- + # based, recorded — an approximate apples-to-apples, not identical work. + # tuned — each backend's recommended/auto launch config (best achievable). + # default — DeepEP --num-sms / MoRI 80 blocks (the bring-up budget). + ap.add_argument("--resource-mode", default="normalized", + choices=["normalized", "tuned", "default"]) + ap.add_argument("--sm-fraction", type=float, default=0.18, + help="normalized mode: fraction of device SMs/CUs dedicated to comms (~24/132)") + ap.add_argument("--num-ep-groups", type=int, default=1, + help="concurrent EP groups; >1 is REJECTED (real subgroup PGs unimplemented)") + ap.add_argument("--seed", type=int, default=67) + # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks + + # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us + # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within + # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless. + ap.add_argument("--warmup", type=int, default=32) + ap.add_argument("--iters", type=int, default=200, + help="timed iterations PER TRIAL; pooled across trials for percentiles") + # review #3: p99 from ~50 samples is just the max. Pool iters x trials, randomize the + # token-order each trial so warmup/clock drift doesn't correlate with T, report p50/ + # p90/p99 (p99 is the headline). 3 trials x 200 iters = 600 pooled samples per point. + ap.add_argument("--trials", type=int, default=3, + help="independent timed trials, token-order randomized per trial; samples pooled") + ap.add_argument("--allow-unknown-provenance", action="store_true", + help="permit a run with unpinned backend commit/version (default: fail)") + # Anomaly waiver (goal P1: roundtrip/isolated_sum threshold -> diagnostic unless explicitly + # waived). Without this, a measured roundtrip implausibly larger/smaller than its components + # (e.g. the open LL-FP8 anomaly) demotes the result to 'diagnostic'. Pass to keep it + # comparable-experimental/official AFTER the cause is understood + documented. + ap.add_argument("--waive-anomaly", action="store_true", + help="do not let a flagged timing anomaly demote publication_status to diagnostic") + ap.add_argument("--roundtrip-anomaly-threshold", type=float, default=3.0, + help="roundtrip p99 > threshold x isolated_sum p99 is flagged as an anomaly") + # provenance / output + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--comparison-class", default="standardized") + # Structured placement metadata (goal P2 topology): GPUs/node + scale-up domain + placement + # kind let routing locality (local/same-node/cross-domain copy fractions) be computed and let + # packed/striped/adversarial be distinguished. gpus-per-node=0 -> single node (= ep_size). + ap.add_argument("--gpus-per-node", type=int, default=0) + ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)") + ap.add_argument("--placement", default="packed", + choices=["packed", "striped", "runtime-native", "adversarial"]) + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + + +def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]: + """Return (ladder, dropped): explicit spec else the phase default; positive ints; + clamped to `cap` with dropped points reported (never silently truncated).""" + if spec and spec.strip(): + want = [int(t) for t in spec.replace(",", " ").split() if t] + else: + want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER + want = sorted({t for t in want if t > 0}) + if cap is not None: + return [t for t in want if t <= cap], [t for t in want if t > cap] + return want, [] + + +def source_token_counts(nominal_T: int, ep_size: int, mode: str) -> list[int]: + """Per-rank source-token counts for the uneven-allocation study (goal P2). 'none' = even + (every rank nominal_T; global = nominal_T*ep). 'linear' = a deterministic ramp ~0.5T..1.5T + (mean ≈ T, so global tokens stay ~the same but ranks are imbalanced). 'empty-rank' = rank 0 + gets 0 and the rest share evenly (the empty-source-rank case). Deterministic => identical on + every rank. Counts are clamped to >=0; total need not divide ep_size.""" + if mode == "none" or ep_size <= 1: + return [nominal_T] * ep_size + if mode == "empty-rank": + if ep_size < 2: + return [nominal_T] + # rank 0 empty; spread ep_size*T across the remaining ranks (keeps ~global constant). + total = nominal_T * ep_size + per = max(1, total // (ep_size - 1)) + return [0] + [per] * (ep_size - 1) + # linear ramp from ~0.5T to ~1.5T across ranks (mean ≈ T). At least 1 token/rank. + if ep_size == 1: + return [nominal_T] + lo, hi = 0.5 * nominal_T, 1.5 * nominal_T + return [max(1, int(round(lo + (hi - lo) * r / (ep_size - 1)))) for r in range(ep_size)] + + +def _stats_vec(xs: list[int]) -> dict: + """min/mean/max/CV (+ empty count) of a per-rank count vector — self-describing source-token + or load summary without dumping the full vector.""" + n = len(xs) or 1 + mean = sum(xs) / n + var = sum((x - mean) ** 2 for x in xs) / n + cv = (var ** 0.5 / mean) if mean > 0 else 0.0 + return {"min": min(xs) if xs else 0, "mean": round(mean, 3), + "max": max(xs) if xs else 0, "cv": round(cv, 4), + "empty_ranks": sum(1 for x in xs if x == 0), "total": sum(xs), "ranks": n} + + +def percentile(xs: list[float], q: float) -> float: + if not xs: + return float("nan") + s = sorted(xs) + i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1))))) + return s[i] + + +def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]: + """Per-iteration CUDA-event latencies (µs) for THIS rank. + + Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync + before the start event so its GPU work can't bleed in), then times `fn(pre_result)` + — how combine is isolated when it consumes the dispatch state and needs a fresh + untimed dispatch+stage before every sample. Returns the raw per-iteration series; + the caller reduces across ranks per iteration before percentiling. + """ + def sample(): + arg = pre() if pre is not None else None + if pre is not None: + torch.cuda.synchronize() + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + fn(arg) if pre is not None else fn() + e.record() + torch.cuda.synchronize() + return s.elapsed_time(e) * 1000.0 # ms -> us + + for _ in range(max(0, warmup)): + if pre is not None: + a = pre(); torch.cuda.synchronize(); fn(a) + else: + fn() + # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn + # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back + # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort + # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync. + torch.cuda.synchronize() + return [sample() for _ in range(iters)] + + +def comparison_key(meta: dict) -> str: + """Machine key gating which rows share a curve — built from the FIXED config ONLY + (tokens_per_rank is the x-axis and is excluded). op/backend/mode/phase/ep_size/ + topology are in the key, so EP4 vs EP8, normal vs LL, decode vs prefill, and + different SKUs are labelled distinct, never silently overlaid.""" + parts = [ + meta["op"], meta["backend"], meta["mode"], meta["phase"], + str(meta["ep_size"]), str(meta["nodes"]), meta.get("resource_mode", "default"), + meta["topology_class"], meta["comparison_class"], meta["measurement_contract"], + json.dumps(meta["shape"], sort_keys=True), + ] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _reduce_vec(torch, dist, device, vals, op): + t = torch.tensor(vals, device=device, dtype=torch.float64) + dist.all_reduce(t, op=op) + return [float(x) for x in t.tolist()] + + +def _reduce_int(torch, dist, device, v: int, op) -> int: + t = torch.tensor([int(v)], device=device, dtype=torch.int64) + dist.all_reduce(t, op=op) + return int(t.item()) + + +def _allgather_floats(torch, dist, device, v: float) -> list[float]: + """Gather one scalar from every rank -> list indexed by rank (for per-rank diagnostics: + which rank is the straggler, the rank spread). all_reduce can't do this — it collapses.""" + world = dist.get_world_size() + out = [torch.zeros(1, device=device, dtype=torch.float64) for _ in range(world)] + dist.all_gather(out, torch.tensor([float(v)], device=device, dtype=torch.float64)) + return [float(x.item()) for x in out] + + +def _histogram(xs: list[float], nbins: int = 40) -> dict: + """Compact distribution of pooled cross-rank-max samples (for p99-spike debugging without + storing every sample). Equal-width bins between min and max.""" + if not xs: + return {"n": 0} + lo, hi = min(xs), max(xs) + if hi <= lo: + return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]} + counts = [0] * nbins + span = hi - lo + for x in xs: + b = min(nbins - 1, int((x - lo) / span * nbins)) + counts[b] += 1 + return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts} + + +def _provenance_unknown(prov: dict) -> list[str]: + return [k for k, v in prov.items() if isinstance(v, str) and v.strip().lower() == "unknown"] + + +def _resource_profile(prov: dict, args) -> dict: + """Map backend-specific provenance onto the backend-INDEPENDENT resource vocabulary (goal P3): + requested vs achieved comm-unit fraction, configured units/warps, and a conformance class. + DeepEP units = SMs (num_sms); MoRI units = CU blocks (block_num).""" + dev = prov.get("device_sms") or prov.get("device_cus") + cfg = prov.get("num_sms") if prov.get("num_sms") is not None else prov.get("block_num") + requested = args.sm_fraction if args.resource_mode == "normalized" else None + achieved = (cfg / dev) if (cfg and dev) else None + floored = bool(prov.get("block_num_floored")) + # FIXED-KERNEL split (goal P3 / immediate P0): a kernel whose comm occupancy is fixed by the + # library and NOT a normalized/tuned SM/CU budget (DeepEP LL: num_sms=None, low_latency_mode, + # tuned_source=ll-fixed-kernel) is NOT a resource-constrained run. It gets resource_class= + # fixed-kernel + conformance not-applicable, and is excluded from resource-Pareto comparisons. + fixed_kernel = bool(prov.get("low_latency_mode")) or ("fixed-kernel" in str(prov.get("tuned_source", ""))) + if fixed_kernel: + resource_class, cls = "fixed-kernel", "not-applicable" + elif floored: + resource_class, cls = "resource-constrained", "minimum-functional" # needed MORE than requested + elif args.resource_mode == "normalized": + resource_class, cls = "resource-constrained", "resource-conforming" + elif args.resource_mode == "tuned": + resource_class = "backend-tuned" + cls = "best-known" if "default" not in str(prov.get("tuned_source", "")) else "backend-default" + else: + resource_class, cls = "backend-default", "backend-default" + # within tolerance? (normalized only — did we hit the requested fraction?) + tol = 0.10 + target_achieved = (requested is not None and achieved is not None + and abs(achieved - requested) <= tol) if requested else None + return { + "comm_units_kind": "sm" if prov.get("num_sms") is not None else "cu_block", + "requested_fraction": requested, "configured_units": cfg, "device_units": dev, + "achieved_fraction": round(achieved, 4) if achieved else None, + "warps_dispatch": prov.get("dispatch_warps"), "warps_combine": prov.get("combine_warps"), + "qps_per_rank": prov.get("num_qps_per_rank"), + "persistent_bytes": prov.get("num_nvl_bytes") or prov.get("num_rdma_bytes") or prov.get("heap_size"), + "tuned_source": prov.get("tuned_source"), + # resource_class: fixed-kernel | resource-constrained | backend-tuned | backend-default. + # fixed-kernel + backend-* are NOT normalized resource-constrained runs (excluded from Pareto). + "resource_class": resource_class, + "conformance_class": cls, "tolerance": tol, "target_achieved_within_tol": target_achieved, + "nonconforming": floored, "fixed_kernel": fixed_kernel, + "pareto_eligible": (resource_class == "resource-constrained" and not floored), + } + + +def _derive_publication_status(v: dict) -> str: + """Machine-derive the publication state from the validity dimensions (goal P1). No caller + may hand-label a result 'official' — it must earn every gate here.""" + if v["execution_status"] != "complete": + return "failed" + if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \ + or v["workload_identity"] == "inconsistent": + return "invalid" + sound = (v["semantic_correctness"] == "pass" + and v["workload_identity"].startswith("consistent") + and v["measurement_conformance"] == "conformant") + # resource-nonconforming but otherwise sound -> diagnostic (not a fair cross-platform point) + if v["resource_conformance"].endswith("nonconforming"): + return "diagnostic" + # contract-level anomaly (goal P1-e/f): a flagged roundtrip/isolated_sum mismatch demotes to + # diagnostic unless explicitly waived (validity.anomaly_free reflects the waiver). + if not v.get("anomaly_free", True): + return "diagnostic" + if sound and v["provenance_complete"] and v["workload_source"] == "canonical-serialized": + return "official" + if sound: + return "comparable-experimental" # measurement sound, missing a publication requirement + return "diagnostic" + + +def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int: + """Drive the source-tokens-per-rank sweep for one fully-specified line.""" + import routing # torch-based; imported lazily so the module byte-compiles without torch + import eplb # stdlib planner + torch remap (the EPLB transform) + + ep_size = world_size # num_ep_groups removed (was metadata-only; no real subgroups) + # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the + # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL + # experts then remapped to physical (build_trace), so the whole sweep runs over the + # balanced physical placement with no adapter change. + eplb_on = getattr(args, "eplb", False) + num_logical = getattr(args, "num_logical_experts", args.experts) + if args.experts % ep_size != 0: + if rank == 0: + print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})") + return 2 + experts_per_rank = args.experts // ep_size + elem_bytes = _DTYPE_BYTES.get(args.dispatch_dtype, 2) + + # Provenance gate (review #1): refuse a comparison run with unpinned backend info. + unknown = _provenance_unknown(backend.backend_provenance) + if unknown and not args.allow_unknown_provenance: + if rank == 0: + print(f"ERROR: unpinned provenance {unknown} in {backend.backend_provenance}; " + f"set the commit/version env or pass --allow-unknown-provenance.") + return 4 + + cap = backend.buffer_cap(args) + ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap) + if rank == 0 and dropped: + print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} " + f"(hidden={args.hidden}); not silently truncated.") + if not ladder: + if rank == 0: + print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})") + return 2 + # MoRI wedges on a COLD dispatch that jumps straight to a large T; it sets + # needs_gradual_ramp so the sweep approaches its max T via a geometric ramp from 1 + # (validated on MI355X). A naturally-gradual ladder (decode) is unchanged. + if getattr(backend, "needs_gradual_ramp", False): + top, ramp, t = ladder[-1], [], 1 + while t < top: + ramp.append(t); t *= 2 + ramp.append(top) + if rank == 0 and ramp != ladder: + print(f"NOTE: {backend.name} sweep ramped gradually 1..{top} (cold-jump-safe): {ramp}") + ladder = ramp + + MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM + # temporal snapshot index — defined BEFORE the EPLB block (which builds a reference trace with + # step=routing_step); the EPLB path runs only when eplb_on, so a late definition raised an + # UnboundLocalError on zipf+eplb canonical runs (caught as a preserved failed-case). + routing_step = int(getattr(args, "routing_step", 0)) + + # EPLB plan (once): estimate logical load from the global logical trace at the largest + # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB + # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps + # to physical when the plan is present; otherwise it's the identity (logical == physical). + eplb_plan = None + if eplb_on: + ref_idx, _ = routing.build_global_routing(max(ladder) * ep_size, num_logical, args.topk, + args.routing, args.seed, num_logical // ep_size, + step=routing_step) + load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist() + eplb_plan = eplb.build_plan(load, args.experts, ep_size) + if rank == 0: + print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); " + f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> " + f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts " + f"replicated (hottest {eplb_plan['max_replicas']}x)") + + canonical = bool(getattr(args, "workload_dir", "")) + uneven = getattr(args, "uneven_tokens", "none") + if canonical and uneven != "none": + if rank == 0: + print(f"ERROR: --uneven-tokens={uneven} is incompatible with --workload-dir " + f"(canonical workloads are serialized at a fixed global-token count per id); " + f"use seeded-runtime for the uneven-allocation study.") + return 2 + loaded_workload_ids, loaded_checksums = [], {} + if canonical: + import workload as _wl + + def build_trace(gt): + # canonical: load pre-serialized trace bytes (verified by checksum) so this run is + # provably the SAME workload as any other consuming the same files. else: seeded gen. + if canonical: + wid = _wl.compute_workload_id(args.routing, args.hidden, args.topk, num_logical, gt, + args.seed, step=routing_step) + idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True) + idx_l = torch.from_numpy(idx_np).to(torch.int64) + w = torch.from_numpy(w_np).to(torch.float32) + if wid not in loaded_workload_ids: + loaded_workload_ids.append(wid) + loaded_checksums[wid] = man.get("checksums") + else: + idx_l, w = routing.build_global_routing(gt, num_logical, args.topk, args.routing, + args.seed, num_logical // ep_size, step=routing_step) + return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w + + # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold + # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually + # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone + # and is also cold-jump-safe for MoRI. + warm_T = min(ladder[-1], 128) + warm_shapes = [t for t in ladder if t <= warm_T] or [ladder[0]] + for wt in warm_shapes: + wi, ww = build_trace(wt * ep_size) + wsi, wsw = routing.rank_slice(wi, ww, rank, wt) + wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16, + profile=args.activation_profile) + wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx) + for _ in range(8): + wh = backend.dispatch(wp); backend.stage(wp, wh); backend.combine(wp, wh) + torch.cuda.synchronize() + try: + dist.barrier() + except Exception: + pass + # Per-point clock-ramp burst (set up below, applied inside the loop): a ONE-TIME burst + # warms clocks, but on Blackwell (B300) the tiny small-T points let clocks drop again, + # so a mid-sweep T=64 reads ~20x cold. Re-ramping at EACH shape keeps every timed point + # steady-state. Gated by backend.wants_warm_burst — MoRI WEDGES on a sustained burst + # (and is already steady at warmup=8), so it opts out. CX_FABRIC_WARM_BURST overrides. + warm_burst = int(os.environ.get("CX_FABRIC_WARM_BURST", "40")) + do_burst = warm_burst > 0 and getattr(backend, "wants_warm_burst", False) + + import random as _random + elem_dispatch = elem_bytes # fp8=1 / bf16=2 (dispatch payload element size) + tol = getattr(backend, "tolerance", 5e-2) + + # ---- Pass 1: build the per-T problem ONCE (deterministic trace + cached layout per + # contract), run the correctness gate ONCE. Timing is Pass 2 (pooled over trials). ---- + problems, gate, gts = {}, {}, {} + routing_hashes = set() + for T in ladder: + # Per-rank source-token counts (goal P2 uneven allocation). mode 'none' => [T]*ep, + # gt = T*ep, offsets = 0,T,2T,... — byte-identical to the even path. Otherwise counts + # vary (global may not divide ep) and rank 0 may be empty. + counts = source_token_counts(T, ep_size, uneven) + offsets = [sum(counts[:r]) for r in range(ep_size)] + gt = sum(counts) + gts[T] = gt + idx_g, w_g = build_trace(gt) + rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g) + gpn = args.gpus_per_node or ep_size + # placement-aware locality (goal P2): packed/striped/adversarial change which physical + # node/domain a rank sits on, so the local/same-node/cross-domain copy fractions differ. + rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, max(1, T), + gpn, args.scale_up_domain or None, + placement=args.placement) + rstats["source_token_stats"] = _stats_vec(counts) + routing_hashes.add(rstats["routing_hash"]) + my_off, my_cnt = offsets[rank], counts[rank] + idx_s = idx_g[my_off:my_off + my_cnt].contiguous() + w_s = w_g[my_off:my_off + my_cnt].contiguous() + x = routing.rank_activations(my_cnt, args.hidden, args.seed, rank, device, torch.bfloat16, + profile=args.activation_profile) + problem = backend.make_problem(my_cnt, idx_s.to(device), w_s.to(device), x) + h = backend.dispatch(problem); backend.stage(problem, h) + combined = backend.combine(problem, h) + torch.cuda.synchronize() + recv_local = backend.recv_tokens(h) + exp, n_cmp = backend.expected(problem, h) + # empty source rank (my_cnt==0): nothing to reconstruct locally — gate passes vacuously. + if n_cmp > 0: + max_abs = (combined[:n_cmp].float() - exp[:n_cmp].float()).abs().max().item() + max_rel = max_abs / (exp[:n_cmp].float().abs().max().item() + 1e-6) + else: + max_rel = 0.0 + problems[T] = problem + gate[T] = {"rstats": rstats, "recv_local": recv_local, + "max_rel": max_rel, "local_ok": 1 if max_rel < tol else 0} + + # ---- Pass 2: N timed trials. Token order is randomized PER TRIAL (seeded ⇒ identical + # on every rank, so collectives stay lock-step) so warmup/clock drift can't correlate + # with T. Per-iteration cross-rank MAX samples are POOLED across trials, then + # percentiled (review #3: p99 from one 50-iter run is just the max). MoRI keeps + # ascending order — it wedges on a cold jump to a large T. ---- + disp_pool = {T: [] for T in ladder} # pooled per-iteration cross-rank MAX (dispatch) + comb_pool = {T: [] for T in ladder} # ... combine + rt_pool = {T: [] for T in ladder} # ... INDEPENDENTLY-MEASURED round trip (goal P1) + disp_local = {T: [] for T in ladder} # THIS rank's own dispatch samples (per-rank diag) + order = list(ladder) + rng = _random.Random(args.seed) + shuffle_ok = not getattr(backend, "needs_gradual_ramp", False) + for trial in range(max(1, args.trials)): + if shuffle_ok: + rng.shuffle(order) + for T in order: + problem = problems[T] + if do_burst: # re-ramp clocks at THIS shape before timing (Blackwell) + for _ in range(warm_burst): + bh = backend.dispatch(problem); backend.stage(problem, bh); backend.combine(problem, bh) + torch.cuda.synchronize() + # roundtrip_only backends (stateful paired dispatch/combine FSM, e.g. FlashInfer + # MoeAlltoAll): isolated/looped dispatch timing corrupts the symmetric workspace, so + # ONLY the paired roundtrip is measurable. Mirror rt into disp/comb (flagged) so the + # schema + plot have values; isolated_sum is meaningless for these (== 2x roundtrip). + roundtrip_only = getattr(backend, "roundtrip_only", False) + + def rt_once(p=problem): + hh = backend.dispatch(p); backend.stage(p, hh); return backend.combine(p, hh) + + if roundtrip_only: + rt_iters = time_us(torch, lambda p=problem: rt_once(p), args.warmup, args.iters) + disp_iters = comb_iters = rt_iters + else: + disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p), + args.warmup, args.iters) + + def prep(p=problem): + hh = backend.dispatch(p); backend.stage(p, hh); return hh + if backend.combine_needs_redispatch: + comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh), + args.warmup, args.iters, pre=prep) + else: + hh = prep() + comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx), + args.warmup, args.iters) + # MEASURED round trip (goal P1: not a sum of percentiles): one timed region over + # dispatch -> stage (no-op "expert" transform) -> combine -> output ready. Captures + # shared sync / launch amortization / overlap that the isolated_sum cannot. + rt_iters = time_us(torch, lambda p=problem: rt_once(p), args.warmup, args.iters) + # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled. + disp_pool[T] += _reduce_vec(torch, dist, device, disp_iters, MAX) + comb_pool[T] += _reduce_vec(torch, dist, device, comb_iters, MAX) + rt_pool[T] += _reduce_vec(torch, dist, device, rt_iters, MAX) + disp_local[T] += disp_iters + + # ---- Pass 3: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ---- + def pcts(xs): + return {"p50": percentile(xs, 50), "p90": percentile(xs, 90), + "p95": percentile(xs, 95), "p99": percentile(xs, 99)} + rows = [] + all_anomalies = [] # contract-level anomalies (goal P1) + thr_rt = float(getattr(args, "roundtrip_anomaly_threshold", 3.0)) + for T in ladder: + gt = gts[T] + g = gate[T]; rstats = g["rstats"] + d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T] + dp, cp, rtp = pcts(d), pcts(c), pcts(rt) + # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op + # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput + # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency. + isum = {k: dp[k] + cp[k] for k in dp} + recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM) + recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX) + recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN) + global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN) + max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0] + point_ok = bool(global_ok) and recv_total > 0 + # Per-rank diagnostics: gather each rank's own dispatch median -> spread + straggler. + per_rank_med = _allgather_floats(torch, dist, device, percentile(disp_local[T], 50)) + slowest_rank = max(range(len(per_rank_med)), key=lambda i: per_rank_med[i]) + rmean = sum(per_rank_med) / len(per_rank_med) + # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv + # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy + # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert. + token_rank_copies = rstats["routed_copies"] + token_expert_copies = gt * args.topk + H = args.hidden + # Bandwidth semantics (goal P1 "distinguish all bandwidth concepts"): the ONLY rates we can + # defensibly publish are logical-payload (canonical routed bytes / latency) and backend- + # buffer (recv-tensor bytes / latency). algorithm/bus/wire bandwidth are NULL — EP + # dispatch/combine have no standard busBW model and we have no transport counters, so we + # must NOT imply physical NVLink/XGMI/RDMA utilization. + def _rate(nbytes, us): + return round(nbytes / (us * 1e3), 3) if (us and us > 0) else None + disp_bytes_l = token_rank_copies * H * elem_dispatch + comb_bytes_l = token_rank_copies * H * 2 + buf_disp = recv_max * H * elem_dispatch + buf_comb = recv_max * H * 2 + bandwidth = { + "logical_payload_rate_gbps": { + "dispatch": _rate(disp_bytes_l, dp["p50"]), "combine": _rate(comb_bytes_l, cp["p50"]), + "roundtrip": _rate(disp_bytes_l + comb_bytes_l, rtp["p50"])}, + "backend_buffer_rate_gbps": { + "dispatch": _rate(buf_disp, dp["p50"]), "combine": _rate(buf_comb, cp["p50"])}, + "algorithm_bandwidth_gbps": None, "bus_bandwidth_gbps": None, "wire_utilization": None, + "basis": ("logical = canonical routed-payload copies x hidden x dtype / latency; " + "buffer = backend recv tensor / latency; alg/bus/wire = null (no defined " + "EP busBW formula, no transport counters) — NOT physical link utilization"), + } + # Contract-level anomaly checks (goal P1) — attached to the ROW and rolled into validity. + # roundtrip_gt_isolated_sum: measured RT p99 >> Σ(isolated dispatch+combine) p99 — a + # chained op shouldn't be far larger than its parts (the open LL-FP8 case). + # roundtrip_lt_component_floor: measured RT p50 < max(dispatch,combine) p50 — a chained + # op can't finish faster than its slowest required component (sync semantics violated). + row_anoms = [] + if isum["p99"] > 0 and rtp["p99"] > thr_rt * isum["p99"]: + row_anoms.append({"type": "roundtrip_gt_isolated_sum", "T": T, + "roundtrip_p99": round(rtp["p99"], 2), "isolated_sum_p99": round(isum["p99"], 2), + "ratio": round(rtp["p99"] / isum["p99"], 2), "threshold": thr_rt}) + floor = max(dp["p50"], cp["p50"]) + if rtp["p50"] > 0 and floor > 0 and rtp["p50"] < 0.95 * floor: + row_anoms.append({"type": "roundtrip_lt_component_floor", "T": T, + "roundtrip_p50": round(rtp["p50"], 2), "component_floor_p50": round(floor, 2)}) + all_anomalies.extend(row_anoms) + rows.append({ + "tokens_per_rank": T, "global_tokens": gt, + "dispatch": dp, "combine": cp, "roundtrip": rtp, "isolated_sum": isum, + # flat aliases kept for back-compat with v3 readers + "dispatch_us_p50": dp["p50"], "dispatch_us_p90": dp["p90"], "dispatch_us_p99": dp["p99"], + "combine_us_p50": cp["p50"], "combine_us_p90": cp["p90"], "combine_us_p99": cp["p99"], + "roundtrip_us_p50": rtp["p50"], "roundtrip_us_p90": rtp["p90"], + "roundtrip_us_p95": rtp["p95"], "roundtrip_us_p99": rtp["p99"], + "isolated_sum_us_p50": isum["p50"], "isolated_sum_us_p99": isum["p99"], + "samples_pooled": len(d), "trials": max(1, args.trials), + "percentile_interpolation": "nearest-rank", + "recv_tokens_max": recv_max, "recv_tokens_min": recv_min, + "recv_tokens_mean": recv_total / world_size, "recv_tokens_total": recv_total, + "per_rank_dispatch_us": {"min": min(per_rank_med), "mean": rmean, + "max": max(per_rank_med), "spread": max(per_rank_med) - min(per_rank_med), + "slowest_rank": slowest_rank}, + # dispatch carries its dtype's element size; combine input is bf16 (2B). + "dispatch_logical_bytes": token_rank_copies * H * elem_dispatch, + "combine_logical_bytes": token_rank_copies * H * 2, + "byte_contracts": { + "token_rank_payload_copies": token_rank_copies, + "token_expert_payload_copies": token_expert_copies, + "dispatch_bytes": token_rank_copies * H * elem_dispatch, + "combine_bytes": token_rank_copies * H * 2, + "fp8_scale_bytes": (token_rank_copies * (H // 128) * 4) if elem_dispatch == 1 else 0, + "routing_index_bytes": token_expert_copies * 4, # int32 topk_idx + "gate_weight_bytes": token_expert_copies * 4, # f32 topk_weights + }, + "byte_contract": "logical-routed-payload-v1", + # throughput from the MEASURED round trip ONLY (not isolated_sum). + "roundtrip_tokens_per_second": (gt / (rtp["p50"] * 1e-6)) if rtp["p50"] > 0 else None, + "raw_samples": {"dispatch": _histogram(d), "combine": _histogram(c), "roundtrip": _histogram(rt)}, + # distinguished bandwidth concepts (goal P1) — logical + buffer real, alg/bus/wire null. + "bandwidth": bandwidth, + # full load + fanout statistics in EVERY row (goal P2 "report full load and fanout"): + "fanout_mean": rstats["fanout_mean"], "fanout_max": rstats["fanout_max"], + "fanout_min": rstats["fanout_min"], "fanout_hist": rstats["fanout_hist"], + "routed_copies": rstats["routed_copies"], + "expert_load_min": rstats["expert_load_min"], "expert_load_max": rstats["expert_load_max"], + "expert_load_mean": rstats["expert_load_mean"], "expert_load_cv": rstats["expert_load_cv"], + "rank_load_cv": rstats["rank_load_cv"], "hotspot_ratio": rstats["hotspot_ratio"], + "dest_rank_load_max": rstats["dest_rank_load_max"], + "dest_rank_load_mean": rstats["dest_rank_load_mean"], + "empty_expert_count": rstats["empty_expert_count"], + "empty_rank_count": rstats["empty_rank_count"], + "rank_load_hist": rstats["rank_load_hist"], + "source_token_stats": rstats.get("source_token_stats"), + "routing_hash": rstats["routing_hash"], "locality": rstats.get("locality"), + "anomalies": row_anoms, + "correct": point_ok, "max_rel_error": max_rel, + }) + if rank == 0: + print(f" T={T:<5} disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} comb {cp['p50']:6.1f}/{cp['p99']:6.1f} " + f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(d)} fanout={rstats['fanout_mean']:.2f} " + f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} " + f"straggler=r{slowest_rank} correct={point_ok}") + + # Cross-rank workload-identity proof: every rank must have built the SAME global routing + # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and + # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing. + trace_sig = int(hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest()[:15], 16) + sig_min = _reduce_int(torch, dist, device, trace_sig, MIN) + sig_max = _reduce_int(torch, dist, device, trace_sig, MAX) + routing_consistent = (sig_min == sig_max == trace_sig) + + if rank != 0: + return 0 + + # status=valid requires correctness AND a proven-identical routing trace across ranks. + all_ok = bool(rows) and all(r["correct"] for r in rows) and routing_consistent + + # ---- Multi-dimensional validity (goal P1) -> MACHINE-DERIVED publication_status. Adapters + # never self-label "official"; status is a pure function of these gates. ---- + prov = backend.backend_provenance + prov_unknown = _provenance_unknown(prov) + repro = getattr(args, "reproduction_full", {}) + git_run = getattr(args, "git_run", None) + provenance_complete = (not prov_unknown + and bool(getattr(args, "image_digest", "")) + and bool(git_run) and all((git_run or {}).get(k) for k in ("run_id", "source_sha"))) + floored = bool(prov.get("block_num_floored")) + # fixed-kernel (DeepEP LL) is NOT a normalized resource-constrained run -> conformance N/A + # (immediate P0 "split LL fixed-kernel from normalized-resource"). Not a conformance failure. + fixed_kernel = bool(prov.get("low_latency_mode")) or ("fixed-kernel" in str(prov.get("tuned_source", ""))) + resource_conformance = ("not-applicable" if fixed_kernel + else "minimum-functional-nonconforming" if floored + else ("resource-conforming" if args.resource_mode == "normalized" + else "backend-default" if args.resource_mode in ("tuned", "default") + else "unspecified")) + # record the canonical workload identity consumed (one trace per T -> set of ids/checksums). + if canonical and loaded_workload_ids: + args.workload_id = (loaded_workload_ids[0] if len(loaded_workload_ids) == 1 + else f"set:{len(loaded_workload_ids)}:{loaded_workload_ids[0]}") + args.workload_checksums = loaded_checksums + canonical_workload = bool(getattr(args, "workload_id", None)) + # Activation-value identity (scaffold): today activations are seeded N(0,1) and NOT serialized, + # so identity is the deterministic descriptor (profile|seed|hidden|generator). When a value rig + # (lognormal / model-trace) lands, this becomes the byte-hash of the serialized activations. + activation_identity = hashlib.sha256( + f"{args.activation_profile}|seed={args.seed}|hidden={args.hidden}|gen=collectivex-activation-v1" + .encode()).hexdigest()[:16] + # EPLB mapping identity hash (goal P2) — over the replica placement, not just the counts. + eplb_mapping_hash = None + if eplb_plan is not None: + eplb_mapping_hash = hashlib.sha256(json.dumps( + {"phys2log": eplb_plan["phys2log"], "rank_of_phys": eplb_plan["rank_of_phys"], + "replicas": eplb_plan["replicas"]}, sort_keys=True).encode()).hexdigest()[:16] + # Anomaly roll-up (goal P1-e/f): any flagged row anomaly demotes publication_status to + # diagnostic, unless --waive-anomaly (set AFTER the cause is understood + documented). + waived = bool(getattr(args, "waive_anomaly", False)) + anomaly_free = (len(all_anomalies) == 0) or waived + validity = { + "execution_status": "complete" if rows else "failed", + "semantic_correctness": "pass" if (rows and all(r["correct"] for r in rows)) else "fail", + "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent", + "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime", + "measurement_conformance": "conformant", # run_ep gate rejects nonconformant pre-run + "resource_conformance": resource_conformance, + "provenance_complete": provenance_complete, + # anomaly-free unless a contract-level timing anomaly fired (then diagnostic, see above). + "anomaly_free": anomaly_free, + } + publication_status = _derive_publication_status(validity) + + shape = { # FIXED line identity (no T, no per-backend resource knobs) + "hidden": args.hidden, "topk": args.topk, "experts": args.experts, + "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype, + "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical, + # DeepEP kernel generation (v1 = NVSHMEM, v2 = NCCL-Gin) — part of line identity so a V2 run + # is never conflated with V1 in comparison_key / plot / cohort. Derived from deepep_version; + # "n-a" for non-DeepEP backends. (Existing V1 docs lack this field -> read as "v1".) + "kernel_gen": ("v2" if str((backend.backend_provenance or {}).get("deepep_version", "")).startswith("2") + else "v1" if str((backend.backend_provenance or {}).get("deepep_version", "")).startswith("1") + else "n-a"), + # temporal snapshot + uneven allocation change the realized workload, so they are part of + # the line identity (fold into comparison_key). Default 0/none reproduce the prior key for + # non-temporal even runs in spirit (the value is recorded either way). + "routing_step": routing_step, "uneven_tokens": uneven, + # value distribution of expert inputs — part of the workload identity (review: quant + # combine can be value-sensitive). "normal" today; folds into comparison_key. + "activation_profile": args.activation_profile, + # Combine contract, SEPARATE from dispatch. Today bf16/none for every backend regardless + # of dispatch_dtype; a quant-combine backend (PR311) reports its actuals via attrs. In + # shape so it folds into comparison_key — a quant-combine run is never compared to a bf16 one. + "quant": { + "combine_input_dtype": getattr(backend, "combine_input_dtype", args.combine_dtype), + "combine_accum_dtype": getattr(backend, "combine_accum_dtype", "fp32"), + "combine_output_dtype": getattr(backend, "combine_output_dtype", "bf16"), + "combine_quant_mode": getattr(backend, "combine_quant_mode", args.combine_quant_mode), + "scale_layout": getattr(backend, "scale_layout", None), + }, + } + meta = { + "op": "ep-dispatch-combine", "backend": backend.name, "mode": args.mode, + "phase": args.phase, "world_size": world_size, "ep_size": ep_size, + "resource_mode": args.resource_mode, + "nodes": int(os.environ.get("SLURM_NNODES", "1")), + "topology_class": args.topology_class, "comparison_class": args.comparison_class, + # honest contract name (was the misleading "comm-only-v1": dispatch INCLUDES layout + # under layout-and-dispatch-v1). Adapters declare which they conform to. + "measurement_contract": args.measurement_contract, "shape": shape, + # structured placement metadata (goal P2 topology) — replaces the bare topology string. + "placement": { + "kind": args.placement, "nodes": int(os.environ.get("SLURM_NNODES", "1")), + "gpus_per_node": args.gpus_per_node or ep_size, + "scale_up_domain": args.scale_up_domain or ((args.gpus_per_node or ep_size) * 1), + "ranks": ep_size, "transport": args.transport, + }, + } + headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2]) + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + doc = { + "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "tests/run_ep.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + # Multi-dimensional validity + machine-derived publication status (goal P1). `status` + # is a back-compat alias (legacy v3 readers) — publication_status is authoritative. + "validity": validity, + "publication_status": publication_status, + "status": "valid" if all_ok else "invalid", + "workload": { + "source": validity["workload_source"], + "workload_id": getattr(args, "workload_id", None), + "manifest_checksums": getattr(args, "workload_checksums", None), + "trace_signature": f"{trace_sig:015x}", + "distinct_per_T_hashes": sorted(routing_hashes), + # within-run (cross-rank) identity is PROVEN here; cross-hardware identity holds + # only if another run records the SAME trace_signature / workload_id. + "cross_rank_consistent": routing_consistent, + # value-distribution identity of the expert inputs (scaffold; see activation_identity above). + "activation_profile": args.activation_profile, + "activation_identity": activation_identity, + }, + "comparison_key": comparison_key(meta), + "x_axis": {"primary": "tokens_per_rank", + "global_relation": "global_tokens = tokens_per_rank * ep_size"}, + "backend_provenance": backend.backend_provenance, + # backend-independent resource vocabulary + conformance class (goal P3). + "resource_profile": _resource_profile(backend.backend_provenance, args), + "reproduction": { + "command": getattr(args, "reproduction_command", ""), + "image": getattr(args, "image", "") or None, + "image_digest": getattr(args, "image_digest", "") or None, + "image_arch": getattr(args, "image_arch", None), + "squash_sha256": getattr(args, "squash_sha256", None), + "git_run": getattr(args, "git_run", None), # repo/run/attempt/ref/sha/job/artifact + # redaction (goal P1): command + provenance carry NO hostnames/IPs/UUIDs/private paths; + # per-node env (hostnames, GPU UUIDs, NIC GUIDs) lives in the separate gitignored + # env_json (CI uploads it as a workflow artifact), never inlined into this record. + "redaction": "no hostnames/IPs/UUIDs/private-paths in command or provenance", + "seed": args.seed, "warmup": args.warmup, "iters": args.iters, + "trials": max(1, args.trials), "samples_per_point": (max(1, args.trials) * args.iters), + "measurement_contract": args.measurement_contract, + "dispatch_dtype": args.dispatch_dtype, "mode": args.mode, + "combine_dtype": args.combine_dtype, "combine_quant_mode": args.combine_quant_mode, + "activation_profile": args.activation_profile, + "routing_step": routing_step, "uneven_tokens": uneven, + "waive_anomaly": waived, + "roundtrip_anomaly_threshold": thr_rt, + # whether (de)quantization is inside the timed window. fp8_quant_in_timing kept as a + # back-compat alias (dispatch-side fp8); combine_* are the quant-combine generalization + # (None today — no quant combine is wired). A backend sets these when it quantizes. + "fp8_quant_in_timing": getattr(backend, "fp8_in_timing", None), + "combine_quant_in_timing": getattr(backend, "combine_quant_in_timing", None), + "combine_dequant_in_timing": getattr(backend, "combine_dequant_in_timing", None), + }, + **meta, + "correctness": {"passed": all_ok, + "max_rel_error": max((r["max_rel_error"] for r in rows), default=None), + "tolerance": getattr(backend, "tolerance", 5e-2), "points": len(rows), + # honest scope: round-trip reconstruction + non-silent recv, NOT a full + # per-token routing/ordering/weight/padding proof (review #3). + "scope": "roundtrip-reconstruction-smoke-v1"}, + "routing_identity": { # cryptographic workload-identity proof (review #3) + "consistent_across_ranks": routing_consistent, + "trace_signature": f"{trace_sig:015x}", + "distinct_per_T_hashes": sorted(routing_hashes), + }, + # EPLB plan + the per-rank load imbalance it removes (the headline of the zipf+EPLB + # comparison). enabled=False when the run did not apply EPLB. + # EPLB mapping IDENTITY (goal P2): logical/physical counts + a hash of the replica + # placement (phys2log/rank_of_phys/replicas). Two EPLB runs are only an official comparison + # if their mapping_hash matches (cohort.py enforces); zipf vs zipf+eplb is a RECOVERY + # experiment, not the same raw workload. + "eplb": ({"enabled": True, "num_logical_experts": num_logical, + "num_physical_experts": args.experts, + "num_redundant": args.experts - num_logical, + "imbalance_before": eplb_plan["imbalance_before"], + "imbalance_after": eplb_plan["imbalance_after"], + "replicated_experts": eplb_plan["replicated_experts"], + "max_replicas": eplb_plan["max_replicas"], + "mapping_hash": eplb_mapping_hash} + if eplb_plan else {"enabled": False}), + "routing_profile": { + "routing": args.routing, + "fanout_mean": sum(r["fanout_mean"] for r in rows) / len(rows), + "fanout_max": max(r["fanout_max"] for r in rows), + "headline_hash": headline["routing_hash"], + }, + "metrics": { # p99 is the headline percentile (review #3); p50/p90/p95 also kept per row + "headline_tokens_per_rank": headline["tokens_per_rank"], + "headline_percentile": "p99", + "dispatch_us_p50": headline["dispatch_us_p50"], "dispatch_us_p99": headline["dispatch_us_p99"], + "combine_us_p50": headline["combine_us_p50"], "combine_us_p99": headline["combine_us_p99"], + "roundtrip_us_p50": headline["roundtrip_us_p50"], "roundtrip_us_p99": headline["roundtrip_us_p99"], + "isolated_sum_us_p50": headline["isolated_sum_us_p50"], "isolated_sum_us_p99": headline["isolated_sum_us_p99"], + "isolated_sum_label": "sum of isolated dispatch+combine percentiles — NOT a measured chained op", + "roundtrip_tokens_per_second": headline["roundtrip_tokens_per_second"], + }, + # phase semantics (goal P2): decode/prefill are regimes with distinct serving meaning, not + # just ladder aliases — a point is one MoE layer / one step / one collective. + "phase_profile": PHASE_PROFILE.get(args.phase, {"regime": args.phase}), + # source-token allocation across ranks (goal P2 uneven allocation). 'none' = even. + "source_allocation": { + "mode": uneven, "routing_step": routing_step, + "note": ("even — every rank gets the ladder T (global = T*ep_size)" if uneven == "none" + else "uneven — per-rank source-token counts vary; see rows[].source_token_stats " + "(global may not divide ep_size; empty-source-rank possible)"), + }, + # contract-level timing anomalies (goal P1) — aggregate of the per-row flags; demotes + # publication_status to diagnostic unless --waive-anomaly (validity.anomaly_free). + "anomalies": all_anomalies, + "anomaly_summary": {"count": len(all_anomalies), "waived": waived, + "types": sorted({a["type"] for a in all_anomalies})}, + "rows": rows, "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"{backend.name} ep-dispatch-combine [{args.phase}/{args.mode}/{args.measurement_contract}]: " + f"status={doc['status']} {len(rows)} pts, routing_consistent={routing_consistent}, " + f"headline T={headline['tokens_per_rank']} disp_p99={headline['dispatch_us_p99']:.1f}us " + f"-> {args.out}") + return 0 if all_ok else 1 diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py new file mode 100644 index 000000000..eef42ee6d --- /dev/null +++ b/experimental/CollectiveX/tests/ep_mori.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — MoRI (AMD ROCm), normal mode. + +The harness owns the deterministic shared routing trace and the comm-only timing; +this file owns MoRI's API and the ionic_rdma-fabric constraints found on MI355X +(validated on-node, see CONTAINERS.md): the whole symmetric heap is one RDMA MR +capped at ~4 GiB (hold at 2 GiB; bound buffers via max_num_inp_token_per_rank ⇒ +buffer_cap); combine() resets recv_num (read it before combine; compare only the +first T rows); and the post-shmem_finalize teardown asserts (finalize hard-exits). + +`make_problem` now materializes the harness-provided rank slice, so MoRI honors the +requested routing (it no longer always-uniform) and runs the identical workload to +the NVIDIA SKUs. combine_needs_redispatch=True: combine consumes recv_num, so the +harness re-dispatches (untimed) before each timed combine sample. +""" +from __future__ import annotations + +import os +import sys +import types + +# MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init — set BEFORE +# `import mori`. 2 GiB registers on the MI355X ionic_rdma NICs; larger fails. +os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", + os.environ.get("CX_MORI_HEAP_SIZE", "2G")) + +import torch +import torch.distributed as dist + +try: + import mori # type: ignore +except Exception as exc: # pragma: no cover - needs the AMD MoRI image + print("ERROR: mori import failed — needs the AMD MoRI image " + f"(rocm/sgl-dev:...-mori-...). {exc!r}", file=sys.stderr) + raise + +# e4m3fnuz (the ROCm-native fp8) finite max. AMD's "fnuz" (finite, no -0/Inf/NaN-unsigned) e4m3 +# saturates at 240.0 — the dispatch fp8 cast scales each block so its amax maps to this. +_FP8_FNUZ_MAX = 240.0 +_FP8_BLOCK = 128 # MoRI/DeepSeek blockwise fp8: one scale per 128-elem hidden block (7168%128==0) + + +def _mori_quant_introspect(): + """Describe MoRI's quant API (enum members + ctor/dispatch signatures + quant/scale helpers). + + FNUZ fp8 dispatch on MoRI keys off EpDispatchCombineConfig.quant_type, which PR311 extended with + QuantType::Fp8BlockwiseQuant — but how that value is EXPOSED to Python (enum attr vs accepted + string vs int) differs by build. We print this to stderr at construction so a GHA run's log is + self-documenting: even if the run wedges or the quant_type guess is wrong, the next iteration has + MoRI's exact surface without needing interactive SSH (which stalls on the shared cluster).""" + import inspect + info = {} + ops = getattr(mori, "ops", None) + try: + info["config_sig"] = str(inspect.signature(mori.ops.EpDispatchCombineConfig.__init__)) + except Exception as e: + info["config_sig"] = f"" + for meth in ("dispatch", "combine"): + try: + info[f"{meth}_sig"] = str(inspect.signature(getattr(mori.ops.EpDispatchCombineOp, meth))) + except Exception as e: + info[f"{meth}_sig"] = f"" + # Any enum / helper whose name mentions quant or scale (the QuantType enum + any quantize fn). + surface = {} + for nm in (dir(ops) if ops else []): + if nm.startswith("_"): + continue + if "quant" in nm.lower() or "scale" in nm.lower(): + obj = getattr(ops, nm) + members = {} + for m in dir(obj): + if m.startswith("_"): + continue + try: + members[m] = int(getattr(obj, m)) + except Exception: + members[m] = str(type(getattr(obj, m)).__name__) + surface[nm] = members or str(type(obj).__name__) + info["quant_surface"] = surface + return info + + +def _mori_quant_type_validator(): + """MoRI's own quant_type normalizer if exposed (mori.ops.dispatch_combine._normalize_quant_type) + — validates a candidate CHEAPLY (no 2 GiB heap alloc) by raising on an invalid value. The config + ctor stores any string; only the OP normalizes it, so a config-only probe can't tell a valid mode + from an invalid one (that cost us a 90-min MI355X run on the wrong 'fp8_blockwise' guess).""" + try: + from mori.ops.dispatch_combine import _normalize_quant_type # type: ignore + return _normalize_quant_type + except Exception: + return None + + +def _fp8_quant_type_candidates(): + """Ordered (value, label) candidates for MoRI's fp8 quant_type. fp8_direct_cast is the validated + mode on the mori-0227-2 image (the GHA self-introspection found the valid set is + ['none','fp8_direct_cast']; 'fp8_blockwise' is in the python map but THIS build's + _normalize_quant_type rejects it). Prefer the direct-cast string, then the typed enum member, then + fallbacks — __init__ keeps the first that MoRI's _normalize_quant_type accepts.""" + ops = mori.ops + out = [("fp8_direct_cast", "str:fp8_direct_cast")] + enum = getattr(ops, "EpDispatchCombineQuantType", None) + if enum is not None: + for pref in ("Fp8DirectCast", "Fp8BlockwiseQuant"): + if hasattr(enum, pref): + out.append((getattr(enum, pref), f"EpDispatchCombineQuantType.{pref}")) + for s in ("fp8", "Fp8", "fp8_blockwise"): + out.append((s, f"str:{s}")) + return out + + +def _quant_blockwise_fp8_fnuz(x, block=_FP8_BLOCK): + """bf16 [T,H] -> (e4m3fnuz [T,H], f32 per-block scales [T,H//block]). Per-128-block amax scaling + onto the fnuz finite range. Caller-side quantization (MoRI transports the fp8 payload + scales; + the combine reduces and the harness dequantizes for the consistency-correctness gate).""" + T, H = x.shape + assert H % block == 0, f"hidden {H} not a multiple of fp8 block {block}" + nb = H // block + xb = x.float().view(T, nb, block) + amax = xb.abs().amax(dim=2).clamp_min(1e-8) # [T, nb] + scale = amax / _FP8_FNUZ_MAX # f32 dequant scale + xq = (xb / scale.unsqueeze(2)).clamp(-_FP8_FNUZ_MAX, _FP8_FNUZ_MAX).to(torch.float8_e4m3fnuz) + return xq.view(T, H), scale + + +def _dequant_blockwise_fp8_fnuz(xq, scale, block=_FP8_BLOCK): + """Inverse of _quant_blockwise_fp8_fnuz: e4m3fnuz [T,H] + f32 [T,H//block] -> bf16-range f32 [T,H].""" + T, H = xq.shape + nb = H // block + return (xq.float().view(T, nb, block) * scale.unsqueeze(2)).view(T, H) + + +class MoRIBackend: + name = "mori" + combine_needs_redispatch = True + # MoRI wedges on a COLD dispatch jumping straight to a large T (validated on + # MI355X); the harness ramps this backend's ladder geometrically from 1. + needs_gradual_ramp = True + # MoRI WEDGES under a sustained warm-up burst (the harness's Blackwell clock-ramp) + # and is already steady at a short warm-up (~44us, reproducible) — so it opts out. + wants_warm_burst = False + # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no + # fallback/mislabel). DISPATCH precision and the SEPARATE combine path are distinct axes + # (review: dispatch_dtype=fp8 must NOT imply quantized combine). bf16 is the default; fp8 routes + # the AMD-native DIRECT-CAST path (quant_type=fp8_direct_cast — the only fp8 mode this MoRI build + # accepts; GHA introspection found the valid set is ['none','fp8_direct_cast']): the kernel casts + # bf16<->e4m3fnuz internally for transport (scale_dim=0, no caller scales) and returns the recv + # buffer as bf16 again. The combine OUTPUT stays bf16 so SUPPORTED_COMBINE_DTYPES is unchanged. + # Keep in sync with capability.py CAP["mori"]. + SUPPORTED_DISPATCH_DTYPES = {"bf16", "fp8"} # fp8 = e4m3fnuz direct-cast (FNUZ dispatch variant) + SUPPORTED_COMBINE_DTYPES = {"bf16"} # + "fp8" once the PR311 quant combine OUTPUT lands + SUPPORTED_COMBINE_QUANT_MODES = {"none"} # + the PR311 mode id once validated + SUPPORTED_PRECISIONS = SUPPORTED_DISPATCH_DTYPES # back-compat alias (run_ep.py / older refs) + SUPPORTED_MODES = {"normal"} # MoRI has no separate low-latency entrypoint + # MoRI computes its routing layout INSIDE the dispatch kernel (block_num/warps launch); + # it cannot be hoisted, so MoRI honors only the layout-and-dispatch contract. Cross- + # vendor comparisons must therefore use layout-and-dispatch-v1 (the common contract). + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = args.mode + assert (args.dispatch_dtype in self.SUPPORTED_DISPATCH_DTYPES + and args.mode in self.SUPPORTED_MODES + and getattr(args, "combine_dtype", "bf16") in self.SUPPORTED_COMBINE_DTYPES + and getattr(args, "combine_quant_mode", "none") in self.SUPPORTED_COMBINE_QUANT_MODES), \ + "run_ep.py must reject unsupported dispatch/mode/combine before constructing the backend" + self.fp8_in_timing = None # set when fp8 dispatch is used (whether the cast is timed) + # Combine-path quant timing (None today — no quant combine wired). PR311 sets these + + # the combine_* dtype attrs ep_harness reads via getattr; until then ep_harness records + # combine bf16 / none from the args defaults. + self.combine_quant_in_timing = None + self.combine_dequant_in_timing = None + self.ep_size = world_size + self.experts_per_rank = args.experts // self.ep_size + dev_cus = torch.cuda.get_device_properties(device).multi_processor_count + # Resource regime — map the comm budget onto CUs to mirror DeepEP's SM fraction. + # normalized: block_num ≈ sm_fraction · CUs (≈ the same device fraction); + # tuned: MoRI launch auto-tuning (API not present in this build — uses default, + # labeled tuned_source); default: the 80-block bring-up budget. + # MoRI DEADLOCKS at T>=32 when block_num is reduced toward the normalized target + # (validated on MI355X g15: block_num=46 wedges, 80 completes T=32/64 with the + # realistic fan-out≈5.3 trace). So MoRI cannot be normalized down to DeepEP's + # device fraction; floor it at a known-functional minimum and record that the + # target fraction was NOT reached. + rm = args.resource_mode + floor = int(os.environ.get("CX_MORI_MIN_BLOCKS", "80")) # functional minimum (deadlocks lower) + env_blocks = os.environ.get("CX_MORI_BLOCK_NUM") + self._block_floored = False + if env_blocks: + self.block_num = int(env_blocks) + self._block_target = self.block_num + elif rm == "normalized": + self._block_target = max(1, round(args.sm_fraction * dev_cus)) + self.block_num = max(floor, self._block_target) + self._block_floored = self.block_num > self._block_target + else: # tuned (no launch auto-tune API in mori-0227-2) / default + self.block_num = 80 + self._block_target = 80 + self._tuned_source = ("default-80" if rm == "tuned" else + ("normalized-floored" if self._block_floored else "n/a")) + self.dispatch_warps = int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16")) + self.combine_warps = int(os.environ.get("CX_MORI_COMBINE_WARPS", "8")) + + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + + self._cap = self.buffer_cap(args) + # Dispatch precision: bf16 (quant_type="none") or fp8 (e4m3fnuz DIRECT-CAST — the FNUZ + # variant). MoRI's only fp8 mode on this image is `fp8_direct_cast` (GHA self-introspection + # found the valid set is ['none','fp8_direct_cast']): the dispatch kernel direct-casts the + # bf16 input to e4m3fnuz for transport and returns the recv buffer as input.dtype (bf16) again + # — so NO caller scales (scale_dim=0; scale_dim>0 is only for caller FP4 dispatch scales). We + # DUMP MoRI's quant API to stderr (self-documenting GHA log — SSH to the cluster stalls) and + # pick the first quant_type MoRI's own _normalize_quant_type accepts (cheap; no heap alloc). + self._fp8 = (args.dispatch_dtype == "fp8") + self._quant_label = "none" + scale_dim = 0 + quant_type = "none" + if self._fp8: + import json as _json + print("MORI_QUANT_API " + _json.dumps(_mori_quant_introspect()), file=sys.stderr, flush=True) + validator = _mori_quant_type_validator() + cands = _fp8_quant_type_candidates() + print(f"MORI_FP8_CANDIDATES {[l for _, l in cands]}", file=sys.stderr, flush=True) + for val, label in cands: + try: + if validator is not None: + validator(val) # raises ValueError on an invalid value (no heap alloc) + else: + mori.ops.EpDispatchCombineConfig( # fallback: config-construct probe + data_type=torch.bfloat16, rank=rank, world_size=world_size, + hidden_dim=args.hidden, scale_dim=0, + scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(), + max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_num_inp_token_per_rank=max(512, self._cap), + num_experts_per_rank=self.experts_per_rank, + num_experts_per_token=args.topk, + use_external_inp_buf=False, quant_type=val) + quant_type, self._quant_label = val, label + break + except Exception as e: + print(f"MORI_FP8_REJECT {label}: {e!r}", file=sys.stderr, flush=True) + if quant_type == "none": + raise RuntimeError("no MoRI quant_type candidate accepted for fp8 — see " + "MORI_QUANT_API above for this build's actual quant surface") + print(f"MORI_FP8_QUANT_TYPE {self._quant_label}", file=sys.stderr, flush=True) + self.fp8_in_timing = True # the e4m3fnuz direct-cast is internal to dispatch (in timing) + # scale_dim==0 in both bf16 and fp8-direct-cast paths -> the 1-byte sentinel element size. + _scale_elt = torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size() + # zero-copy mode = NOT use_external_inp_buf. MoRI ASSERTS "Fp8DirectCast is not supported in + # zero-copy mode" (dispatch_combine.cpp:454, evidenced on MI355X run 28318485335), and the + # source also gates Fp8BlockwiseQuant on --zero-copy 0. So fp8 MUST use the external-input-buf + # (non-zero-copy) path; the dispatch copies the input to its staging buffer internally + # (EpDispatchCopyToStaging). bf16 keeps the validated zero-copy path (use_external_inp_buf=False). + _use_ext_inp_buf = bool(self._fp8) + self.config = mori.ops.EpDispatchCombineConfig( + data_type=torch.bfloat16, rank=rank, world_size=world_size, + hidden_dim=args.hidden, scale_dim=scale_dim, + scale_type_size=_scale_elt, + max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_num_inp_token_per_rank=max(512, self._cap), + num_experts_per_rank=self.experts_per_rank, + num_experts_per_token=args.topk, + use_external_inp_buf=_use_ext_inp_buf, quant_type=quant_type, + ) + self.op = mori.ops.EpDispatchCombineOp(self.config) + # fp8 blockwise carries fp8 quant error -> loosen the correctness gate to the fp8 class + # (the harness reads backend.tolerance; bf16 default 5e-2). The combine reduces the + # (dequantized) payload per rank, compared against x*unique_ranks within this tolerance class. + if self._fp8: + self.tolerance = 1.5e-1 + # Provenance: MoRI has no pip version; pin via MORI_COMMIT, else the image tag + # the launcher exported (COLLECTIVEX_IMAGE carries the mori build tag), so the + # provenance gate has something real rather than "unknown". + img = os.environ.get("COLLECTIVEX_IMAGE", "") + mori_commit = os.environ.get("MORI_COMMIT") or (f"image:{img}" if img else "unknown") + self.backend_provenance = { + "mori_commit": mori_commit, + "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"), + "max_num_inp_token_per_rank": max(512, self._cap), + "resource_mode": args.resource_mode, "block_num": self.block_num, + "block_num_target": self._block_target, "block_num_floored": self._block_floored, + "dispatch_warps": self.dispatch_warps, "combine_warps": self.combine_warps, + "device_cus": dev_cus, "sm_fraction": (self.block_num / dev_cus), + "tuned_source": self._tuned_source, + "dispatch_dtype": args.dispatch_dtype, + "quant_type": self._quant_label, + "fp8_format": ("e4m3fnuz" if self._fp8 else None), + "fp8_mode": ("direct_cast" if self._fp8 else None), # internal cast, scale_dim=0, no blocks + } + + def buffer_cap(self, args): + # Largest tokens/rank the 2 GiB registerable heap holds at hidden=7168 (512, + # validated on-node). Override via CX_MORI_MAX_TOKENS. + return int(os.environ.get("CX_MORI_MAX_TOKENS", "512")) + + def make_problem(self, T, idx, weights, x): + # Shared-trace slice: idx[T,topk] -> int32 (MoRI expects int32 expert ids); weights[T,topk] + # f32; x[T,hidden] bf16. scale_dim==0 for BOTH bf16 and fp8-direct-cast (the kernel casts + # bf16<->e4m3fnuz internally for transport), so scales is the (T,0) fp8 sentinel either way + # (dispatch ignores it since scale_dim==0). caller scales are only for FP4 dispatch. + indices = idx.to(torch.int32) + scales = torch.empty((T, 0), dtype=torch.float8_e4m3fnuz, device=self.device) + return types.SimpleNamespace(T=T, x=x, indices=indices, + weights=weights.to(torch.float32), scales=scales) + + def dispatch(self, p): + (dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num) = self.op.dispatch( + p.x, p.weights, p.scales, p.indices, + block_num=self.block_num, warp_per_block=self.dispatch_warps) + total_recv = int(recv_num[0].item()) # read BEFORE combine (combine resets recv_num) + # MoRI returns the recv buffer as input.dtype (bf16) for BOTH "none" and "fp8_direct_cast" + # (the e4m3fnuz cast is internal to the transport, dequantized back to bf16 on recv) -> a + # plain .to(bf16) is the combine input. fp8's e4m3 rounding shows up in the correctness gate + # against the looser fp8 tolerance class set in __init__. + return types.SimpleNamespace( + dispatch_output=dispatch_output, dispatch_weights=dispatch_weights, + dispatch_indices=dispatch_indices, total_recv=total_recv, + combine_input=dispatch_output.to(torch.bfloat16)) + + def stage(self, p, h): + # comm-only contract: stage the "expert outputs" into MoRI's registered + # combine-input buffer UNTIMED (in a real MoE the expert FFN writes here). + buf = self.op.get_registered_combine_input_buffer( + torch.bfloat16, hidden_dim=h.combine_input.size(1)) + buf[:h.total_recv, :].copy_(h.combine_input[:h.total_recv, :]) + + def combine(self, p, h): + combined, _w = self.op.combine( + h.combine_input, h.dispatch_weights, h.dispatch_indices, + block_num=self.block_num, warp_per_block=self.combine_warps) + return combined + + def expected(self, p, h): + # MoRI combine sums one copy per destination RANK ⇒ combined[i] ≈ + # ref[i] * (#unique destination ranks among the token's topk experts). + pes = p.indices.long() // self.experts_per_rank + unique_pes = torch.tensor( + [len(set(row.tolist())) for row in pes], device=self.device, dtype=torch.float32 + ).unsqueeze(1) + ref = p.x.float() + if self._fp8: + # fp8_direct_cast transports e4m3fnuz, so gate against the SAME direct-cast reference + # (consistency — like the flashinfer mxfp8/nvfp4 paths): combined = reduce(e4m3fnuz(x)), + # ref = e4m3fnuz(x)*ranks, so the e4m3 rounding CANCELS. A bf16 reference instead carries + # the full e4m3 error into relErr, which spuriously fails the per-rank gate at T=1 (the + # relErr denominator there is a single token's magnitude — a near-zero token inflates it). + ref = p.x.to(torch.float8_e4m3fnuz).float() + return ref * unique_pes, p.T + + def recv_tokens(self, h): + return int(h.total_recv) + + def finalize(self, rc): + # MoRI's shmem teardown asserts after shmem_finalize(); results are already + # written, so sync and hard-exit past it. + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(0 if rc == 0 else 1) diff --git a/experimental/CollectiveX/tests/ep_nccl.py b/experimental/CollectiveX/tests/ep_nccl.py new file mode 100644 index 000000000..f341100e7 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_nccl.py @@ -0,0 +1,130 @@ +"""CollectiveX — NCCL all-to-all expert-parallel backend (cross-node EP, goal 182). + +The canonical "token-shuffle" EP built on torch.distributed's NCCL ``all_to_all_single``: dispatch +routes each token-copy to the rank that owns its expert via an uneven all-to-all; combine reverses it +and weighted-sums the top-k copies back into each origin token. With no expert compute the round trip +reconstructs ``x * sum(topk_weights)`` per token. + +Why this exists alongside DeepEP/UCCL/MoRI: those use custom one-sided RDMA (DeepEP/NVSHMEM, UCCL's own +ibv verbs, MoRI ionic_rdma). Cross-node, UCCL's ``ibv_reg_mr`` failed with EINVAL -> heap corruption -> +SIGSEGV (run 28326528672) because the cluster's IB HCAs / container lack the GPUDirect-RDMA peer-memory +that custom verbs registration needs. NCCL's collective transport, by contrast, negotiates IB and +*gracefully host-stages* when GPUDirect RDMA is unavailable — so an EP built purely on NCCL collectives +runs cross-node on the same fabric. It is also the reference baseline the fused EP kernels improve upon, +so a same-shape NCCL number is a meaningful comparison point, not just a fallback. + +Scope: BF16, normal mode, layout-and-dispatch-v1 (the timed window includes the layout/argsort + both +all-to-alls). RCCL exposes the identical API, so this backend also covers AMD (rccl) cross-node EP. +""" +import os +import types + +import torch +import torch.distributed as dist + + +class NCCLBackend: + name = "nccl-ep" + combine_needs_redispatch = False # dispatch saves the permutation + splits; combine reuses them + wants_warm_burst = False + # Pure-collective token shuffle: bf16 only (no fp8 dispatch path), normal mode, single contract. + SUPPORTED_PRECISIONS = {"bf16"} + SUPPORTED_MODES = {"normal"} + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.experts = args.experts + assert args.experts % world_size == 0, \ + f"NCCL EP needs experts({args.experts}) divisible by world_size({world_size})" + self.experts_per_rank = args.experts // world_size + assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \ + f"NCCL EP supports precisions={sorted(self.SUPPORTED_PRECISIONS)} modes={sorted(self.SUPPORTED_MODES)} only" + self.tolerance = 5e-2 # bf16 round-trip + try: + _nccl = ".".join(str(v) for v in torch.cuda.nccl.version()) + except Exception: + _nccl = "unknown" + self.backend_provenance = { + "backend": "nccl-all2all", + "nccl_version": _nccl, + "transport": "nccl-all_to_all_single", + "resource_mode": args.resource_mode, + "num_sms": None, + "device_sms": torch.cuda.get_device_properties(device).multi_processor_count, + "tuned_source": "nccl-collective", + } + + def buffer_cap(self, args): + return None # no fixed pre-allocated buffer; all-to-all sizes itself per step + + def make_problem(self, T, idx, weights, x): + # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared routing-trace slice. + return types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), layout=None) + + def dispatch(self, p): + ws = self.world_size + x = p.x # [T, H] bf16 + idx = p.topk_idx # [T, topk] + T, H = int(x.shape[0]), int(x.shape[1]) + topk = int(idx.shape[1]) + dev = x.device + # Flatten the T*topk token-copies; each goes to the rank owning its expert. + flat_expert = idx.reshape(-1) # [T*topk] + flat_dest = (flat_expert // self.experts_per_rank).to(torch.int64) # dest rank per copy + flat_token = torch.arange(T, device=dev, dtype=torch.int64).repeat_interleave(topk) + # Group copies by destination rank (stable -> deterministic, invertible permutation). + order = torch.argsort(flat_dest, stable=True) + send_counts = torch.bincount(flat_dest, minlength=ws) # [ws] + send_x = x.index_select(0, flat_token.index_select(0, order)).contiguous() # [T*topk, H], send order + # Exchange per-rank counts so every rank can size its receive buffer. + recv_counts = torch.empty_like(send_counts) + dist.all_to_all_single(recv_counts, send_counts) + sc = send_counts.tolist() + rc = recv_counts.tolist() + total_recv = int(sum(rc)) + recv_x = torch.empty((total_recv, H), dtype=x.dtype, device=dev) + # The dispatch all-to-all (uneven splits). NCCL routes internode over IB (host-staged if no + # GPUDirect RDMA) — this is the line that runs cross-node where UCCL's ibv_reg_mr fails. + dist.all_to_all_single(recv_x, send_x, rc, sc) + return types.SimpleNamespace(recv_x=recv_x, combine_input=None, order=order, + flat_token=flat_token, flat_w=p.topk_weights.reshape(-1), + send_counts=sc, recv_counts=rc, T=T, H=H, total_recv=total_recv) + + def stage(self, p, h): + # No expert compute: the expert "output" is the received tokens as-is (the round-trip identity). + h.combine_input = h.recv_x + return None + + def combine(self, p, h): + # Reverse all-to-all: ship expert outputs back to their origin ranks (swap the split lists). + send_back = torch.empty((int(h.order.shape[0]), h.H), dtype=h.combine_input.dtype, + device=h.combine_input.device) + dist.all_to_all_single(send_back, h.combine_input.contiguous(), h.send_counts, h.recv_counts) + # send_back is in send (sorted) order; invert the argsort to token-copy order. + copies = torch.empty_like(send_back) + copies[h.order] = send_back + # Weighted reduce of each token's top-k copies into [T, H] (accumulate in fp32 for stability). + out = torch.zeros((h.T, h.H), dtype=torch.float32, device=send_back.device) + out.index_add_(0, h.flat_token, copies.float() * h.flat_w.unsqueeze(1)) + return out.to(p.x.dtype) + + def recv_tokens(self, h): + return int(h.total_recv) + + def expected(self, p, h): + # Round trip with identity expert: out[t] = sum_k w[t,k] * x[t] = x[t] * sum_k w[t,k]. + wsum = p.topk_weights.sum(dim=1, keepdim=True).float() + return p.x.float() * wsum, p.T + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_uccl.py b/experimental/CollectiveX/tests/ep_uccl.py new file mode 100644 index 000000000..f13a77051 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_uccl.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — UCCL EP (NVIDIA), normal + LL modes. PRODUCING RESULTS: +cx_build_uccl vendors UCCL's deep_ep_wrapper as `uccl_deepep` (its Buffer takes a torch +ProcessGroup), so this adapter runs GENUINE uccl.ep dispatch/combine (uccl_version 0.1.1, +intranode NVLink) — validated on h100/h200/b300/b200. See docs/gated.md "UCCL EP". + +IMPORTANT (empirically established on H100 via GHA): the LOW-LEVEL `uccl.ep.Buffer` is +NOT a drop-in DeepEP clone. Its constructor is + Buffer(rank, num_ranks, num_nvl_bytes=0, num_rdma_bytes=0, low_latency_mode=False, …) +— it takes rank/num_ranks ints, NOT a torch ProcessGroup, so the `Buffer(self.group, …)` +calls below raise `TypeError: incompatible function arguments`. The DeepEP-identical +`Buffer(group, …)` API is UCCL's separate ~1900-line `deep_ep_wrapper` package (packaged +as `deep_ep`, colliding with the container's real DeepEP), whose __init__ runs a proxy + +IPC-handle-exchange + runtime.sync + connect_atomic_buffer bootstrap. To finish UCCL: +vendor `deep_ep_wrapper` under a non-colliding name (it uses relative imports + only needs +`uccl.ep`) and import its Buffer here; then this file is a true ep_deepep.py clone. This is +DONE: cx_build_uccl vendors `deep_ep_wrapper` as `uccl_deepep` and the import below uses it; if +that wrapper is ever absent the import falls back to the low-level `uccl.ep.Buffer`, which then +fails loudly (preserved failed-case) — never faked. With the wrapper present, results are genuine. + +The harness contract (make_problem/dispatch/stage/combine/expected/buffer_cap/recv_tokens/ +finalize + backend_provenance + SUPPORTED_*) mirrors ep_deepep.py and is correct once the +wrapper Buffer is wired. + +Install (see launchers/run_in_container.sh cx_build_uccl): `pip install uccl` ships a +prebuilt cp312 wheel; the UCCL EP kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH +(pip install nvidia-cuda-runtime-cu12, prepend its lib dir) even on a cu13 image. + +Correctness (identical to DeepEP's intranode test): a pure dispatch->combine round trip +with no expert compute reconstructs x only after dividing by the number of ranks each +token was sent to, so the harness expects combined ~= x * is_token_in_rank.sum(dim=1). +""" +from __future__ import annotations + +import os +import sys +import types + +import torch +import torch.distributed as dist + +try: + import uccl # for version/provenance + try: + # PREFERRED: vendored deep_ep_wrapper (cx_build_uccl -> uccl_deepep). Buffer(group, ...) + # takes a torch ProcessGroup (matches DeepEP + this adapter's calls) + runs UCCL's full + # proxy/IPC/runtime.sync bootstrap. Fallback: low-level uccl.ep.Buffer(rank,num_ranks,...). + from uccl_deepep import Buffer # type: ignore + except Exception: + from uccl.ep import Buffer # type: ignore +except Exception as exc: # pragma: no cover - needs the installed uccl wheel + cu12 runtime + print("ERROR: uccl.ep import failed — `pip install uccl nvidia-cuda-runtime-cu12` and " + "prepend the cu12 lib dir to LD_LIBRARY_PATH at job setup (cx_build_uccl). " + f"{exc!r}", file=sys.stderr) + raise + + +def _uccl_version() -> str: + try: + import importlib.metadata as _md + return _md.version("uccl") + except Exception: + return getattr(uccl, "__version__", "unknown") + + +# UCCL's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token +# block-128 scale — the SAME convention DeepEP's kernels expect (UCCL's ep.Buffer is a +# clone): scales [T, H//128] float32, e4m3, 448 = e4m3 max. Both directions of the cast +# run OUTSIDE the timed window (cast in make_problem, dequant in stage), so fp8 +# quantization is NOT included in dispatch time (except under runtime-visible-v1). +_FP8_MAX = 448.0 +_FP8_BLOCK = 128 + + +def _per_token_cast_to_fp8(x): + # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32) + T, H = x.shape + xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK) + amax = xv.abs().amax(dim=2).clamp(min=1e-4) # [T, H//128] + x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H) + return x_fp8, (amax / _FP8_MAX).contiguous() + + +def _per_block_dequant(x_fp8, scales): + # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16 + R, H = x_fp8.shape + xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK) + return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16) + + +def _per_block_dequant_3d(x_fp8, scales): + # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16 + E, S, H = x_fp8.shape + xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK) + return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16) + + +class UCCLBackend: + name = "uccl" + combine_needs_redispatch = False # UCCL combine reuses the handle (DeepEP-clone semantics) + # Blackwell (B300) drops GPU clocks during the tiny small-T points, so the harness + # re-ramps clocks at each shape before timing it. Harmless (just untimed iters) on H100/H200. + wants_warm_burst = True + # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no + # fallback/mislabel). Expanded as each path is implemented + hardware-validated. + # normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink on H200 (EP2). + # ll mode: low_latency_dispatch/combine via allow_nvlink_for_low_latency_mode — validated + # RUNNING intranode over NVLink on H200 (EP2); same DeepEP-clone LL kernel family. + SUPPORTED_PRECISIONS = {"bf16", "fp8"} + SUPPORTED_MODES = {"normal", "ll"} + # Three contracts (mirror DeepEP — UCCL's Buffer is the same API): + # layout-and-dispatch-v1 — times get_dispatch_layout INSIDE dispatch; fp8 cast/dequant + # OUTSIDE (preprocessing mirrors a producer handing quantized x). + # cached-layout-comm-only-v1 — layout hoisted out (untimed); dispatch = pure comm. normal only. + # runtime-visible-v1 — dispatch INCLUDES the fp8 quant (cast) + layout + comm + the + # recv-dequant that makes expert input consumable; combine starts + # from bf16 expert outputs. (normal mode; LL times all of it in-kernel.) + SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"} + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = args.mode + self.ll = (args.mode == "ll") + self.contract = args.measurement_contract + # hoist layout out of the timed dispatch only for the cached contract in normal mode. + self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll + # runtime-visible-v1: the fp8 cast + recv-dequant move INSIDE the timed dispatch (normal + # mode). LL already times cast+layout+comm in its single kernel, so it's runtime-visible + # by construction — the flag only changes normal mode's boundary. + self.runtime_visible = (self.contract == "runtime-visible-v1") and not self.ll + self.group = dist.group.WORLD + assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \ + "run_ep.py must reject unsupported dtype/mode before constructing the backend" + # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest + # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is + # recorded in the artifact so the looser fp8 gate is explicit, not hidden. + self.fp8 = (args.dispatch_dtype == "fp8") + self.tolerance = 1.25e-1 if self.fp8 else 5e-2 + dev_sms = torch.cuda.get_device_properties(device).multi_processor_count + ver = _uccl_version() + if self.ll: + self._init_ll(args, dev_sms, ver) + else: + self._init_normal(args, rank, dev_sms, ver) + + def _init_normal(self, args, rank, dev_sms, ver): + # fp8 cast: UNTIMED (make_problem) under layout-and-dispatch / cached-layout; TIMED (inside + # dispatch) under runtime-visible-v1. So fp8_in_timing tracks the contract honestly. + self.fp8_in_timing = (self.runtime_visible if self.fp8 else None) + self.combine_needs_redispatch = False # normal combine reuses the handle + # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points (the shared + # T=128 point must match between the decode and prefill sweeps). 4 GiB holds T up to 4096. + num_nvl_bytes = int(os.environ.get("CX_UCCL_NVL_BYTES", + os.environ.get("CX_DEEPEP_NVL_BYTES", + str(4 * 1024 * 1024 * 1024)))) + self.buffer = Buffer(self.group, num_nvl_bytes, 0) + rm = args.resource_mode + tuned_src = None + if rm == "normalized": + num_sms = max(1, round(args.sm_fraction * dev_sms)) # ~same device fraction as MoRI + elif rm == "tuned": + # Best-available for the installed UCCL: its OWN default SM count (Buffer.num_sms — + # the library's analytic choice). get_dispatch_config(num_ranks) returns the + # recommended Config but doesn't expose num_sms to Python; the default reflects it. + num_sms = int(getattr(Buffer, "num_sms", args.num_sms)) + tuned_src = "uccl-default-num_sms" + else: # default — the bring-up budget + num_sms = args.num_sms + try: + Buffer.set_num_sms(num_sms) + except Exception as exc: # pragma: no cover - version dependent + if rank == 0: + print(f"WARN: could not set num_sms={num_sms}: {exc!r}", file=sys.stderr) + self.backend_provenance = { + "uccl_version": ver, + "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{ver}", + "mode": "normal", "resource_mode": rm, "num_sms": num_sms, "device_sms": dev_sms, + "sm_fraction": (num_sms / dev_sms), "tuned_source": tuned_src or "n/a", + "num_nvl_bytes": num_nvl_bytes, + } + + def _init_ll(self, args, dev_sms, ver): + # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink via + # allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch so for + # fp8 the quantization IS inside the timed window (recorded honestly). The buffer is sized + # for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical), so LL is a + # decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop). set_num_sms + # does NOT apply (the LL kernel picks its own occupancy) — recorded n/a. + self.fp8_in_timing = (True if self.fp8 else None) + self.combine_needs_redispatch = True # re-dispatch (untimed) before each timed combine + self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128")) + self.experts = args.experts + rdma_bytes = Buffer.get_low_latency_rdma_size_hint( + self.num_max, args.hidden, self.world_size, args.experts) + # one QP per local expert is the DeepEP/UCCL convention for LL + self.num_qps = max(1, args.experts // self.world_size) + self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True, + num_qps_per_rank=self.num_qps, + allow_nvlink_for_low_latency_mode=True) + self.backend_provenance = { + "uccl_version": ver, + "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{ver}", + "mode": "ll", "resource_mode": args.resource_mode, + "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel", + "num_max_dispatch_tokens_per_rank": self.num_max, + "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps, + "low_latency_mode": True, "use_fp8": self.fp8, + } + + def buffer_cap(self, args): + # LL is sized for a fixed num_max; cap the sweep there (reported, not silent). + return self.num_max if self.ll else None + + def make_problem(self, T, idx, weights, x): + # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice. + p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), layout=None) + if self.fp8 and not self.ll and not self.runtime_visible: + # layout-and-dispatch / cached-layout: per-token block-128 cast, UNTIMED (preprocessing, + # mirrors the real producer that hands the dispatcher already-quantized activations). + # runtime-visible does NOT pre-cast (the cast is timed inside dispatch); LL casts in-kernel. + p.x_fp8, p.x_scales = _per_token_cast_to_fp8(x) + if self.cache_layout: + # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed) so the + # timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None and dispatch + # computes it inside the timed window.) + ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + p.layout = (ntr, ntpe, itir) + return p + + def dispatch(self, p): + if self.ll: + return self._dispatch_ll(p) + if p.layout is not None: # cached-layout-comm-only-v1 + num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout + else: # layout-and-dispatch / runtime-visible (timed layout) + (num_tokens_per_rank, _, num_tokens_per_expert, + is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + ref_fp8 = ref_scales = None + if self.fp8: + if self.runtime_visible: + # runtime-visible: the per-token block-128 cast is INSIDE the timed dispatch. + x_fp8, x_scales = _per_token_cast_to_fp8(p.x) + ref_fp8, ref_scales = x_fp8, x_scales # for the correctness reference + else: + x_fp8, x_scales = p.x_fp8, p.x_scales # pre-cast (untimed) + x_in = (x_fp8, x_scales) + else: + x_in = p.x + recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch( + x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert) + out = types.SimpleNamespace( + recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle, + is_token_in_rank=is_token_in_rank, ref_fp8=ref_fp8, ref_scales=ref_scales) + if self.fp8 and self.runtime_visible: + # dispatch ENDS when expert input is consumable: dequant fp8 recv -> bf16 INSIDE the + # timed window (the contract's "expert input genuinely consumable" boundary). stage() + # then no-ops for this contract. + recv_fp8, recv_scales = recv_x + out.combine_input = _per_block_dequant(recv_fp8, recv_scales) + out.rv_staged = True + return out + + def _dispatch_ll(self, p): + # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the cast IS + # inside this timed op — fp8_in_timing=True). recv is the expert-major 3D layout + # [num_local_experts, num_max*world, hidden] (+scales when fp8). + recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch( + p.x, p.topk_idx, self.num_max, self.experts, + use_fp8=self.fp8, return_recv_hook=False) + return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle) + + def stage(self, p, h): + # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv to + # bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in both + # precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.) + if getattr(h, "rv_staged", False): + return None # runtime-visible already produced bf16 combine_input inside dispatch (timed) + if self.ll: + if self.fp8: + recv_fp8, recv_scales = h.recv_x + h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales) + else: + h.combine_input = h.recv_x + elif self.fp8: + recv_fp8, recv_scales = h.recv_x + h.combine_input = _per_block_dequant(recv_fp8, recv_scales) + else: + h.combine_input = h.recv_x + return None + + def combine(self, p, h): + if self.ll: + # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones. + combined_x, _event, _hook = self.buffer.low_latency_combine( + h.combine_input, p.topk_idx, p.topk_weights, h.handle) + return combined_x + combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle, + topk_weights=h.recv_topk_weights) + return combined_x + + def expected(self, p, h): + if self.ll: + # LL combine reduces each token's topk expert copies weighted by topk_weights; with no + # expert compute each copy is (the kernel's fp8 cast of) x, so combined ~= x * + # sum(topk_weights). fp8 quant error is covered by self.tolerance. + wsum = p.topk_weights.sum(dim=1, keepdim=True) + return p.x.float() * wsum, p.T + # normal: round trip with no expert compute reconstructs x*(#destination ranks); for fp8 + # compare against the dequantized cast that was actually sent. + ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float() + ref = p.x.float() + if self.fp8: + # runtime-visible cast lives on the handle (no pre-cast on p); else use the pre-cast. + x_fp8 = getattr(h, "ref_fp8", None) + x_scales = getattr(h, "ref_scales", None) + if x_fp8 is None: + x_fp8, x_scales = p.x_fp8, p.x_scales + ref = _per_block_dequant(x_fp8, x_scales).float() + return ref * ranks_per_token, p.T + + def recv_tokens(self, h): + if self.ll: + return int(h.recv_count.sum().item()) # token-copies received across local experts + rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x + return int(rx.shape[0]) + + def finalize(self, rc): + # UCCL's symmetric-memory / proxy teardown SIGSEGVs after the sweep completes — but the + # result JSON is already written by run_sweep, so (like ep_mori) hard-exit past the crashy + # dist/uccl cleanup with the real rc. A clean teardown isn't worth a false 'failed' on a + # valid result (the H100 smoke produced status=valid, correct=True before the SIGSEGV). + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(0 if rc == 0 else 1) diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py new file mode 100644 index 000000000..2234fea96 --- /dev/null +++ b/experimental/CollectiveX/tests/eplb.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for +skewed (zipf) expert load. + +Under skewed routing, the ranks hosting hot logical experts receive far more token-copies +than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX +the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts +onto extra physical slots and PLACES the slots so every rank carries ~equal load. + +This module is backend-agnostic: it is purely a transform of the deterministic routing +trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to +rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots +RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping +reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical` +and the remapped (physical) trace; nothing else changes. + + num_physical = num_logical + redundant (redundant rounded up to a multiple of ep_size) + build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks + remap_idx(): each token's logical targets -> physical replicas, spread by global token id + +Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch. +""" +from __future__ import annotations + + +def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int: + """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the + physical experts divide evenly across ranks (symmetric dispatch).""" + r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size + return num_logical + r + + +def _contiguous_rank_load(logical_load, ep_size): + """Per-rank received load WITHOUT EPLB: logical experts placed contiguously + (experts_per_rank = num_logical/ep_size), so rank r carries its block's total.""" + n = len(logical_load) + per = n // ep_size + return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)] + + +def build_plan(logical_load, num_physical: int, ep_size: int) -> dict: + """logical_load: list[float] length num_logical (token-copies per logical expert). + Returns the replication+placement plan (all pure-Python lists) + before/after balance.""" + num_logical = len(logical_load) + assert num_physical >= num_logical, "num_physical must be >= num_logical" + assert num_physical % ep_size == 0, "num_physical must divide ep_size" + assert num_logical % ep_size == 0, "num_logical must divide ep_size" + spp = num_physical // ep_size # physical slots per rank (fixed) + + # 1) Replica allocation — start one slot per logical expert, then hand each redundant + # slot to the expert with the highest CURRENT per-replica load (greedy min-max). + replicas = [1] * num_logical + for _ in range(num_physical - num_logical): + best, best_lps = 0, -1.0 + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + if lps > best_lps: + best, best_lps = e, lps + replicas[best] += 1 + + # 2) Slots = (per-replica load, logical expert), one per replica. + slots = [] + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + slots.extend((lps, e) for _ in range(replicas[e])) + + # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the + # max per-rank load: heaviest slot first -> least-loaded rank that still has capacity. + slots.sort(reverse=True) + rank_slots = [[] for _ in range(ep_size)] + rank_load = [0.0] * ep_size + for lps, e in slots: + r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp), + key=lambda r: rank_load[r]) + rank_slots[r].append(e) + rank_load[r] += lps + + # 4) Rank-major physical numbering -> contiguous placement == this balanced placement. + phys2log, rank_of_phys = [], [] + for r in range(ep_size): + for e in rank_slots[r]: + phys2log.append(e) + rank_of_phys.append(r) + log2phys = [[] for _ in range(num_logical)] + for pid, e in enumerate(phys2log): + log2phys[e].append(pid) + + before = _contiguous_rank_load(logical_load, ep_size) + total = sum(logical_load) or 1.0 + mean = total / ep_size + return { + "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size, + "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas), + "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys, + "rank_load_after": rank_load, "rank_load_before": before, + # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts. + "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean, + "replicated_experts": sum(1 for r in replicas if r > 1), + } + + +def remap_idx(idx_logical, plan): + """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace). + Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's + physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out + across its replicas (= across ranks). Replicas of distinct logical experts are disjoint, + so a token's top-k physical ids stay distinct (dispatch invariant preserved).""" + import torch + replicas = plan["replicas"] + num_logical = len(replicas) + max_rc = plan["max_replicas"] + rc = torch.tensor(replicas, dtype=torch.int64) + # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed + # past rc[e] because the replica index is taken mod rc[e]). + padded = torch.zeros(num_logical, max_rc, dtype=torch.int64) + for e, phys in enumerate(plan["log2phys"]): + for k in range(max_rc): + padded[e, k] = phys[k] if k < len(phys) else phys[0] + gt = idx_logical.shape[0] + rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1) # [gt,1] global token id + e = idx_logical.to(torch.int64) # [gt,topk] + ridx = rows % rc[e] # [gt,topk] replica index + return padded[e, ridx] # [gt,topk] physical ids + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed. + import sys + NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32 + load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)] + nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP) + plan = build_plan(load, nphys, EP) + print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}") + print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} " + f"(hottest expert 0 replicas={plan['replicas'][0]})") + print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}") + print(f"per-rank load AFTER (EPLB): {[round(x,3) for x in plan['rank_load_after']]}") + print(f"imbalance (max/mean) BEFORE={plan['imbalance_before']:.2f}x AFTER={plan['imbalance_after']:.2f}x") + # Gates: equal slot cardinality, every logical expert placed, big imbalance cut. + assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL)) + assert sum(plan["replicas"]) == nphys + assert len(plan["phys2log"]) == nphys + assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL)) + # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing + assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"]) + assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance" + assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}" + # remap (if torch present): distinctness + balanced receive on a sampled zipf trace. + try: + import torch + g = torch.Generator().manual_seed(0) + p = torch.tensor(load); p = (p / p.sum()).expand(4096, NUM_LOGICAL) + idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64) + idx_p = remap_idx(idx_l, plan) + assert idx_p.shape == idx_l.shape + # top-k physical ids distinct per token + assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct" + spp = plan["slots_per_rank"] + recv_before = [0] * EP + recv_after = [0] * EP + per_log = NUM_LOGICAL // EP + for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()): + for e in row_l: + recv_before[e // per_log] += 1 + for pid in row_p: + recv_after[pid // spp] += 1 + ib = max(recv_before) / (sum(recv_before) / EP) + ia = max(recv_after) / (sum(recv_after) / EP) + print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x AFTER={ia:.2f}x") + assert ia < ib and ia < 1.35, "remap must balance per-rank receive load" + print("remap self-test: OK") + except ImportError: + print("(torch absent — skipped remap self-test; planner gates passed)") + print("EPLB self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/tests/failure_taxonomy.py b/experimental/CollectiveX/tests/failure_taxonomy.py new file mode 100644 index 000000000..45782ee07 --- /dev/null +++ b/experimental/CollectiveX/tests/failure_taxonomy.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +"""CollectiveX failure taxonomy (goal Part 3: failure & reliability characterization). + +A wedged or crashing EP run should become a CLASSIFIED, bounded record — not a silent hang or a +bare rc=1. classify() maps an exception (or a process return code from the timeout-wrapped driver) +onto a stable failure mode, so coverage/reliability views can keep failed cases instead of dropping +them. Pure stdlib. +""" +from __future__ import annotations + +# Stable failure modes (goal Part 3). Order matters: classify() returns the first match. +MODES = [ + "unsupported", # capability rejected the combo (run_ep exit 5) + "initialization-failure", # process group / buffer / NVSHMEM bring-up failed + "out-of-memory", + "registration-failure", # MR / symmetric-heap registration (e.g. MoRI errno 22) + "correctness-failure", # ran but reconstruction gate failed + "timeout", # killed by the timeout wrapper (rc 124) — bounded hang + "deadlock", # collective watchdog abort (NCCL SIGABRT / rc -6 after a stall) + "teardown-failure", # post-finalize / shmem_finalize assertion + "infrastructure", # slurm / container / FS / node failure + "unknown", +] + +_SIGNATURES = [ + ("unsupported", ("unsupported", "rejects", "not supported", "no fallback")), + ("out-of-memory", ("out of memory", "outofmemory", "cuda oom", "cudaerrormemoryallocation")), + ("registration-failure", ("errno 22", "registration", "register", "ibv_reg", "mr ")), + ("initialization-failure", ("nvshmem", "init_process_group", "ncclcomminit", "bootstrap", "buffer(")), + ("deadlock", ("watchdog", "sigabrt", "signal 6", "collective", "timed out waiting", "nccl timeout")), + ("teardown-failure", ("shmem_finalize", "destroy_process_group", "teardown", "finalize")), + ("correctness-failure", ("correct=false", "reconstruction", "max_rel", "assertion.*tol")), + ("infrastructure", ("srun: error", "slurm", "node fail", "container", "no such file")), +] + + +def classify(text: str = "", rc: int | None = None) -> str: + """Best-effort failure mode from captured stderr/stdout text and/or a process return code.""" + if rc is not None: + if rc == 5: + return "unsupported" + if rc == 124: + return "timeout" # GNU timeout SIGTERM + if rc in (137, -9): + return "timeout" # SIGKILL (timeout -k) + if rc in (134, -6): + return "deadlock" # SIGABRT (NCCL watchdog / assertion) + t = (text or "").lower() + for mode, sigs in _SIGNATURES: + if any(s in t for s in sigs): + return mode + if rc not in (None, 0): + return "unknown" + return "unknown" + + +def record(text="", rc=None, case=None) -> dict: + """A classified failure record preserving the exact case + signal for reliability views.""" + return {"failure_mode": classify(text, rc), "return_code": rc, + "case": case or {}, "evidence": (text or "")[-400:]} + + +if __name__ == "__main__": + import sys + cases = [ + ("RuntimeError: Unsupported number of EP ranks", None, "unsupported"), + ("", 124, "timeout"), + ("Signal 6 (SIGABRT) received ... NCCL watchdog", None, "deadlock"), + ("", -6, "deadlock"), + ("cuda out of memory", None, "out-of-memory"), + ("ibv_reg_mr failed errno 22", None, "registration-failure"), + ("shmem_finalize teardown assertion", None, "teardown-failure"), + ("srun: error: node failed", None, "infrastructure"), + ] + ok = True + for text, rc, want in cases: + got = classify(text, rc) + flag = "OK" if got == want else "FAIL" + if got != want: + ok = False + print(f" [{flag}] rc={rc} text={text[:40]!r} -> {got} (want {want})") + print("failure_taxonomy self-test:", "PASS" if ok else "FAIL") + sys.exit(0 if ok else 1) diff --git a/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt new file mode 100644 index 000000000..c8825164e --- /dev/null +++ b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt @@ -0,0 +1,50 @@ +# nThread 1 nGpus 8 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0 +# +# Using devices +# Rank 0 Group 0 Pid 12345 on b200-node device 0 [0x1b] NVIDIA B200 +# Rank 1 Group 0 Pid 12345 on b200-node device 1 [0x43] NVIDIA B200 +# Rank 2 Group 0 Pid 12345 on b200-node device 2 [0x52] NVIDIA B200 +# Rank 3 Group 0 Pid 12345 on b200-node device 3 [0x61] NVIDIA B200 +# Rank 4 Group 0 Pid 12345 on b200-node device 4 [0x9d] NVIDIA B200 +# Rank 5 Group 0 Pid 12345 on b200-node device 5 [0xc3] NVIDIA B200 +# Rank 6 Group 0 Pid 12345 on b200-node device 6 [0xd1] NVIDIA B200 +# Rank 7 Group 0 Pid 12345 on b200-node device 7 [0xdf] NVIDIA B200 +# +# out-of-place in-place +# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 9.62 0.00 0.00 0 9.60 0.00 0.00 0 + 16 4 float sum -1 9.61 0.00 0.00 0 9.59 0.00 0.00 0 + 32 8 float sum -1 9.63 0.00 0.00 0 9.62 0.00 0.00 0 + 64 16 float sum -1 9.60 0.00 0.00 0 9.58 0.00 0.00 0 + 128 32 float sum -1 9.64 0.01 0.02 0 9.63 0.01 0.02 0 + 256 64 float sum -1 9.66 0.03 0.05 0 9.64 0.03 0.05 0 + 512 128 float sum -1 9.69 0.05 0.09 0 9.67 0.05 0.09 0 + 1024 256 float sum -1 9.74 0.11 0.18 0 9.72 0.11 0.18 0 + 2048 512 float sum -1 9.82 0.21 0.37 0 9.80 0.21 0.37 0 + 4096 1024 float sum -1 9.97 0.41 0.72 0 9.95 0.41 0.72 0 + 8192 2048 float sum -1 10.22 0.80 1.40 0 10.20 0.80 1.40 0 + 16384 4096 float sum -1 10.81 1.52 2.65 0 10.79 1.52 2.65 0 + 32768 8192 float sum -1 11.93 2.75 4.81 0 11.90 2.75 4.81 0 + 65536 16384 float sum -1 13.62 4.81 8.42 0 13.59 4.82 8.43 0 + 131072 32768 float sum -1 16.94 7.74 13.54 0 16.90 7.76 13.57 0 + 262144 65536 float sum -1 23.14 11.33 19.83 0 23.10 11.35 19.86 0 + 524288 131072 float sum -1 35.62 14.72 25.76 0 35.55 14.75 25.81 0 + 1048576 262144 float sum -1 60.40 17.36 30.38 0 60.30 17.39 30.43 0 + 2097152 524288 float sum -1 76.50 27.41 47.97 0 76.40 27.45 48.04 0 + 4194304 1048576 float sum -1 110.20 38.06 66.61 0 110.05 38.11 66.70 0 + 8388608 2097152 float sum -1 165.80 50.60 88.55 0 165.60 50.66 88.65 0 + 16777216 4194304 float sum -1 250.10 67.08 117.40 0 249.80 67.16 117.54 0 + 33554432 8388608 float sum -1 360.50 93.08 162.90 0 360.10 93.18 163.07 0 + 67108864 16777216 float sum -1 520.80 128.85 225.50 0 520.20 129.00 225.75 0 + 134217728 33554432 float sum -1 720.30 186.34 326.10 0 719.50 186.55 326.46 0 + 268435456 67108864 float sum -1 1080.50 248.43 434.80 0 1079.20 248.73 435.27 0 + 536870912 134217728 float sum -1 1990.20 269.76 472.10 0 1988.50 269.99 472.49 0 + 1073741824 268435456 float sum -1 3940.60 272.48 476.84 0 3938.10 272.65 477.14 0 + 2147483648 536870912 float sum -1 7850.10 273.56 478.73 0 7846.20 273.69 478.96 0 + 4294967296 1073741824 float sum -1 15680.50 273.91 479.34 0 15673.80 274.03 479.55 0 + 8589934592 2147483648 float sum -1 31250.80 274.87 481.02 0 31238.10 274.98 481.22 0 +# +# Out of bounds values : 0 OK +# Avg bus bandwidth : 168.42 +# diff --git a/experimental/CollectiveX/tests/kv_cache_transfer.py b/experimental/CollectiveX/tests/kv_cache_transfer.py new file mode 100644 index 000000000..655ece58d --- /dev/null +++ b/experimental/CollectiveX/tests/kv_cache_transfer.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +"""CollectiveX — KV-cache transfer benchmark (family=kv-cache). + +Times raw CUDA memcpy of KV-cache-shaped buffers across the transfer paths a +serving stack actually uses, with CUDA events (GPU-accurate). Adapted from +experimental/kvcache_transfer_DtoH_HtoD/benchmark.py but WITHOUT the vLLM +`swap_blocks` dependency — CollectiveX containers may not ship vLLM, and the goal +asks for the raw CUDA/HIP memcpy + CPU pinned-memory path as the reference. + +Dimensions (goal P2 "KV-cache transfer suite"): + direction : dtoh | htod | dtod-local | dtod-remote (remote needs >=2 GPUs) + layout : contiguous (one copy) | paged (N scattered block copies — the real + paged-KV pattern; captures per-block launch/scatter overhead) + size class: decode-sized (small per-token blocks) .. prefill/prefix-cache-sized (large) + backend : memcpy (raw cudaMemcpy), pinned (CPU pinned host) — WIRED. + nixl / mooncake / mori-io / nccl — declared, NOT wired (stubs; never faked). + +Stdlib + torch; torch is imported lazily so `--help` / `--parse-only`-style use works +without a GPU. One provenance-tagged JSON per run, matching run_nccl.py's structure. + + python tests/kv_cache_transfer.py --direction all --runner h200-dgxc \\ + --topology-class h200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/h200_kvcache.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "kv-cache-memcpy-v1" +FAMILY = "kv-cache" + +# Backends: which transfer mechanism moves the bytes. Only the raw memcpy + pinned-host +# paths are wired; the rest are declared so the axis is honest and a future adapter slots in. +WIRED_BACKENDS = ("memcpy", "pinned") +STUB_BACKENDS = ("nixl", "mooncake", "mori-io", "nccl") + +# KV block byte sizes: decode-sized (a few tokens' KV) .. prefill/prefix-cache-sized. +# A DeepSeek-V3 layer KV block for a handful of tokens is ~tens of KiB; a prefill/prefix +# chunk is MiB. Sweep geometric 16KiB -> 256MiB and class each point. +DECODE_MAX_BYTES = 512 * 1024 # <=512KiB == "decode-sized" +DEFAULT_MIN_BYTES = 16 * 1024 +DEFAULT_MAX_BYTES = 256 * 1024 * 1024 + + +def size_class(nbytes: int) -> str: + return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill" + + +def _sizes(min_bytes: int, max_bytes: int, factor: int = 4): + out, s = [], min_bytes + while s <= max_bytes: + out.append(s) + s *= factor + return out + + +def comparison_key(meta: dict) -> str: + parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"], + str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _bench_one(torch, src, dst, total_bytes, block_bytes, layout, paged_blocks, + warmup: int, iters: int): + """Time `iters` copies of total_bytes from src->dst. paged => paged_blocks scattered + block copies of block_bytes each; contiguous => one copy. Returns (time_ms, gb_s).""" + def _do(): + if layout == "paged": + # scatter: copy each logical block to a (shuffled) destination block slot — + # the paged-KV access pattern (non-contiguous gather/scatter). + for s_off, d_off in paged_blocks: + dst[d_off:d_off + block_bytes].copy_(src[s_off:s_off + block_bytes], + non_blocking=True) + else: + dst.copy_(src, non_blocking=True) + + for _ in range(warmup): + _do() + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(iters): + _do() + end.record() + torch.cuda.synchronize() + ms = start.elapsed_time(end) / iters + gb_s = (total_bytes / (ms / 1e3)) / 1e9 if ms > 0 else 0.0 + return round(ms, 5), round(gb_s, 2) + + +def _alloc(torch, where, nbytes, pinned: bool): + n = nbytes # bytes; use uint8 so 1 elem == 1 byte + if where == "cpu": + t = torch.empty(n, dtype=torch.uint8, device="cpu") + return t.pin_memory() if pinned else t + return torch.empty(n, dtype=torch.uint8, device=where) + + +def run_direction(torch, direction, backend, layout, sizes, block_bytes, warmup, iters, + ngpu: int): + """Yield a row per size for one (direction, backend, layout).""" + rows = [] + pinned = (backend == "pinned") + for nbytes in sizes: + # endpoints + if direction == "dtoh": + src_dev, dst_dev = "cuda:0", "cpu" + elif direction == "htod": + src_dev, dst_dev = "cpu", "cuda:0" + elif direction == "dtod-local": + src_dev, dst_dev = "cuda:0", "cuda:0" + elif direction == "dtod-remote": + if ngpu < 2: + return [], "n/a (needs >=2 GPUs)" + src_dev, dst_dev = "cuda:0", "cuda:1" + else: + return [], f"unknown direction {direction}" + # pinned only matters when a host buffer is involved + host_involved = ("cpu" in (src_dev, dst_dev)) + if backend == "pinned" and not host_involved: + continue # pinned is a host-memory property; skip for pure DtoD + try: + src = _alloc(torch, src_dev, nbytes, pinned and src_dev == "cpu") + dst = _alloc(torch, dst_dev, nbytes, pinned and dst_dev == "cpu") + except RuntimeError as exc: # OOM at the largest sizes — stop, don't crash + rows.append({"transfer_bytes": nbytes, "error": f"alloc: {exc!r}", "correct": None}) + break + nblk = max(1, nbytes // block_bytes) + bb = nbytes // nblk + # paged: shuffle destination block order (deterministic) to force scatter + paged = [((i * bb), (((i * 2654435761) % nblk) * bb)) for i in range(nblk)] \ + if layout == "paged" else None + ms, gb_s = _bench_one(torch, src, dst, nbytes, bb, layout, paged, warmup, iters) + rows.append({ + "transfer_bytes": nbytes, "size_class": size_class(nbytes), + "block_bytes": bb if layout == "paged" else nbytes, + "num_blocks": nblk if layout == "paged" else 1, + "time_ms": ms, "bandwidth_gb_s": gb_s, + "correct": True, # raw memcpy is exact (uint8); kept for schema parity + }) + del src, dst + torch.cuda.empty_cache() + return rows, None + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX KV-cache transfer benchmark") + ap.add_argument("--direction", default="all", + choices=["all", "dtoh", "htod", "dtod-local", "dtod-remote"]) + ap.add_argument("--backends", default="memcpy,pinned", + help="comma list from memcpy,pinned (wired) — stubs are recorded, not run") + ap.add_argument("--layouts", default="contiguous,paged") + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--block-bytes", type=int, default=64 * 1024, + help="paged KV block size (a few tokens' KV); default 64KiB") + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=30) + # provenance (mirror run_nccl.py) + ap.add_argument("--runner", required=True) + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + try: + import torch + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + if not torch.cuda.is_available(): + print("ERROR: CUDA not available", file=sys.stderr) + return 3 + + ngpu = torch.cuda.device_count() + directions = (["dtoh", "htod", "dtod-local", "dtod-remote"] + if args.direction == "all" else [args.direction]) + backends = [b.strip() for b in args.backends.split(",") if b.strip()] + layouts = [l.strip() for l in args.layouts.split(",") if l.strip()] + sizes = _sizes(args.min_bytes, args.max_bytes) + + groups = [] + notes = [] + peak = 0.0 + for backend in backends: + if backend not in WIRED_BACKENDS: + notes.append(f"backend '{backend}' not wired (declared only)") + continue + for direction in directions: + for layout in layouts: + rows, na = run_direction(torch, direction, backend, layout, sizes, + args.block_bytes, args.warmup, args.iters, ngpu) + if na: + notes.append(f"{direction}/{backend}/{layout}: {na}") + continue + if not rows: + continue + peak = max(peak, max((r.get("bandwidth_gb_s") or 0.0) for r in rows)) + meta = {"direction": direction, "layout": layout, "backend": backend, + "dtype": "uint8", "nodes": args.nodes, + "topology_class": args.topology_class, + "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows}) + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + doc = { + "schema_version": SCHEMA_VERSION, + "family": FAMILY, + "generated_by": "kv_cache_transfer.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, + "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, + "nodes": args.nodes, + "num_gpus_visible": ngpu, + "wired_backends": list(WIRED_BACKENDS), + "declared_unwired_backends": list(STUB_BACKENDS), + "status": "valid" if (groups and peak > 0.0) else "invalid", + "num_groups": len(groups), + "groups": groups, + "notes": notes, + "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"kv-cache: {len(groups)} (dir,backend,layout) groups -> {args.out} " + f"(status={doc['status']}, peak_bw={peak:.1f} GB/s, gpus={ngpu})") + if notes: + print("notes: " + "; ".join(notes), file=sys.stderr) + return 0 if doc["status"] == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py new file mode 100644 index 000000000..2e05c0537 --- /dev/null +++ b/experimental/CollectiveX/tests/make_workloads.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""Generate canonical serialized workloads (goal Part 1). Runs build_workload (needs torch) for +each (routing, global_tokens) in a ladder and writes .npz + .manifest.json into a +dir that runs then consume via `run_ep.py --workload-dir`. One trace per global-token count +because the generator is not prefix-consistent across sizes. + + python3 tests/make_workloads.py --out-dir /data/sa-shared/cx_workloads \\ + --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\ + --tokens-ladder "1 2 4 8 16 32 64 128 256 512" + +Or by NAMED model manifest (goal P1 model-shape coverage) — dims resolved from configs/workloads.yaml +(synthetic + model_derived; experts <- experts|routed_experts). Explicit --hidden/--topk/--experts +still override per field, so the env-var-driven in-container path (CX_HIDDEN/CX_TOPK/CX_EXPERTS) is +unchanged; this just lets a SKU stage a model shape by name: + + python3 tests/make_workloads.py --out-dir /data/cx_workloads --workload kimi-k2-v1 --routing uniform --ep 8 + +--id-only prints the deterministic workload_id per ladder point WITHOUT torch/numpy (the id is a hash +of the identity params, not the bytes) — runnable on a login node / in CI to prove cross-SKU identity: + + python3 tests/make_workloads.py --workload kimi-k2-v1 --ep 8 --id-only + +Generate every routing the suites need by running once per --routing. Idempotent (same id => same +file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes. +""" +from __future__ import annotations + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import workload as wl # noqa: E402 + +# Repo root holds configs/ (this file is in tests/). Used only for --workload name resolution. +_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +def resolve_manifest(name): + """Look a workload name up in configs/workloads.yaml and return (hidden, topk, experts). + Searches synthetic + model_derived; expert count = `experts` or (for model-derived) `routed_experts`. + Raises SystemExit with the known names if the manifest is absent. Pure PyYAML + stdlib.""" + import yaml + path = os.path.join(_REPO, "configs", "workloads.yaml") + cfg = yaml.safe_load(open(path)) + known = [] + for section in ("synthetic", "model_derived"): + sec = cfg.get(section) or {} + known += list(sec) + m = sec.get(name) + if m is None: + continue + experts = m.get("experts", m.get("routed_experts")) + if m.get("hidden") is None or m.get("topk") is None or experts is None: + raise SystemExit(f"workload '{name}' is missing hidden/topk/experts in {path}") + return int(m["hidden"]), int(m["topk"]), int(experts) + raise SystemExit(f"unknown --workload '{name}'; known: {sorted(known)}") + + +def main() -> int: + ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads") + ap.add_argument("--out-dir", help="required unless --id-only") + ap.add_argument("--workload", help="named manifest in configs/workloads.yaml (sets hidden/topk/experts)") + ap.add_argument("--routing", default="uniform") + ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)") + ap.add_argument("--hidden", type=int, help="override (default 7168, or the --workload's hidden)") + ap.add_argument("--topk", type=int, help="override (default 8, or the --workload's topk)") + ap.add_argument("--experts", type=int, help="override (default 256, or the --workload's experts)") + ap.add_argument("--seed", type=int, default=67) + ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512") + ap.add_argument("--id-only", action="store_true", + help="print deterministic workload_id per point WITHOUT torch/numpy (no files written)") + a = ap.parse_args() + + # Resolve dims: a named --workload supplies defaults; explicit --hidden/--topk/--experts override + # per field. With neither, fall back to the historical ds-like-ref defaults (7168/8/256). + base_h, base_t, base_e = (7168, 8, 256) + if a.workload: + base_h, base_t, base_e = resolve_manifest(a.workload) + hidden = a.hidden if a.hidden is not None else base_h + topk = a.topk if a.topk is not None else base_t + experts = a.experts if a.experts is not None else base_e + + if not a.id_only and not a.out_dir: + ap.error("--out-dir is required unless --id-only") + + ladder = sorted({int(t) for t in a.tokens_ladder.replace(",", " ").split() if int(t) > 0}) + epr = experts // a.ep + label = f"workload={a.workload} " if a.workload else "" + + if a.id_only: + # Identity-only path: the workload_id is a hash of (generator|routing|hidden|topk|experts|gt|seed), + # so it is fully determined WITHOUT generating the trace. Proves cross-SKU identity in CI/login. + made = [] + for T in ladder: + gt = T * a.ep + wid = wl.compute_workload_id(a.routing, hidden, topk, experts, gt, a.seed) + made.append((T, gt, wid)) + print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}") + print(f"{label}id-only: {len(made)} workload_id(s) " + f"(hidden={hidden} topk={topk} experts={experts} ep={a.ep} routing={a.routing} seed={a.seed})") + return 0 + + os.makedirs(a.out_dir, exist_ok=True) + made = [] + for T in ladder: + gt = T * a.ep + idx, w, man = wl.build_workload(hidden, topk, experts, a.routing, gt, a.seed, epr) + wid = wl.save_workload(a.out_dir, idx, w, man) + made.append((T, gt, wid)) + print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid} " + f"(trace sha {man['checksums']['trace'][:12]})") + print(f"{label}wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/mooncake_transfer.py b/experimental/CollectiveX/tests/mooncake_transfer.py new file mode 100644 index 000000000..9cc8d2931 --- /dev/null +++ b/experimental/CollectiveX/tests/mooncake_transfer.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +"""CollectiveX — Mooncake transfer-engine benchmark (family=kv-cache, backend=mooncake). + +Mooncake (kvcache-ai/Mooncake) is the disaggregated-KV transfer engine used by vLLM/SGLang PD +setups. This benches its RDMA `transfer_write_on_cuda` the way a prefill->decode KV write uses it: +one TransferEngine, P2PHANDSHAKE metadata (no etcd), src+dst GPU buffers registered for RDMA, the +engine RDMA-writes src->dst (loopback to its own rpc endpoint) over a KV-block size sweep. CUDA- +event timed on the transfer stream. + +The WIRED kv-cache `mooncake` backend the goal declared a stub. Mooncake isn't in any CollectiveX +container, so run_in_container pip-installs `mooncake-transfer-engine` first (the directive's "import +a new one" — a pip import rather than a base-image swap). Needs an RDMA NIC (auto-detected from +/sys/class/infiniband). The mooncake API surface + the chosen device are DUMPED to the log; absence +of the package or an RDMA device is recorded, never faked. + + python tests/mooncake_transfer.py --runner b300 --topology-class b300-nvlink-island \\ + --transport rdma --env-json results/env.json --out results/b300_mooncake.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import socket +import sys +import time + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "mooncake-transfer-v1" +FAMILY = "kv-cache" +BACKEND = "mooncake" + +DEFAULT_MIN_BYTES = 64 * 1024 +DEFAULT_MAX_BYTES = 256 * 1024 * 1024 +DECODE_MAX_BYTES = 512 * 1024 + + +def size_class(nbytes: int) -> str: + return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill" + + +def _sizes(lo: int, hi: int, factor: int = 4): + out, s = [], lo + while s <= hi: + out.append(s) + s *= factor + return out + + +def comparison_key(meta: dict) -> str: + parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"], + str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _get_ip() -> str: + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.connect(("8.8.8.8", 80)) + return s.getsockname()[0] + except Exception: + return socket.gethostbyname(socket.gethostname()) + + +def _rdma_devices(): + """RDMA device names to try, in order — the detected IB devices, then common fallbacks.""" + devs = [] + try: + devs = sorted(os.listdir("/sys/class/infiniband")) + except Exception: + pass + # prefer a bond if present (the Mooncake test used mlx5_bond_0), then the raw devices. + bonds = [d for d in devs if "bond" in d] + return bonds + [d for d in devs if d not in bonds] + ["mlx5_bond_0", "mlx5_0", "rocep0s0"] + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX Mooncake transfer benchmark") + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=30) + ap.add_argument("--runner", required=True) + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="rdma") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + def _emit(groups, status, peak, notes): + doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY, + "generated_by": "mooncake_transfer.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes, + "wired_backends": [BACKEND], "status": status, + "num_groups": len(groups), "groups": groups, "notes": notes, "environment": env} + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"mooncake: {len(groups)} groups -> {args.out} (status={status}, peak_bw={peak:.1f} GB/s)") + if notes: + print("notes: " + "; ".join(notes), file=sys.stderr) + + try: + import torch + except Exception as exc: + _emit([], "invalid", 0.0, [f"torch unavailable: {exc!r}"]) + return 3 + if not torch.cuda.is_available(): + _emit([], "invalid", 0.0, ["CUDA/ROCm not available"]) + return 3 + try: + from mooncake.engine import TransferEngine + except Exception as exc: + _emit([], "invalid", 0.0, + [f"mooncake import failed (run_in_container pip-installs mooncake-transfer-engine): {exc!r}"]) + return 1 + print("MOONCAKE_API methods=" + json.dumps([m for m in dir(TransferEngine) if not m.startswith("_")][:40]), + file=sys.stderr, flush=True) + + is_rocm = bool(getattr(torch.version, "hip", None)) + xfer = "transfer_write_on_hip" if is_rocm else "transfer_write_on_cuda" + eng = TransferEngine() + host = _get_ip() + init_note = None + for dev in _rdma_devices(): + try: + ret = eng.initialize(host, "P2PHANDSHAKE", "rdma", dev) + if ret == 0: + init_note = f"initialized on rdma device {dev}" + break + except Exception as e: + init_note = f"init raised on {dev}: {e!r}" + if init_note is None or "initialized" not in init_note: + _emit([], "invalid", 0.0, [f"mooncake init failed on all RDMA devices: {init_note}"]) + return 1 + print(f"MOONCAKE_INIT {init_note}", file=sys.stderr, flush=True) + if not hasattr(eng, xfer): + _emit([], "invalid", 0.0, [f"mooncake engine has no {xfer} (methods dumped above)"]) + return 1 + rpc = eng.get_rpc_port() + target = f"[{host}]:{rpc}" if ":" in host else f"{host}:{rpc}" + transfer = getattr(eng, xfer) + + dev0 = torch.device("cuda:0") + stream = torch.cuda.Stream(dev0) + sizes = _sizes(args.min_bytes, args.max_bytes) + rows, peak = [], 0.0 + for nbytes in sizes: + try: + src = torch.ones(nbytes, dtype=torch.uint8, device=dev0) + dst = torch.zeros(nbytes, dtype=torch.uint8, device=dev0) + if eng.register_memory(src.data_ptr(), src.nbytes) != 0 or \ + eng.register_memory(dst.data_ptr(), dst.nbytes) != 0: + rows.append({"transfer_bytes": nbytes, "error": "register_memory != 0", "correct": None}) + break + + def _once(): + transfer(target, src.data_ptr(), dst.data_ptr(), nbytes, stream.cuda_stream) + for _ in range(args.warmup): + _once() + torch.cuda.synchronize() + t0 = time.perf_counter() + for _ in range(args.iters): + _once() + stream.synchronize() + torch.cuda.synchronize() + dt = time.perf_counter() - t0 + eng.unregister_memory(src.data_ptr()); eng.unregister_memory(dst.data_ptr()) + except Exception as exc: + rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None}) + break + ms = (dt / args.iters) * 1e3 + gb_s = (nbytes / (dt / args.iters)) / 1e9 if dt > 0 else 0.0 + rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes), + "block_bytes": nbytes, "num_blocks": 1, + "time_ms": round(ms, 5), "bandwidth_gb_s": round(gb_s, 2), "correct": True}) + peak = max(peak, gb_s) + del src, dst + torch.cuda.empty_cache() + + groups = [] + if any(r.get("bandwidth_gb_s") for r in rows): + meta = {"direction": "dtod-local", "layout": "contiguous", "backend": BACKEND, + "dtype": "uint8", "nodes": args.nodes, + "topology_class": args.topology_class, + "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows}) + status = "valid" if (groups and peak > 0.0) else "invalid" + _emit(groups, status, peak, [init_note, f"loopback target={target}"]) + return 0 if status == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/mori_io_transfer.py b/experimental/CollectiveX/tests/mori_io_transfer.py new file mode 100644 index 000000000..572251b94 --- /dev/null +++ b/experimental/CollectiveX/tests/mori_io_transfer.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +"""CollectiveX — MoRI-IO transfer benchmark (family=kv-cache, backend=mori-io). + +MoRI-IO (ROCm/mori `mori.io`) is AMD's RDMA point-to-point transfer engine — the AMD analog of +NIXL, used for disaggregated-serving KV movement between GPUs/nodes. This benches its read path the +way a prefill->decode KV handoff uses it: two IOEngines in one process (initiator + target, RDMA +backend, mutual register_remote_engine), the initiator RDMA-reads the target's GPU buffer, swept +over KV-block-sized payloads. Wall-clock latency + bandwidth (RDMA completion via InProgress()). + +This is the WIRED `mori-io` backend the goal's "KV-cache transfer backends" axis declared a stub. +Runs only on the AMD MoRI image (CX_BENCH=mori-io on mi355x); elsewhere the import fails and the run +records that — never faked. The mori.io API surface is DUMPED to stderr at startup so a GHA run's +log is self-documenting (SSH into the MI355X container stalls on the shared cluster). + + python tests/mori_io_transfer.py --runner mi355x --topology-class mi355x-xgmi \\ + --transport rdma --env-json results/env.json --out results/mi355x_mori_io.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys +import time + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "mori-io-transfer-v1" +FAMILY = "kv-cache" +BACKEND = "mori-io" + +DEFAULT_MIN_BYTES = 64 * 1024 +DEFAULT_MAX_BYTES = 256 * 1024 * 1024 +DECODE_MAX_BYTES = 512 * 1024 + + +def size_class(nbytes: int) -> str: + return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill" + + +def _sizes(min_bytes: int, max_bytes: int, factor: int = 4): + out, s = [], min_bytes + while s <= max_bytes: + out.append(s) + s *= factor + return out + + +def comparison_key(meta: dict) -> str: + parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"], + str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _introspect(mod): + info = {"mori_io_exports": [n for n in dir(mod) if not n.startswith("_")][:40]} + try: + import importlib.metadata as _m + info["mori_version"] = _m.version("mori") + except Exception as e: + info["mori_version"] = f"<{e!r}>" + print("MORI_IO_API " + json.dumps(info), file=sys.stderr, flush=True) + + +def _make_engines(io): + """Two local IOEngines (initiator + target) on distinct localhost ports with an RDMA backend, + mutually registered. Mirrors examples/io/example.py.""" + cfg = io.IOEngineConfig(host="127.0.0.1", port=8080) + initiator = io.IOEngine(key="cx_initiator", config=cfg) + cfg2 = io.IOEngineConfig(host="127.0.0.1", port=8081) + target = io.IOEngine(key="cx_target", config=cfg2) + rdma = io.RdmaBackendConfig(qp_per_transfer=1) + initiator.create_backend(io.BackendType.RDMA, rdma) + target.create_backend(io.BackendType.RDMA, rdma) + initiator.register_remote_engine(target.get_engine_desc()) + target.register_remote_engine(initiator.get_engine_desc()) + return initiator, target + + +def _bench_one(initiator, target, src_t, dst_t, nbytes, warmup, iters): + """Register src (initiator, GPU0) + dst (target, GPU1); RDMA-read dst->src `iters` times, poll + each to completion. Returns (latency_ms, gb_s). Raises on a MoRI-IO error.""" + im = initiator.register_torch_tensor(src_t) + tm = target.register_torch_tensor(dst_t) + + def _once(): + uid = initiator.allocate_transfer_uid() + st = initiator.read(im, 0, tm, 0, nbytes, uid) + while st.InProgress(): + pass + msg = st.Message() if hasattr(st, "Message") else "" + if msg and "succ" not in msg.lower() and "ok" not in msg.lower() and "done" not in msg.lower(): + # Message() is informational on success; only treat an explicit failure word as fatal. + if any(w in msg.lower() for w in ("fail", "error", "abort")): + raise RuntimeError(f"mori-io read status: {msg}") + + try: + for _ in range(warmup): + _once() + t0 = time.perf_counter() + for _ in range(iters): + _once() + dt = time.perf_counter() - t0 + finally: + initiator.deregister_memory(im) + target.deregister_memory(tm) + ms = (dt / iters) * 1e3 + gb_s = (nbytes / (dt / iters)) / 1e9 if dt > 0 else 0.0 + return round(ms, 5), round(gb_s, 2) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX MoRI-IO transfer benchmark") + ap.add_argument("--direction", default="dtod-remote", choices=["dtod-remote"]) + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=30) + ap.add_argument("--runner", required=True) + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="rdma") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + def _emit(groups, status, peak, notes): + doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY, + "generated_by": "mori_io_transfer.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes, + "wired_backends": [BACKEND], "status": status, + "num_groups": len(groups), "groups": groups, "notes": notes, "environment": env} + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"mori-io: {len(groups)} groups -> {args.out} (status={status}, peak_bw={peak:.1f} GB/s)") + if notes: + print("notes: " + "; ".join(notes), file=sys.stderr) + + try: + import torch + except Exception as exc: + _emit([], "invalid", 0.0, [f"torch unavailable: {exc!r}"]) + return 3 + if not torch.cuda.is_available() or torch.cuda.device_count() < 2: + _emit([], "invalid", 0.0, + [f"mori-io needs >=2 GPUs (RDMA p2p); have {torch.cuda.device_count() if torch.cuda.is_available() else 0}"]) + return 1 + try: + import mori.io as moriio + except Exception as exc: + _emit([], "invalid", 0.0, [f"mori.io import failed (needs the AMD MoRI image): {exc!r}"]) + return 1 + _introspect(moriio) + try: + if hasattr(moriio, "set_log_level"): + moriio.set_log_level("warning") + initiator, target = _make_engines(moriio) + except Exception as exc: + _emit([], "invalid", 0.0, [f"mori.io engine/backend init failed: {exc!r}"]) + return 1 + + sizes = _sizes(args.min_bytes, args.max_bytes) + notes = ["mori.io 2-engine RDMA loopback (GPU0<->GPU1)"] + rows, peak = [], 0.0 + for nbytes in sizes: + try: + src = torch.empty(nbytes, dtype=torch.uint8, device="cuda:0") + dst = torch.empty(nbytes, dtype=torch.uint8, device="cuda:1") + ms, gb_s = _bench_one(initiator, target, src, dst, nbytes, args.warmup, args.iters) + except Exception as exc: + rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None}) + break + rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes), + "block_bytes": nbytes, "num_blocks": 1, + "time_ms": ms, "bandwidth_gb_s": gb_s, "correct": True}) + peak = max(peak, gb_s) + del src, dst + torch.cuda.empty_cache() + + groups = [] + if any(r.get("bandwidth_gb_s") for r in rows): + meta = {"direction": "dtod-remote", "layout": "contiguous", "backend": BACKEND, + "dtype": "uint8", "nodes": args.nodes, + "topology_class": args.topology_class, + "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows}) + status = "valid" if (groups and peak > 0.0) else "invalid" + _emit(groups, status, peak, notes) + return 0 if status == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/nccl_kv_transfer.py b/experimental/CollectiveX/tests/nccl_kv_transfer.py new file mode 100644 index 000000000..0e77b88e1 --- /dev/null +++ b/experimental/CollectiveX/tests/nccl_kv_transfer.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +"""CollectiveX — NCCL/RCCL KV-cache transfer benchmark (family=kv-cache, backend=nccl|rccl). + +The point-to-point KV handoff a disaggregated stack does over the collective library directly: +torchrun with 2 ranks, rank 0 `dist.send`s KV-block-sized buffers to rank 1 (`dist.recv`), timed +with CUDA events. NCCL on NVIDIA, RCCL on AMD/ROCm (same torch.distributed API) — so this is the +WIRED `nccl`/`rccl` KV-cache backend the goal's "KV-cache transfer backends" axis declared a stub +(the NCCL collective suite covers the all_reduce/all_gather primitives; this is the p2p KV path). + +Emits one kv-cache-family JSON (plots in the KV-cache tab next to memcpy/nixl/mori-io). Single +(dir, backend, layout) group per run. Backend label = rccl on ROCm, nccl on CUDA. + + torchrun --nproc_per_node=2 tests/nccl_kv_transfer.py --runner h200-dgxc \\ + --topology-class h200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/h200_ncclkv.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "nccl-kv-sendrecv-v1" +FAMILY = "kv-cache" + +DEFAULT_MIN_BYTES = 64 * 1024 +DEFAULT_MAX_BYTES = 256 * 1024 * 1024 +DECODE_MAX_BYTES = 512 * 1024 + + +def size_class(nbytes: int) -> str: + return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill" + + +def _sizes(min_bytes: int, max_bytes: int, factor: int = 4): + out, s = [], min_bytes + while s <= max_bytes: + out.append(s) + s *= factor + return out + + +def comparison_key(meta: dict) -> str: + parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"], + str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _bench_one(torch, dist, rank, send_buf, recv_buf, nbytes, warmup, iters): + """rank0 sends -> rank1 recvs, `iters` times, CUDA-event timed on the active rank. Returns + (latency_ms, gb_s) on rank 0 (rank 1 returns None and is the receiver).""" + def _once(): + if rank == 0: + dist.send(send_buf, dst=1) + else: + dist.recv(recv_buf, src=0) + for _ in range(warmup): + _once() + torch.cuda.synchronize() + dist.barrier() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(iters): + _once() + end.record() + torch.cuda.synchronize() + ms = start.elapsed_time(end) / iters + gb_s = (nbytes / (ms / 1e3)) / 1e9 if ms > 0 else 0.0 + return round(ms, 5), round(gb_s, 2) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX NCCL/RCCL KV-cache transfer benchmark") + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=30) + ap.add_argument("--runner", required=True) + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="nvlink") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + try: + import torch + import torch.distributed as dist + except Exception as exc: + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + if not torch.cuda.is_available(): + print("ERROR: CUDA/ROCm not available", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + dev = torch.device("cuda", local_rank) + # backend label tracks the actual transport library: RCCL on ROCm, NCCL on CUDA. + is_rocm = bool(getattr(torch.version, "hip", None)) + backend_label = "rccl" if is_rocm else "nccl" + + if world < 2: + if rank == 0: + _emit(args, [], "invalid", 0.0, [f"needs >=2 ranks (torchrun --nproc_per_node>=2); world={world}"], + backend_label) + return 1 + if not dist.is_initialized(): + dist.init_process_group(backend="nccl", init_method="env://", world_size=world, rank=rank) + + sizes = _sizes(args.min_bytes, args.max_bytes) + rows = [] + peak = 0.0 + for nbytes in sizes: + try: + send_buf = torch.empty(nbytes, dtype=torch.uint8, device=dev) if rank == 0 else torch.empty(1, dtype=torch.uint8, device=dev) + recv_buf = torch.empty(nbytes, dtype=torch.uint8, device=dev) if rank == 1 else torch.empty(1, dtype=torch.uint8, device=dev) + ms, gb_s = _bench_one(torch, dist, rank, send_buf, recv_buf, nbytes, args.warmup, args.iters) + except RuntimeError as exc: + if rank == 0: + rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None}) + break + if rank == 0: + rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes), + "block_bytes": nbytes, "num_blocks": 1, + "time_ms": ms, "bandwidth_gb_s": gb_s, "correct": True}) + peak = max(peak, gb_s) + del send_buf, recv_buf + torch.cuda.empty_cache() + + dist.barrier() + if rank != 0: + dist.destroy_process_group() + return 0 + + groups = [] + if any(r.get("bandwidth_gb_s") for r in rows): + meta = {"direction": "dtod-remote", "layout": "contiguous", "backend": backend_label, + "dtype": "uint8", "nodes": args.nodes, + "topology_class": args.topology_class, + "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows}) + status = "valid" if (groups and peak > 0.0) else "invalid" + _emit(args, groups, status, peak, [f"{backend_label} 2-rank send/recv (rank0->rank1)"], backend_label) + dist.destroy_process_group() + return 0 if status == "valid" else 1 + + +def _emit(args, groups, status, peak, notes, backend_label): + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY, + "generated_by": "nccl_kv_transfer.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes, + "wired_backends": [backend_label], "status": status, + "num_groups": len(groups), "groups": groups, "notes": notes, "environment": env} + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"{backend_label}-kv: {len(groups)} groups -> {args.out} (status={status}, peak_bw={peak:.1f} GB/s)") + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/nixl_transfer.py b/experimental/CollectiveX/tests/nixl_transfer.py new file mode 100644 index 000000000..ef589ee2c --- /dev/null +++ b/experimental/CollectiveX/tests/nixl_transfer.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +"""CollectiveX — NIXL transfer benchmark (family=kv-cache, backend=nixl). + +NIXL (ai-dynamo/nixl) is the transfer fabric dynamo uses for disaggregated-serving KV movement. +This benches its point-to-point transfer engine the way a prefill->decode KV handoff uses it: two +NIXL agents in one process, one registers the source buffer and the other the destination, and the +initiator posts a WRITE over the UCX backend (GPU<->GPU, GPU<->host). It sweeps KV-block-sized +payloads and records wall-clock latency + bandwidth (NIXL transfers run on UCX's own streams, so +CUDA events don't bound them — perf_counter around post+poll-to-DONE is the honest measure). + +This is the WIRED `nixl` backend for the goal's "KV-cache transfer backends" axis (kv_cache_transfer +declared it a stub). It runs only in the NIXL/dynamo container (CX_BENCH=nixl switches CX_IMAGE to +the tensorrtllm-runtime image); elsewhere the import fails and the run records that — never faked. + +The NIXL Python surface (version, Abseil, backends, agent methods) is DUMPED to stderr at startup so +a GHA run's log is self-documenting even if the API drifted — SSH inspection of the NIXL container is +not available. Emits one kv-cache-family JSON (plots in the KV-cache tab next to raw memcpy). + + python tests/nixl_transfer.py --runner b300 --topology-class b300-nvlink-island \\ + --transport nvlink --env-json results/env.json --out results/b300_nixl.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys +import time + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "nixl-transfer-v1" +FAMILY = "kv-cache" # same family/schema as kv_cache_transfer.py -> plots in the KV-cache tab +BACKEND = "nixl" + +DEFAULT_MIN_BYTES = 64 * 1024 +DEFAULT_MAX_BYTES = 256 * 1024 * 1024 +DECODE_MAX_BYTES = 512 * 1024 + + +def size_class(nbytes: int) -> str: + return "decode" if nbytes <= DECODE_MAX_BYTES else "prefill" + + +def _sizes(min_bytes: int, max_bytes: int, factor: int = 4): + out, s = [], min_bytes + while s <= max_bytes: + out.append(s) + s *= factor + return out + + +def comparison_key(meta: dict) -> str: + parts = [meta["direction"], meta["layout"], meta["backend"], meta["dtype"], + str(meta["nodes"]), meta["topology_class"], meta["measurement_contract"]] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _import_nixl(): + """Return (nixl_agent, nixl_agent_config, import_path) or raise. Tries both documented paths.""" + try: + from nixl._api import nixl_agent, nixl_agent_config # canonical + return nixl_agent, nixl_agent_config, "nixl._api" + except Exception: + from nixl import nixl_agent, nixl_agent_config # re-export + return nixl_agent, nixl_agent_config, "nixl" + + +def _nixl_introspect(nixl_agent, nixl_agent_config): + """Dump the NIXL surface (version, Abseil, backends, agent methods) to stderr. Self-documenting + so the GHA log resolves any API drift without SSH into the NIXL container.""" + info = {} + try: + import importlib.metadata as _m + info["nixl_version"] = _m.version("nixl") + except Exception as e: + info["nixl_version"] = f"<{e!r}>" + try: + import nixl._bindings as _b # the pybind core; surfaces the linked Abseil/UCX if present + info["bindings"] = [n for n in dir(_b) if not n.startswith("_")][:40] + except Exception as e: + info["bindings"] = f"<{e!r}>" + info["agent_methods"] = [n for n in dir(nixl_agent) if not n.startswith("_")] + print("NIXL_API " + json.dumps(info), file=sys.stderr, flush=True) + return info + + +def _make_agents(nixl_agent, nixl_agent_config): + """Two local agents (initiator + target) on the UCX backend; exchange metadata so the initiator + can post to the target's registered memory. No IP/listen thread needed in one process.""" + try: + cfg = nixl_agent_config(backends=["UCX"]) + except TypeError: + cfg = nixl_agent_config(True, True, 0) # positional fallback (older signature) + init = nixl_agent("cx_initiator", cfg) + targ = nixl_agent("cx_target", cfg) + return init, targ + + +def _bench_one(init, targ, src_t, dst_t, nbytes, warmup, iters): + """Register src (initiator) + dst (target), post WRITE src->dst `iters` times, poll each to DONE. + Returns (latency_ms_per_xfer, gb_s). Raises on a NIXL error (caller records it).""" + init.register_memory(src_t) + targ.register_memory(dst_t) + init.add_remote_agent(targ.get_agent_metadata()) + src_descs = init.get_xfer_descs([src_t]) + dst_descs = init.get_xfer_descs([dst_t]) + + def _once(): + h = init.initialize_xfer("WRITE", src_descs, dst_descs, targ.name, b"cx") + st = init.transfer(h) + if st == "ERR": + init.release_xfer_handle(h) + raise RuntimeError("nixl transfer post returned ERR") + while True: + st = init.check_xfer_state(h) + if st == "ERR": + init.release_xfer_handle(h) + raise RuntimeError("nixl transfer state ERR") + if st == "DONE": + break + init.release_xfer_handle(h) + + for _ in range(warmup): + _once() + t0 = time.perf_counter() + for _ in range(iters): + _once() + dt = time.perf_counter() - t0 + ms = (dt / iters) * 1e3 + gb_s = (nbytes / (dt / iters)) / 1e9 if dt > 0 else 0.0 + return round(ms, 5), round(gb_s, 2) + + +def _alloc(torch, where, nbytes): + if where == "cpu": + return torch.empty(nbytes, dtype=torch.uint8, device="cpu").pin_memory() + return torch.empty(nbytes, dtype=torch.uint8, device=where) + + +def run_direction(torch, init, targ, direction, sizes, warmup, iters, ngpu): + rows = [] + for nbytes in sizes: + if direction == "dtod-local": + src_dev, dst_dev = "cuda:0", "cuda:0" + elif direction == "dtod-remote": + if ngpu < 2: + return [], "n/a (needs >=2 GPUs)" + src_dev, dst_dev = "cuda:0", "cuda:1" + elif direction == "dtoh": + src_dev, dst_dev = "cuda:0", "cpu" + elif direction == "htod": + src_dev, dst_dev = "cpu", "cuda:0" + else: + return [], f"unknown direction {direction}" + try: + src = _alloc(torch, src_dev, nbytes) + dst = _alloc(torch, dst_dev, nbytes) + ms, gb_s = _bench_one(init, targ, src, dst, nbytes, warmup, iters) + except RuntimeError as exc: + rows.append({"transfer_bytes": nbytes, "error": f"{exc!r}", "correct": None}) + break + rows.append({"transfer_bytes": nbytes, "size_class": size_class(nbytes), + "block_bytes": nbytes, "num_blocks": 1, + "time_ms": ms, "bandwidth_gb_s": gb_s, "correct": True}) + del src, dst + torch.cuda.empty_cache() + return rows, None + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX NIXL transfer benchmark") + ap.add_argument("--direction", default="all", + choices=["all", "dtod-local", "dtod-remote", "dtoh", "htod"]) + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=30) + ap.add_argument("--runner", required=True) + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + notes = [] + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + def _emit(groups, status, peak, extra_notes): + doc = {"schema_version": SCHEMA_VERSION, "family": FAMILY, + "generated_by": "nixl_transfer.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, "nodes": args.nodes, + "wired_backends": [BACKEND], "status": status, + "num_groups": len(groups), "groups": groups, + "notes": extra_notes, "environment": env} + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"nixl-transfer: {len(groups)} groups -> {args.out} (status={status}, " + f"peak_bw={peak:.1f} GB/s)") + if extra_notes: + print("notes: " + "; ".join(extra_notes), file=sys.stderr) + + try: + import torch + except Exception as exc: + _emit([], "invalid", 0.0, [f"torch unavailable: {exc!r}"]) + return 3 + if not torch.cuda.is_available(): + _emit([], "invalid", 0.0, ["CUDA not available"]) + return 3 + + try: + nixl_agent, nixl_agent_config, path = _import_nixl() + notes.append(f"nixl imported via {path}") + except Exception as exc: + _emit([], "invalid", 0.0, + [f"nixl import failed (needs the NIXL/dynamo container): {exc!r}"]) + return 1 + _nixl_introspect(nixl_agent, nixl_agent_config) + try: + init, targ = _make_agents(nixl_agent, nixl_agent_config) + except Exception as exc: + _emit([], "invalid", 0.0, [f"nixl agent init failed: {exc!r}"]) + return 1 + + ngpu = torch.cuda.device_count() + directions = (["dtod-local", "dtod-remote", "dtoh", "htod"] + if args.direction == "all" else [args.direction]) + sizes = _sizes(args.min_bytes, args.max_bytes) + + groups, peak = [], 0.0 + for direction in directions: + try: + rows, na = run_direction(torch, init, targ, direction, sizes, args.warmup, args.iters, ngpu) + except Exception as exc: + notes.append(f"{direction}: {exc!r}") + continue + if na: + notes.append(f"{direction}: {na}") + continue + timed = [r for r in rows if r.get("bandwidth_gb_s")] + if not timed: + continue + peak = max(peak, max(r["bandwidth_gb_s"] for r in timed)) + meta = {"direction": direction, "layout": "contiguous", "backend": BACKEND, + "dtype": "uint8", "nodes": args.nodes, + "topology_class": args.topology_class, + "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows}) + + status = "valid" if (groups and peak > 0.0) else "invalid" + _emit(groups, status, peak, notes) + return 0 if status == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/offload_bench.py b/experimental/CollectiveX/tests/offload_bench.py new file mode 100644 index 000000000..a338a3a4d --- /dev/null +++ b/experimental/CollectiveX/tests/offload_bench.py @@ -0,0 +1,446 @@ +#!/usr/bin/env python3 +"""CollectiveX — CPU<->GPU offload suite (goal P2 "CPU-GPU offload suite"). + +Measures host<->device memcpy bandwidth + latency over a size sweep, for the +four sub-ops {h2d, d2h} x {pinned, pageable}, plus two diagnostics that matter +for real offload (KV spill, weight streaming, activation checkpointing): + + * NUMA locality — which NUMA node the host buffer landed on, and (best + effort, if numactl/affinity is available) a node-pinned + vs default comparison. Recorded, never required. + * overlap-w-compute — a copy stream running concurrently with a dummy GEMM on + a separate compute stream; reports achieved overlap % + (how much of the copy is hidden behind compute). + +Matches run_nccl.py's result CONVENTION (family/runner/op/rows/comparison_key/ +status/transport/environment/generated_at) and env_capture.py's provenance +style, so the plot + collector consume it uniformly. + +Stdlib + torch. torch is needed ONLY at runtime on the GPU; --help and +--parse-only work without it (the JSON writer + CLI are import-safe). + +Run (inside the container, 1 GPU is enough): + python tests/offload_bench.py \\ + --runner h200 --topology-class h200-nvlink-island --transport pcie \\ + --env-json results/env.json --out results/h200_offload.json + +Verify offline (no GPU/torch needed): + python tests/offload_bench.py --parse-only --runner h200 \\ + --topology-class h200-nvlink-island --out /tmp/parsed.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +FAMILY = "offload" +MEASUREMENT_CONTRACT = "host-device-memcpy-v1" +GENERATED_BY = "offload_bench.py" + +# (direction, host_memory) sub-ops. h2d = host->device (CPU->GPU), d2h = the reverse. +SUBOPS = [ + ("h2d", "pinned"), + ("h2d", "pageable"), + ("d2h", "pinned"), + ("d2h", "pageable"), +] + +# Default byte sweep: 4 KiB .. 256 MiB by x4. Covers decode-token-sized spills +# up to prefix-cache / weight-shard sized streams. +DEFAULT_MIN_BYTES = 4 * 1024 +DEFAULT_MAX_BYTES = 256 * 1024 * 1024 +DEFAULT_FACTOR = 4 + + +# --------------------------------------------------------------------------- # +# import-safe helpers (no torch) # +# --------------------------------------------------------------------------- # +def _human(n: int) -> str: + for unit in ("B", "KiB", "MiB", "GiB"): + if n < 1024 or unit == "GiB": + return f"{n:.0f}{unit}" + n /= 1024 + return f"{n}" + + +def size_ladder(min_bytes: int, max_bytes: int, factor: int) -> list[int]: + sizes, s = [], int(min_bytes) + while s <= int(max_bytes): + sizes.append(s) + s *= factor + return sizes + + +def comparison_key(meta: dict) -> str: + """Deterministic curve key. transport + topology_class are part of the key so + a PCIe H200 result and an NVLink-C2C GB200 result are labelled distinct rather + than silently overlaid (mirrors run_nccl.py's intent).""" + parts = [ + meta["op"], + meta["host_memory"], + meta["dtype"], + meta["transport"], + meta["topology_class"], + meta["comparison_class"], + meta["measurement_contract"], + ] + return hashlib.sha256("|".join(map(str, parts)).encode()).hexdigest()[:16] + + +def _load_env(path: str | None) -> dict | None: + if path and os.path.exists(path): + with open(path) as fh: + return json.load(fh) + return None + + +def _provenance() -> dict: + """GitHub / container provenance (mirrors tests/run_ep.py).""" + import platform as _plat + + arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine()) + run = { + "run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + } + return { + "image": os.environ.get("COLLECTIVEX_IMAGE", ""), + "image_digest": os.environ.get("COLLECTIVEX_IMAGE_DIGEST", ""), + "image_arch": arch, + "squash_sha256": os.environ.get("COLLECTIVEX_SQUASH_SHA256"), + "git_run": run if any(run.values()) else None, + } + + +def _numa_locality() -> dict: + """Best-effort NUMA context. Never required; degrades to nulls off-NUMA. + + Records the process's allowed NUMA node(s) and CPU affinity so a result that + happened to land cross-socket from the GPU is identifiable after the fact. + """ + info: dict = { + "available": False, + "process_node": None, + "membind": None, + "cpus_allowed_list": None, + "node_count": None, + "source": None, + } + # numactl --show is the clean read; fall back to /proc self status bitmasks. + import shutil + import subprocess + + if shutil.which("numactl"): + try: + out = subprocess.run( + ["numactl", "--show"], capture_output=True, text=True, timeout=10, check=False + ) + if out.returncode == 0: + info["available"] = True + info["source"] = "numactl --show" + for line in out.stdout.splitlines(): + if line.startswith("nodebind:"): + info["process_node"] = line.split(":", 1)[1].strip() + elif line.startswith("membind:"): + info["membind"] = line.split(":", 1)[1].strip() + except (OSError, subprocess.TimeoutExpired): + pass + # node count from sysfs (independent of numactl) + try: + nodes = [d for d in os.listdir("/sys/devices/system/node") if d.startswith("node")] + if nodes: + info["node_count"] = len(nodes) + except OSError: + pass + # CPU affinity of this process (which cores -> which socket -> NUMA hint) + try: + if hasattr(os, "sched_getaffinity"): + info["cpus_allowed_list"] = sorted(os.sched_getaffinity(0)) + if info["source"] is None: + info["available"] = True + info["source"] = "os.sched_getaffinity" + except OSError: + pass + return info + + +# --------------------------------------------------------------------------- # +# GPU path (torch only here) # +# --------------------------------------------------------------------------- # +def _bench_one(torch, direction: str, host_memory: str, nbytes: int, + dtype, warmup: int, iters: int) -> dict: + """Time a single (direction, host_memory, size) point with CUDA events. + + Returns latency (us) and bandwidth (GB/s, decimal). Uses non_blocking=True so + pinned transfers actually go async on the copy engine; pageable is implicitly + synchronous (the staging copy serializes), which is the honest contrast. + """ + elem = torch.tensor([], dtype=dtype).element_size() + n = max(1, nbytes // elem) + pin = host_memory == "pinned" + + host = torch.empty(n, dtype=dtype, device="cpu", pin_memory=pin) + dev = torch.empty(n, dtype=dtype, device="cuda") + if direction == "h2d": + src, dst = host, dev + else: + src, dst = dev, host + + non_blocking = pin # pageable cannot be truly async + + for _ in range(warmup): + dst.copy_(src, non_blocking=non_blocking) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(iters): + dst.copy_(src, non_blocking=non_blocking) + end.record() + torch.cuda.synchronize() + + elapsed_ms = start.elapsed_time(end) + avg_ms = elapsed_ms / iters + actual_bytes = n * elem + gbps = (actual_bytes / (avg_ms / 1e3)) / 1e9 if avg_ms > 0 else 0.0 + return { + "size_bytes": actual_bytes, + "requested_bytes": nbytes, + "latency_us": round(avg_ms * 1e3, 4), + "bandwidth_gbps": round(gbps, 3), + } + + +def _overlap_with_compute(torch, nbytes: int, dtype, iters: int) -> dict: + """Run a pinned H2D copy concurrently with a dummy GEMM on a separate stream + and report achieved overlap %. + + overlap_pct = 1 - overlapped_time / (copy_alone + gemm_alone), clamped to + [0, 100]. 100% means the copy was fully hidden behind compute; ~0% means the + copy stream and compute stream serialized (e.g. PCIe contention or no copy + engine free). Best-effort and labelled — it is a diagnostic, not a curve point. + """ + elem = torch.tensor([], dtype=dtype).element_size() + n = max(1, nbytes // elem) + host = torch.empty(n, dtype=dtype, device="cpu", pin_memory=True) + dev = torch.empty(n, dtype=dtype, device="cuda") + + # A GEMM big enough to take longer than the copy (so the copy can hide under it). + m = 2048 + a = torch.randn(m, m, device="cuda", dtype=torch.float16) + b = torch.randn(m, m, device="cuda", dtype=torch.float16) + + copy_stream = torch.cuda.Stream() + compute_stream = torch.cuda.Stream() + + def _time(fn) -> float: + torch.cuda.synchronize() + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + fn() + e.record() + torch.cuda.synchronize() + return s.elapsed_time(e) / iters + + # warmup both paths + for _ in range(3): + dev.copy_(host, non_blocking=True) + torch.matmul(a, b) + torch.cuda.synchronize() + + copy_ms = _time(lambda: [dev.copy_(host, non_blocking=True) for _ in range(iters)]) + gemm_ms = _time(lambda: [torch.matmul(a, b) for _ in range(iters)]) + + def _overlapped(): + for _ in range(iters): + with torch.cuda.stream(copy_stream): + dev.copy_(host, non_blocking=True) + with torch.cuda.stream(compute_stream): + torch.matmul(a, b) + copy_stream.synchronize() + compute_stream.synchronize() + + both_ms = _time(_overlapped) + + serial = copy_ms + gemm_ms + # Hidden time = how much shorter "both concurrent" is than running them back to back. + hidden = max(0.0, serial - both_ms) + # As a fraction of the SMALLER of the two (the most that can be hidden is min). + hideable = min(copy_ms, gemm_ms) + overlap_pct = (hidden / hideable * 100.0) if hideable > 0 else 0.0 + overlap_pct = max(0.0, min(100.0, overlap_pct)) + return { + "size_bytes": n * elem, + "copy_alone_us": round(copy_ms * 1e3, 4), + "gemm_alone_us": round(gemm_ms * 1e3, 4), + "concurrent_us": round(both_ms * 1e3, 4), + "serial_sum_us": round(serial * 1e3, 4), + "overlap_pct": round(overlap_pct, 1), + "gemm_shape": [m, m, m], + } + + +def run_gpu(args) -> tuple[list[dict], dict, str | None]: + """Returns (rows, diagnostics, error). rows is empty + error set if torch/GPU + is unavailable — the caller turns that into status=invalid, never a fake row.""" + try: + import torch + except Exception as exc: # pragma: no cover - runtime/GPU only + return [], {}, f"torch unavailable: {exc!r}" + if not torch.cuda.is_available(): + return [], {}, "torch.cuda.is_available() is False (no GPU in this container)" + + dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16, + "float32": torch.float32, "uint8": torch.uint8}[args.dtype] + sizes = size_ladder(args.min_bytes, args.max_bytes, args.factor) + + rows: list[dict] = [] + for direction, host_memory in SUBOPS: + for nbytes in sizes: + try: + r = _bench_one(torch, direction, host_memory, nbytes, dtype, + args.warmup, args.iters) + r["op"] = direction + r["host_memory"] = host_memory + rows.append(r) + except RuntimeError as exc: # OOM at the top of the ladder, etc. + rows.append({ + "op": direction, "host_memory": host_memory, + "size_bytes": nbytes, "requested_bytes": nbytes, + "latency_us": None, "bandwidth_gbps": None, + "error": repr(exc), + }) + + diagnostics: dict = {"numa": _numa_locality()} + if not args.no_overlap: + try: + diagnostics["overlap_with_compute"] = _overlap_with_compute( + torch, args.overlap_bytes, dtype, max(5, args.iters)) + except Exception as exc: # best-effort diagnostic + diagnostics["overlap_with_compute"] = {"error": repr(exc)} + return rows, diagnostics, None + + +# --------------------------------------------------------------------------- # +# document assembly + CLI # +# --------------------------------------------------------------------------- # +def build_doc(args, rows: list[dict], diagnostics: dict, error: str | None) -> dict: + # Peak bandwidth across every real measured row gates validity: a run that + # produced no positive bandwidth did not actually transfer. + measured = [r for r in rows if r.get("bandwidth_gbps")] + peak_bw = max((r["bandwidth_gbps"] for r in measured), default=0.0) + transferred = bool(measured) and peak_bw > 0.0 + + meta = { + "op": "host_device_copy", + "host_memory": "mixed", + "dtype": args.dtype, + "transport": args.transport, + "topology_class": args.topology_class, + "comparison_class": args.comparison_class, + "measurement_contract": MEASUREMENT_CONTRACT, + } + # Per-curve keys: one comparison_key per (op, host_memory) so the plotter can + # overlay pinned-vs-pageable / h2d-vs-d2h as distinct curves. + curve_keys = {} + for direction, host_memory in SUBOPS: + cm = dict(meta, op=direction, host_memory=host_memory) + curve_keys[f"{direction}/{host_memory}"] = comparison_key(cm) + for r in rows: + r["comparison_key"] = curve_keys.get(f"{r['op']}/{r['host_memory']}") + + doc = { + "schema_version": SCHEMA_VERSION, + "family": FAMILY, + "generated_by": GENERATED_BY, + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, + "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, + "topology_class": args.topology_class, + "comparison_class": args.comparison_class, + "dtype": args.dtype, + "sub_ops": [f"{d}/{h}" for d, h in SUBOPS], + # top-level comparison_key = the whole-suite key (op=host_device_copy); + # per-row keys (above) drive curve overlays. + "comparison_key": comparison_key(meta), + "curve_keys": curve_keys, + "status": "valid" if transferred else "invalid", + "error": error, + "peak_bandwidth_gbps": round(peak_bw, 3), + "sweep": {"min_bytes": args.min_bytes, "max_bytes": args.max_bytes, + "factor": args.factor, "warmup": args.warmup, "iters": args.iters}, + "num_rows": len(rows), + "rows": rows, + "diagnostics": diagnostics, + "provenance": _provenance(), + "environment": _load_env(args.env_json), + } + return doc + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX CPU<->GPU offload suite") + # sweep knobs + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--factor", type=int, default=DEFAULT_FACTOR, help="size step factor") + ap.add_argument("--dtype", default="float16", + choices=["float16", "bfloat16", "float32", "uint8"]) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=20) + ap.add_argument("--no-overlap", action="store_true", + help="skip the overlap-with-compute diagnostic") + ap.add_argument("--overlap-bytes", type=int, default=16 * 1024 * 1024, + help="copy size for the overlap-with-compute diagnostic") + ap.add_argument("--parse-only", action="store_true", + help="emit a well-formed (status=invalid) doc with no GPU — schema check") + # provenance (mirrors run_nccl.py) + ap.add_argument("--runner", required=True, help="runner label, e.g. h200") + ap.add_argument("--topology-class", required=True, + help="e.g. h200-nvlink-island, gb200-nvl72-c2c") + ap.add_argument("--transport", default="pcie", + help="observed host<->device transport: pcie | nvlink-c2c") + ap.add_argument("--comparison-class", default="standardized", + choices=["standardized", "backend-optimized", "framework-integrated"]) + ap.add_argument("--env-json", help="path to env_capture.py output to embed") + ap.add_argument("--timestamp", help="ISO timestamp (default now)") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + if args.parse_only: + rows, diagnostics, error = [], {"numa": _numa_locality()}, "parse-only (no GPU run)" + else: + rows, diagnostics, error = run_gpu(args) + + doc = build_doc(args, rows, diagnostics, error) + + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + + ov = doc["diagnostics"].get("overlap_with_compute", {}) + print( + f"offload: {doc['num_rows']} rows -> {args.out} " + f"(status={doc['status']}, peak_bw={doc['peak_bandwidth_gbps']} GB/s, " + f"overlap={ov.get('overlap_pct')}%, key={doc['comparison_key']})", + file=sys.stderr, + ) + return 0 if doc["status"] == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/probe_mori_caps.py b/experimental/CollectiveX/tests/probe_mori_caps.py new file mode 100644 index 000000000..19ae6e9ed --- /dev/null +++ b/experimental/CollectiveX/tests/probe_mori_caps.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +"""Read-only MoRI capability probe (run under torchrun on MI355X, 8 ranks). + +Decides whether 'fp8' enters MoRIBackend.SUPPORTED_PRECISIONS: inspects +EpDispatchCombineConfig for quant_type options + the scale plumbing, then attempts a +small fp8 dispatch/combine. Prints MORI_FP8_OK (with the working quant_type + recon +error) or MORI_FP8_FAIL (with the exception) — that verdict gates the reject matrix. +LL is not probed: MoRI exposes no separate low-latency entrypoint (caps exclude it). +""" +import inspect +import os +import sys +import traceback + +import torch +import torch.distributed as dist + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import routing # noqa: E402 + +os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", os.environ.get("CX_MORI_HEAP_SIZE", "2G")) + + +def main() -> int: + rank = int(os.environ.get("RANK", "0")) + world = int(os.environ.get("WORLD_SIZE", "1")) + local = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local) + device = torch.device(f"cuda:{local}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12399") + dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world, + device_id=device) + import mori + + if rank == 0: + p = torch.cuda.get_device_properties(0) + print(f"[mori] device={p.name} cus={p.multi_processor_count}") + print("[mori] EpDispatchCombineConfig sig:") + try: + print(" ", inspect.signature(mori.ops.EpDispatchCombineConfig)) + except Exception as e: + print(" ", repr(e)) + # surface any quant enum the module exposes + for name in dir(mori.ops): + if "quant" in name.lower() or "Quant" in name: + obj = getattr(mori.ops, name) + print(f"[mori] ops.{name} = {obj}") + if hasattr(obj, "__members__"): + print(" members:", list(obj.__members__)) + + hidden, topk, experts = 7168, 8, 256 + T = 8 + epr = experts // world + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + + # candidate fp8 quant_type values to try (string and enum forms) + candidates = [] + QT = getattr(mori.ops, "EpDispatchCombineQuantType", None) or getattr(mori.ops, "QuantType", None) + if QT is not None and hasattr(QT, "__members__"): + for mname in QT.__members__: + if "8" in mname or "fp8" in mname.lower() or "FP8" in mname: + candidates.append((f"enum:{mname}", QT.__members__[mname])) + for s in ("fp8", "fp8_e4m3", "e4m3"): + candidates.append((f"str:{s}", s)) + + if rank == 0: + print(f"[mori] fp8 quant_type candidates: {[c[0] for c in candidates]}") + + gi, gw = routing.build_global_routing(T * world, experts, topk, "uniform", 67, epr) + si, sw = routing.rank_slice(gi, gw, rank, T) + x = routing.rank_activations(T, hidden, 67, rank, device, torch.bfloat16) + indices = si.to(device).to(torch.int32) + weights = sw.to(device).to(torch.float32) + + working = None + detail = "" + for label, qt in candidates: + try: + cfg = mori.ops.EpDispatchCombineConfig( + data_type=torch.bfloat16, rank=rank, world_size=world, + hidden_dim=hidden, scale_dim=hidden // 128, + scale_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_num_inp_token_per_rank=512, num_experts_per_rank=epr, + num_experts_per_token=topk, use_external_inp_buf=False, quant_type=qt) + op = mori.ops.EpDispatchCombineOp(cfg) + scales = torch.ones((T, hidden // 128), dtype=torch.float32, device=device) + out = op.dispatch(x, weights, scales, indices, block_num=80, warp_per_block=16) + recv = int(out[-1][0].item()) + dist.barrier() + working = label + detail = f"quant_type={label} dispatched recv={recv}" + if rank == 0: + print(f"[mori] FP8 DISPATCH OK with {label}: recv={recv}") + break + except Exception as exc: + if rank == 0: + print(f"[mori] {label} failed: {type(exc).__name__}: {str(exc)[:160]}") + detail = f"{type(exc).__name__}: {str(exc)[:160]}" + + v = torch.tensor([1 if working else 0], device=device) + dist.all_reduce(v, op=dist.ReduceOp.MIN) + if rank == 0: + print(("MORI_FP8_OK " + detail) if int(v.item()) == 1 else ("MORI_FP8_FAIL " + detail)) + sys.stdout.flush(); sys.stderr.flush() + os._exit(0 if int(v.item()) == 1 else 7) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/reference_ep.py b/experimental/CollectiveX/tests/reference_ep.py new file mode 100644 index 000000000..c19f854e0 --- /dev/null +++ b/experimental/CollectiveX/tests/reference_ep.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""CollectiveX independent EP reference semantics (goal Part 3). + +A from-scratch model of MoE dispatch + combine, written WITHOUT DeepEP or MoRI, used ONLY for +UNTIMED correctness validation. The point (goal: "avoid validating backend against itself"): +expected outputs come from the canonical routing trace + this independent logic, never from the +backend's own round trip. Pure numpy — runs anywhere, no torch. + +Model (ep_size ranks, experts_per_rank experts each; expert e lives on rank e // experts_per_rank): + dispatch: token t selected for expert e contributes a copy of x[t] to (rank e//epr, expert e). + expert: a deterministic per-expert transform f_e (default: scale x by (1 + e/E) — distinct + per expert so a mis-routed copy is detectable; identity is the degenerate case). + combine: y[t] = sum over t's selected experts e of topk_weight[t,e] * f_e(x[t]). + Reduction is over the token's experts; output is in SOURCE token order. + +validate_dispatch() checks every (token, selected-expert) maps to the right rank+expert and the +right payload+gate weight, exactly once. validate_combine() checks the reduction, gate-weighting, +source ordering, and multiple-experts-on-one-rank. reference_combine() returns y for comparing a +backend's combined output against an independent oracle. +""" +from __future__ import annotations + +import numpy as np + + +def expert_scale(e: int, experts: int) -> float: + """Default deterministic per-expert transform factor — distinct per expert so a copy routed + to the wrong expert produces a wrong value (identity would hide mis-routing).""" + return 1.0 + e / float(experts) + + +def dispatch_plan(idx: np.ndarray, experts: int, experts_per_rank: int): + """Independent dispatch model. idx[T,topk] selected experts per token. + Returns list of (token, slot, expert, dest_rank) — every routed copy, exactly once.""" + T, topk = idx.shape + plan = [] + for t in range(T): + seen = set() + for k in range(topk): + e = int(idx[t, k]) + assert e not in seen, f"token {t} selects expert {e} twice (must be distinct)" + seen.add(e) + plan.append((t, k, e, e // experts_per_rank)) + return plan + + +def reference_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale): + """y[t] = sum_k weights[t,k] * f_{idx[t,k]}(x[t]); source-token order. The independent oracle.""" + T, topk = idx.shape + y = np.zeros_like(x, dtype=np.float64) + for t in range(T): + for k in range(topk): + e = int(idx[t, k]) + y[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64) + return y + + +def validate_dispatch(idx, experts, experts_per_rank): + """Every selected (token,expert) routes to the correct rank+expert, exactly once.""" + plan = dispatch_plan(idx, experts, experts_per_rank) + errs = [] + # exactly-once: no duplicate (token, expert) + pairs = [(t, e) for (t, _k, e, _r) in plan] + if len(pairs) != len(set(pairs)): + errs.append("duplicate (token,expert) routed copy") + # correct destination rank + for (t, k, e, r) in plan: + if r != e // experts_per_rank: + errs.append(f"token {t} expert {e} -> rank {r}, expected {e // experts_per_rank}") + ep = (experts + experts_per_rank - 1) // experts_per_rank + for (t, k, e, r) in plan: + if not (0 <= r < ep): + errs.append(f"dest rank {r} out of range [0,{ep})") + return errs + + +def validate_combine(idx, weights, x, experts, experts_per_rank, transform=expert_scale, tol=1e-9): + """Recompute y two ways (vectorizable reduction vs explicit per-copy accumulation) and confirm + they agree — exercises reduction across experts, gate-weighting, source ordering, and the + multiple-experts-on-one-rank case (when topk experts share a rank).""" + errs = [] + y_ref = reference_combine(idx, weights, x, experts, experts_per_rank, transform) + # explicit accumulation over the dispatch plan (independent path) + T = idx.shape[0] + y_acc = np.zeros((T, x.shape[1]), dtype=np.float64) + for (t, k, e, r) in dispatch_plan(idx, experts, experts_per_rank): + y_acc[t] += float(weights[t, k]) * transform(e, experts) * x[t].astype(np.float64) + if np.abs(y_ref - y_acc).max() > tol: + errs.append(f"combine reduction mismatch ({np.abs(y_ref - y_acc).max():.2e})") + # multiple-experts-on-one-rank present? + multi = any(len({int(e) // experts_per_rank for e in idx[t]}) < idx.shape[1] for t in range(T)) + return errs, {"has_multi_expert_per_rank": bool(multi)} + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + import sys + rng = np.random.default_rng(0) + E, EPR, T, topk, H = 256, 32, 64, 8, 16 + idx = np.stack([rng.permutation(E)[:topk] for _ in range(T)]).astype(np.int64) + w = rng.random((T, topk)).astype(np.float32) + x = rng.standard_normal((T, H)).astype(np.float32) + de = validate_dispatch(idx, E, EPR); assert not de, de + ce, info = validate_combine(idx, w, x, E, EPR); assert not ce, ce + print(f"dispatch+combine semantics OK (multi_expert_per_rank={info['has_multi_expert_per_rank']})") + # mis-routing is DETECTED: corrupt one expert id and confirm the oracle value changes + y0 = reference_combine(idx, w, x, E, EPR) + idx2 = idx.copy(); idx2[0, 0] = (idx2[0, 0] + 1) % E + y1 = reference_combine(idx2, w, x, E, EPR) + assert np.abs(y0[0] - y1[0]).max() > 1e-6, "per-expert transform must make mis-routing detectable" + print("mis-routing detectable via distinct per-expert transform OK") + # edge cases (goal Part 3): empty rank, repeated dest rank, non-divisible handled by callers + idx_hot = np.zeros((4, topk), dtype=np.int64) + idx_hot[:] = np.arange(topk) # all tokens -> experts 0..7 (all on rank 0) = hotspot + assert not validate_dispatch(idx_hot, E, EPR), "single-rank hotspot must validate" + print("edge case: single-rank hotspot (all topk on rank 0) OK") + print("reference_ep self-test: PASS"); sys.exit(0) diff --git a/experimental/CollectiveX/tests/rl_mesh_bench.py b/experimental/CollectiveX/tests/rl_mesh_bench.py new file mode 100644 index 000000000..d35c32a22 --- /dev/null +++ b/experimental/CollectiveX/tests/rl_mesh_bench.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +"""CollectiveX — RL mesh-to-mesh transfer benchmark (family=rl-mesh). + +In RL post-training the TRAINER mesh (updated weights) must hand parameters to the +GENERATOR/rollout mesh, and rollouts flow back — an NCCL M2N / "NCCL Xfer" pattern between +two DISJOINT device meshes. This benchmark splits the world into a trainer half and a +generator half and times weight-sized tensor transfer between them, both directions, under +two redistribution patterns: + + paired : trainer rank i -> generator rank i (1:1 send/recv, matched ranks) + redistribute : every trainer rank -> every generator rank (disjoint all-to-all reshard, + the realistic case when trainer-TP != generator-TP) + +Run under torchrun (multi-process); world is split in half (needs >=2 ranks, even count). +CUDA-event timed; one provenance-tagged JSON like run_nccl.py. Stdlib + torch (torch only +needed at runtime; --help works without it). + + torchrun --nproc_per_node=8 tests/rl_mesh_bench.py --runner h200-dgxc \\ + --topology-class h200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/h200_rl_mesh.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "rl-mesh-xfer-v1" +FAMILY = "rl-mesh" + +# Weight-shard byte sizes a trainer->generator handoff moves: a single large tensor (a fused +# QKV / MLP weight) up to a whole layer's params. Sweep 1 MiB .. 1 GiB. +DEFAULT_MIN_BYTES = 1 << 20 +DEFAULT_MAX_BYTES = 1 << 30 + + +def _sizes(lo, hi, factor=4): + out, s = [], lo + while s <= hi: + out.append(s) + s *= factor + return out + + +def comparison_key(meta: dict) -> str: + parts = [meta["direction"], meta["pattern"], str(meta["world_size"]), + meta["topology_class"], meta["measurement_contract"]] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _bench(fn, torch, warmup, iters): + for _ in range(warmup): + fn() + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(iters): + fn() + end.record() + torch.cuda.synchronize() + return start.elapsed_time(end) / iters # ms/iter + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX RL mesh-to-mesh transfer benchmark") + ap.add_argument("--min-bytes", type=int, default=DEFAULT_MIN_BYTES) + ap.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=20) + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="nvlink") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world < 2 or world % 2 != 0: + if rank == 0: + print(f"ERROR: rl-mesh needs an even world_size >= 2 (got {world})", file=sys.stderr) + return 5 + torch.cuda.set_device(local_rank) + dev = torch.device(f"cuda:{local_rank}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12357") + if not dist.is_initialized(): + dist.init_process_group("nccl") + + half = world // 2 + is_trainer = rank < half + # peer for the paired (1:1) pattern: trainer i <-> generator (i+half) + paired_peer = (rank + half) if is_trainer else (rank - half) + sizes = _sizes(args.min_bytes, args.max_bytes) + groups = [] + peak = 0.0 + + def _buf(nbytes): + return torch.empty(nbytes, dtype=torch.uint8, device=dev) + + # PAIRED 1:1 send/recv, timed on the trainer side per direction. + for direction in ("trainer_to_generator", "generator_to_trainer"): + rows = [] + sender_is_trainer = (direction == "trainer_to_generator") + i_send = (is_trainer == sender_is_trainer) # this rank sends in this direction + for nbytes in sizes: + buf = _buf(nbytes) + + def step(): + if i_send: + dist.send(buf, dst=paired_peer) + else: + dist.recv(buf, src=paired_peer) + try: + ms = _bench(step, torch, args.warmup, args.iters) + except RuntimeError as exc: + rows.append({"transfer_bytes": nbytes, "error": repr(exc), "correct": None}) + break + gb_s = (nbytes / (ms / 1e3)) / 1e9 if ms > 0 else 0.0 + # reduce timing across ranks (max = slowest pair) for a stable number + t = torch.tensor([ms], device=dev) + dist.all_reduce(t, op=dist.ReduceOp.MAX) + ms_max = float(t.item()) + gb_s = (nbytes / (ms_max / 1e3)) / 1e9 if ms_max > 0 else 0.0 + peak = max(peak, gb_s) + rows.append({"transfer_bytes": nbytes, "time_ms": round(ms_max, 5), + "bandwidth_gb_s": round(gb_s, 2), "correct": True}) + meta = {"direction": direction, "pattern": "paired", "world_size": world, + "trainer_ranks": half, "generator_ranks": world - half, + "topology_class": args.topology_class, "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows}) + + # REDISTRIBUTE: disjoint all-to-all (trainer half scatters to all generator ranks). Each + # sender sends nbytes/half to each receiver in the other mesh; timed via batched isend/irecv. + for direction in ("trainer_to_generator", "generator_to_trainer"): + rows = [] + senders = range(0, half) if direction == "trainer_to_generator" else range(half, world) + receivers = range(half, world) if direction == "trainer_to_generator" else range(0, half) + am_sender = rank in senders + am_receiver = rank in receivers + for nbytes in sizes: + chunk = max(1, nbytes // half) + sbuf = _buf(chunk) + + def step(): + reqs = [] + if am_sender: + for dst in receivers: + reqs.append(dist.isend(sbuf, dst=dst)) + if am_receiver: + for src in senders: + rbuf = _buf(chunk) + reqs.append(dist.irecv(rbuf, src=src)) + for r in reqs: + r.wait() + try: + ms = _bench(step, torch, args.warmup, args.iters) + except RuntimeError as exc: + rows.append({"transfer_bytes": nbytes, "error": repr(exc), "correct": None}) + break + t = torch.tensor([ms], device=dev) + dist.all_reduce(t, op=dist.ReduceOp.MAX) + ms_max = float(t.item()) + # effective payload moved per receiver = nbytes (half chunks of nbytes/half) + gb_s = (nbytes / (ms_max / 1e3)) / 1e9 if ms_max > 0 else 0.0 + peak = max(peak, gb_s) + rows.append({"transfer_bytes": nbytes, "time_ms": round(ms_max, 5), + "bandwidth_gb_s": round(gb_s, 2), "correct": True}) + meta = {"direction": direction, "pattern": "redistribute", "world_size": world, + "trainer_ranks": half, "generator_ranks": world - half, + "topology_class": args.topology_class, "measurement_contract": MEASUREMENT_CONTRACT} + groups.append({**meta, "comparison_key": comparison_key(meta), "rows": rows}) + + if rank != 0: + dist.barrier() + dist.destroy_process_group() + return 0 + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + doc = { + "schema_version": SCHEMA_VERSION, "family": FAMILY, + "generated_by": "rl_mesh_bench.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "measurement_contract": MEASUREMENT_CONTRACT, + "world_size": world, "trainer_ranks": half, "generator_ranks": world - half, + "status": "valid" if (groups and peak > 0.0) else "invalid", + "peak_bandwidth_gb_s": round(peak, 2), + "num_groups": len(groups), "groups": groups, "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"rl-mesh: {len(groups)} (direction,pattern) groups -> {args.out} " + f"(status={doc['status']}, peak_bw={peak:.1f} GB/s, world={world} trainer={half})") + dist.barrier() + dist.destroy_process_group() + return 0 if doc["status"] == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py new file mode 100644 index 000000000..7c1f3458d --- /dev/null +++ b/experimental/CollectiveX/tests/routing.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +"""CollectiveX — deterministic, platform-independent MoE routing trace. + +Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated +ONCE from a fixed seed over the *global* token batch, indexed by global token id, and +is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k, +experts_per_rank). Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations +are per-rank (same rank ⇒ same x on any platform), so a given global token id has +identical activation everywhere without materializing a global activation tensor. + +Trace classes (the rank fan-out — #destination ranks a token's top-k experts touch — +is the property that makes an EP workload representative; review caught the old +default having fan-out 1): + + * uniform — top-k distinct experts drawn uniformly per token. The DEFAULT. + Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈ + 8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson. + * balanced — load-equalized AND maximally spread: token i, slot j → + (i + j·experts_per_rank) mod E, so the 8 experts sit one-per-rank + (fan-out = ep_size) and every expert is hit equally. The high-fan-out, + perfectly-balanced reference. + * balanced-rank-local — the OLD degenerate "balanced": (i·top_k + j) mod E, i.e. + top_k consecutive experts, which (top_k ≤ experts/rank, aligned) all + land on ONE rank ⇒ fan-out 1, minimum communication. Kept as an + explicit edge case, honestly named. + * zipf — expert popularity ∝ 1/rank (skewed load), uniform-ish fan-out. + +Temporal classes (goal Part 2 "temporal routing changes" — the hot set MOVES across decode +steps; selected by `step`, which every rank passes identically so the trace stays consistent): + + * hotspot-single — STATIC hotspot: expert 0 hot on every step (the adversarial baseline). + * hotspot-moving — the hot expert is `step % experts` (a hotspot that migrates step-to-step). + * alternating-groups — tokens route within one of two disjoint expert halves, the active half + toggling with `step % 2` (models expert groups that alternate across steps). + * trace-replay — RESERVED: captured per-step routing from real serving (needs a captured + trace loader; not yet wired — `build_global_routing` raises for it). + +Always publish the realized fan-out so the workload is never misread again +(`routing_stats`). +""" +from __future__ import annotations + +import hashlib + +import torch + +_RANK_SUBSEED = 7919 + + +def _cpu_gen(seed: int) -> "torch.Generator": + g = torch.Generator(device="cpu") + g.manual_seed(int(seed)) + return g + + +def build_global_routing(global_tokens: int, experts: int, topk: int, + routing: str, seed: int, experts_per_rank: int, step: int = 0): + """(idx[gt, topk] int64, weights[gt, topk] float32) on CPU — deterministic, + independent of world/EP/platform, experts distinct within a token. `step` selects + the temporal snapshot for the moving/alternating distributions (0 = first step = + the static behavior; identical on every rank so the trace stays cross-rank consistent).""" + if topk > experts: + raise ValueError(f"topk ({topk}) > experts ({experts})") + gt = int(global_tokens) + g = _cpu_gen(seed) + if routing == "uniform": + keys = torch.rand(gt, experts, generator=g) + idx = keys.argsort(dim=1)[:, :topk].contiguous().to(torch.int64) + elif routing == "balanced": + # one expert per rank ⇒ fan-out = ep_size, perfectly balanced load. + i = torch.arange(gt, dtype=torch.int64).unsqueeze(1) + j = torch.arange(topk, dtype=torch.int64).unsqueeze(0) + idx = (i + j * int(experts_per_rank)) % experts + elif routing == "balanced-rank-local": + # top_k consecutive (mod E) ⇒ all on ONE rank ⇒ fan-out 1 (min comm). Edge case. + i = torch.arange(gt, dtype=torch.int64).unsqueeze(1) + j = torch.arange(topk, dtype=torch.int64).unsqueeze(0) + idx = (i * topk + j) % experts + elif routing == "zipf" or routing.startswith("zipf-"): + # popularity ∝ 1/rank^s — s sets the skew. zipf == zipf-moderate (s=1). + s = {"zipf": 1.0, "zipf-mild": 0.5, "zipf-moderate": 1.0, "zipf-heavy": 2.0}.get(routing) + if s is None: + raise ValueError(f"unknown zipf level '{routing}'") + p = 1.0 / torch.arange(1, experts + 1, dtype=torch.float32).pow(s) + p = (p / p.sum()).expand(gt, experts) + idx = torch.multinomial(p, topk, replacement=False, generator=g).to(torch.int64) + elif routing == "hotspot-single" or routing == "hotspot-moving": + # adversarial: ONE hot expert is in EVERY token's top-k (max single-rank load), the + # other topk-1 drawn uniformly from the rest. hotspot-single pins it at expert 0 + # (STATIC); hotspot-moving migrates it to `step % experts` (the hot rank moves across + # decode steps). Identical math otherwise — `hot` is the only difference. + hot = 0 if routing == "hotspot-single" else (int(step) % experts) + others = [e for e in range(experts) if e != hot] + others_t = torch.tensor(others, dtype=torch.int64) + rest = torch.stack([others_t[torch.randperm(experts - 1, generator=g)[:topk - 1]] + for _ in range(gt)]).to(torch.int64) + idx = torch.cat([torch.full((gt, 1), hot, dtype=torch.int64), rest], dim=1) + elif routing == "alternating-groups": + # tokens route ENTIRELY within one disjoint expert half; the active half toggles with + # `step % 2` (group A = [0, E/2), group B = [E/2, E)). Models expert groups that + # alternate across steps — half the ranks idle each step (a temporal load shift). + half = experts // 2 + if topk > half: + raise ValueError(f"alternating-groups needs topk ({topk}) <= experts/2 ({half})") + base = 0 if (int(step) % 2 == 0) else half + keys = torch.rand(gt, half, generator=g) + idx = (keys.argsort(dim=1)[:, :topk].contiguous().to(torch.int64) + base) + elif routing == "trace-replay": + raise ValueError("trace-replay routing is reserved — needs a captured per-step trace " + "loader (not yet wired); use make_workloads.py + --workload-dir to " + "replay a serialized trace, or pick a synthetic temporal mode") + else: + raise ValueError( + f"unknown routing '{routing}' (uniform|balanced|balanced-rank-local|" + f"zipf[-mild|-moderate|-heavy]|hotspot-single|hotspot-moving|alternating-groups)") + weights = torch.softmax(torch.randn(gt, topk, generator=g), dim=1).to(torch.float32) + return idx, weights + + +# Activation VALUE distributions (goal Part 2 "activation-value sensitivity"). Under bf16 combine +# these are latency-neutral (bf16 is value-independent — the ratio is ~1.0, the expected null +# result); they become latency-relevant only under a quantized combine (PR311), where amax / +# outliers / saturation drive scale computation. Kept here so the rig is ready + the value +# identity (activation_identity) is honest about which distribution was used. +ACTIVATION_PROFILES = ("normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation") +_FP8_E4M3_MAX = 448.0 # e4m3 max magnitude — fp8-saturation pushes values to/over this + + +def rank_slice(idx, weights, rank: int, tokens_per_rank: int): + lo = rank * tokens_per_rank + return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous() + + +def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device, + dtype=torch.bfloat16, profile: str = "normal"): + """Per-rank expert-input activations. Deterministic from (seed, rank) so a given global + token has identical activation on every platform. `profile` selects the VALUE distribution + (goal Part 2): normal N(0,1); zeros; small-amplitude (×0.01); wide-dynamic-range (heavy-tailed + with rare large outliers); fp8-saturation (values scaled to straddle the e4m3 max so an fp8 + cast saturates). All seeded identically per rank — only the value shape changes.""" + g = _cpu_gen(int(seed) * _RANK_SUBSEED + int(rank) + 1) + if profile == "zeros": + x = torch.zeros(tokens, hidden, dtype=torch.float32) + elif profile == "small-amplitude": + x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32) * 0.01 + elif profile == "wide-dynamic-range": + # heavy-tailed: N(0,1) base with a sparse (~1%) set of large (×~250) outliers, so amax + # per block swings widely token-to-token (the case that stresses per-block fp8 scaling). + x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32) + spikes = (torch.rand(tokens, hidden, generator=g) < 0.01).float() + x = x + spikes * torch.randn(tokens, hidden, generator=g, dtype=torch.float32) * 250.0 + elif profile == "fp8-saturation": + # uniform in [-1,1] scaled to ~1.5× the e4m3 max so a naive fp8 cast clips/saturates. + u = torch.rand(tokens, hidden, generator=g, dtype=torch.float32) * 2.0 - 1.0 + x = u * (_FP8_E4M3_MAX * 1.5) + elif profile == "normal": + x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32) + else: + raise ValueError(f"unknown activation profile '{profile}' (one of {ACTIVATION_PROFILES})") + return x.to(device=device, dtype=dtype) + + +def placement_perm(ep_size: int, gpus_per_node: int, placement: str) -> list: + """phys[logical_rank] -> physical slot, per placement kind (goal Part 2 placement matrix). + The physical slot's node = slot // gpus_per_node, domain = slot // scale_up_domain. Single + node (ep <= gpus_per_node) makes every placement identical (everything is same-node). + + packed identity — fill one node/domain before crossing (latency-oriented default). + runtime-native identity for now — reproduces the serving placement (link via recipe meta). + striped round-robin logical ranks across nodes (exposes inter-node transport). + adversarial a deterministic scatter that maximizes cross-node/-domain copies. + """ + n = ep_size + if gpus_per_node <= 0 or gpus_per_node >= n or placement in ("packed", "runtime-native"): + return list(range(n)) + nodes = (n + gpus_per_node - 1) // gpus_per_node + if placement == "striped": + # logical r -> node (r % nodes), intra-node slot (r // nodes): spreads neighbors apart. + return [min(n - 1, (r % nodes) * gpus_per_node + (r // nodes)) for r in range(n)] + if placement == "adversarial": + # reverse within the rank space, then stripe — pushes a rank's neighbors to far nodes. + return [min(n - 1, ((n - 1 - r) % nodes) * gpus_per_node + ((n - 1 - r) // nodes)) + for r in range(n)] + return list(range(n)) + + +def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int, + gpus_per_node: int, scale_up_domain: int = None, + placement: str = "packed") -> dict: + """Locality of the routed (token, dest-rank) copies (goal Part 2 topology section). + A token's SOURCE rank is global_id // tokens_per_rank; its DEST ranks are idx // epr. The + PLACEMENT maps each logical rank to a physical slot, so node/domain membership — and thus the + same-node / same-domain / cross-* fractions — depend on packed vs striped vs adversarial.""" + import torch as _t + gt = idx.shape[0] + dest = (idx // experts_per_rank).clamp(max=ep_size - 1) # [gt, topk] dest logical rank + src = (_t.arange(gt) // max(1, tokens_per_rank)).clamp(max=ep_size - 1).unsqueeze(1) + src = src.expand_as(dest) + sud = scale_up_domain or (gpus_per_node * ep_size) # default: all one domain + # physical slot of each logical rank, per placement -> node / domain it lives in. + perm = placement_perm(ep_size, gpus_per_node, placement) + phys = _t.tensor(perm, dtype=_t.int64) + pd, ps = phys[dest], phys[src] + local = (dest == src) + same_node = (pd // gpus_per_node) == (ps // gpus_per_node) + same_dom = (pd // sud) == (ps // sud) + n = dest.numel() + return { + "placement": placement, + "local_rank_fraction": float(local.float().mean()), + "same_node_fraction": float(same_node.float().mean()), + "same_scaleup_domain_fraction": float(same_dom.float().mean()), + "cross_node_fraction": float((~same_node).float().mean()), + "cross_domain_fraction": float((~same_dom).float().mean()), + "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n), + } + + +def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict: + """Realized routing properties for the GLOBAL trace — published per point so the + fan-out / load can never be silently misread. idx is the global [gt, topk] tensor; + weights the matching [gt, topk] gate weights (hashed too for workload identity). + """ + ep = max(1, experts // max(1, experts_per_rank)) + ranks = (idx // experts_per_rank) # [gt, topk] destination rank per assignment + # unique destination ranks per token (fan-out) + onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool) + onehot.scatter_(1, ranks.clamp(max=ep - 1), True) + fanout = onehot.sum(dim=1) # [gt] + hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist() # counts for fan-out 1..ep + load = torch.bincount(idx.reshape(-1), minlength=experts).float() + # token-copies SENT to each destination rank (the "send histogram", review #3). + rank_load_t = torch.bincount(ranks.reshape(-1).clamp(max=ep - 1), minlength=ep).float() + rank_load = [int(x) for x in rank_load_t.tolist()] + # One-number imbalance summaries so a row is self-describing for the distribution-sensitivity + # suite (no need to read the full histograms): CV = std/mean of the load; hotspot_ratio = + # worst expert load over the mean. uniform -> CV≈0, hotspot_ratio≈1; zipf / hotspot-single -> + # high CV and hotspot_ratio (≫1). Population std (unbiased=False) over the full realized trace. + def _cv(t): + m = float(t.mean()) + return float(t.std(unbiased=False) / m) if m > 0 else 0.0 + expert_load_cv = _cv(load) + rank_load_cv = _cv(rank_load_t) + hotspot_ratio = float(load.max() / load.mean()) if float(load.mean()) > 0 else 0.0 + # Empty-expert / empty-rank counts (goal P2 "report full load and fanout statistics"): + # how many experts/dest-ranks received ZERO token-copies (the dark side of skew — idle + # units while the hot rank stalls). dest-rank load max/mean make the rank histogram + # self-describing without re-reading rank_load_hist. + empty_expert_count = int((load == 0).sum()) + empty_rank_count = int((rank_load_t == 0).sum()) + dest_rank_load_max = int(rank_load_t.max()) + dest_rank_load_mean = float(rank_load_t.mean()) + # SHA-256 workload identity over BOTH topk_idx and gate weights (review #3): a chart + # point's routing is provably identical across SKUs only if both hashes match. + idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes() + idx_hash = hashlib.sha256(idx_bytes).hexdigest()[:16] + if weights is not None: + w_bytes = weights.to(torch.float32).cpu().numpy().tobytes() + w_hash = hashlib.sha256(w_bytes).hexdigest()[:16] + routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest()[:16] # combined identity + else: + w_hash, routing_hash = None, idx_hash + return { + "fanout_mean": float(fanout.float().mean()), + "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()), + "fanout_hist": hist, # index k-1 = #tokens with fan-out k + "rank_load_hist": rank_load, # token-copies sent to each dest rank + "routed_copies": int(fanout.sum()), # total (token, dest-rank) pairs + "expert_load_min": int(load.min()), "expert_load_max": int(load.max()), + "expert_load_mean": float(load.mean()), "expert_load_cv": expert_load_cv, + "rank_load_cv": rank_load_cv, "hotspot_ratio": hotspot_ratio, + "dest_rank_load_max": dest_rank_load_max, "dest_rank_load_mean": dest_rank_load_mean, + "empty_expert_count": empty_expert_count, "empty_rank_count": empty_rank_count, + "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash, + } + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": # needs torch; verifies temporal modes + value profiles + new stats + import sys + E, TOPK, EPR, GT = 256, 8, 32, 4096 + # (1) static vs moving hotspot: the hot expert is 0 for static, step%E for moving. + si, _ = build_global_routing(GT, E, TOPK, "hotspot-single", 67, EPR, step=5) + assert (si[:, 0] == 0).all(), "hotspot-single must pin expert 0 on every step" + mi, _ = build_global_routing(GT, E, TOPK, "hotspot-moving", 67, EPR, step=5) + assert (mi[:, 0] == 5).all(), "hotspot-moving step=5 must pin expert 5" + mi0, _ = build_global_routing(GT, E, TOPK, "hotspot-moving", 67, EPR, step=0) + assert (mi0[:, 0] == 0).all(), "hotspot-moving step=0 == static origin" + # all topk distinct (hot + topk-1 from the rest, no collision) + assert all(len(set(r.tolist())) == TOPK for r in mi[:16]), "moving-hotspot topk must stay distinct" + # (2) alternating-groups: even step -> lower half, odd step -> upper half. + a0, _ = build_global_routing(GT, E, TOPK, "alternating-groups", 67, EPR, step=0) + a1, _ = build_global_routing(GT, E, TOPK, "alternating-groups", 67, EPR, step=1) + assert int(a0.max()) < E // 2 and int(a1.min()) >= E // 2, "alternating-groups must toggle halves" + # (3) new stats: uniform low CV / no empties; hotspot high CV + many empty experts. + su = routing_stats(build_global_routing(GT, E, TOPK, "uniform", 67, EPR)[0], E, EPR) + sh = routing_stats(si, E, EPR) + assert su["hotspot_ratio"] < 1.5 and sh["hotspot_ratio"] > 5, "hotspot_ratio must separate uniform/hotspot" + assert sh["empty_expert_count"] >= 0 and "empty_rank_count" in sh and "dest_rank_load_max" in sh + print(f"routing temporal+stats OK (uniform hotspot_ratio={su['hotspot_ratio']:.2f} " + f"hotspot empty_experts={sh['empty_expert_count']} dest_rank_max={sh['dest_rank_load_max']})") + # (4) value profiles: distinct value shapes, all finite, fp8-saturation exceeds e4m3 max. + dev = torch.device("cpu") + z = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="zeros") + assert float(z.abs().max()) == 0.0, "zeros profile must be all-zero" + sat = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="fp8-saturation") + assert float(sat.abs().max()) > _FP8_E4M3_MAX, "fp8-saturation must exceed e4m3 max" + sm = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="small-amplitude") + assert float(sm.abs().max()) < 1.0, "small-amplitude must be tiny" + for prof in ACTIVATION_PROFILES: + v = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile=prof) + assert torch.isfinite(v).all(), f"{prof} produced non-finite values" + print(f"activation profiles OK ({', '.join(ACTIVATION_PROFILES)})") + print("routing self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py new file mode 100644 index 000000000..217d9ca80 --- /dev/null +++ b/experimental/CollectiveX/tests/run_ep.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +"""CollectiveX — EP dispatch/combine benchmark entrypoint (run under torchrun). + +Picks a backend adapter (DeepEP or MoRI), runs the source-tokens-per-rank sweep +via ep_harness, and writes one provenance-tagged JSON doc. Dispatch and combine +are timed SEPARATELY (see ep_harness); only T varies along the resulting line. + + torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \\ + --phase decode --runner mi355x-amds --topology-class mi355x-xgmi \\ + --transport xgmi --env-json results/env.json --out results/mi355x_mori_decode.json + + torchrun --nproc_per_node=8 tests/run_ep.py --backend deepep \\ + --phase prefill --runner b200-dgxc --topology-class b200-nvlink-island \\ + --transport nvlink --env-json results/env.json --out results/b200_deepep_prefill.json +""" +from __future__ import annotations + +import argparse +import os +import sys + +# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under +# torchrun (it executes the file as __main__, not as a package). +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import ep_harness # noqa: E402 (stdlib-only; safe before torch) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep") + ap.add_argument("--backend", required=True, + choices=["deepep", "deepep-hybrid", "mori", "uccl", "nccl-ep", "flashinfer"]) + ep_harness.add_common_args(ap) + args = ap.parse_args() + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12355") + + # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction + # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL + # routing trace and remaps it to the balanced physical placement (a pure routing transform, + # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count. + if getattr(args, "eplb", False): + import eplb + args.num_logical_experts = args.experts + args.experts = eplb.physical_count(args.experts, args.num_redundant_experts, world_size) + + # Reproduction provenance (recorded in the artifact). + args.reproduction_command = (f"torchrun --nproc_per_node={world_size} tests/run_ep.py " + + " ".join(sys.argv[1:])) + args.image = os.environ.get("COLLECTIVEX_IMAGE", "") + args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "") + # Container provenance (goal P1): arch (amd64/arm64) + local squash hash for Enroot/Pyxis. + import platform as _plat + _arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine()) + args.image_arch = _arch + args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256") + # Complete GitHub provenance (goal P1): repo, run id, attempt, ref/branch, source SHA, job, + # artifact. A result is only publication-'official' when these are present (validity gate). + _run = {"run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME")} + args.git_run = _run if any(_run.values()) else None + + # Import the backend CLASS (module-top imports torch + the backend lib; no process + # group needed) and REJECT unsupported combos BEFORE init — never fall back or + # mislabel (review/goal). All ranks reject identically. + if args.backend == "mori": + from ep_mori import MoRIBackend as Backend + elif args.backend == "nccl-ep": + from ep_nccl import NCCLBackend as Backend + elif args.backend == "uccl": + from ep_uccl import UCCLBackend as Backend + elif args.backend == "flashinfer": + from ep_flashinfer import FlashInferBackend as Backend + elif args.backend == "deepep-hybrid": + from ep_deepep_hybrid import DeepEPHybridBackend as Backend + else: + from ep_deepep import DeepEPBackend as Backend + if args.num_ep_groups != 1: + if rank == 0: + print(f"ERROR: num_ep_groups={args.num_ep_groups} REJECTED — real subgroup process " + f"groups are unimplemented; not faking it.", file=sys.stderr) + return 5 + sp = getattr(Backend, "SUPPORTED_PRECISIONS", {"bf16"}) + sm = getattr(Backend, "SUPPORTED_MODES", {"normal"}) + if args.dispatch_dtype not in sp or args.mode not in sm: + if rank == 0: + print(f"ERROR: {args.backend} REJECTS dispatch-dtype={args.dispatch_dtype} / " + f"mode={args.mode} — not supported on this build (no fallback). " + f"supported precisions={sorted(sp)} modes={sorted(sm)}.", file=sys.stderr) + return 5 + # Combine-path capability (review: dispatch_dtype=fp8 must NOT silently imply quantized + # combine). Defaults (bf16 / none) reproduce today's behavior; a quant-combine backend + # widens its SUPPORTED_COMBINE_* sets. getattr keeps backends that don't declare them at bf16/none. + scd = getattr(Backend, "SUPPORTED_COMBINE_DTYPES", {"bf16"}) + sqm = getattr(Backend, "SUPPORTED_COMBINE_QUANT_MODES", {"none"}) + cdt = getattr(args, "combine_dtype", "bf16") + cqm = getattr(args, "combine_quant_mode", "none") + if cdt not in scd or cqm not in sqm: + if rank == 0: + print(f"ERROR: {args.backend} REJECTS combine-dtype={cdt} / combine-quant-mode={cqm} " + f"— quant combine not wired (no fallback). supported combine_dtypes={sorted(scd)} " + f"quant_modes={sorted(sqm)}.", file=sys.stderr) + return 5 + # Measurement-contract capability (review #3): each adapter conforms to a declared + # contract; reject anything else rather than letting it pick its own timing boundary. + sc = getattr(Backend, "SUPPORTED_CONTRACTS", {"layout-and-dispatch-v1"}) + if args.measurement_contract not in sc: + if rank == 0: + print(f"ERROR: {args.backend} REJECTS measurement-contract=" + f"{args.measurement_contract} — supported={sorted(sc)}.", file=sys.stderr) + return 5 + if args.measurement_contract == "cached-layout-comm-only-v1" and args.mode == "ll": + if rank == 0: + print("ERROR: cached-layout-comm-only-v1 is meaningless for LL (low_latency_dispatch " + "computes its layout internally; nothing to hoist).", file=sys.stderr) + return 5 + + # MoRI inits its shmem on a process group it registers as "default" and wants + # the gloo+nccl combo with an explicit device_id (per its reference test); + # DeepEP uses a plain nccl group. + # Cross-node rendezvous: env:// (TCPStore at MASTER_ADDR:PORT) is the default and is byte-identical + # to single-node behavior. But on the H100/H200/MI355X fleets the rank-0 MASTER_ADDR (the scontrol + # management-subnet NodeAddr) is NOT reachable from a peer rank's enroot container net namespace, so + # the TCPStore bootstrap times out before any RDMA transport engages. When CX_RDZV_FILE points at a + # path on the COMPUTE-VISIBLE shared mount, init via a FileStore instead: ranks exchange the store + # (and NCCL's unique-id) through the shared file, and NCCL then connects peers over the IB fabric + # (which IS routable cross-node) rather than the unreachable management TCP. Opt-in; unset = today. + if not dist.is_initialized(): + _rdzv = os.environ.get("CX_RDZV_FILE") + _fstore = {"init_method": f"file://{_rdzv}", "rank": rank, "world_size": world_size} if _rdzv else {} + if args.backend == "mori": + dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world_size, + device_id=device, + **({"init_method": f"file://{_rdzv}"} if _rdzv else {})) + else: + dist.init_process_group("nccl", **_fstore) + + # Construct + run inside a try so a backend exception (esp. a new adapter on GPU) prints its + # FULL traceback to STDOUT — torchrun captures per-rank stdout but only summarizes stderr, so an + # uncaught exception is otherwise invisible in CI. Print on every rank (prefixed) then re-raise. + try: + backend = Backend(args, rank, world_size, local_rank, device) + if rank == 0: + print(f"[run_ep] backend={args.backend} phase={args.phase} mode={args.mode} " + f"world={world_size} ep_size={world_size} hidden={args.hidden} " + f"topk={args.topk} experts={args.experts} dtype={args.dispatch_dtype} " + f"routing={args.routing} seed={args.seed}") + rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size) + except Exception: + import traceback + print(f"[run_ep][rank{rank}] backend={args.backend} FAILED:\n" + traceback.format_exc(), + flush=True) + raise + # finalize() handles backend-specific teardown: DeepEP returns rc cleanly; + # MoRI hard-exits past its post-shmem_finalize teardown assertion. + return backend.finalize(rc) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/sensitivity.py b/experimental/CollectiveX/tests/sensitivity.py new file mode 100644 index 000000000..b8c3d1a39 --- /dev/null +++ b/experimental/CollectiveX/tests/sensitivity.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +"""CollectiveX distribution-sensitivity summary (stdlib-only — no torch/numpy). + +A single arbitrary routing distribution can't be published as "backend X latency" and implied +to generalize (review): MoE combine cost depends on how tokens spread across experts/ranks. This +collapses that into ONE number per (sku, backend, phase) instead of a 7th chart dimension: + + distribution_sensitivity_ratio = p99(worst stressor distribution) / p99(headline = uniform) + +at MATCHED tokens/rank (anchor points). >1 means the backend degrades under skew; ~1 means robust. +Stressors = balanced / zipf* / hotspot-single (NOT the degenerate balanced-rank-local best case, +NOT EPLB-remedied runs). Also reports the best-case ratio and the EPLB recovery where present. + +Compares ONLY within an identical (sku, backend, phase, dispatch_dtype, mode, contract, ep, +combine_quant_mode, activation_profile) group — the routing distribution is the only thing that +varies, so the ratio is attributable to it and nothing else. + + python3 tests/sensitivity.py --results-dir results # markdown table to stdout + python3 tests/sensitivity.py --results-dir results --out results/sensitivity.json + python3 tests/sensitivity.py --results-dir results --anchors 1,8,32,128 --metric roundtrip +""" +from __future__ import annotations + +import argparse +import glob +import json +import os + +HEADLINE = "uniform" +BEST_CASE = "balanced-rank-local" # min-comm degenerate case (fan-out 1) — not a stressor + + +def _routing_label(doc: dict) -> str: + sh = doc.get("shape", {}) or {} + r = sh.get("routing", "?") + return r + ("+eplb" if (doc.get("eplb") or {}).get("enabled") else "") + + +def _group_key(doc: dict) -> tuple: + sh = doc.get("shape", {}) or {} + q = sh.get("quant", {}) or {} + sku = (doc.get("runner") or "?").split("_")[0].split("-")[0] + return (sku, doc.get("backend"), doc.get("phase"), + sh.get("dispatch_dtype"), doc.get("mode"), doc.get("measurement_contract"), + doc.get("ep_size"), q.get("combine_quant_mode", "none"), + sh.get("activation_profile", "normal")) + + +def _p99_by_T(doc: dict, metric: str) -> dict: + out = {} + for r in doc.get("rows", []): + T = r.get("tokens_per_rank") + m = r.get(metric) or {} + if T is not None and m.get("p99") is not None: + out[int(T)] = float(m["p99"]) + return out + + +def analyze(results_dir: str, metric: str = "roundtrip", anchors=None) -> dict: + # group docs by identical config; within a group map routing-label -> {T: p99}. + groups: dict = {} + for path in sorted(glob.glob(os.path.join(results_dir, "**", "*.json"), recursive=True)): + try: + doc = json.load(open(path)) + except (json.JSONDecodeError, OSError): + continue + if doc.get("family") != "moe" or not doc.get("rows"): + continue + gk = _group_key(doc) + # merge (not overwrite) so multiple files of the same config+routing — e.g. an anchor + # sensitivity run plus a full-ladder headline run — combine their T points. + groups.setdefault(gk, {}).setdefault(_routing_label(doc), {}).update(_p99_by_T(doc, metric)) + + results = [] + for gk, by_routing in sorted(groups.items()): + sku, backend, phase, dtype, mode, contract, ep, cqm, act = gk + headline = by_routing.get(HEADLINE) + if not headline: + continue # no uniform headline in this group -> can't form a ratio + def common_T(other): + ts = sorted(set(headline) & set(other)) + return [t for t in ts if (anchors is None or t in anchors)] + + per_dist, worst, best_case, eplb_recovery = {}, None, None, None + for rlabel, series in by_routing.items(): + if rlabel == HEADLINE: + continue + ratios = {t: series[t] / headline[t] for t in common_T(series) if headline[t] > 0} + if not ratios: + continue + rmax_T = max(ratios, key=ratios.get) + per_dist[rlabel] = {"ratio_max": round(ratios[rmax_T], 4), "at_T": rmax_T, + "ratio_by_T": {t: round(v, 4) for t, v in ratios.items()}} + base = rlabel.replace("+eplb", "") + is_eplb = rlabel.endswith("+eplb") + if base == BEST_CASE: + best_case = {"routing": rlabel, "ratio": round(min(ratios.values()), 4)} + elif not is_eplb: # a genuine stressor (balanced / zipf* / hotspot-single) + cand = (ratios[rmax_T], rlabel, rmax_T) + if worst is None or cand[0] > worst[0]: + worst = cand + # EPLB recovery: zipf vs zipf+eplb worst ratio (the remedy's effect), if both present + if "zipf" in per_dist and "zipf+eplb" in per_dist: + eplb_recovery = {"zipf": per_dist["zipf"]["ratio_max"], + "zipf+eplb": per_dist["zipf+eplb"]["ratio_max"]} + + results.append({ + "sku": sku, "backend": backend, "phase": phase, "dispatch_dtype": dtype, + "mode": mode, "contract": contract, "ep": ep, + "combine_quant_mode": cqm, "activation_profile": act, + "metric": metric, + "headline_p99_range_us": [round(min(headline.values()), 2), round(max(headline.values()), 2)], + "distribution_sensitivity_ratio": round(worst[0], 4) if worst else None, + "worst_distribution": worst[1] if worst else None, + "worst_at_T": worst[2] if worst else None, + "best_case_ratio": best_case, "eplb_recovery": eplb_recovery, + "per_distribution": per_dist, + }) + return {"metric": metric, "anchors": sorted(anchors) if anchors else None, "groups": results} + + +def to_markdown(report: dict) -> str: + # Only groups that actually have a stressor distribution vs uniform are a sensitivity result; + # uniform-only groups (other contracts / fp8 / LL that didn't run the routing sweep) are noise. + rated = [r for r in report["groups"] if r["distribution_sensitivity_ratio"] is not None] + skipped = len(report["groups"]) - len(rated) + if not rated: + return "_no comparable (uniform + stressor) routing groups found_" + h = (f"### Distribution sensitivity ({report['metric']} p99; ratio = worst stressor / uniform)\n\n" + "| SKU | backend | phase | dtype·mode·contract | headline p99 µs | worst dist @T | " + "**sensitivity** | best-case | EPLB (zipf→+eplb) |\n" + "|---|---|---|---|---|---|---|---|---|\n") + for r in sorted(rated, key=lambda x: (x["sku"], x["backend"], x["phase"], x["dispatch_dtype"])): + sr = r["distribution_sensitivity_ratio"] + cfg = f"{r['dispatch_dtype']}·{r['mode']}·{(r['contract'] or '').replace('-v1','')}" + worst = f"{r['worst_distribution']} @{r['worst_at_T']}" + rng = r["headline_p99_range_us"] + bc = f"{r['best_case_ratio']['ratio']:.2f}×" if r.get("best_case_ratio") else "—" + ev = (f"{r['eplb_recovery']['zipf']:.2f}→{r['eplb_recovery']['zipf+eplb']:.2f}×" + if r.get("eplb_recovery") else "—") + h += (f"| {r['sku']} | {r['backend']} | {r['phase']} | {cfg} | " + f"{rng[0]}–{rng[1]} | {worst} | **{sr:.2f}×** | {bc} | {ev} |\n") + if skipped: + h += f"\n_({skipped} uniform-only group(s) omitted — no stressor distribution run for them.)_\n" + return h + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX distribution-sensitivity summary") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--metric", default="roundtrip", choices=["roundtrip", "dispatch", "combine"]) + ap.add_argument("--anchors", default="", help="comma-separated tokens/rank to restrict to; blank = all common T") + ap.add_argument("--out", default="", help="write the JSON report here (markdown always goes to stdout)") + a = ap.parse_args() + anchors = set(int(x) for x in a.anchors.replace(",", " ").split()) if a.anchors.strip() else None + report = analyze(a.results_dir, a.metric, anchors) + if a.out: + os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True) + with open(a.out, "w") as fh: + json.dump(report, fh, indent=2, sort_keys=True) + print(f"wrote {a.out} ({len(report['groups'])} groups)") + print(to_markdown(report)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py new file mode 100644 index 000000000..db68afb4c --- /dev/null +++ b/experimental/CollectiveX/tests/workload.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +"""CollectiveX — canonical, serialized MoE routing workloads (goal Part 1: workload identity). + +A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent +file, and referenced by an immutable `workload_id`. Every official benchmark point consumes the +SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a +checksum match, not by trusting that two machines re-ran the same seeded generator. + +Layout on disk (one workload = two files, basename = workload_id): + /.npz topk_idx [gt,topk] int32, topk_weights [gt,topk] float32 + /.manifest.json dims, routing profile, generator version, seed, SHA-256s + +Split by dependency so it runs where each step lives: + * build_workload() needs torch (via routing.py) — run on a node/container. + * load/verify/manifest need only numpy + stdlib — run on a login node or in CI. + +Seeded runtime generation (routing.build_global_routing) stays for local dev; canonical files +are how cross-hardware comparisons are gated. +""" +from __future__ import annotations + +import hashlib +import json +import os + +WORKLOAD_SCHEMA_VERSION = 1 +# Bump when routing.build_global_routing's numerics change so a stale file can't masquerade as +# current. The workload_id folds this in: same id <=> same generator + params. +GENERATOR_VERSION = "collectivex-routing-v1" +GATE_WEIGHT_FORMAT = "softmax-of-randn-f32" # how topk_weights are produced (see routing.py) +ACTIVATION_GENERATOR = "collectivex-activation-v1" # bump if the activation value-generator changes +ACTIVATION_PROFILE_DEFAULT = "normal" # seeded N(0,1) per token; the only wired profile + + +def _sha256(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +def compute_workload_id(routing: str, hidden: int, topk: int, experts: int, + global_tokens: int, seed: int, generator: str = GENERATOR_VERSION, + step: int = 0) -> str: + """Deterministic id over the identity-defining params. Same params+generator => same id. + `step` is the temporal snapshot for moving/alternating routing; folded in ONLY when non-zero + so every existing (step=0) canonical workload keeps its id.""" + key = (f"{generator}|routing={routing}|hidden={hidden}|topk={topk}|experts={experts}" + f"|gt={global_tokens}|seed={seed}") + if step: + key += f"|step={step}" + return _sha256(key.encode())[:16] + + +def compute_activation_identity(activation_profile, seed, hidden, + generator=ACTIVATION_GENERATOR) -> str: + """Deterministic identity of the activation VALUE distribution (scaffold). Today activations + are seeded N(0,1) and NOT serialized, so identity = a descriptor hash. The formula MUST match + the inline one in ep_harness so a manifest and a result doc agree. Becomes the byte-hash of + the serialized activations once a model-trace value rig lands.""" + key = f"{activation_profile}|seed={seed}|hidden={hidden}|gen={generator}" + return _sha256(key.encode())[:16] + + +def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank, + idx_np, weights_np, routing_stats=None, + activation_profile=ACTIVATION_PROFILE_DEFAULT): + """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib.""" + idx_bytes = idx_np.astype("int32").tobytes() + w_bytes = weights_np.astype("float32").tobytes() + wid = compute_workload_id(routing, hidden, topk, experts, global_tokens, seed) + return { + "schema_version": WORKLOAD_SCHEMA_VERSION, + "workload_id": wid, + "generator_version": GENERATOR_VERSION, + "gate_weight_format": GATE_WEIGHT_FORMAT, + "dims": {"hidden": hidden, "topk": topk, "experts": experts, + "global_tokens": int(global_tokens), "experts_per_rank": experts_per_rank}, + "routing_profile": routing, + "seed": seed, + "checksums": { # SHA-256 over the raw little-endian array bytes (int32 / float32) + "topk_idx": _sha256(idx_bytes), + "topk_weights": _sha256(w_bytes), # gate-weight (value) distribution identity + "trace": _sha256(idx_bytes + w_bytes), # full-workload identity + }, + "routing_stats": routing_stats or {}, + # Activation value distribution (scaffold): name + deterministic descriptor identity. + # NOT under checksums — activations are not byte-serialized today (see compute_activation_identity). + "activation_profile": activation_profile, + "activation_identity": compute_activation_identity(activation_profile, seed, hidden), + } + + +def build_workload(hidden, topk, experts, routing, global_tokens, seed, experts_per_rank, + activation_profile=ACTIVATION_PROFILE_DEFAULT): + """Generate a canonical trace. Needs torch (routing.py). Returns (idx_np, weights_np, manifest).""" + import numpy as np + import routing as _routing + idx_t, w_t = _routing.build_global_routing(global_tokens, experts, topk, routing, seed, + experts_per_rank) + rstats = _routing.routing_stats(idx_t, experts, experts_per_rank, weights=w_t) + idx_np = idx_t.detach().cpu().numpy().astype(np.int32) + w_np = w_t.detach().cpu().numpy().astype(np.float32) + manifest = build_manifest(routing, hidden, topk, experts, global_tokens, seed, + experts_per_rank, idx_np, w_np, rstats, + activation_profile=activation_profile) + return idx_np, w_np, manifest + + +def save_workload(out_dir, idx_np, weights_np, manifest) -> str: + import numpy as np + os.makedirs(out_dir, exist_ok=True) + wid = manifest["workload_id"] + np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"), + topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32)) + with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh: + json.dump(manifest, fh, indent=2, sort_keys=True) + return wid + + +def load_workload(npz_path, verify=True): + """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest). + Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums.""" + import numpy as np + base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path + with open(base + ".manifest.json") as fh: + manifest = json.load(fh) + z = np.load(base + ".npz") + idx_np, w_np = z["topk_idx"], z["topk_weights"] + if verify: + ok, reason = verify_workload(manifest, idx_np, w_np) + if not ok: + raise ValueError(f"workload checksum mismatch for {base}: {reason}") + return idx_np, w_np, manifest + + +def verify_workload(manifest, idx_np, weights_np): + """Recompute checksums and compare to the manifest. Returns (ok, reason).""" + import numpy as np # noqa: F401 + ib = idx_np.astype("int32").tobytes() + wb = weights_np.astype("float32").tobytes() + cs = manifest.get("checksums", {}) + if _sha256(ib) != cs.get("topk_idx"): + return False, "topk_idx hash differs" + if _sha256(wb) != cs.get("topk_weights"): + return False, "topk_weights hash differs" + if _sha256(ib + wb) != cs.get("trace"): + return False, "trace hash differs" + wid = compute_workload_id(manifest["routing_profile"], manifest["dims"]["hidden"], + manifest["dims"]["topk"], manifest["dims"]["experts"], + manifest["dims"]["global_tokens"], manifest["seed"], + manifest.get("generator_version", GENERATOR_VERSION)) + if wid != manifest["workload_id"]: + return False, f"workload_id mismatch (recomputed {wid} != {manifest['workload_id']})" + return True, "ok" + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + import sys + import tempfile + # (1) workload_id determinism + sensitivity — pure stdlib, always runs. + a = compute_workload_id("zipf", 7168, 8, 256, 4096, 67) + b = compute_workload_id("zipf", 7168, 8, 256, 4096, 67) + c = compute_workload_id("uniform", 7168, 8, 256, 4096, 67) + assert a == b, "workload_id must be deterministic" + assert a != c, "workload_id must depend on routing" + print(f"workload_id determinism OK (zipf={a} uniform={c})") + # (2) build/save/load/verify roundtrip + cross-build identity — needs torch+numpy. + try: + import numpy as np # noqa: F401 + try: + idx, w, man = build_workload(7168, 8, 256, "zipf", 512, 67, 32) + built = True + except Exception as exc: # torch missing on a login node + print(f"(torch unavailable — synthesizing arrays to test load/verify: {exc!r})") + idx = np.random.default_rng(0).integers(0, 256, size=(512, 8)).astype(np.int32) + w = np.random.default_rng(1).random((512, 8)).astype(np.float32) + man = build_manifest("zipf", 7168, 8, 256, 512, 67, 32, idx, w) + built = False + with tempfile.TemporaryDirectory() as d: + wid = save_workload(d, idx, w, man) + idx2, w2, man2 = load_workload(os.path.join(d, f"{wid}.npz"), verify=True) + assert (idx2 == idx).all() and (w2 == w).all(), "roundtrip array mismatch" + ok, reason = verify_workload(man2, idx2, w2) + assert ok, reason + # tamper -> must fail + idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256 + bad, _ = verify_workload(man2, idx2, w2) + assert not bad, "verify must catch tampering" + print(f"save/load/verify roundtrip OK (workload_id={wid}, built_via_torch={built})") + except ImportError: + print("(numpy unavailable — skipped serialization roundtrip; id logic passed)") + print("workload self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/validate_results.py b/experimental/CollectiveX/validate_results.py new file mode 100644 index 000000000..9128c8a20 --- /dev/null +++ b/experimental/CollectiveX/validate_results.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +"""CollectiveX result validator (goal Part 1: schema + validation tooling). + +Validates EP result JSON docs against ep-result-v4 and the project's semantic gates: +schema shape, provenance completeness, workload identity (incl. cross-run trace-signature +agreement within a comparison_key), measurement-contract membership, byte-contract presence, +sample counts, and — crucially — that `publication_status` is the MACHINE-DERIVED function of +`validity` (no doc may hand-label itself official). Exits non-zero when any doc claims +`official` but fails a gate (or, with --require-official, when any doc isn't official). + +Pure stdlib; uses `jsonschema` if importable, else a built-in required-key/type/enum check. +v3 docs (no publication_status) load as legacy/experimental and are reported, not failed. + + python3 validate_results.py results/*.json + python3 validate_results.py --require-official --schema schemas/ep-result-v4.schema.json results/ +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +import sys + +MIN_SAMPLES_OFFICIAL = 100 +KNOWN_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"} +PUB_STATES = {"official", "comparable-experimental", "diagnostic", "invalid", "failed"} + + +def derive_publication_status(v: dict) -> str: + """MUST mirror ep_harness._derive_publication_status — the validator's job is to confirm the + recorded status equals this derivation.""" + if v.get("execution_status") != "complete": + return "failed" + if (v.get("semantic_correctness") != "pass" or v.get("measurement_conformance") != "conformant" + or v.get("workload_identity") == "inconsistent"): + return "invalid" + sound = (v.get("semantic_correctness") == "pass" + and str(v.get("workload_identity", "")).startswith("consistent") + and v.get("measurement_conformance") == "conformant") + if str(v.get("resource_conformance", "")).endswith("nonconforming"): + return "diagnostic" + # contract-level anomaly (goal P1-e/f): demotes to diagnostic unless waived (anomaly_free). + if not v.get("anomaly_free", True): + return "diagnostic" + if sound and v.get("provenance_complete") and v.get("workload_source") == "canonical-serialized": + return "official" + if sound: + return "comparable-experimental" + return "diagnostic" + + +def _schema_check(doc, schema): + """jsonschema if available; else a pragmatic required-keys/enum check of the top level + rows.""" + try: + import jsonschema + jsonschema.validate(doc, schema) + return [] + except ImportError: + errs = [] + for k in schema.get("required", []): + if k not in doc: + errs.append(f"missing required field '{k}'") + # enum spot-checks the built-in path can do cheaply + ms = doc.get("measurement_contract") + if ms is not None and ms not in KNOWN_CONTRACTS: + errs.append(f"unknown measurement_contract '{ms}'") + ps = doc.get("publication_status") + if ps is not None and ps not in PUB_STATES: + errs.append(f"unknown publication_status '{ps}'") + if not doc.get("rows"): + errs.append("no rows") + return errs + except Exception as exc: # jsonschema.ValidationError + return [f"schema: {exc.message if hasattr(exc, 'message') else exc}"] + + +def validate_doc(doc, schema, path): + errs, warns = [], [] + legacy = "publication_status" not in doc + if legacy: + warns.append("legacy (v3, no publication_status) — loads as experimental, not comparable as official") + return errs, warns, "legacy-experimental" + errs += _schema_check(doc, schema) if schema else [] + v = doc.get("validity", {}) + recorded = doc.get("publication_status") + derived = derive_publication_status(v) + if recorded != derived: + errs.append(f"publication_status '{recorded}' != machine-derived '{derived}' (validity tampered or stale)") + # byte + contract + sample gates + if doc.get("measurement_contract") not in KNOWN_CONTRACTS: + errs.append(f"unknown measurement_contract {doc.get('measurement_contract')}") + rows = doc.get("rows", []) + for r in rows: + if "byte_contracts" not in r: + errs.append(f"T={r.get('tokens_per_rank')}: missing byte_contracts"); break + for op in ("dispatch", "combine", "roundtrip"): + if op not in r or "p99" not in r.get(op, {}): + errs.append(f"T={r.get('tokens_per_rank')}: missing {op} percentiles"); break + # anomaly self-consistency (goal P1-e): validity.anomaly_free must equal (no anomalies or waived). + anoms = doc.get("anomalies") or [] + waived = (doc.get("anomaly_summary") or {}).get("waived", False) + expect_anomaly_free = (len(anoms) == 0) or bool(waived) + if v.get("anomaly_free", True) != expect_anomaly_free: + errs.append(f"validity.anomaly_free={v.get('anomaly_free')} but {len(anoms)} anomalies " + f"(waived={waived}) imply {expect_anomaly_free}") + if anoms and not waived and recorded not in ("diagnostic", "invalid", "failed"): + errs.append(f"{len(anoms)} unwaived timing anomaly(ies) but status={recorded} (must be diagnostic)") + # official-grade gates + if recorded == "official": + if not v.get("provenance_complete"): + errs.append("official but provenance_complete=false") + if v.get("workload_source") != "canonical-serialized": + errs.append("official but workload not canonical-serialized") + # goal P1: official requires NON-NULL workload identity (id + signature). + wl = doc.get("workload") or {} + if not wl.get("workload_id"): + errs.append("official but workload_id is null (non-null workload identity required)") + if not wl.get("trace_signature"): + errs.append("official but trace_signature is null") + if anoms and not waived: + errs.append("official but has unwaived timing anomalies") + if rows and min((r.get("samples_pooled", 0) for r in rows)) < MIN_SAMPLES_OFFICIAL: + errs.append(f"official but a point has <{MIN_SAMPLES_OFFICIAL} pooled samples") + if not all(r.get("correct") for r in rows): + errs.append("official but a point failed correctness") + return errs, warns, recorded + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP result validator") + ap.add_argument("paths", nargs="+", help="result JSON files or dirs") + ap.add_argument("--schema", default=os.path.join(os.path.dirname(__file__), "schemas", "ep-result-v4.schema.json")) + ap.add_argument("--require-official", action="store_true", + help="fail if any non-legacy doc is not 'official'") + ap.add_argument("--regression", action="store_true", + help="also run threshold-based performance-regression detection (regression.py) " + "over the same files and fail if any hard regression (outside run-to-run " + "noise) is found, so one CI step gates on validity AND performance") + ap.add_argument("--regression-metric", default="roundtrip", help="regression op (default roundtrip)") + ap.add_argument("--regression-pct", default="p99", help="regression percentile (default p99)") + ap.add_argument("--regression-threshold", type=float, default=0.10, + help="regression fractional threshold (default 0.10)") + a = ap.parse_args() + schema = None + if a.schema and os.path.exists(a.schema): + schema = json.load(open(a.schema)) + files = [] + for p in a.paths: + if os.path.isdir(p): + files += glob.glob(os.path.join(p, "**", "*.json"), recursive=True) + else: + files.append(p) + files = sorted(f for f in files if not os.path.basename(f).startswith("env_")) + + # cross-run workload identity: within a comparison_key, the realized routing must be the SAME + # workload. We check PER-TOKEN routing_hash agreement (not the whole trace_signature) so two + # runs of the same config at DIFFERENT ladders (e.g. a capped cross-vendor sweep 1..16 vs a full + # 1..128 headline) are NOT falsely flagged — only a genuine conflict (same T, different routing + # bytes) is a different workload. + by_ck = {} # ck -> {T: {routing_hash: [files]}} + bad = 0 + for f in files: + try: + doc = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if doc.get("family") != "moe": + continue + # preserved failed-case record (goal immediate P2): a classified failure (run_in_container + # emitted it on a wedge/timeout/crash). Report it as a preserved case, NOT a validation error. + if doc.get("record_type") == "failed-case": + fm = (doc.get("failure") or {}).get("failure_mode", "?") + print(f"[FAILED-CASE] {os.path.basename(f):68s} mode={fm} (preserved, not a validation error)") + continue + errs, warns, status = validate_doc(doc, schema, f) + ck = doc.get("comparison_key") + # routing_step (temporal) + uneven_tokens change the realized workload but are NOT in the + # comparison_key (they live in reproduction) — include them in the cross-run grouping so a + # moving-hotspot step / uneven-allocation variant isn't falsely flagged as a conflicting + # same-config workload. + repro = doc.get("reproduction") or {} + gk = (ck, repro.get("routing_step", 0), repro.get("uneven_tokens", "none")) if ck else None + if gk: + for r in doc.get("rows", []): + T, rh = r.get("tokens_per_rank"), r.get("routing_hash") + if T is not None and rh: + by_ck.setdefault(gk, {}).setdefault(T, {}).setdefault(rh, []).append(os.path.basename(f)) + tag = "OK" if not errs else "FAIL" + if errs: + bad += 1 + if a.require_official and status not in ("official",) and not errs: + tag = "FAIL"; bad += 1; errs = [f"not official (status={status})"] + print(f"[{tag}] {os.path.basename(f):70s} status={status}") + for e in errs: + print(f" ERROR: {e}") + for w in warns: + print(f" note: {w}") + # report cross-run identity CONFLICTS: same comparison_key + same T but DIFFERENT routing bytes + # (a genuine "not the same workload" — different hardware ran different routing for one point). + for gk, perT in by_ck.items(): + ck = gk[0] + conflicts = {T: hs for T, hs in perT.items() if len(hs) > 1} + if conflicts: + bad += 1 + print(f"[FAIL] comparison_key {ck[:12]} (step={gk[1]},uneven={gk[2]}): per-T routing-hash CONFLICT — not the same workload:") + for T, hs in sorted(conflicts.items()): + print(f" T={T}: " + "; ".join(f"{h[:10]}=[{', '.join(fs)}]" for h, fs in hs.items())) + print(f"\n{'FAILED' if bad else 'PASS'}: {len(files)} files, {bad} problem(s)") + + # Optional performance-regression gate (goal P1 "Add regression thresholds"). Imported lazily so + # validation carries no new dependency/behavior unless --regression is passed. A hard regression + # (a >threshold slowdown outside this point's run-to-run noise) folds into the non-zero exit. + if a.regression: + import regression as _reg + rep = _reg.analyze(a.paths, metric=a.regression_metric, pct=a.regression_pct, + threshold=a.regression_threshold) + print() + print(_reg.to_markdown(rep)) + if rep["hard_regressions"]: + bad += rep["hard_regressions"] + return 1 if bad else 0 + + +if __name__ == "__main__": + raise SystemExit(main())