diff --git a/docs/api/primitive-catalog.md b/docs/api/primitive-catalog.md
index 2a65b30..0bc02bf 100644
--- a/docs/api/primitive-catalog.md
+++ b/docs/api/primitive-catalog.md
@@ -7,7 +7,7 @@
 
 # Primitive catalog — the never-stale anti-reinvention inventory
 
-> **GENERATED** from `@tangle-network/agent-runtime@0.85.0` and `@tangle-network/agent-eval@0.103.1` by `scripts/gen-primitive-catalog.mjs`. Do NOT hand-edit — run `pnpm run docs:api`. This is the mechanical companion to the JUDGMENT in `canonical-api.md` (§2 decision table + §1.5 AgentProfile law): that doc says WHICH primitive to reach for and what NOT to build; this catalog proves WHAT exists. Per-symbol signatures + `file:line` live in the per-module pages under `docs/api/`.
+> **GENERATED** from `@tangle-network/agent-runtime@0.86.0` and `@tangle-network/agent-eval@0.103.1` by `scripts/gen-primitive-catalog.mjs`. Do NOT hand-edit — run `pnpm run docs:api`. This is the mechanical companion to the JUDGMENT in `canonical-api.md` (§2 decision table + §1.5 AgentProfile law): that doc says WHICH primitive to reach for and what NOT to build; this catalog proves WHAT exists. Per-symbol signatures + `file:line` live in the per-module pages under `docs/api/`.
 
 ## 1. agent-runtime — own public surface
 
@@ -246,7 +246,7 @@ Import from `@tangle-network/agent-runtime/intelligence` — 63 exports.
 
 ### Recursive atom + loop kernel (alias of ./runtime)
 
-Import from `@tangle-network/agent-runtime/loops` — 425 exports.
+Import from `@tangle-network/agent-runtime/loops` — 426 exports.
 
 | Symbol | Kind | Summary |
 |---|---|---|
@@ -447,10 +447,12 @@ Import from `@tangle-network/agent-runtime/loops` — 425 exports.
 | `LeaderboardBenchScore` | interface | Structurally `BenchScore` (bench registry shape). |
 | `LeaderboardBenchTask` | interface | Structurally `BenchTask` (bench registry shape) — declared locally so this |
 | `LeaderboardFlagSpec` | interface | One extra CLI flag a spec declares. Parsed by `run()` as `--<name> <value>` |
+| `LeaderboardIterationInfo` | interface | Per-shot outcome context passed as `onCellEvents`'s third argument — how a |
 | `LeaderboardRow` | interface | One leaderboard row — a harness×model profile, every measured column. |
 | `LeaderboardRunContext` | interface | Resolved run configuration handed to `setup` / `teardown` / `export`. |
 | `LeaderboardScenario` | interface | The campaign scenario a case is wrapped into: the case rides along so |
 | `LeaderboardScore` | interface | Structured per-case verdict a `score` function may return (a bare number is |
+| `LeaderboardSpec` | interface | The declarative leaderboard spec. `TArtifact` is the artifact channel the |
 | `LoopCampaignDispatchOptions` | interface | Options for adapting plain agent-eval campaign scenarios into runtime `runLoop` cells. |
 | `LoopIterationDispatchPayload` | interface | Where the iteration's worker was placed. `sibling` = a fresh sandbox the |
 | `LoopLineageOptions` | interface | Opt-in box-lineage controls for `runLoop`. Default OFF — with both flags |
@@ -571,7 +573,7 @@ Import from `@tangle-network/agent-runtime/loops` — 425 exports.
 | `WinnerStrategy` | type | Built-in valid-only winner strategies for `selectValidWinner` (selector≠judge): best gated-valid |
 | `WorktreePatchArtifact` | type | Terminal artifact of one worktree-CLI run — the canonical worktree-harness result (the captured |
 
-**Undocumented supporting types** (add a TSDoc line at the declaration to earn a table row): `AgentEnvironment`, `AgentEnvironmentCapabilities`, `AgentEnvironmentEvent`, `AgentEnvironmentProvider`, `AgentEnvironmentQuery`, `AgentEnvironmentSummary`, `AgenticOptions`, `AgenticRunResult`, `AgenticTool`, `AgentSession`, `AgentSessionRef`, `AgentTurnInput`, `AgentTurnResult`, `AnalystRegistry`, `AnytimeReport`, `AnytimeStrategySummary`, `ArtifactHandle`, `AuditIntentOptions`, `AuthoredHarness`, `AuthoredStrategy`, `AuthorStrategyOptions`, `BenchmarkConfig`, `BenchmarkLift`, `BenchmarkStrategySummary`, `BenchmarkTaskRow`, `BudgetPool`, `BusStats`, `ChampionPick`, `CheckpointRef`, `CheckpointRequest`, `CreateAgentEnvironmentInput`, `DefinedLeaderboard`, `Driver`, `EventBus`, `EvolutionArchiveNode`, `EvolutionBandInfo`, `EvolutionCandidate`, `EvolutionGeneration`, `EvolutionReport`, `ExecRequest`, `ExecResult`, `ForkRequest`, `GitWorkspaceOptions`, `HarvestFailure`, `HarvestReport`, `Inbox`, `InProcessSandboxClientOptions`, `IntentAudit`, `Iteration`, `Leaderboard`, `LeaderboardOptions`, `LeaderboardSpec`, `LoopDecisionPayload`, `LoopDispatchOptions`, `LoopEndedPayload`, `LoopIterationEndedPayload`, `LoopIterationStartedPayload`, `LoopPlanDescription`, `LoopResult`, `LoopSandboxPlacement`, `LoopStartedPayload`, `LoopTraceEmitter`, `LoopWinner`, `McpEnvironmentOptions`, `Observation`, `ObserveOptions`, `OpenSandboxRunOptions`, `PairwiseOptions`, `PatchDeliverableOptions`, `PlacementInfo`, `PromotionGateOptions`, `PromotionVerdict`, `PublishOptions`, `ResourceRequest`, `RouterChatResult`, `RouterChatToolsResult`, `RouterToolLoopResult`, `RunAgenticOptions`, `SandboxRun`, `ShotSpec`, `StrategyEvolutionConfig`, `StrategyResult`, `StreamAgentTurnOptions`, `SuperviseOptions`, `SuperviseSurfaceOptions`, `SupervisorAgentDeps`, `SupervisorOpts`, `SurfaceScore`, `ToolSpec`, `TraceSource`, `ValidationCtx`, `Validator`, `WaterfallCollector`, `WaterfallReport`, `Workspace`, `WorkspaceRequest`, `WorkspaceRun`, `WorktreeCliExecutorOptions`, `WorktreeFanoutOptions`, `AgentEnvironmentStatus`, `AgentSessionStatus`, `ChampionPolicy`, `LoopTraceEvent`, `MakeWorkerAgent`, `WorkspaceCommit`.
+**Undocumented supporting types** (add a TSDoc line at the declaration to earn a table row): `AgentEnvironment`, `AgentEnvironmentCapabilities`, `AgentEnvironmentEvent`, `AgentEnvironmentProvider`, `AgentEnvironmentQuery`, `AgentEnvironmentSummary`, `AgenticOptions`, `AgenticRunResult`, `AgenticTool`, `AgentSession`, `AgentSessionRef`, `AgentTurnInput`, `AgentTurnResult`, `AnalystRegistry`, `AnytimeReport`, `AnytimeStrategySummary`, `ArtifactHandle`, `AuditIntentOptions`, `AuthoredHarness`, `AuthoredStrategy`, `AuthorStrategyOptions`, `BenchmarkConfig`, `BenchmarkLift`, `BenchmarkStrategySummary`, `BenchmarkTaskRow`, `BudgetPool`, `BusStats`, `ChampionPick`, `CheckpointRef`, `CheckpointRequest`, `CreateAgentEnvironmentInput`, `DefinedLeaderboard`, `Driver`, `EventBus`, `EvolutionArchiveNode`, `EvolutionBandInfo`, `EvolutionCandidate`, `EvolutionGeneration`, `EvolutionReport`, `ExecRequest`, `ExecResult`, `ForkRequest`, `GitWorkspaceOptions`, `HarvestFailure`, `HarvestReport`, `Inbox`, `InProcessSandboxClientOptions`, `IntentAudit`, `Iteration`, `Leaderboard`, `LeaderboardOptions`, `LoopDecisionPayload`, `LoopDispatchOptions`, `LoopEndedPayload`, `LoopIterationEndedPayload`, `LoopIterationStartedPayload`, `LoopPlanDescription`, `LoopResult`, `LoopSandboxPlacement`, `LoopStartedPayload`, `LoopTraceEmitter`, `LoopWinner`, `McpEnvironmentOptions`, `Observation`, `ObserveOptions`, `OpenSandboxRunOptions`, `PairwiseOptions`, `PatchDeliverableOptions`, `PlacementInfo`, `PromotionGateOptions`, `PromotionVerdict`, `PublishOptions`, `ResourceRequest`, `RouterChatResult`, `RouterChatToolsResult`, `RouterToolLoopResult`, `RunAgenticOptions`, `SandboxRun`, `ShotSpec`, `StrategyEvolutionConfig`, `StrategyResult`, `StreamAgentTurnOptions`, `SuperviseOptions`, `SuperviseSurfaceOptions`, `SupervisorAgentDeps`, `SupervisorOpts`, `SurfaceScore`, `ToolSpec`, `TraceSource`, `ValidationCtx`, `Validator`, `WaterfallCollector`, `WaterfallReport`, `Workspace`, `WorkspaceRequest`, `WorkspaceRun`, `WorktreeCliExecutorOptions`, `WorktreeFanoutOptions`, `AgentEnvironmentStatus`, `AgentSessionStatus`, `ChampionPolicy`, `LoopTraceEvent`, `MakeWorkerAgent`, `WorkspaceCommit`.
 
 ### Environment provider adapters — generic sandbox/compute bridge
 
diff --git a/docs/api/runtime.md b/docs/api/runtime.md
index 64f3100..07429fa 100644
--- a/docs/api/runtime.md
+++ b/docs/api/runtime.md
@@ -1310,7 +1310,7 @@ Minimum confidence a PROBABILISTIC verdict must clear to end. Default 0.8.
 
 ### LeaderboardScore
 
-Defined in: [runtime/define-leaderboard.ts:60](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L60)
+Defined in: [runtime/define-leaderboard.ts:61](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L61)
 
 Structured per-case verdict a `score` function may return (a bare number is
  shorthand for `{ composite }`). `composite` is the [0,1] leaderboard score;
@@ -1322,25 +1322,25 @@ Structured per-case verdict a `score` function may return (a bare number is
 
 > **composite**: `number`
 
-Defined in: [runtime/define-leaderboard.ts:61](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L61)
+Defined in: [runtime/define-leaderboard.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L62)
 
 ##### dimensions?
 
 > `optional` **dimensions?**: `Record`\<`string`, `number`\>
 
-Defined in: [runtime/define-leaderboard.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L62)
+Defined in: [runtime/define-leaderboard.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L63)
 
 ##### notes?
 
 > `optional` **notes?**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L63)
+Defined in: [runtime/define-leaderboard.ts:64](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L64)
 
 ***
 
 ### LeaderboardScenario
 
-Defined in: [runtime/define-leaderboard.ts:68](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L68)
+Defined in: [runtime/define-leaderboard.ts:69](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L69)
 
 The campaign scenario a case is wrapped into: the case rides along so
  judges and hooks can reach the full domain payload, not just its id.
@@ -1361,13 +1361,13 @@ The campaign scenario a case is wrapped into: the case rides along so
 
 > **case**: `TCase`
 
-Defined in: [runtime/define-leaderboard.ts:69](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L69)
+Defined in: [runtime/define-leaderboard.ts:70](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L70)
 
 ***
 
 ### LeaderboardFlagSpec
 
-Defined in: [runtime/define-leaderboard.ts:74](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L74)
+Defined in: [runtime/define-leaderboard.ts:75](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L75)
 
 One extra CLI flag a spec declares. Parsed by `run()` as `--<name> <value>`
  and surfaced to every hook via `ctx.args`.
@@ -1378,19 +1378,19 @@ One extra CLI flag a spec declares. Parsed by `run()` as `--<name> <value>`
 
 > `optional` **default?**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:75](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L75)
+Defined in: [runtime/define-leaderboard.ts:76](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L76)
 
 ##### description
 
 > **description**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:76](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L76)
+Defined in: [runtime/define-leaderboard.ts:77](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L77)
 
 ***
 
 ### LeaderboardRunContext
 
-Defined in: [runtime/define-leaderboard.ts:80](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L80)
+Defined in: [runtime/define-leaderboard.ts:81](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L81)
 
 Resolved run configuration handed to `setup` / `teardown` / `export`.
 
@@ -1400,13 +1400,13 @@ Resolved run configuration handed to `setup` / `teardown` / `export`.
 
 > **name**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:81](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L81)
+Defined in: [runtime/define-leaderboard.ts:82](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L82)
 
 ##### backend
 
 > **backend**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:83](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L83)
+Defined in: [runtime/define-leaderboard.ts:84](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L84)
 
 Execution backend name (`--backend`), a key of `backends`.
 
@@ -1414,19 +1414,19 @@ Execution backend name (`--backend`), a key of `backends`.
 
 > **runDir**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:84](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L84)
+Defined in: [runtime/define-leaderboard.ts:85](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L85)
 
 ##### exportDir
 
 > **exportDir**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:85](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L85)
+Defined in: [runtime/define-leaderboard.ts:86](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L86)
 
 ##### args
 
 > **args**: `Record`\<`string`, `string` \| `undefined`\>
 
-Defined in: [runtime/define-leaderboard.ts:87](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L87)
+Defined in: [runtime/define-leaderboard.ts:88](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L88)
 
 Every parsed flag (standard + `spec.flags`), by name without `--`.
 
@@ -1434,13 +1434,13 @@ Every parsed flag (standard + `spec.flags`), by name without `--`.
 
 > **harnesses**: readonly `HarnessType`[]
 
-Defined in: [runtime/define-leaderboard.ts:88](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L88)
+Defined in: [runtime/define-leaderboard.ts:89](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L89)
 
 ##### models
 
 > **models**: readonly `string`[]
 
-Defined in: [runtime/define-leaderboard.ts:90](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L90)
+Defined in: [runtime/define-leaderboard.ts:91](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L91)
 
 Snapshot-stamped model ids (`name@snapshot`) — the eval identity models.
 
@@ -1448,25 +1448,25 @@ Snapshot-stamped model ids (`name@snapshot`) — the eval identity models.
 
 > **caseIds**: readonly `string`[]
 
-Defined in: [runtime/define-leaderboard.ts:91](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L91)
+Defined in: [runtime/define-leaderboard.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L92)
 
 ##### shots
 
 > **shots**: `number`
 
-Defined in: [runtime/define-leaderboard.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L92)
+Defined in: [runtime/define-leaderboard.ts:93](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L93)
 
 ##### reps
 
 > **reps**: `number`
 
-Defined in: [runtime/define-leaderboard.ts:93](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L93)
+Defined in: [runtime/define-leaderboard.ts:94](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L94)
 
 ***
 
 ### LeaderboardBenchTask
 
-Defined in: [runtime/define-leaderboard.ts:98](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L98)
+Defined in: [runtime/define-leaderboard.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L99)
 
 Structurally `BenchTask` (bench registry shape) — declared locally so this
  module adds no dependency on a benchmark package.
@@ -1477,31 +1477,31 @@ Structurally `BenchTask` (bench registry shape) — declared locally so this
 
 > **id**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L99)
+Defined in: [runtime/define-leaderboard.ts:100](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L100)
 
 ##### prompt
 
 > **prompt**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:100](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L100)
+Defined in: [runtime/define-leaderboard.ts:101](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L101)
 
 ##### split?
 
 > `optional` **split?**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:101](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L101)
+Defined in: [runtime/define-leaderboard.ts:102](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L102)
 
 ##### metadata?
 
 > `optional` **metadata?**: `Record`\<`string`, `unknown`\>
 
-Defined in: [runtime/define-leaderboard.ts:102](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L102)
+Defined in: [runtime/define-leaderboard.ts:103](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L103)
 
 ***
 
 ### LeaderboardBenchScore
 
-Defined in: [runtime/define-leaderboard.ts:106](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L106)
+Defined in: [runtime/define-leaderboard.ts:107](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L107)
 
 Structurally `BenchScore` (bench registry shape).
 
@@ -1511,28 +1511,36 @@ Structurally `BenchScore` (bench registry shape).
 
 > **resolved**: `boolean`
 
-Defined in: [runtime/define-leaderboard.ts:107](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L107)
+Defined in: [runtime/define-leaderboard.ts:108](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L108)
 
 ##### score
 
 > **score**: `number`
 
-Defined in: [runtime/define-leaderboard.ts:108](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L108)
+Defined in: [runtime/define-leaderboard.ts:109](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L109)
 
 ##### detail?
 
 > `optional` **detail?**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:109](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L109)
+Defined in: [runtime/define-leaderboard.ts:110](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L110)
 
 ***
 
 ### LeaderboardBenchmarkAdapter
 
-Defined in: [runtime/define-leaderboard.ts:114](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L114)
+Defined in: [runtime/define-leaderboard.ts:117](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L117)
 
 Structurally `BenchmarkAdapter` (bench registry shape): `name`,
  `preflight()`, `loadTasks()`, deterministic `judge()`, `goldArtifact()`.
+ Generic over the artifact channel; the `string` default IS the registry
+ shape, so a default-artifact adapter registers unchanged.
+
+#### Type Parameters
+
+##### TArtifact
+
+`TArtifact` = `string`
 
 #### Properties
 
@@ -1540,7 +1548,7 @@ Structurally `BenchmarkAdapter` (bench registry shape): `name`,
 
 > `readonly` **name**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:115](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L115)
+Defined in: [runtime/define-leaderboard.ts:118](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L118)
 
 #### Methods
 
@@ -1548,7 +1556,7 @@ Defined in: [runtime/define-leaderboard.ts:115](https://github.com/tangle-networ
 
 > **preflight**(): `Promise`\<`void`\>
 
-Defined in: [runtime/define-leaderboard.ts:116](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L116)
+Defined in: [runtime/define-leaderboard.ts:119](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L119)
 
 ###### Returns
 
@@ -1558,7 +1566,7 @@ Defined in: [runtime/define-leaderboard.ts:116](https://github.com/tangle-networ
 
 > **loadTasks**(`opts?`): `Promise`\<[`LeaderboardBenchTask`](#leaderboardbenchtask)[]\>
 
-Defined in: [runtime/define-leaderboard.ts:117](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L117)
+Defined in: [runtime/define-leaderboard.ts:120](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L120)
 
 ###### Parameters
 
@@ -1584,7 +1592,7 @@ Defined in: [runtime/define-leaderboard.ts:117](https://github.com/tangle-networ
 
 > **judge**(`task`, `artifact`): `Promise`\<[`LeaderboardBenchScore`](#leaderboardbenchscore)\>
 
-Defined in: [runtime/define-leaderboard.ts:122](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L122)
+Defined in: [runtime/define-leaderboard.ts:125](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L125)
 
 ###### Parameters
 
@@ -1594,7 +1602,7 @@ Defined in: [runtime/define-leaderboard.ts:122](https://github.com/tangle-networ
 
 ###### artifact
 
-`string`
+`TArtifact`
 
 ###### Returns
 
@@ -1604,7 +1612,7 @@ Defined in: [runtime/define-leaderboard.ts:122](https://github.com/tangle-networ
 
 > **goldArtifact**(`task`): `Promise`\<`string` \| `undefined`\>
 
-Defined in: [runtime/define-leaderboard.ts:123](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L123)
+Defined in: [runtime/define-leaderboard.ts:126](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L126)
 
 ###### Parameters
 
@@ -1618,9 +1626,54 @@ Defined in: [runtime/define-leaderboard.ts:123](https://github.com/tangle-networ
 
 ***
 
+### LeaderboardIterationInfo
+
+Defined in: [runtime/define-leaderboard.ts:132](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L132)
+
+Per-shot outcome context passed as `onCellEvents`'s third argument — how a
+ thrown shot (which never reaches `parseOutput`) stays visible through the
+ facade instead of surfacing only as an empty zero-token cell.
+
+#### Properties
+
+##### index
+
+> **index**: `number`
+
+Defined in: [runtime/define-leaderboard.ts:134](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L134)
+
+0-based shot index within the cell.
+
+##### error?
+
+> `optional` **error?**: `string`
+
+Defined in: [runtime/define-leaderboard.ts:136](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L136)
+
+The shot's thrown error message, when the shot failed before scoring.
+
+##### verdict?
+
+> `optional` **verdict?**: `object`
+
+Defined in: [runtime/define-leaderboard.ts:138](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L138)
+
+The shot's validator verdict, when the shot reached scoring.
+
+###### score?
+
+> `optional` **score?**: `number`
+
+***
+
 ### LeaderboardSpec
 
-Defined in: [runtime/define-leaderboard.ts:126](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L126)
+Defined in: [runtime/define-leaderboard.ts:147](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L147)
+
+The declarative leaderboard spec. `TArtifact` is the artifact channel the
+dispatch produces and the judges score — `string` (the default) is the plain
+agent-response-text path; a structured artifact type flows natively once the
+spec supplies `parseOutput` (or a LEVEL-2 `dispatch`) producing it.
 
 #### Type Parameters
 
@@ -1628,13 +1681,17 @@ Defined in: [runtime/define-leaderboard.ts:126](https://github.com/tangle-networ
 
 `TCase`
 
+##### TArtifact
+
+`TArtifact` = `string`
+
 #### Properties
 
 ##### name
 
 > **name**: `string`
 
-Defined in: [runtime/define-leaderboard.ts:128](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L128)
+Defined in: [runtime/define-leaderboard.ts:149](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L149)
 
 Leaderboard name — the scenario `kind`, default profile name, and report title.
 
@@ -1642,7 +1699,7 @@ Leaderboard name — the scenario `kind`, default profile name, and report title
 
 > **cases**: `TCase`[]
 
-Defined in: [runtime/define-leaderboard.ts:130](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L130)
+Defined in: [runtime/define-leaderboard.ts:151](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L151)
 
 The case corpus. Every case needs a stable string id (see `caseId`).
 
@@ -1650,7 +1707,7 @@ The case corpus. Every case needs a stable string id (see `caseId`).
 
 > `optional` **caseId?**: (`c`) => `string`
 
-Defined in: [runtime/define-leaderboard.ts:133](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L133)
+Defined in: [runtime/define-leaderboard.ts:154](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L154)
 
 Stable id extractor. Default: the case's own `id` property (fail-loud
  when absent or not a string).
@@ -1669,7 +1726,7 @@ Stable id extractor. Default: the case's own `id` property (fail-loud
 
 > **prompt**: (`c`) => `string` \| `Promise`\<`string`\>
 
-Defined in: [runtime/define-leaderboard.ts:136](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L136)
+Defined in: [runtime/define-leaderboard.ts:157](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L157)
 
 The per-case task prompt. May be async (e.g. built by shelling out to a
  reference implementation); resolved ONCE per case before dispatch.
@@ -1688,17 +1745,17 @@ The per-case task prompt. May be async (e.g. built by shelling out to a
 
 > **score**: (`output`, `c`) => `number` \| [`LeaderboardScore`](#leaderboardscore)
 
-Defined in: [runtime/define-leaderboard.ts:140](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L140)
+Defined in: [runtime/define-leaderboard.ts:161](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L161)
 
-The domain grader: agent output text → score. Used BOTH as the per-shot
- validator (a shot with `composite > 0` stops the naive retry loop) and,
- wrapped as a campaign judge, as the recorded leaderboard score.
+The domain grader: agent output artifact → score. Used BOTH as the
+ per-shot validator (a shot with `composite > 0` stops the naive retry
+ loop) and, wrapped as a campaign judge, as the recorded leaderboard score.
 
 ###### Parameters
 
 ###### output
 
-`string`
+`TArtifact`
 
 ###### c
 
@@ -1712,7 +1769,7 @@ The domain grader: agent output text → score. Used BOTH as the per-shot
 
 > `optional` **axis?**: `object`
 
-Defined in: [runtime/define-leaderboard.ts:144](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L144)
+Defined in: [runtime/define-leaderboard.ts:165](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L165)
 
 Harness × model axes for `expandProfileAxes`. Defaults: the canonical
  `CODING_HARNESSES` × the base profile's `model.default`. `--harnesses` /
@@ -1730,7 +1787,7 @@ Harness × model axes for `expandProfileAxes`. Defaults: the canonical
 
 > `optional` **baseProfile?**: `AgentProfile`
 
-Defined in: [runtime/define-leaderboard.ts:147](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L147)
+Defined in: [runtime/define-leaderboard.ts:168](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L168)
 
 Base profile the axes expand over (prompt/tools/skills held fixed).
  Default: a minimal `{ name, model: { default: <first model> } }`.
@@ -1739,7 +1796,7 @@ Base profile the axes expand over (prompt/tools/skills held fixed).
 
 > `optional` **backends?**: `Record`\<`string`, (() => [`SandboxClient`](#sandboxclient-3)) \| `undefined`\>
 
-Defined in: [runtime/define-leaderboard.ts:157](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L157)
+Defined in: [runtime/define-leaderboard.ts:178](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L178)
 
 Execution-backend registry: `--backend <name>` picks the factory that
 yields the `SandboxClient` every cell runs on. Merged over the defaults:
@@ -1753,7 +1810,7 @@ yields the `SandboxClient` every cell runs on. Merged over the defaults:
 
 > `optional` **flags?**: `Record`\<`string`, [`LeaderboardFlagSpec`](#leaderboardflagspec)\>
 
-Defined in: [runtime/define-leaderboard.ts:159](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L159)
+Defined in: [runtime/define-leaderboard.ts:180](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L180)
 
 Extra `--flag value` CLI args `run()` parses and surfaces via `ctx.args`.
 
@@ -1761,7 +1818,7 @@ Extra `--flag value` CLI args `run()` parses and surfaces via `ctx.args`.
 
 > `optional` **modelBackend?**: `Record`\<`string`, `unknown`\>
 
-Defined in: [runtime/define-leaderboard.ts:163](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L163)
+Defined in: [runtime/define-leaderboard.ts:184](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L184)
 
 Extra fields merged into each cell's `backend.model` create override —
  e.g. `{ provider: 'openai-compat', apiKey, baseUrl }` for a router-backed
@@ -1771,7 +1828,7 @@ Extra fields merged into each cell's `backend.model` create override —
 
 > `optional` **setup?**: (`ctx`) => `void` \| `Promise`\<`void`\>
 
-Defined in: [runtime/define-leaderboard.ts:165](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L165)
+Defined in: [runtime/define-leaderboard.ts:186](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L186)
 
 Runs once before the matrix (fetch fixtures, warm caches).
 
@@ -1789,7 +1846,7 @@ Runs once before the matrix (fetch fixtures, warm caches).
 
 > `optional` **teardown?**: (`ctx`) => `void` \| `Promise`\<`void`\>
 
-Defined in: [runtime/define-leaderboard.ts:167](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L167)
+Defined in: [runtime/define-leaderboard.ts:188](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L188)
 
 Runs once after the matrix, even on failure (reap boxes, close handles).
 
@@ -1805,13 +1862,15 @@ Runs once after the matrix, even on failure (reap boxes, close handles).
 
 ##### onCellEvents?
 
-> `optional` **onCellEvents?**: (`events`, `c`) => `void`
+> `optional` **onCellEvents?**: (`events`, `c`, `iteration?`) => `void`
 
-Defined in: [runtime/define-leaderboard.ts:171](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L171)
+Defined in: [runtime/define-leaderboard.ts:194](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L194)
 
-Per-cell event tap: the raw sandbox events of each parsed iteration,
- with the case — the seam for domain metric capture (search counts,
- citations) without a substrate change.
+Per-cell event tap: the raw sandbox events of EVERY shot, with the case —
+ the seam for domain metric capture (search counts, citations) without a
+ substrate change. Fires once per shot after the cell's loop settles, in
+ shot order, including thrown shots (whose events may be partial or empty);
+ the third argument carries the shot's index + error/verdict outcome.
 
 ###### Parameters
 
@@ -1823,19 +1882,25 @@ readonly `SandboxEvent`[]
 
 `TCase`
 
+###### iteration?
+
+[`LeaderboardIterationInfo`](#leaderboarditerationinfo)
+
 ###### Returns
 
 `void`
 
 ##### parseOutput?
 
-> `optional` **parseOutput?**: (`events`, `c`) => `string`
+> `optional` **parseOutput?**: (`events`, `c`) => `TArtifact`
 
-Defined in: [runtime/define-leaderboard.ts:175](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L175)
+Defined in: [runtime/define-leaderboard.ts:204](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L204)
 
-Output decode override: raw events → the scored output text. Default:
- the sandbox SDK's `collectAgentResponseText` (final answer text; empty
- string when the stream carried none — which then scores 0).
+Output decode override: raw events → the scored artifact. Default: the
+ sandbox SDK's `collectAgentResponseText` (final answer text; empty string
+ when the stream carried none — which then scores 0). The default only
+ produces `string`, so a spec with a structured `TArtifact` MUST supply
+ this (or a LEVEL-2 `dispatch`).
 
 ###### Parameters
 
@@ -1849,13 +1914,37 @@ readonly `SandboxEvent`[]
 
 ###### Returns
 
-`string`
+`TArtifact`
+
+##### resolveModel?
+
+> `optional` **resolveModel?**: (`events`) => `string` \| `undefined`
+
+Defined in: [runtime/define-leaderboard.ts:214](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L214)
+
+Resolve the model the backend ACTUALLY served off a shot's raw events.
+Required for HARNESS_NATIVE_MODEL-snapped cells (a vendor-locked harness ×
+an out-of-family model expands to the `default` sentinel): the RunRecord
+must pin a real snapshot-bearing model id, which only the dispatch —
+reading the backend's usage/terminal events — can know. When this returns
+a value the default dispatch reports it via `ctx.cost.observeModel`;
+in-family cells (concrete declared model) never need it.
+
+###### Parameters
+
+###### events
+
+readonly `SandboxEvent`[]
+
+###### Returns
+
+`string` \| `undefined`
 
 ##### export?
 
 > `optional` **export?**: (`result`, `ctx`) => `void` \| `Promise`\<`void`\>
 
-Defined in: [runtime/define-leaderboard.ts:178](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L178)
+Defined in: [runtime/define-leaderboard.ts:217](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L217)
 
 Result export. Default: write `matrix-result.json` under the run dir and
  print (+ write) the ranked leaderboard markdown under the export dir.
@@ -1864,7 +1953,7 @@ Result export. Default: write `matrix-result.json` under the run dir and
 
 ###### result
 
-`RunProfileMatrixResult`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>
+`RunProfileMatrixResult`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>
 
 ###### ctx
 
@@ -1876,18 +1965,18 @@ Result export. Default: write `matrix-result.json` under the run dir and
 
 ##### dispatch?
 
-> `optional` **dispatch?**: `ProfileDispatchFn`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `string`\>
+> `optional` **dispatch?**: `ProfileDispatchFn`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `TArtifact`\>
 
-Defined in: [runtime/define-leaderboard.ts:184](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L184)
+Defined in: [runtime/define-leaderboard.ts:223](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L223)
 
 LEVEL 2 — full dispatch replacement (in-process products bring their own).
  The default is `loopDispatch` + `naiveDriver` over the resolved backend.
 
 ##### judges?
 
-> `optional` **judges?**: `JudgeConfig`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>[]
+> `optional` **judges?**: `JudgeConfig`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>[]
 
-Defined in: [runtime/define-leaderboard.ts:186](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L186)
+Defined in: [runtime/define-leaderboard.ts:225](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L225)
 
 LEVEL 2 — full judge replacement. Default: `score` wrapped as one judge.
 
@@ -1895,7 +1984,7 @@ LEVEL 2 — full judge replacement. Default: `score` wrapped as one judge.
 
 > `optional` **shots?**: `number`
 
-Defined in: [runtime/define-leaderboard.ts:188](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L188)
+Defined in: [runtime/define-leaderboard.ts:227](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L227)
 
 Naive-retry shot cap per cell (`--shots`). Default 1.
 
@@ -1903,15 +1992,15 @@ Naive-retry shot cap per cell (`--shots`). Default 1.
 
 > `optional` **reps?**: `number`
 
-Defined in: [runtime/define-leaderboard.ts:190](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L190)
+Defined in: [runtime/define-leaderboard.ts:229](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L229)
 
 Replicates per cell (`--reps`). Default 1.
 
 ##### matrix?
 
-> `optional` **matrix?**: `Partial`\<`RunProfileMatrixOptions`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `string`\>\>
+> `optional` **matrix?**: `Partial`\<`RunProfileMatrixOptions`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `TArtifact`\>\>
 
-Defined in: [runtime/define-leaderboard.ts:194](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L194)
+Defined in: [runtime/define-leaderboard.ts:233](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L233)
 
 Passthrough overrides spread onto the final `runProfileMatrix` call
  (e.g. `maxConcurrency`, `costCeiling`, `integrity`, `storage`) — spread
@@ -1921,7 +2010,7 @@ Passthrough overrides spread onto the final `runProfileMatrix` call
 
 ### DefinedLeaderboard
 
-Defined in: [runtime/define-leaderboard.ts:197](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L197)
+Defined in: [runtime/define-leaderboard.ts:236](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L236)
 
 #### Type Parameters
 
@@ -1929,13 +2018,17 @@ Defined in: [runtime/define-leaderboard.ts:197](https://github.com/tangle-networ
 
 `TCase`
 
+##### TArtifact
+
+`TArtifact` = `string`
+
 #### Methods
 
 ##### run()
 
-> **run**(`argv?`): `Promise`\<`RunProfileMatrixResult`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\>
+> **run**(`argv?`): `Promise`\<`RunProfileMatrixResult`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\>
 
-Defined in: [runtime/define-leaderboard.ts:211](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L211)
+Defined in: [runtime/define-leaderboard.ts:250](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L250)
 
 Parse flags, run the matrix, export, and return the raw result.
 
@@ -1957,19 +2050,19 @@ only an explicit `--run-dir` opts into that resume behavior.
 
 ###### Returns
 
-`Promise`\<`RunProfileMatrixResult`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\>
+`Promise`\<`RunProfileMatrixResult`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\>
 
 ##### toBenchmarkAdapter()
 
-> **toBenchmarkAdapter**(): [`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter)
+> **toBenchmarkAdapter**(): [`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter)\<`TArtifact`\>
 
-Defined in: [runtime/define-leaderboard.ts:213](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L213)
+Defined in: [runtime/define-leaderboard.ts:252](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L252)
 
 The same domain surface in the structural `BenchmarkAdapter` shape.
 
 ###### Returns
 
-[`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter)
+[`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter)\<`TArtifact`\>
 
 ***
 
@@ -14676,7 +14769,7 @@ Post-reservation pool readout — the shape `Scope.budget` exposes. `tokensLeft`
 
 > **ExecutorConfig** = `object` & `RouterSeam` \| `object` & `RouterToolsSeam` \| `object` & `BridgeSeam` \| `object` & `CliSeam` \| `object` & `CliWorktreeSeam` \| `object` & [`ProviderSeam`](#providerseam) \| `object` & `SandboxSeam`
 
-Defined in: [runtime/supervise/runtime.ts:1501](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1501)
+Defined in: [runtime/supervise/runtime.ts:1534](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1534)
 
 Config for [createExecutor](#createexecutor): the backend is DATA — the cost dial a profile,
 an experiment config, or a replay journal can name — not an import choice. Each
@@ -15164,7 +15257,7 @@ The conserved pool a `delegate()` call applies when the caller does not pass its
 
 > `const` **cliWorktreeExecutor**: [`ExecutorFactory`](#executorfactory)\<`unknown`\>
 
-Defined in: [runtime/supervise/runtime.ts:1465](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1465)
+Defined in: [runtime/supervise/runtime.ts:1498](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1498)
 
 The leaf `createWorktreeCliExecutor` as a backend-as-data factory: a supervisor-authored
 `AgentProfile` driving claude / codex / opencode on its own worktree. `budgetExempt` like
@@ -15527,9 +15620,9 @@ passes. Ground truth — the driver ends directly, no validation. The check read
 
 ### defineLeaderboard()
 
-> **defineLeaderboard**\<`TCase`\>(`spec`): [`DefinedLeaderboard`](#definedleaderboard)\<`TCase`\>
+> **defineLeaderboard**\<`TCase`, `TArtifact`\>(`spec`): [`DefinedLeaderboard`](#definedleaderboard)\<`TCase`, `TArtifact`\>
 
-Defined in: [runtime/define-leaderboard.ts:255](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L255)
+Defined in: [runtime/define-leaderboard.ts:294](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L294)
 
 #### Type Parameters
 
@@ -15537,15 +15630,19 @@ Defined in: [runtime/define-leaderboard.ts:255](https://github.com/tangle-networ
 
 `TCase`
 
+##### TArtifact
+
+`TArtifact` = `string`
+
 #### Parameters
 
 ##### spec
 
-[`LeaderboardSpec`](#leaderboardspec)\<`TCase`\>
+[`LeaderboardSpec`](#leaderboardspec)\<`TCase`, `TArtifact`\>
 
 #### Returns
 
-[`DefinedLeaderboard`](#definedleaderboard)\<`TCase`\>
+[`DefinedLeaderboard`](#definedleaderboard)\<`TCase`, `TArtifact`\>
 
 ***
 
@@ -18054,7 +18151,7 @@ state between runs), so two runs never cross-contaminate their journals/blobs.
 
 > **createExecutor**(`config`): [`ExecutorFactory`](#executorfactory)\<`unknown`\>
 
-Defined in: [runtime/supervise/runtime.ts:1518](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1518)
+Defined in: [runtime/supervise/runtime.ts:1551](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1551)
 
 The single built-in executor factory. Picks a leaf backend by data (`config.backend`),
 injects the matching seam, and delegates to that backend's built-in implementation.
@@ -18079,7 +18176,7 @@ per-vendor adapter or a closed `inline|sandbox|cli` switch — those bypass the
 
 > **createExecutorRegistry**(): [`ExecutorRegistry`](#executorregistry)
 
-Defined in: [runtime/supervise/runtime.ts:1564](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1564)
+Defined in: [runtime/supervise/runtime.ts:1597](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1597)
 
 The open resolver/registry. Pre-registers the three built-ins under their
 runtime tags (`'router'`, `'sandbox'`, `'cli'`) and accepts `register(name,
diff --git a/docs/canonical-api.md b/docs/canonical-api.md
index 5879506..8667e15 100644
--- a/docs/canonical-api.md
+++ b/docs/canonical-api.md
@@ -2,7 +2,7 @@
 
 <!-- This doc is the JUDGMENT layer: the mental model (§1), the AgentProfile law (§1.5), and the anti-reinvention decision table (§2) — WHICH primitive to reach for and what NOT to build. The export INVENTORY (WHAT exists) and per-symbol signatures + `file:line` are GENERATED into `docs/api/` (TypeDoc + `scripts/gen-primitive-catalog.mjs`, do NOT hand-edit) — that is the mechanical reference: `docs/api/primitive-catalog.md` is the never-stale list of every primitive to reuse. The freshness gate (`pnpm docs:freshness`) FAILS CI if a version pin, a cited `file:line`, a decision-table symbol, or the generated catalog drifts from source — see `docs/MAINTAINING.md`. Keep this file the small, hand-curated spine; never re-list the inventory here — point at the catalog. -->
 
-> **Version 0.85.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.101.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under.
+> **Version 0.86.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.101.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under.
 >
 > **`./loops` is the runtime barrel** — `package.json` maps it to `src/runtime/index.ts`. Everything below labelled `/loops` is the recursive-atom + loop-kernel surface.
 >
diff --git a/package.json b/package.json
index 6587ecd..7598f9e 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.85.0",
+  "version": "0.86.0",
   "description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {
diff --git a/src/runtime/define-leaderboard.test.ts b/src/runtime/define-leaderboard.test.ts
index 9f7a17b..2319ef2 100644
--- a/src/runtime/define-leaderboard.test.ts
+++ b/src/runtime/define-leaderboard.test.ts
@@ -3,7 +3,11 @@ import { tmpdir } from 'node:os'
 import { join } from 'node:path'
 import type { SandboxEvent } from '@tangle-network/sandbox'
 import { describe, expect, it } from 'vitest'
-import { defineLeaderboard, type LeaderboardRunContext } from './define-leaderboard'
+import {
+  defineLeaderboard,
+  type LeaderboardIterationInfo,
+  type LeaderboardRunContext,
+} from './define-leaderboard'
 import { inProcessSandboxClient } from './in-process-sandbox-client'
 
 interface FakeCase {
@@ -128,6 +132,85 @@ describe('defineLeaderboard', () => {
     for (const s of seen) expect(s.types).toContain('llm_call')
   })
 
+  it('carries per-shot index + verdict to onCellEvents, and error for THROWN shots', async () => {
+    // Shot 0 throws before producing events; shot 1 succeeds. Before the
+    // iteration-metadata seam, the thrown shot was invisible through the facade.
+    let attempts = 0
+    const throwingBackend = inProcessSandboxClient({
+      onPrompt: (prompt): SandboxEvent[] => {
+        if (attempts++ === 0) throw new Error('upstream harness terminated')
+        const answer = /answer=(\S+)/.exec(prompt)?.[1] ?? 'missing'
+        return [
+          { type: 'llm_call', data: { tokensIn: 12, tokensOut: 6, costUsd: 0.002 } },
+          { type: 'result', data: { finalText: `final answer=${answer}` } },
+        ]
+      },
+    })
+    const shots: Array<{ id: string; info: LeaderboardIterationInfo | undefined }> = []
+    await board({
+      backends: { inproc: () => throwingBackend },
+      shots: 2,
+      onCellEvents: (_events, c, info) => {
+        shots.push({ id: c.id, info })
+      },
+    }).run([...AXIS, '--cases', 'case-alpha'])
+
+    expect(shots).toHaveLength(2)
+    expect(shots[0]?.info).toEqual({ index: 0, error: 'upstream harness terminated' })
+    expect(shots[1]?.info).toEqual({ index: 1, verdict: { score: 1 } })
+  })
+
+  it('pins HARNESS_NATIVE_MODEL-snapped cells via the resolveModel seam', async () => {
+    // claude-code is vendor-locked to anthropic/*; a moonshot model snaps the
+    // axis to the 'default' sentinel, and the RunRecord then REQUIRES a
+    // dispatch-reported served model.
+    const snappedAxis = [
+      '--backend',
+      'inproc',
+      '--harnesses',
+      'claude-code',
+      '--models',
+      'moonshot/kimi-k2@2026-01-01',
+    ]
+    await expect(board().run([...snappedAxis, '--cases', 'case-alpha'])).rejects.toThrow(
+      /observeModel/,
+    )
+
+    const result = await board({
+      resolveModel: (events) => {
+        // The served model rides the backend's own usage events — here the fake
+        // backend's llm_call stands in for the harness's terminal event.
+        const call = events.find((e) => (e as { type: string }).type === 'llm_call')
+        return call ? 'kimi-k2@2026-01-01' : undefined
+      },
+    }).run([...snappedAxis, '--cases', 'case-alpha'])
+    expect(result.records[0]?.model).toBe('kimi-k2@2026-01-01')
+  })
+
+  it('flows a structured TArtifact through parseOutput → score → records natively', async () => {
+    interface Structured {
+      answer: string
+      confidence: number
+    }
+    const result = await defineLeaderboard<FakeCase, Structured>({
+      name: 'structured-board',
+      cases: CASES,
+      prompt: async (c) => `solve the task. answer=${c.answer}`,
+      parseOutput: (events): Structured => {
+        const final = events.find((e) => (e as { type: string }).type === 'result') as
+          | { data?: { finalText?: string } }
+          | undefined
+        const text = final?.data?.finalText ?? ''
+        return { answer: /answer=(\S+)/.exec(text)?.[1] ?? '', confidence: 0.9 }
+      },
+      score: (output, c) => (output.answer === c.answer ? output.confidence : 0),
+      backends: { inproc: fakeBackend },
+      export: async () => {},
+    }).run([...AXIS, '--cases', 'case-alpha'])
+
+    expect(Object.values(result.byProfile)[0]?.meanComposite).toBe(0.9)
+  })
+
   it('parses spec.flags and surfaces every flag to the hooks via ctx.args', async () => {
     let args: Record<string, string | undefined> = {}
     await board({
diff --git a/src/runtime/define-leaderboard.ts b/src/runtime/define-leaderboard.ts
index a31dffa..0596413 100644
--- a/src/runtime/define-leaderboard.ts
+++ b/src/runtime/define-leaderboard.ts
@@ -15,7 +15,8 @@
  *
  *   - LEVEL 0 (declarative): `cases` / `prompt` / `score` / `axis`.
  *   - LEVEL 1 (seams): `backends`, `flags`, `parseOutput`, `onCellEvents`,
- *     `setup`/`teardown`, `export`, `modelBackend`, `matrix` passthrough.
+ *     `resolveModel`, `setup`/`teardown`, `export`, `modelBackend`, `matrix`
+ *     passthrough.
  *   - LEVEL 2 (replacement): `dispatch` and `judges` swap out the whole
  *     loop wiring or scoring; `runProfileMatrix` itself stays public as the
  *     escape floor — a product overriding everything just writes what it has
@@ -52,7 +53,7 @@ import { leaderboard, renderLeaderboardMarkdown } from './benchmark-report'
 import { loopDispatch } from './loop-dispatch'
 import { resolveSandboxClient } from './resolve-sandbox-client'
 import { naiveDriver, type SteeringDecision } from './steering-drivers'
-import type { SandboxClient } from './types'
+import type { LoopResult, SandboxClient } from './types'
 
 /** Structured per-case verdict a `score` function may return (a bare number is
  *  shorthand for `{ composite }`). `composite` is the [0,1] leaderboard score;
@@ -110,8 +111,10 @@ export interface LeaderboardBenchScore {
 }
 
 /** Structurally `BenchmarkAdapter` (bench registry shape): `name`,
- *  `preflight()`, `loadTasks()`, deterministic `judge()`, `goldArtifact()`. */
-export interface LeaderboardBenchmarkAdapter {
+ *  `preflight()`, `loadTasks()`, deterministic `judge()`, `goldArtifact()`.
+ *  Generic over the artifact channel; the `string` default IS the registry
+ *  shape, so a default-artifact adapter registers unchanged. */
+export interface LeaderboardBenchmarkAdapter<TArtifact = string> {
   readonly name: string
   preflight(): Promise<void>
   loadTasks(opts?: {
@@ -119,11 +122,29 @@ export interface LeaderboardBenchmarkAdapter {
     split?: string
     ids?: string[]
   }): Promise<LeaderboardBenchTask[]>
-  judge(task: LeaderboardBenchTask, artifact: string): Promise<LeaderboardBenchScore>
+  judge(task: LeaderboardBenchTask, artifact: TArtifact): Promise<LeaderboardBenchScore>
   goldArtifact(task: LeaderboardBenchTask): Promise<string | undefined>
 }
 
-export interface LeaderboardSpec<TCase> {
+/** Per-shot outcome context passed as `onCellEvents`'s third argument — how a
+ *  thrown shot (which never reaches `parseOutput`) stays visible through the
+ *  facade instead of surfacing only as an empty zero-token cell. */
+export interface LeaderboardIterationInfo {
+  /** 0-based shot index within the cell. */
+  index: number
+  /** The shot's thrown error message, when the shot failed before scoring. */
+  error?: string
+  /** The shot's validator verdict, when the shot reached scoring. */
+  verdict?: { score?: number }
+}
+
+/**
+ * The declarative leaderboard spec. `TArtifact` is the artifact channel the
+ * dispatch produces and the judges score — `string` (the default) is the plain
+ * agent-response-text path; a structured artifact type flows natively once the
+ * spec supplies `parseOutput` (or a LEVEL-2 `dispatch`) producing it.
+ */
+export interface LeaderboardSpec<TCase, TArtifact = string> {
   /** Leaderboard name — the scenario `kind`, default profile name, and report title. */
   name: string
   /** The case corpus. Every case needs a stable string id (see `caseId`). */
@@ -134,10 +155,10 @@ export interface LeaderboardSpec<TCase> {
   /** The per-case task prompt. May be async (e.g. built by shelling out to a
    *  reference implementation); resolved ONCE per case before dispatch. */
   prompt: (c: TCase) => string | Promise<string>
-  /** The domain grader: agent output text → score. Used BOTH as the per-shot
-   *  validator (a shot with `composite > 0` stops the naive retry loop) and,
-   *  wrapped as a campaign judge, as the recorded leaderboard score. */
-  score: (output: string, c: TCase) => number | LeaderboardScore
+  /** The domain grader: agent output artifact → score. Used BOTH as the
+   *  per-shot validator (a shot with `composite > 0` stops the naive retry
+   *  loop) and, wrapped as a campaign judge, as the recorded leaderboard score. */
+  score: (output: TArtifact, c: TCase) => number | LeaderboardScore
   /** Harness × model axes for `expandProfileAxes`. Defaults: the canonical
    *  `CODING_HARNESSES` × the base profile's `model.default`. `--harnesses` /
    *  `--models` override per run. */
@@ -165,25 +186,43 @@ export interface LeaderboardSpec<TCase> {
   setup?: (ctx: LeaderboardRunContext) => Promise<void> | void
   /** Runs once after the matrix, even on failure (reap boxes, close handles). */
   teardown?: (ctx: LeaderboardRunContext) => Promise<void> | void
-  /** Per-cell event tap: the raw sandbox events of each parsed iteration,
-   *  with the case — the seam for domain metric capture (search counts,
-   *  citations) without a substrate change. */
-  onCellEvents?: (events: readonly SandboxEvent[], c: TCase) => void
-  /** Output decode override: raw events → the scored output text. Default:
-   *  the sandbox SDK's `collectAgentResponseText` (final answer text; empty
-   *  string when the stream carried none — which then scores 0). */
-  parseOutput?: (events: readonly SandboxEvent[], c: TCase) => string
+  /** Per-cell event tap: the raw sandbox events of EVERY shot, with the case —
+   *  the seam for domain metric capture (search counts, citations) without a
+   *  substrate change. Fires once per shot after the cell's loop settles, in
+   *  shot order, including thrown shots (whose events may be partial or empty);
+   *  the third argument carries the shot's index + error/verdict outcome. */
+  onCellEvents?: (
+    events: readonly SandboxEvent[],
+    c: TCase,
+    iteration?: LeaderboardIterationInfo,
+  ) => void
+  /** Output decode override: raw events → the scored artifact. Default: the
+   *  sandbox SDK's `collectAgentResponseText` (final answer text; empty string
+   *  when the stream carried none — which then scores 0). The default only
+   *  produces `string`, so a spec with a structured `TArtifact` MUST supply
+   *  this (or a LEVEL-2 `dispatch`). */
+  parseOutput?: (events: readonly SandboxEvent[], c: TCase) => TArtifact
+  /**
+   * Resolve the model the backend ACTUALLY served off a shot's raw events.
+   * Required for HARNESS_NATIVE_MODEL-snapped cells (a vendor-locked harness ×
+   * an out-of-family model expands to the `default` sentinel): the RunRecord
+   * must pin a real snapshot-bearing model id, which only the dispatch —
+   * reading the backend's usage/terminal events — can know. When this returns
+   * a value the default dispatch reports it via `ctx.cost.observeModel`;
+   * in-family cells (concrete declared model) never need it.
+   */
+  resolveModel?: (events: readonly SandboxEvent[]) => string | undefined
   /** Result export. Default: write `matrix-result.json` under the run dir and
    *  print (+ write) the ranked leaderboard markdown under the export dir. */
   export?: (
-    result: RunProfileMatrixResult<string, LeaderboardScenario<TCase>>,
+    result: RunProfileMatrixResult<TArtifact, LeaderboardScenario<TCase>>,
     ctx: LeaderboardRunContext,
   ) => Promise<void> | void
   /** LEVEL 2 — full dispatch replacement (in-process products bring their own).
    *  The default is `loopDispatch` + `naiveDriver` over the resolved backend. */
-  dispatch?: ProfileDispatchFn<LeaderboardScenario<TCase>, string>
+  dispatch?: ProfileDispatchFn<LeaderboardScenario<TCase>, TArtifact>
   /** LEVEL 2 — full judge replacement. Default: `score` wrapped as one judge. */
-  judges?: JudgeConfig<string, LeaderboardScenario<TCase>>[]
+  judges?: JudgeConfig<TArtifact, LeaderboardScenario<TCase>>[]
   /** Naive-retry shot cap per cell (`--shots`). Default 1. */
   shots?: number
   /** Replicates per cell (`--reps`). Default 1. */
@@ -191,10 +230,10 @@ export interface LeaderboardSpec<TCase> {
   /** Passthrough overrides spread onto the final `runProfileMatrix` call
    *  (e.g. `maxConcurrency`, `costCeiling`, `integrity`, `storage`) — spread
    *  LAST, so anything the facade wired can be overridden. */
-  matrix?: Partial<RunProfileMatrixOptions<LeaderboardScenario<TCase>, string>>
+  matrix?: Partial<RunProfileMatrixOptions<LeaderboardScenario<TCase>, TArtifact>>
 }
 
-export interface DefinedLeaderboard<TCase> {
+export interface DefinedLeaderboard<TCase, TArtifact = string> {
   /**
    * Parse flags, run the matrix, export, and return the raw result.
    *
@@ -208,9 +247,9 @@ export interface DefinedLeaderboard<TCase> {
    * would silently reuse a prior FAILED zero-token cell and skip dispatch —
    * only an explicit `--run-dir` opts into that resume behavior.
    */
-  run(argv?: string[]): Promise<RunProfileMatrixResult<string, LeaderboardScenario<TCase>>>
+  run(argv?: string[]): Promise<RunProfileMatrixResult<TArtifact, LeaderboardScenario<TCase>>>
   /** The same domain surface in the structural `BenchmarkAdapter` shape. */
-  toBenchmarkAdapter(): LeaderboardBenchmarkAdapter
+  toBenchmarkAdapter(): LeaderboardBenchmarkAdapter<TArtifact>
 }
 
 /** Read `--name <value>` from an argv array. */
@@ -252,7 +291,9 @@ function normalizeScore(s: number | LeaderboardScore): LeaderboardScore {
   return typeof s === 'number' ? { composite: s } : s
 }
 
-export function defineLeaderboard<TCase>(spec: LeaderboardSpec<TCase>): DefinedLeaderboard<TCase> {
+export function defineLeaderboard<TCase, TArtifact = string>(
+  spec: LeaderboardSpec<TCase, TArtifact>,
+): DefinedLeaderboard<TCase, TArtifact> {
   const caseId = (c: TCase): string => {
     const id = spec.caseId ? spec.caseId(c) : (c as { id?: unknown }).id
     if (typeof id !== 'string' || id.length === 0) {
@@ -278,7 +319,7 @@ export function defineLeaderboard<TCase>(spec: LeaderboardSpec<TCase>): DefinedL
     })
   }
 
-  const scoreJudge: JudgeConfig<string, LeaderboardScenario<TCase>> = {
+  const scoreJudge: JudgeConfig<TArtifact, LeaderboardScenario<TCase>> = {
     name: `${spec.name}-score`,
     dimensions: [{ key: 'composite', description: `${spec.name} case score` }],
     score({ artifact, scenario }) {
@@ -293,7 +334,7 @@ export function defineLeaderboard<TCase>(spec: LeaderboardSpec<TCase>): DefinedL
 
   async function run(
     argv: string[] = process.argv.slice(2),
-  ): Promise<RunProfileMatrixResult<string, LeaderboardScenario<TCase>>> {
+  ): Promise<RunProfileMatrixResult<TArtifact, LeaderboardScenario<TCase>>> {
     const args: Record<string, string | undefined> = {}
     for (const name of [
       'backend',
@@ -420,32 +461,38 @@ export function defineLeaderboard<TCase>(spec: LeaderboardSpec<TCase>): DefinedL
     // response-caching of byte-identical prompts across naive-retry shots.
     let shotNonce = 0
 
-    const dispatch =
-      spec.dispatch ??
-      loopDispatch<
+    // The default dispatch wraps loopDispatch per cell (closures only — no
+    // per-cell resource cost) so the loop's finished iterations can be joined
+    // with the campaign ctx: onCellEvents gets EVERY shot's outcome (a thrown
+    // shot never reaches parse, so parse-time tapping would hide it), and a
+    // spec-resolved served model reaches ctx.cost.observeModel (the only
+    // channel that pins HARNESS_NATIVE_MODEL-snapped cells to a real model).
+    const dispatch: ProfileDispatchFn<LeaderboardScenario<TCase>, TArtifact> = spec.dispatch ??
+    ((profile, scenario, dispatchCtx) => {
+      const cellDispatch = loopDispatch<
         LeaderboardScenario<TCase>,
-        string,
+        TArtifact,
         SteeringDecision,
         LeaderboardScenario<TCase>,
-        string
+        TArtifact
       >({
         sandboxClient,
-        toLoopOptions: (scenario, profile) => {
+        toLoopOptions: (cellScenario, cellProfile) => {
           // The cell's harness + model come off the profile's axis stamp set
           // by expandProfileAxes; the sandbox create override carries them to
           // whichever backend client runs the cell.
-          const axis = harnessAxisOf(profile)
+          const axis = harnessAxisOf(cellProfile)
           const modelId = bareModel(axis?.model ?? models[0] ?? '')
           return {
             // naiveDriver = the no-signal retry floor: re-run the same case as
             // an independent attempt until one scores (>0) or the shot cap.
-            driver: naiveDriver<LeaderboardScenario<TCase>, string>({
+            driver: naiveDriver<LeaderboardScenario<TCase>, TArtifact>({
               continuation: '',
               applyContinuation: (task) => task,
               maxIterations: shots,
             }),
             agentRun: {
-              profile,
+              profile: cellProfile,
               taskToPrompt: (s) => `${promptOf(s)}\n\n<!-- independent-attempt:${shotNonce++} -->`,
               ...(axis
                 ? {
@@ -459,28 +506,47 @@ export function defineLeaderboard<TCase>(spec: LeaderboardSpec<TCase>): DefinedL
                 : {}),
             },
             output: {
-              parse: (events) => {
-                spec.onCellEvents?.(events, scenario.case)
-                return spec.parseOutput
-                  ? spec.parseOutput(events, scenario.case)
-                  : (collectAgentResponseText(events) ?? '')
-              },
+              parse: (events) =>
+                spec.parseOutput
+                  ? spec.parseOutput(events, cellScenario.case)
+                  : // The default decode produces string — the TArtifact
+                    // default. A structured-TArtifact spec supplies parseOutput
+                    // (documented on the field), so this cast never lies.
+                    ((collectAgentResponseText(events) ?? '') as TArtifact),
             },
             validator: {
-              validate: async (output: string) => {
-                const s = normalizeScore(spec.score(output, scenario.case))
+              validate: async (output: TArtifact) => {
+                const s = normalizeScore(spec.score(output, cellScenario.case))
                 return { valid: s.composite > 0, score: s.composite }
               },
             },
-            task: scenario,
+            task: cellScenario,
             maxIterations: shots,
           }
         },
+        toArtifact: (result: LoopResult<LeaderboardScenario<TCase>, TArtifact, unknown>) => {
+          for (const iter of result.iterations) {
+            spec.onCellEvents?.(iter.events, scenario.case, {
+              index: iter.index,
+              ...(iter.error ? { error: iter.error.message } : {}),
+              ...(iter.verdict ? { verdict: { score: iter.verdict.score } } : {}),
+            })
+            if (spec.resolveModel) {
+              const served = spec.resolveModel(iter.events)
+              if (served !== undefined) dispatchCtx.cost.observeModel?.(served)
+            }
+          }
+          // Same as loopDispatch's default: no winner → undefined artifact
+          // (judges skip the cell; usage is still reported).
+          return result.winner?.output as TArtifact
+        },
       })
+      return cellDispatch(profile, scenario, dispatchCtx)
+    })
 
     await spec.setup?.(ctx)
     try {
-      const result = await runProfileMatrix<LeaderboardScenario<TCase>, string>({
+      const result = await runProfileMatrix<LeaderboardScenario<TCase>, TArtifact>({
         profiles,
         scenarios,
         dispatch,
@@ -508,7 +574,7 @@ export function defineLeaderboard<TCase>(spec: LeaderboardSpec<TCase>): DefinedL
     }
   }
 
-  function toBenchmarkAdapter(): LeaderboardBenchmarkAdapter {
+  function toBenchmarkAdapter(): LeaderboardBenchmarkAdapter<TArtifact> {
     return {
       name: spec.name,
       async preflight(): Promise<void> {
diff --git a/src/runtime/index.ts b/src/runtime/index.ts
index 388fb7f..6fdffa0 100644
--- a/src/runtime/index.ts
+++ b/src/runtime/index.ts
@@ -97,6 +97,7 @@ export {
   type LeaderboardBenchScore,
   type LeaderboardBenchTask,
   type LeaderboardFlagSpec,
+  type LeaderboardIterationInfo,
   type LeaderboardRunContext,
   type LeaderboardScenario,
   type LeaderboardScore,
diff --git a/src/runtime/supervise/bridge-executor.test.ts b/src/runtime/supervise/bridge-executor.test.ts
new file mode 100644
index 0000000..583236c
--- /dev/null
+++ b/src/runtime/supervise/bridge-executor.test.ts
@@ -0,0 +1,107 @@
+import { createServer, type Server } from 'node:http'
+import type { AddressInfo } from 'node:net'
+import type { AgentProfile } from '@tangle-network/agent-interface'
+import { afterEach, describe, expect, it } from 'vitest'
+import { bridgeExecutor } from './runtime'
+import type { UsageEvent } from './types'
+
+/** Serve one canned cli-bridge response body per request (HTTP 200 unless told
+ *  otherwise) and hand back the bridge URL — the upstream-failure shapes under
+ *  test are byte-level wire artifacts, so the test speaks real HTTP. */
+async function startBridgeStub(
+  body: string,
+  opts: { status?: number; contentType?: string } = {},
+): Promise<{ url: string; server: Server }> {
+  const server = createServer((_req, res) => {
+    res.writeHead(opts.status ?? 200, {
+      'content-type': opts.contentType ?? 'text/event-stream',
+    })
+    res.end(body)
+  })
+  await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve))
+  const { port } = server.address() as AddressInfo
+  return { url: `http://127.0.0.1:${port}`, server }
+}
+
+function makeExecutor(bridgeUrl: string) {
+  const profile: AgentProfile = { name: 'bridge-test-worker' }
+  return bridgeExecutor(
+    { profile, harness: null },
+    {
+      signal: new AbortController().signal,
+      seams: { bridge: { bridgeUrl, bridgeBearer: 'test-bearer', model: 'kimi-k2' } },
+    },
+  )
+}
+
+async function drain(stream: AsyncIterable<UsageEvent>): Promise<UsageEvent[]> {
+  const events: UsageEvent[] = []
+  for await (const ev of stream) events.push(ev)
+  return events
+}
+
+describe('bridgeExecutor upstream-error propagation', () => {
+  let server: Server | undefined
+  afterEach(async () => {
+    if (server) await new Promise((resolve) => server?.close(resolve))
+    server = undefined
+  })
+
+  it('throws the upstream error from a bare JSON error body (no SSE framing)', async () => {
+    // The kimi failure shape: HTTP 200, plain JSON error object, zero SSE frames.
+    // Before the tail parse this drained as one empty zero-token result.
+    const stub = await startBridgeStub(
+      JSON.stringify({ error: { type: 'access_terminated_error', message: 'account terminated' } }),
+      { contentType: 'application/json' },
+    )
+    server = stub.server
+    const executor = makeExecutor(stub.url)
+    const stream = executor.execute('do the task', new AbortController().signal)
+    await expect(drain(stream as AsyncIterable<UsageEvent>)).rejects.toThrow(
+      /bridge upstream error: account terminated/,
+    )
+    // The run still fails loud end-to-end: no artifact was produced.
+    expect(() => executor.resultArtifact()).toThrow(/before stream drained/)
+  })
+
+  it('throws from an UNTERMINATED final SSE error frame (no trailing blank line)', async () => {
+    const frame = `data: ${JSON.stringify({ error: { type: 'access_terminated_error' } })}\n`
+    const stub = await startBridgeStub(frame)
+    server = stub.server
+    const executor = makeExecutor(stub.url)
+    const stream = executor.execute('do the task', new AbortController().signal)
+    // No `message` on the payload — the error class must still surface, never 'unknown'.
+    await expect(drain(stream as AsyncIterable<UsageEvent>)).rejects.toThrow(
+      /bridge stream error: access_terminated_error/,
+    )
+  })
+
+  it('still throws on a mid-stream terminated SSE error frame', async () => {
+    const body = `data: ${JSON.stringify({ error: { message: 'quota exhausted' } })}\n\n`
+    const stub = await startBridgeStub(body)
+    server = stub.server
+    const executor = makeExecutor(stub.url)
+    const stream = executor.execute('do the task', new AbortController().signal)
+    await expect(drain(stream as AsyncIterable<UsageEvent>)).rejects.toThrow(
+      /bridge stream error: quota exhausted/,
+    )
+  })
+
+  it('drains a healthy stream unchanged and settles the artifact (tail parse is inert)', async () => {
+    const chunks = [
+      `data: ${JSON.stringify({ choices: [{ delta: { content: 'final answer' } }] })}`,
+      `data: ${JSON.stringify({ usage: { prompt_tokens: 10, completion_tokens: 4, cost: 0.01 } })}`,
+      'data: [DONE]',
+    ]
+    const stub = await startBridgeStub(`${chunks.join('\n\n')}\n\n`)
+    server = stub.server
+    const executor = makeExecutor(stub.url)
+    const events = await drain(
+      executor.execute('do the task', new AbortController().signal) as AsyncIterable<UsageEvent>,
+    )
+    expect(events).toContainEqual({ kind: 'tokens', input: 10, output: 4 })
+    const artifact = executor.resultArtifact()
+    expect(artifact.out).toMatchObject({ content: 'final answer' })
+    expect(artifact.spent.tokens).toEqual({ input: 10, output: 4 })
+  })
+})
diff --git a/src/runtime/supervise/runtime.ts b/src/runtime/supervise/runtime.ts
index d417ced..6749807 100644
--- a/src/runtime/supervise/runtime.ts
+++ b/src/runtime/supervise/runtime.ts
@@ -1212,11 +1212,42 @@ async function* parseSseChatStream(
         sep = buf.indexOf('\n\n')
       }
     }
+    // Upstream failures routinely arrive UNTERMINATED: a final `data:` frame
+    // with no trailing blank line, or a bare JSON error body with no SSE
+    // framing at all (kimi's access_terminated_error). Dropping the tail here
+    // ends the stream as one empty zero-token turn — the integrity guard still
+    // fails the run, but the diagnostic dies with the buffer. Parse the tail so
+    // the upstream error message rides the thrown event instead.
+    const tail = parseSseStreamTail(buf)
+    if (tail !== undefined && tail !== 'done') yield tail
   } finally {
     reader.releaseLock()
   }
 }
 
+/** Parse the stream's unterminated tail: an SSE frame missing its trailing
+ *  blank line, or a bare (non-SSE) JSON body — the shape bridge upstreams use
+ *  for terminal failures. Throws `ValidationError` on an error payload; returns
+ *  `undefined` for keepalive noise or non-JSON leftovers. */
+function parseSseStreamTail(buf: string): BridgeStreamChunk | 'done' | undefined {
+  const tail = buf.trim()
+  if (!tail) return undefined
+  const framed = parseSseFrame(tail)
+  if (framed !== undefined) return framed
+  let parsed: { error?: { message?: string; type?: string } }
+  try {
+    parsed = JSON.parse(tail)
+  } catch {
+    return undefined
+  }
+  if (parsed.error) {
+    throw new ValidationError(
+      `bridgeExecutor: bridge upstream error: ${parsed.error.message ?? parsed.error.type ?? 'unknown'}`,
+    )
+  }
+  return undefined
+}
+
 /** Parse one SSE frame (possibly multi-line `data:`/comment) into a chunk, `'done'`,
  *  or undefined (comment/keepalive/empty). */
 function parseSseFrame(frame: string): BridgeStreamChunk | 'done' | undefined {
@@ -1237,7 +1268,7 @@ function parseSseFrame(frame: string): BridgeStreamChunk | 'done' | undefined {
       }
       message?: { content?: string | null }
     }>
-    error?: { message?: string }
+    error?: { message?: string; type?: string }
     usage?: { prompt_tokens?: number; completion_tokens?: number; cost?: number }
   }
   try {
@@ -1246,8 +1277,10 @@ function parseSseFrame(frame: string): BridgeStreamChunk | 'done' | undefined {
     return undefined
   }
   if (parsed.error) {
+    // `type` is the upstream's error class (e.g. kimi's access_terminated_error)
+    // — carry it when the payload has no message, never collapse to 'unknown'.
     throw new ValidationError(
-      `bridgeExecutor: bridge stream error: ${parsed.error.message ?? 'unknown'}`,
+      `bridgeExecutor: bridge stream error: ${parsed.error.message ?? parsed.error.type ?? 'unknown'}`,
     )
   }
   const out: BridgeStreamChunk = {}