diff --git a/docs/api/primitive-catalog.md b/docs/api/primitive-catalog.md index 2a65b30..0bc02bf 100644 --- a/docs/api/primitive-catalog.md +++ b/docs/api/primitive-catalog.md @@ -7,7 +7,7 @@ # Primitive catalog — the never-stale anti-reinvention inventory -> **GENERATED** from `@tangle-network/agent-runtime@0.85.0` and `@tangle-network/agent-eval@0.103.1` by `scripts/gen-primitive-catalog.mjs`. Do NOT hand-edit — run `pnpm run docs:api`. This is the mechanical companion to the JUDGMENT in `canonical-api.md` (§2 decision table + §1.5 AgentProfile law): that doc says WHICH primitive to reach for and what NOT to build; this catalog proves WHAT exists. Per-symbol signatures + `file:line` live in the per-module pages under `docs/api/`. +> **GENERATED** from `@tangle-network/agent-runtime@0.86.0` and `@tangle-network/agent-eval@0.103.1` by `scripts/gen-primitive-catalog.mjs`. Do NOT hand-edit — run `pnpm run docs:api`. This is the mechanical companion to the JUDGMENT in `canonical-api.md` (§2 decision table + §1.5 AgentProfile law): that doc says WHICH primitive to reach for and what NOT to build; this catalog proves WHAT exists. Per-symbol signatures + `file:line` live in the per-module pages under `docs/api/`. ## 1. agent-runtime — own public surface @@ -246,7 +246,7 @@ Import from `@tangle-network/agent-runtime/intelligence` — 63 exports. ### Recursive atom + loop kernel (alias of ./runtime) -Import from `@tangle-network/agent-runtime/loops` — 425 exports. +Import from `@tangle-network/agent-runtime/loops` — 426 exports. | Symbol | Kind | Summary | |---|---|---| @@ -447,10 +447,12 @@ Import from `@tangle-network/agent-runtime/loops` — 425 exports. | `LeaderboardBenchScore` | interface | Structurally `BenchScore` (bench registry shape). | | `LeaderboardBenchTask` | interface | Structurally `BenchTask` (bench registry shape) — declared locally so this | | `LeaderboardFlagSpec` | interface | One extra CLI flag a spec declares. Parsed by `run()` as `-- ` | +| `LeaderboardIterationInfo` | interface | Per-shot outcome context passed as `onCellEvents`'s third argument — how a | | `LeaderboardRow` | interface | One leaderboard row — a harness×model profile, every measured column. | | `LeaderboardRunContext` | interface | Resolved run configuration handed to `setup` / `teardown` / `export`. | | `LeaderboardScenario` | interface | The campaign scenario a case is wrapped into: the case rides along so | | `LeaderboardScore` | interface | Structured per-case verdict a `score` function may return (a bare number is | +| `LeaderboardSpec` | interface | The declarative leaderboard spec. `TArtifact` is the artifact channel the | | `LoopCampaignDispatchOptions` | interface | Options for adapting plain agent-eval campaign scenarios into runtime `runLoop` cells. | | `LoopIterationDispatchPayload` | interface | Where the iteration's worker was placed. `sibling` = a fresh sandbox the | | `LoopLineageOptions` | interface | Opt-in box-lineage controls for `runLoop`. Default OFF — with both flags | @@ -571,7 +573,7 @@ Import from `@tangle-network/agent-runtime/loops` — 425 exports. | `WinnerStrategy` | type | Built-in valid-only winner strategies for `selectValidWinner` (selector≠judge): best gated-valid | | `WorktreePatchArtifact` | type | Terminal artifact of one worktree-CLI run — the canonical worktree-harness result (the captured | -**Undocumented supporting types** (add a TSDoc line at the declaration to earn a table row): `AgentEnvironment`, `AgentEnvironmentCapabilities`, `AgentEnvironmentEvent`, `AgentEnvironmentProvider`, `AgentEnvironmentQuery`, `AgentEnvironmentSummary`, `AgenticOptions`, `AgenticRunResult`, `AgenticTool`, `AgentSession`, `AgentSessionRef`, `AgentTurnInput`, `AgentTurnResult`, `AnalystRegistry`, `AnytimeReport`, `AnytimeStrategySummary`, `ArtifactHandle`, `AuditIntentOptions`, `AuthoredHarness`, `AuthoredStrategy`, `AuthorStrategyOptions`, `BenchmarkConfig`, `BenchmarkLift`, `BenchmarkStrategySummary`, `BenchmarkTaskRow`, `BudgetPool`, `BusStats`, `ChampionPick`, `CheckpointRef`, `CheckpointRequest`, `CreateAgentEnvironmentInput`, `DefinedLeaderboard`, `Driver`, `EventBus`, `EvolutionArchiveNode`, `EvolutionBandInfo`, `EvolutionCandidate`, `EvolutionGeneration`, `EvolutionReport`, `ExecRequest`, `ExecResult`, `ForkRequest`, `GitWorkspaceOptions`, `HarvestFailure`, `HarvestReport`, `Inbox`, `InProcessSandboxClientOptions`, `IntentAudit`, `Iteration`, `Leaderboard`, `LeaderboardOptions`, `LeaderboardSpec`, `LoopDecisionPayload`, `LoopDispatchOptions`, `LoopEndedPayload`, `LoopIterationEndedPayload`, `LoopIterationStartedPayload`, `LoopPlanDescription`, `LoopResult`, `LoopSandboxPlacement`, `LoopStartedPayload`, `LoopTraceEmitter`, `LoopWinner`, `McpEnvironmentOptions`, `Observation`, `ObserveOptions`, `OpenSandboxRunOptions`, `PairwiseOptions`, `PatchDeliverableOptions`, `PlacementInfo`, `PromotionGateOptions`, `PromotionVerdict`, `PublishOptions`, `ResourceRequest`, `RouterChatResult`, `RouterChatToolsResult`, `RouterToolLoopResult`, `RunAgenticOptions`, `SandboxRun`, `ShotSpec`, `StrategyEvolutionConfig`, `StrategyResult`, `StreamAgentTurnOptions`, `SuperviseOptions`, `SuperviseSurfaceOptions`, `SupervisorAgentDeps`, `SupervisorOpts`, `SurfaceScore`, `ToolSpec`, `TraceSource`, `ValidationCtx`, `Validator`, `WaterfallCollector`, `WaterfallReport`, `Workspace`, `WorkspaceRequest`, `WorkspaceRun`, `WorktreeCliExecutorOptions`, `WorktreeFanoutOptions`, `AgentEnvironmentStatus`, `AgentSessionStatus`, `ChampionPolicy`, `LoopTraceEvent`, `MakeWorkerAgent`, `WorkspaceCommit`. +**Undocumented supporting types** (add a TSDoc line at the declaration to earn a table row): `AgentEnvironment`, `AgentEnvironmentCapabilities`, `AgentEnvironmentEvent`, `AgentEnvironmentProvider`, `AgentEnvironmentQuery`, `AgentEnvironmentSummary`, `AgenticOptions`, `AgenticRunResult`, `AgenticTool`, `AgentSession`, `AgentSessionRef`, `AgentTurnInput`, `AgentTurnResult`, `AnalystRegistry`, `AnytimeReport`, `AnytimeStrategySummary`, `ArtifactHandle`, `AuditIntentOptions`, `AuthoredHarness`, `AuthoredStrategy`, `AuthorStrategyOptions`, `BenchmarkConfig`, `BenchmarkLift`, `BenchmarkStrategySummary`, `BenchmarkTaskRow`, `BudgetPool`, `BusStats`, `ChampionPick`, `CheckpointRef`, `CheckpointRequest`, `CreateAgentEnvironmentInput`, `DefinedLeaderboard`, `Driver`, `EventBus`, `EvolutionArchiveNode`, `EvolutionBandInfo`, `EvolutionCandidate`, `EvolutionGeneration`, `EvolutionReport`, `ExecRequest`, `ExecResult`, `ForkRequest`, `GitWorkspaceOptions`, `HarvestFailure`, `HarvestReport`, `Inbox`, `InProcessSandboxClientOptions`, `IntentAudit`, `Iteration`, `Leaderboard`, `LeaderboardOptions`, `LoopDecisionPayload`, `LoopDispatchOptions`, `LoopEndedPayload`, `LoopIterationEndedPayload`, `LoopIterationStartedPayload`, `LoopPlanDescription`, `LoopResult`, `LoopSandboxPlacement`, `LoopStartedPayload`, `LoopTraceEmitter`, `LoopWinner`, `McpEnvironmentOptions`, `Observation`, `ObserveOptions`, `OpenSandboxRunOptions`, `PairwiseOptions`, `PatchDeliverableOptions`, `PlacementInfo`, `PromotionGateOptions`, `PromotionVerdict`, `PublishOptions`, `ResourceRequest`, `RouterChatResult`, `RouterChatToolsResult`, `RouterToolLoopResult`, `RunAgenticOptions`, `SandboxRun`, `ShotSpec`, `StrategyEvolutionConfig`, `StrategyResult`, `StreamAgentTurnOptions`, `SuperviseOptions`, `SuperviseSurfaceOptions`, `SupervisorAgentDeps`, `SupervisorOpts`, `SurfaceScore`, `ToolSpec`, `TraceSource`, `ValidationCtx`, `Validator`, `WaterfallCollector`, `WaterfallReport`, `Workspace`, `WorkspaceRequest`, `WorkspaceRun`, `WorktreeCliExecutorOptions`, `WorktreeFanoutOptions`, `AgentEnvironmentStatus`, `AgentSessionStatus`, `ChampionPolicy`, `LoopTraceEvent`, `MakeWorkerAgent`, `WorkspaceCommit`. ### Environment provider adapters — generic sandbox/compute bridge diff --git a/docs/api/runtime.md b/docs/api/runtime.md index 64f3100..07429fa 100644 --- a/docs/api/runtime.md +++ b/docs/api/runtime.md @@ -1310,7 +1310,7 @@ Minimum confidence a PROBABILISTIC verdict must clear to end. Default 0.8. ### LeaderboardScore -Defined in: [runtime/define-leaderboard.ts:60](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L60) +Defined in: [runtime/define-leaderboard.ts:61](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L61) Structured per-case verdict a `score` function may return (a bare number is shorthand for `{ composite }`). `composite` is the [0,1] leaderboard score; @@ -1322,25 +1322,25 @@ Structured per-case verdict a `score` function may return (a bare number is > **composite**: `number` -Defined in: [runtime/define-leaderboard.ts:61](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L61) +Defined in: [runtime/define-leaderboard.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L62) ##### dimensions? > `optional` **dimensions?**: `Record`\<`string`, `number`\> -Defined in: [runtime/define-leaderboard.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L62) +Defined in: [runtime/define-leaderboard.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L63) ##### notes? > `optional` **notes?**: `string` -Defined in: [runtime/define-leaderboard.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L63) +Defined in: [runtime/define-leaderboard.ts:64](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L64) *** ### LeaderboardScenario -Defined in: [runtime/define-leaderboard.ts:68](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L68) +Defined in: [runtime/define-leaderboard.ts:69](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L69) The campaign scenario a case is wrapped into: the case rides along so judges and hooks can reach the full domain payload, not just its id. @@ -1361,13 +1361,13 @@ The campaign scenario a case is wrapped into: the case rides along so > **case**: `TCase` -Defined in: [runtime/define-leaderboard.ts:69](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L69) +Defined in: [runtime/define-leaderboard.ts:70](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L70) *** ### LeaderboardFlagSpec -Defined in: [runtime/define-leaderboard.ts:74](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L74) +Defined in: [runtime/define-leaderboard.ts:75](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L75) One extra CLI flag a spec declares. Parsed by `run()` as `-- ` and surfaced to every hook via `ctx.args`. @@ -1378,19 +1378,19 @@ One extra CLI flag a spec declares. Parsed by `run()` as `-- ` > `optional` **default?**: `string` -Defined in: [runtime/define-leaderboard.ts:75](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L75) +Defined in: [runtime/define-leaderboard.ts:76](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L76) ##### description > **description**: `string` -Defined in: [runtime/define-leaderboard.ts:76](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L76) +Defined in: [runtime/define-leaderboard.ts:77](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L77) *** ### LeaderboardRunContext -Defined in: [runtime/define-leaderboard.ts:80](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L80) +Defined in: [runtime/define-leaderboard.ts:81](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L81) Resolved run configuration handed to `setup` / `teardown` / `export`. @@ -1400,13 +1400,13 @@ Resolved run configuration handed to `setup` / `teardown` / `export`. > **name**: `string` -Defined in: [runtime/define-leaderboard.ts:81](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L81) +Defined in: [runtime/define-leaderboard.ts:82](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L82) ##### backend > **backend**: `string` -Defined in: [runtime/define-leaderboard.ts:83](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L83) +Defined in: [runtime/define-leaderboard.ts:84](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L84) Execution backend name (`--backend`), a key of `backends`. @@ -1414,19 +1414,19 @@ Execution backend name (`--backend`), a key of `backends`. > **runDir**: `string` -Defined in: [runtime/define-leaderboard.ts:84](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L84) +Defined in: [runtime/define-leaderboard.ts:85](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L85) ##### exportDir > **exportDir**: `string` -Defined in: [runtime/define-leaderboard.ts:85](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L85) +Defined in: [runtime/define-leaderboard.ts:86](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L86) ##### args > **args**: `Record`\<`string`, `string` \| `undefined`\> -Defined in: [runtime/define-leaderboard.ts:87](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L87) +Defined in: [runtime/define-leaderboard.ts:88](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L88) Every parsed flag (standard + `spec.flags`), by name without `--`. @@ -1434,13 +1434,13 @@ Every parsed flag (standard + `spec.flags`), by name without `--`. > **harnesses**: readonly `HarnessType`[] -Defined in: [runtime/define-leaderboard.ts:88](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L88) +Defined in: [runtime/define-leaderboard.ts:89](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L89) ##### models > **models**: readonly `string`[] -Defined in: [runtime/define-leaderboard.ts:90](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L90) +Defined in: [runtime/define-leaderboard.ts:91](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L91) Snapshot-stamped model ids (`name@snapshot`) — the eval identity models. @@ -1448,25 +1448,25 @@ Snapshot-stamped model ids (`name@snapshot`) — the eval identity models. > **caseIds**: readonly `string`[] -Defined in: [runtime/define-leaderboard.ts:91](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L91) +Defined in: [runtime/define-leaderboard.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L92) ##### shots > **shots**: `number` -Defined in: [runtime/define-leaderboard.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L92) +Defined in: [runtime/define-leaderboard.ts:93](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L93) ##### reps > **reps**: `number` -Defined in: [runtime/define-leaderboard.ts:93](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L93) +Defined in: [runtime/define-leaderboard.ts:94](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L94) *** ### LeaderboardBenchTask -Defined in: [runtime/define-leaderboard.ts:98](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L98) +Defined in: [runtime/define-leaderboard.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L99) Structurally `BenchTask` (bench registry shape) — declared locally so this module adds no dependency on a benchmark package. @@ -1477,31 +1477,31 @@ Structurally `BenchTask` (bench registry shape) — declared locally so this > **id**: `string` -Defined in: [runtime/define-leaderboard.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L99) +Defined in: [runtime/define-leaderboard.ts:100](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L100) ##### prompt > **prompt**: `string` -Defined in: [runtime/define-leaderboard.ts:100](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L100) +Defined in: [runtime/define-leaderboard.ts:101](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L101) ##### split? > `optional` **split?**: `string` -Defined in: [runtime/define-leaderboard.ts:101](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L101) +Defined in: [runtime/define-leaderboard.ts:102](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L102) ##### metadata? > `optional` **metadata?**: `Record`\<`string`, `unknown`\> -Defined in: [runtime/define-leaderboard.ts:102](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L102) +Defined in: [runtime/define-leaderboard.ts:103](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L103) *** ### LeaderboardBenchScore -Defined in: [runtime/define-leaderboard.ts:106](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L106) +Defined in: [runtime/define-leaderboard.ts:107](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L107) Structurally `BenchScore` (bench registry shape). @@ -1511,28 +1511,36 @@ Structurally `BenchScore` (bench registry shape). > **resolved**: `boolean` -Defined in: [runtime/define-leaderboard.ts:107](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L107) +Defined in: [runtime/define-leaderboard.ts:108](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L108) ##### score > **score**: `number` -Defined in: [runtime/define-leaderboard.ts:108](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L108) +Defined in: [runtime/define-leaderboard.ts:109](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L109) ##### detail? > `optional` **detail?**: `string` -Defined in: [runtime/define-leaderboard.ts:109](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L109) +Defined in: [runtime/define-leaderboard.ts:110](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L110) *** ### LeaderboardBenchmarkAdapter -Defined in: [runtime/define-leaderboard.ts:114](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L114) +Defined in: [runtime/define-leaderboard.ts:117](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L117) Structurally `BenchmarkAdapter` (bench registry shape): `name`, `preflight()`, `loadTasks()`, deterministic `judge()`, `goldArtifact()`. + Generic over the artifact channel; the `string` default IS the registry + shape, so a default-artifact adapter registers unchanged. + +#### Type Parameters + +##### TArtifact + +`TArtifact` = `string` #### Properties @@ -1540,7 +1548,7 @@ Structurally `BenchmarkAdapter` (bench registry shape): `name`, > `readonly` **name**: `string` -Defined in: [runtime/define-leaderboard.ts:115](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L115) +Defined in: [runtime/define-leaderboard.ts:118](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L118) #### Methods @@ -1548,7 +1556,7 @@ Defined in: [runtime/define-leaderboard.ts:115](https://github.com/tangle-networ > **preflight**(): `Promise`\<`void`\> -Defined in: [runtime/define-leaderboard.ts:116](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L116) +Defined in: [runtime/define-leaderboard.ts:119](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L119) ###### Returns @@ -1558,7 +1566,7 @@ Defined in: [runtime/define-leaderboard.ts:116](https://github.com/tangle-networ > **loadTasks**(`opts?`): `Promise`\<[`LeaderboardBenchTask`](#leaderboardbenchtask)[]\> -Defined in: [runtime/define-leaderboard.ts:117](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L117) +Defined in: [runtime/define-leaderboard.ts:120](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L120) ###### Parameters @@ -1584,7 +1592,7 @@ Defined in: [runtime/define-leaderboard.ts:117](https://github.com/tangle-networ > **judge**(`task`, `artifact`): `Promise`\<[`LeaderboardBenchScore`](#leaderboardbenchscore)\> -Defined in: [runtime/define-leaderboard.ts:122](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L122) +Defined in: [runtime/define-leaderboard.ts:125](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L125) ###### Parameters @@ -1594,7 +1602,7 @@ Defined in: [runtime/define-leaderboard.ts:122](https://github.com/tangle-networ ###### artifact -`string` +`TArtifact` ###### Returns @@ -1604,7 +1612,7 @@ Defined in: [runtime/define-leaderboard.ts:122](https://github.com/tangle-networ > **goldArtifact**(`task`): `Promise`\<`string` \| `undefined`\> -Defined in: [runtime/define-leaderboard.ts:123](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L123) +Defined in: [runtime/define-leaderboard.ts:126](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L126) ###### Parameters @@ -1618,9 +1626,54 @@ Defined in: [runtime/define-leaderboard.ts:123](https://github.com/tangle-networ *** +### LeaderboardIterationInfo + +Defined in: [runtime/define-leaderboard.ts:132](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L132) + +Per-shot outcome context passed as `onCellEvents`'s third argument — how a + thrown shot (which never reaches `parseOutput`) stays visible through the + facade instead of surfacing only as an empty zero-token cell. + +#### Properties + +##### index + +> **index**: `number` + +Defined in: [runtime/define-leaderboard.ts:134](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L134) + +0-based shot index within the cell. + +##### error? + +> `optional` **error?**: `string` + +Defined in: [runtime/define-leaderboard.ts:136](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L136) + +The shot's thrown error message, when the shot failed before scoring. + +##### verdict? + +> `optional` **verdict?**: `object` + +Defined in: [runtime/define-leaderboard.ts:138](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L138) + +The shot's validator verdict, when the shot reached scoring. + +###### score? + +> `optional` **score?**: `number` + +*** + ### LeaderboardSpec -Defined in: [runtime/define-leaderboard.ts:126](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L126) +Defined in: [runtime/define-leaderboard.ts:147](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L147) + +The declarative leaderboard spec. `TArtifact` is the artifact channel the +dispatch produces and the judges score — `string` (the default) is the plain +agent-response-text path; a structured artifact type flows natively once the +spec supplies `parseOutput` (or a LEVEL-2 `dispatch`) producing it. #### Type Parameters @@ -1628,13 +1681,17 @@ Defined in: [runtime/define-leaderboard.ts:126](https://github.com/tangle-networ `TCase` +##### TArtifact + +`TArtifact` = `string` + #### Properties ##### name > **name**: `string` -Defined in: [runtime/define-leaderboard.ts:128](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L128) +Defined in: [runtime/define-leaderboard.ts:149](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L149) Leaderboard name — the scenario `kind`, default profile name, and report title. @@ -1642,7 +1699,7 @@ Leaderboard name — the scenario `kind`, default profile name, and report title > **cases**: `TCase`[] -Defined in: [runtime/define-leaderboard.ts:130](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L130) +Defined in: [runtime/define-leaderboard.ts:151](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L151) The case corpus. Every case needs a stable string id (see `caseId`). @@ -1650,7 +1707,7 @@ The case corpus. Every case needs a stable string id (see `caseId`). > `optional` **caseId?**: (`c`) => `string` -Defined in: [runtime/define-leaderboard.ts:133](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L133) +Defined in: [runtime/define-leaderboard.ts:154](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L154) Stable id extractor. Default: the case's own `id` property (fail-loud when absent or not a string). @@ -1669,7 +1726,7 @@ Stable id extractor. Default: the case's own `id` property (fail-loud > **prompt**: (`c`) => `string` \| `Promise`\<`string`\> -Defined in: [runtime/define-leaderboard.ts:136](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L136) +Defined in: [runtime/define-leaderboard.ts:157](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L157) The per-case task prompt. May be async (e.g. built by shelling out to a reference implementation); resolved ONCE per case before dispatch. @@ -1688,17 +1745,17 @@ The per-case task prompt. May be async (e.g. built by shelling out to a > **score**: (`output`, `c`) => `number` \| [`LeaderboardScore`](#leaderboardscore) -Defined in: [runtime/define-leaderboard.ts:140](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L140) +Defined in: [runtime/define-leaderboard.ts:161](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L161) -The domain grader: agent output text → score. Used BOTH as the per-shot - validator (a shot with `composite > 0` stops the naive retry loop) and, - wrapped as a campaign judge, as the recorded leaderboard score. +The domain grader: agent output artifact → score. Used BOTH as the + per-shot validator (a shot with `composite > 0` stops the naive retry + loop) and, wrapped as a campaign judge, as the recorded leaderboard score. ###### Parameters ###### output -`string` +`TArtifact` ###### c @@ -1712,7 +1769,7 @@ The domain grader: agent output text → score. Used BOTH as the per-shot > `optional` **axis?**: `object` -Defined in: [runtime/define-leaderboard.ts:144](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L144) +Defined in: [runtime/define-leaderboard.ts:165](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L165) Harness × model axes for `expandProfileAxes`. Defaults: the canonical `CODING_HARNESSES` × the base profile's `model.default`. `--harnesses` / @@ -1730,7 +1787,7 @@ Harness × model axes for `expandProfileAxes`. Defaults: the canonical > `optional` **baseProfile?**: `AgentProfile` -Defined in: [runtime/define-leaderboard.ts:147](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L147) +Defined in: [runtime/define-leaderboard.ts:168](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L168) Base profile the axes expand over (prompt/tools/skills held fixed). Default: a minimal `{ name, model: { default: } }`. @@ -1739,7 +1796,7 @@ Base profile the axes expand over (prompt/tools/skills held fixed). > `optional` **backends?**: `Record`\<`string`, (() => [`SandboxClient`](#sandboxclient-3)) \| `undefined`\> -Defined in: [runtime/define-leaderboard.ts:157](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L157) +Defined in: [runtime/define-leaderboard.ts:178](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L178) Execution-backend registry: `--backend ` picks the factory that yields the `SandboxClient` every cell runs on. Merged over the defaults: @@ -1753,7 +1810,7 @@ yields the `SandboxClient` every cell runs on. Merged over the defaults: > `optional` **flags?**: `Record`\<`string`, [`LeaderboardFlagSpec`](#leaderboardflagspec)\> -Defined in: [runtime/define-leaderboard.ts:159](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L159) +Defined in: [runtime/define-leaderboard.ts:180](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L180) Extra `--flag value` CLI args `run()` parses and surfaces via `ctx.args`. @@ -1761,7 +1818,7 @@ Extra `--flag value` CLI args `run()` parses and surfaces via `ctx.args`. > `optional` **modelBackend?**: `Record`\<`string`, `unknown`\> -Defined in: [runtime/define-leaderboard.ts:163](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L163) +Defined in: [runtime/define-leaderboard.ts:184](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L184) Extra fields merged into each cell's `backend.model` create override — e.g. `{ provider: 'openai-compat', apiKey, baseUrl }` for a router-backed @@ -1771,7 +1828,7 @@ Extra fields merged into each cell's `backend.model` create override — > `optional` **setup?**: (`ctx`) => `void` \| `Promise`\<`void`\> -Defined in: [runtime/define-leaderboard.ts:165](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L165) +Defined in: [runtime/define-leaderboard.ts:186](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L186) Runs once before the matrix (fetch fixtures, warm caches). @@ -1789,7 +1846,7 @@ Runs once before the matrix (fetch fixtures, warm caches). > `optional` **teardown?**: (`ctx`) => `void` \| `Promise`\<`void`\> -Defined in: [runtime/define-leaderboard.ts:167](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L167) +Defined in: [runtime/define-leaderboard.ts:188](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L188) Runs once after the matrix, even on failure (reap boxes, close handles). @@ -1805,13 +1862,15 @@ Runs once after the matrix, even on failure (reap boxes, close handles). ##### onCellEvents? -> `optional` **onCellEvents?**: (`events`, `c`) => `void` +> `optional` **onCellEvents?**: (`events`, `c`, `iteration?`) => `void` -Defined in: [runtime/define-leaderboard.ts:171](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L171) +Defined in: [runtime/define-leaderboard.ts:194](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L194) -Per-cell event tap: the raw sandbox events of each parsed iteration, - with the case — the seam for domain metric capture (search counts, - citations) without a substrate change. +Per-cell event tap: the raw sandbox events of EVERY shot, with the case — + the seam for domain metric capture (search counts, citations) without a + substrate change. Fires once per shot after the cell's loop settles, in + shot order, including thrown shots (whose events may be partial or empty); + the third argument carries the shot's index + error/verdict outcome. ###### Parameters @@ -1823,19 +1882,25 @@ readonly `SandboxEvent`[] `TCase` +###### iteration? + +[`LeaderboardIterationInfo`](#leaderboarditerationinfo) + ###### Returns `void` ##### parseOutput? -> `optional` **parseOutput?**: (`events`, `c`) => `string` +> `optional` **parseOutput?**: (`events`, `c`) => `TArtifact` -Defined in: [runtime/define-leaderboard.ts:175](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L175) +Defined in: [runtime/define-leaderboard.ts:204](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L204) -Output decode override: raw events → the scored output text. Default: - the sandbox SDK's `collectAgentResponseText` (final answer text; empty - string when the stream carried none — which then scores 0). +Output decode override: raw events → the scored artifact. Default: the + sandbox SDK's `collectAgentResponseText` (final answer text; empty string + when the stream carried none — which then scores 0). The default only + produces `string`, so a spec with a structured `TArtifact` MUST supply + this (or a LEVEL-2 `dispatch`). ###### Parameters @@ -1849,13 +1914,37 @@ readonly `SandboxEvent`[] ###### Returns -`string` +`TArtifact` + +##### resolveModel? + +> `optional` **resolveModel?**: (`events`) => `string` \| `undefined` + +Defined in: [runtime/define-leaderboard.ts:214](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L214) + +Resolve the model the backend ACTUALLY served off a shot's raw events. +Required for HARNESS_NATIVE_MODEL-snapped cells (a vendor-locked harness × +an out-of-family model expands to the `default` sentinel): the RunRecord +must pin a real snapshot-bearing model id, which only the dispatch — +reading the backend's usage/terminal events — can know. When this returns +a value the default dispatch reports it via `ctx.cost.observeModel`; +in-family cells (concrete declared model) never need it. + +###### Parameters + +###### events + +readonly `SandboxEvent`[] + +###### Returns + +`string` \| `undefined` ##### export? > `optional` **export?**: (`result`, `ctx`) => `void` \| `Promise`\<`void`\> -Defined in: [runtime/define-leaderboard.ts:178](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L178) +Defined in: [runtime/define-leaderboard.ts:217](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L217) Result export. Default: write `matrix-result.json` under the run dir and print (+ write) the ranked leaderboard markdown under the export dir. @@ -1864,7 +1953,7 @@ Result export. Default: write `matrix-result.json` under the run dir and ###### result -`RunProfileMatrixResult`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\> +`RunProfileMatrixResult`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\> ###### ctx @@ -1876,18 +1965,18 @@ Result export. Default: write `matrix-result.json` under the run dir and ##### dispatch? -> `optional` **dispatch?**: `ProfileDispatchFn`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `string`\> +> `optional` **dispatch?**: `ProfileDispatchFn`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `TArtifact`\> -Defined in: [runtime/define-leaderboard.ts:184](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L184) +Defined in: [runtime/define-leaderboard.ts:223](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L223) LEVEL 2 — full dispatch replacement (in-process products bring their own). The default is `loopDispatch` + `naiveDriver` over the resolved backend. ##### judges? -> `optional` **judges?**: `JudgeConfig`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>[] +> `optional` **judges?**: `JudgeConfig`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>[] -Defined in: [runtime/define-leaderboard.ts:186](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L186) +Defined in: [runtime/define-leaderboard.ts:225](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L225) LEVEL 2 — full judge replacement. Default: `score` wrapped as one judge. @@ -1895,7 +1984,7 @@ LEVEL 2 — full judge replacement. Default: `score` wrapped as one judge. > `optional` **shots?**: `number` -Defined in: [runtime/define-leaderboard.ts:188](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L188) +Defined in: [runtime/define-leaderboard.ts:227](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L227) Naive-retry shot cap per cell (`--shots`). Default 1. @@ -1903,15 +1992,15 @@ Naive-retry shot cap per cell (`--shots`). Default 1. > `optional` **reps?**: `number` -Defined in: [runtime/define-leaderboard.ts:190](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L190) +Defined in: [runtime/define-leaderboard.ts:229](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L229) Replicates per cell (`--reps`). Default 1. ##### matrix? -> `optional` **matrix?**: `Partial`\<`RunProfileMatrixOptions`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `string`\>\> +> `optional` **matrix?**: `Partial`\<`RunProfileMatrixOptions`\<[`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>, `TArtifact`\>\> -Defined in: [runtime/define-leaderboard.ts:194](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L194) +Defined in: [runtime/define-leaderboard.ts:233](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L233) Passthrough overrides spread onto the final `runProfileMatrix` call (e.g. `maxConcurrency`, `costCeiling`, `integrity`, `storage`) — spread @@ -1921,7 +2010,7 @@ Passthrough overrides spread onto the final `runProfileMatrix` call ### DefinedLeaderboard -Defined in: [runtime/define-leaderboard.ts:197](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L197) +Defined in: [runtime/define-leaderboard.ts:236](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L236) #### Type Parameters @@ -1929,13 +2018,17 @@ Defined in: [runtime/define-leaderboard.ts:197](https://github.com/tangle-networ `TCase` +##### TArtifact + +`TArtifact` = `string` + #### Methods ##### run() -> **run**(`argv?`): `Promise`\<`RunProfileMatrixResult`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\> +> **run**(`argv?`): `Promise`\<`RunProfileMatrixResult`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\> -Defined in: [runtime/define-leaderboard.ts:211](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L211) +Defined in: [runtime/define-leaderboard.ts:250](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L250) Parse flags, run the matrix, export, and return the raw result. @@ -1957,19 +2050,19 @@ only an explicit `--run-dir` opts into that resume behavior. ###### Returns -`Promise`\<`RunProfileMatrixResult`\<`string`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\> +`Promise`\<`RunProfileMatrixResult`\<`TArtifact`, [`LeaderboardScenario`](#leaderboardscenario)\<`TCase`\>\>\> ##### toBenchmarkAdapter() -> **toBenchmarkAdapter**(): [`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter) +> **toBenchmarkAdapter**(): [`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter)\<`TArtifact`\> -Defined in: [runtime/define-leaderboard.ts:213](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L213) +Defined in: [runtime/define-leaderboard.ts:252](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L252) The same domain surface in the structural `BenchmarkAdapter` shape. ###### Returns -[`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter) +[`LeaderboardBenchmarkAdapter`](#leaderboardbenchmarkadapter)\<`TArtifact`\> *** @@ -14676,7 +14769,7 @@ Post-reservation pool readout — the shape `Scope.budget` exposes. `tokensLeft` > **ExecutorConfig** = `object` & `RouterSeam` \| `object` & `RouterToolsSeam` \| `object` & `BridgeSeam` \| `object` & `CliSeam` \| `object` & `CliWorktreeSeam` \| `object` & [`ProviderSeam`](#providerseam) \| `object` & `SandboxSeam` -Defined in: [runtime/supervise/runtime.ts:1501](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1501) +Defined in: [runtime/supervise/runtime.ts:1534](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1534) Config for [createExecutor](#createexecutor): the backend is DATA — the cost dial a profile, an experiment config, or a replay journal can name — not an import choice. Each @@ -15164,7 +15257,7 @@ The conserved pool a `delegate()` call applies when the caller does not pass its > `const` **cliWorktreeExecutor**: [`ExecutorFactory`](#executorfactory)\<`unknown`\> -Defined in: [runtime/supervise/runtime.ts:1465](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1465) +Defined in: [runtime/supervise/runtime.ts:1498](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1498) The leaf `createWorktreeCliExecutor` as a backend-as-data factory: a supervisor-authored `AgentProfile` driving claude / codex / opencode on its own worktree. `budgetExempt` like @@ -15527,9 +15620,9 @@ passes. Ground truth — the driver ends directly, no validation. The check read ### defineLeaderboard() -> **defineLeaderboard**\<`TCase`\>(`spec`): [`DefinedLeaderboard`](#definedleaderboard)\<`TCase`\> +> **defineLeaderboard**\<`TCase`, `TArtifact`\>(`spec`): [`DefinedLeaderboard`](#definedleaderboard)\<`TCase`, `TArtifact`\> -Defined in: [runtime/define-leaderboard.ts:255](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L255) +Defined in: [runtime/define-leaderboard.ts:294](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L294) #### Type Parameters @@ -15537,15 +15630,19 @@ Defined in: [runtime/define-leaderboard.ts:255](https://github.com/tangle-networ `TCase` +##### TArtifact + +`TArtifact` = `string` + #### Parameters ##### spec -[`LeaderboardSpec`](#leaderboardspec)\<`TCase`\> +[`LeaderboardSpec`](#leaderboardspec)\<`TCase`, `TArtifact`\> #### Returns -[`DefinedLeaderboard`](#definedleaderboard)\<`TCase`\> +[`DefinedLeaderboard`](#definedleaderboard)\<`TCase`, `TArtifact`\> *** @@ -18054,7 +18151,7 @@ state between runs), so two runs never cross-contaminate their journals/blobs. > **createExecutor**(`config`): [`ExecutorFactory`](#executorfactory)\<`unknown`\> -Defined in: [runtime/supervise/runtime.ts:1518](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1518) +Defined in: [runtime/supervise/runtime.ts:1551](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1551) The single built-in executor factory. Picks a leaf backend by data (`config.backend`), injects the matching seam, and delegates to that backend's built-in implementation. @@ -18079,7 +18176,7 @@ per-vendor adapter or a closed `inline|sandbox|cli` switch — those bypass the > **createExecutorRegistry**(): [`ExecutorRegistry`](#executorregistry) -Defined in: [runtime/supervise/runtime.ts:1564](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1564) +Defined in: [runtime/supervise/runtime.ts:1597](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1597) The open resolver/registry. Pre-registers the three built-ins under their runtime tags (`'router'`, `'sandbox'`, `'cli'`) and accepts `register(name, diff --git a/docs/canonical-api.md b/docs/canonical-api.md index 5879506..8667e15 100644 --- a/docs/canonical-api.md +++ b/docs/canonical-api.md @@ -2,7 +2,7 @@ -> **Version 0.85.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.101.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under. +> **Version 0.86.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.101.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under. > > **`./loops` is the runtime barrel** — `package.json` maps it to `src/runtime/index.ts`. Everything below labelled `/loops` is the recursive-atom + loop-kernel surface. > diff --git a/package.json b/package.json index 6587ecd..7598f9e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.85.0", + "version": "0.86.0", "description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { diff --git a/src/runtime/define-leaderboard.test.ts b/src/runtime/define-leaderboard.test.ts index 9f7a17b..2319ef2 100644 --- a/src/runtime/define-leaderboard.test.ts +++ b/src/runtime/define-leaderboard.test.ts @@ -3,7 +3,11 @@ import { tmpdir } from 'node:os' import { join } from 'node:path' import type { SandboxEvent } from '@tangle-network/sandbox' import { describe, expect, it } from 'vitest' -import { defineLeaderboard, type LeaderboardRunContext } from './define-leaderboard' +import { + defineLeaderboard, + type LeaderboardIterationInfo, + type LeaderboardRunContext, +} from './define-leaderboard' import { inProcessSandboxClient } from './in-process-sandbox-client' interface FakeCase { @@ -128,6 +132,85 @@ describe('defineLeaderboard', () => { for (const s of seen) expect(s.types).toContain('llm_call') }) + it('carries per-shot index + verdict to onCellEvents, and error for THROWN shots', async () => { + // Shot 0 throws before producing events; shot 1 succeeds. Before the + // iteration-metadata seam, the thrown shot was invisible through the facade. + let attempts = 0 + const throwingBackend = inProcessSandboxClient({ + onPrompt: (prompt): SandboxEvent[] => { + if (attempts++ === 0) throw new Error('upstream harness terminated') + const answer = /answer=(\S+)/.exec(prompt)?.[1] ?? 'missing' + return [ + { type: 'llm_call', data: { tokensIn: 12, tokensOut: 6, costUsd: 0.002 } }, + { type: 'result', data: { finalText: `final answer=${answer}` } }, + ] + }, + }) + const shots: Array<{ id: string; info: LeaderboardIterationInfo | undefined }> = [] + await board({ + backends: { inproc: () => throwingBackend }, + shots: 2, + onCellEvents: (_events, c, info) => { + shots.push({ id: c.id, info }) + }, + }).run([...AXIS, '--cases', 'case-alpha']) + + expect(shots).toHaveLength(2) + expect(shots[0]?.info).toEqual({ index: 0, error: 'upstream harness terminated' }) + expect(shots[1]?.info).toEqual({ index: 1, verdict: { score: 1 } }) + }) + + it('pins HARNESS_NATIVE_MODEL-snapped cells via the resolveModel seam', async () => { + // claude-code is vendor-locked to anthropic/*; a moonshot model snaps the + // axis to the 'default' sentinel, and the RunRecord then REQUIRES a + // dispatch-reported served model. + const snappedAxis = [ + '--backend', + 'inproc', + '--harnesses', + 'claude-code', + '--models', + 'moonshot/kimi-k2@2026-01-01', + ] + await expect(board().run([...snappedAxis, '--cases', 'case-alpha'])).rejects.toThrow( + /observeModel/, + ) + + const result = await board({ + resolveModel: (events) => { + // The served model rides the backend's own usage events — here the fake + // backend's llm_call stands in for the harness's terminal event. + const call = events.find((e) => (e as { type: string }).type === 'llm_call') + return call ? 'kimi-k2@2026-01-01' : undefined + }, + }).run([...snappedAxis, '--cases', 'case-alpha']) + expect(result.records[0]?.model).toBe('kimi-k2@2026-01-01') + }) + + it('flows a structured TArtifact through parseOutput → score → records natively', async () => { + interface Structured { + answer: string + confidence: number + } + const result = await defineLeaderboard({ + name: 'structured-board', + cases: CASES, + prompt: async (c) => `solve the task. answer=${c.answer}`, + parseOutput: (events): Structured => { + const final = events.find((e) => (e as { type: string }).type === 'result') as + | { data?: { finalText?: string } } + | undefined + const text = final?.data?.finalText ?? '' + return { answer: /answer=(\S+)/.exec(text)?.[1] ?? '', confidence: 0.9 } + }, + score: (output, c) => (output.answer === c.answer ? output.confidence : 0), + backends: { inproc: fakeBackend }, + export: async () => {}, + }).run([...AXIS, '--cases', 'case-alpha']) + + expect(Object.values(result.byProfile)[0]?.meanComposite).toBe(0.9) + }) + it('parses spec.flags and surfaces every flag to the hooks via ctx.args', async () => { let args: Record = {} await board({ diff --git a/src/runtime/define-leaderboard.ts b/src/runtime/define-leaderboard.ts index a31dffa..0596413 100644 --- a/src/runtime/define-leaderboard.ts +++ b/src/runtime/define-leaderboard.ts @@ -15,7 +15,8 @@ * * - LEVEL 0 (declarative): `cases` / `prompt` / `score` / `axis`. * - LEVEL 1 (seams): `backends`, `flags`, `parseOutput`, `onCellEvents`, - * `setup`/`teardown`, `export`, `modelBackend`, `matrix` passthrough. + * `resolveModel`, `setup`/`teardown`, `export`, `modelBackend`, `matrix` + * passthrough. * - LEVEL 2 (replacement): `dispatch` and `judges` swap out the whole * loop wiring or scoring; `runProfileMatrix` itself stays public as the * escape floor — a product overriding everything just writes what it has @@ -52,7 +53,7 @@ import { leaderboard, renderLeaderboardMarkdown } from './benchmark-report' import { loopDispatch } from './loop-dispatch' import { resolveSandboxClient } from './resolve-sandbox-client' import { naiveDriver, type SteeringDecision } from './steering-drivers' -import type { SandboxClient } from './types' +import type { LoopResult, SandboxClient } from './types' /** Structured per-case verdict a `score` function may return (a bare number is * shorthand for `{ composite }`). `composite` is the [0,1] leaderboard score; @@ -110,8 +111,10 @@ export interface LeaderboardBenchScore { } /** Structurally `BenchmarkAdapter` (bench registry shape): `name`, - * `preflight()`, `loadTasks()`, deterministic `judge()`, `goldArtifact()`. */ -export interface LeaderboardBenchmarkAdapter { + * `preflight()`, `loadTasks()`, deterministic `judge()`, `goldArtifact()`. + * Generic over the artifact channel; the `string` default IS the registry + * shape, so a default-artifact adapter registers unchanged. */ +export interface LeaderboardBenchmarkAdapter { readonly name: string preflight(): Promise loadTasks(opts?: { @@ -119,11 +122,29 @@ export interface LeaderboardBenchmarkAdapter { split?: string ids?: string[] }): Promise - judge(task: LeaderboardBenchTask, artifact: string): Promise + judge(task: LeaderboardBenchTask, artifact: TArtifact): Promise goldArtifact(task: LeaderboardBenchTask): Promise } -export interface LeaderboardSpec { +/** Per-shot outcome context passed as `onCellEvents`'s third argument — how a + * thrown shot (which never reaches `parseOutput`) stays visible through the + * facade instead of surfacing only as an empty zero-token cell. */ +export interface LeaderboardIterationInfo { + /** 0-based shot index within the cell. */ + index: number + /** The shot's thrown error message, when the shot failed before scoring. */ + error?: string + /** The shot's validator verdict, when the shot reached scoring. */ + verdict?: { score?: number } +} + +/** + * The declarative leaderboard spec. `TArtifact` is the artifact channel the + * dispatch produces and the judges score — `string` (the default) is the plain + * agent-response-text path; a structured artifact type flows natively once the + * spec supplies `parseOutput` (or a LEVEL-2 `dispatch`) producing it. + */ +export interface LeaderboardSpec { /** Leaderboard name — the scenario `kind`, default profile name, and report title. */ name: string /** The case corpus. Every case needs a stable string id (see `caseId`). */ @@ -134,10 +155,10 @@ export interface LeaderboardSpec { /** The per-case task prompt. May be async (e.g. built by shelling out to a * reference implementation); resolved ONCE per case before dispatch. */ prompt: (c: TCase) => string | Promise - /** The domain grader: agent output text → score. Used BOTH as the per-shot - * validator (a shot with `composite > 0` stops the naive retry loop) and, - * wrapped as a campaign judge, as the recorded leaderboard score. */ - score: (output: string, c: TCase) => number | LeaderboardScore + /** The domain grader: agent output artifact → score. Used BOTH as the + * per-shot validator (a shot with `composite > 0` stops the naive retry + * loop) and, wrapped as a campaign judge, as the recorded leaderboard score. */ + score: (output: TArtifact, c: TCase) => number | LeaderboardScore /** Harness × model axes for `expandProfileAxes`. Defaults: the canonical * `CODING_HARNESSES` × the base profile's `model.default`. `--harnesses` / * `--models` override per run. */ @@ -165,25 +186,43 @@ export interface LeaderboardSpec { setup?: (ctx: LeaderboardRunContext) => Promise | void /** Runs once after the matrix, even on failure (reap boxes, close handles). */ teardown?: (ctx: LeaderboardRunContext) => Promise | void - /** Per-cell event tap: the raw sandbox events of each parsed iteration, - * with the case — the seam for domain metric capture (search counts, - * citations) without a substrate change. */ - onCellEvents?: (events: readonly SandboxEvent[], c: TCase) => void - /** Output decode override: raw events → the scored output text. Default: - * the sandbox SDK's `collectAgentResponseText` (final answer text; empty - * string when the stream carried none — which then scores 0). */ - parseOutput?: (events: readonly SandboxEvent[], c: TCase) => string + /** Per-cell event tap: the raw sandbox events of EVERY shot, with the case — + * the seam for domain metric capture (search counts, citations) without a + * substrate change. Fires once per shot after the cell's loop settles, in + * shot order, including thrown shots (whose events may be partial or empty); + * the third argument carries the shot's index + error/verdict outcome. */ + onCellEvents?: ( + events: readonly SandboxEvent[], + c: TCase, + iteration?: LeaderboardIterationInfo, + ) => void + /** Output decode override: raw events → the scored artifact. Default: the + * sandbox SDK's `collectAgentResponseText` (final answer text; empty string + * when the stream carried none — which then scores 0). The default only + * produces `string`, so a spec with a structured `TArtifact` MUST supply + * this (or a LEVEL-2 `dispatch`). */ + parseOutput?: (events: readonly SandboxEvent[], c: TCase) => TArtifact + /** + * Resolve the model the backend ACTUALLY served off a shot's raw events. + * Required for HARNESS_NATIVE_MODEL-snapped cells (a vendor-locked harness × + * an out-of-family model expands to the `default` sentinel): the RunRecord + * must pin a real snapshot-bearing model id, which only the dispatch — + * reading the backend's usage/terminal events — can know. When this returns + * a value the default dispatch reports it via `ctx.cost.observeModel`; + * in-family cells (concrete declared model) never need it. + */ + resolveModel?: (events: readonly SandboxEvent[]) => string | undefined /** Result export. Default: write `matrix-result.json` under the run dir and * print (+ write) the ranked leaderboard markdown under the export dir. */ export?: ( - result: RunProfileMatrixResult>, + result: RunProfileMatrixResult>, ctx: LeaderboardRunContext, ) => Promise | void /** LEVEL 2 — full dispatch replacement (in-process products bring their own). * The default is `loopDispatch` + `naiveDriver` over the resolved backend. */ - dispatch?: ProfileDispatchFn, string> + dispatch?: ProfileDispatchFn, TArtifact> /** LEVEL 2 — full judge replacement. Default: `score` wrapped as one judge. */ - judges?: JudgeConfig>[] + judges?: JudgeConfig>[] /** Naive-retry shot cap per cell (`--shots`). Default 1. */ shots?: number /** Replicates per cell (`--reps`). Default 1. */ @@ -191,10 +230,10 @@ export interface LeaderboardSpec { /** Passthrough overrides spread onto the final `runProfileMatrix` call * (e.g. `maxConcurrency`, `costCeiling`, `integrity`, `storage`) — spread * LAST, so anything the facade wired can be overridden. */ - matrix?: Partial, string>> + matrix?: Partial, TArtifact>> } -export interface DefinedLeaderboard { +export interface DefinedLeaderboard { /** * Parse flags, run the matrix, export, and return the raw result. * @@ -208,9 +247,9 @@ export interface DefinedLeaderboard { * would silently reuse a prior FAILED zero-token cell and skip dispatch — * only an explicit `--run-dir` opts into that resume behavior. */ - run(argv?: string[]): Promise>> + run(argv?: string[]): Promise>> /** The same domain surface in the structural `BenchmarkAdapter` shape. */ - toBenchmarkAdapter(): LeaderboardBenchmarkAdapter + toBenchmarkAdapter(): LeaderboardBenchmarkAdapter } /** Read `--name ` from an argv array. */ @@ -252,7 +291,9 @@ function normalizeScore(s: number | LeaderboardScore): LeaderboardScore { return typeof s === 'number' ? { composite: s } : s } -export function defineLeaderboard(spec: LeaderboardSpec): DefinedLeaderboard { +export function defineLeaderboard( + spec: LeaderboardSpec, +): DefinedLeaderboard { const caseId = (c: TCase): string => { const id = spec.caseId ? spec.caseId(c) : (c as { id?: unknown }).id if (typeof id !== 'string' || id.length === 0) { @@ -278,7 +319,7 @@ export function defineLeaderboard(spec: LeaderboardSpec): DefinedL }) } - const scoreJudge: JudgeConfig> = { + const scoreJudge: JudgeConfig> = { name: `${spec.name}-score`, dimensions: [{ key: 'composite', description: `${spec.name} case score` }], score({ artifact, scenario }) { @@ -293,7 +334,7 @@ export function defineLeaderboard(spec: LeaderboardSpec): DefinedL async function run( argv: string[] = process.argv.slice(2), - ): Promise>> { + ): Promise>> { const args: Record = {} for (const name of [ 'backend', @@ -420,32 +461,38 @@ export function defineLeaderboard(spec: LeaderboardSpec): DefinedL // response-caching of byte-identical prompts across naive-retry shots. let shotNonce = 0 - const dispatch = - spec.dispatch ?? - loopDispatch< + // The default dispatch wraps loopDispatch per cell (closures only — no + // per-cell resource cost) so the loop's finished iterations can be joined + // with the campaign ctx: onCellEvents gets EVERY shot's outcome (a thrown + // shot never reaches parse, so parse-time tapping would hide it), and a + // spec-resolved served model reaches ctx.cost.observeModel (the only + // channel that pins HARNESS_NATIVE_MODEL-snapped cells to a real model). + const dispatch: ProfileDispatchFn, TArtifact> = spec.dispatch ?? + ((profile, scenario, dispatchCtx) => { + const cellDispatch = loopDispatch< LeaderboardScenario, - string, + TArtifact, SteeringDecision, LeaderboardScenario, - string + TArtifact >({ sandboxClient, - toLoopOptions: (scenario, profile) => { + toLoopOptions: (cellScenario, cellProfile) => { // The cell's harness + model come off the profile's axis stamp set // by expandProfileAxes; the sandbox create override carries them to // whichever backend client runs the cell. - const axis = harnessAxisOf(profile) + const axis = harnessAxisOf(cellProfile) const modelId = bareModel(axis?.model ?? models[0] ?? '') return { // naiveDriver = the no-signal retry floor: re-run the same case as // an independent attempt until one scores (>0) or the shot cap. - driver: naiveDriver, string>({ + driver: naiveDriver, TArtifact>({ continuation: '', applyContinuation: (task) => task, maxIterations: shots, }), agentRun: { - profile, + profile: cellProfile, taskToPrompt: (s) => `${promptOf(s)}\n\n`, ...(axis ? { @@ -459,28 +506,47 @@ export function defineLeaderboard(spec: LeaderboardSpec): DefinedL : {}), }, output: { - parse: (events) => { - spec.onCellEvents?.(events, scenario.case) - return spec.parseOutput - ? spec.parseOutput(events, scenario.case) - : (collectAgentResponseText(events) ?? '') - }, + parse: (events) => + spec.parseOutput + ? spec.parseOutput(events, cellScenario.case) + : // The default decode produces string — the TArtifact + // default. A structured-TArtifact spec supplies parseOutput + // (documented on the field), so this cast never lies. + ((collectAgentResponseText(events) ?? '') as TArtifact), }, validator: { - validate: async (output: string) => { - const s = normalizeScore(spec.score(output, scenario.case)) + validate: async (output: TArtifact) => { + const s = normalizeScore(spec.score(output, cellScenario.case)) return { valid: s.composite > 0, score: s.composite } }, }, - task: scenario, + task: cellScenario, maxIterations: shots, } }, + toArtifact: (result: LoopResult, TArtifact, unknown>) => { + for (const iter of result.iterations) { + spec.onCellEvents?.(iter.events, scenario.case, { + index: iter.index, + ...(iter.error ? { error: iter.error.message } : {}), + ...(iter.verdict ? { verdict: { score: iter.verdict.score } } : {}), + }) + if (spec.resolveModel) { + const served = spec.resolveModel(iter.events) + if (served !== undefined) dispatchCtx.cost.observeModel?.(served) + } + } + // Same as loopDispatch's default: no winner → undefined artifact + // (judges skip the cell; usage is still reported). + return result.winner?.output as TArtifact + }, }) + return cellDispatch(profile, scenario, dispatchCtx) + }) await spec.setup?.(ctx) try { - const result = await runProfileMatrix, string>({ + const result = await runProfileMatrix, TArtifact>({ profiles, scenarios, dispatch, @@ -508,7 +574,7 @@ export function defineLeaderboard(spec: LeaderboardSpec): DefinedL } } - function toBenchmarkAdapter(): LeaderboardBenchmarkAdapter { + function toBenchmarkAdapter(): LeaderboardBenchmarkAdapter { return { name: spec.name, async preflight(): Promise { diff --git a/src/runtime/index.ts b/src/runtime/index.ts index 388fb7f..6fdffa0 100644 --- a/src/runtime/index.ts +++ b/src/runtime/index.ts @@ -97,6 +97,7 @@ export { type LeaderboardBenchScore, type LeaderboardBenchTask, type LeaderboardFlagSpec, + type LeaderboardIterationInfo, type LeaderboardRunContext, type LeaderboardScenario, type LeaderboardScore, diff --git a/src/runtime/supervise/bridge-executor.test.ts b/src/runtime/supervise/bridge-executor.test.ts new file mode 100644 index 0000000..583236c --- /dev/null +++ b/src/runtime/supervise/bridge-executor.test.ts @@ -0,0 +1,107 @@ +import { createServer, type Server } from 'node:http' +import type { AddressInfo } from 'node:net' +import type { AgentProfile } from '@tangle-network/agent-interface' +import { afterEach, describe, expect, it } from 'vitest' +import { bridgeExecutor } from './runtime' +import type { UsageEvent } from './types' + +/** Serve one canned cli-bridge response body per request (HTTP 200 unless told + * otherwise) and hand back the bridge URL — the upstream-failure shapes under + * test are byte-level wire artifacts, so the test speaks real HTTP. */ +async function startBridgeStub( + body: string, + opts: { status?: number; contentType?: string } = {}, +): Promise<{ url: string; server: Server }> { + const server = createServer((_req, res) => { + res.writeHead(opts.status ?? 200, { + 'content-type': opts.contentType ?? 'text/event-stream', + }) + res.end(body) + }) + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)) + const { port } = server.address() as AddressInfo + return { url: `http://127.0.0.1:${port}`, server } +} + +function makeExecutor(bridgeUrl: string) { + const profile: AgentProfile = { name: 'bridge-test-worker' } + return bridgeExecutor( + { profile, harness: null }, + { + signal: new AbortController().signal, + seams: { bridge: { bridgeUrl, bridgeBearer: 'test-bearer', model: 'kimi-k2' } }, + }, + ) +} + +async function drain(stream: AsyncIterable): Promise { + const events: UsageEvent[] = [] + for await (const ev of stream) events.push(ev) + return events +} + +describe('bridgeExecutor upstream-error propagation', () => { + let server: Server | undefined + afterEach(async () => { + if (server) await new Promise((resolve) => server?.close(resolve)) + server = undefined + }) + + it('throws the upstream error from a bare JSON error body (no SSE framing)', async () => { + // The kimi failure shape: HTTP 200, plain JSON error object, zero SSE frames. + // Before the tail parse this drained as one empty zero-token result. + const stub = await startBridgeStub( + JSON.stringify({ error: { type: 'access_terminated_error', message: 'account terminated' } }), + { contentType: 'application/json' }, + ) + server = stub.server + const executor = makeExecutor(stub.url) + const stream = executor.execute('do the task', new AbortController().signal) + await expect(drain(stream as AsyncIterable)).rejects.toThrow( + /bridge upstream error: account terminated/, + ) + // The run still fails loud end-to-end: no artifact was produced. + expect(() => executor.resultArtifact()).toThrow(/before stream drained/) + }) + + it('throws from an UNTERMINATED final SSE error frame (no trailing blank line)', async () => { + const frame = `data: ${JSON.stringify({ error: { type: 'access_terminated_error' } })}\n` + const stub = await startBridgeStub(frame) + server = stub.server + const executor = makeExecutor(stub.url) + const stream = executor.execute('do the task', new AbortController().signal) + // No `message` on the payload — the error class must still surface, never 'unknown'. + await expect(drain(stream as AsyncIterable)).rejects.toThrow( + /bridge stream error: access_terminated_error/, + ) + }) + + it('still throws on a mid-stream terminated SSE error frame', async () => { + const body = `data: ${JSON.stringify({ error: { message: 'quota exhausted' } })}\n\n` + const stub = await startBridgeStub(body) + server = stub.server + const executor = makeExecutor(stub.url) + const stream = executor.execute('do the task', new AbortController().signal) + await expect(drain(stream as AsyncIterable)).rejects.toThrow( + /bridge stream error: quota exhausted/, + ) + }) + + it('drains a healthy stream unchanged and settles the artifact (tail parse is inert)', async () => { + const chunks = [ + `data: ${JSON.stringify({ choices: [{ delta: { content: 'final answer' } }] })}`, + `data: ${JSON.stringify({ usage: { prompt_tokens: 10, completion_tokens: 4, cost: 0.01 } })}`, + 'data: [DONE]', + ] + const stub = await startBridgeStub(`${chunks.join('\n\n')}\n\n`) + server = stub.server + const executor = makeExecutor(stub.url) + const events = await drain( + executor.execute('do the task', new AbortController().signal) as AsyncIterable, + ) + expect(events).toContainEqual({ kind: 'tokens', input: 10, output: 4 }) + const artifact = executor.resultArtifact() + expect(artifact.out).toMatchObject({ content: 'final answer' }) + expect(artifact.spent.tokens).toEqual({ input: 10, output: 4 }) + }) +}) diff --git a/src/runtime/supervise/runtime.ts b/src/runtime/supervise/runtime.ts index d417ced..6749807 100644 --- a/src/runtime/supervise/runtime.ts +++ b/src/runtime/supervise/runtime.ts @@ -1212,11 +1212,42 @@ async function* parseSseChatStream( sep = buf.indexOf('\n\n') } } + // Upstream failures routinely arrive UNTERMINATED: a final `data:` frame + // with no trailing blank line, or a bare JSON error body with no SSE + // framing at all (kimi's access_terminated_error). Dropping the tail here + // ends the stream as one empty zero-token turn — the integrity guard still + // fails the run, but the diagnostic dies with the buffer. Parse the tail so + // the upstream error message rides the thrown event instead. + const tail = parseSseStreamTail(buf) + if (tail !== undefined && tail !== 'done') yield tail } finally { reader.releaseLock() } } +/** Parse the stream's unterminated tail: an SSE frame missing its trailing + * blank line, or a bare (non-SSE) JSON body — the shape bridge upstreams use + * for terminal failures. Throws `ValidationError` on an error payload; returns + * `undefined` for keepalive noise or non-JSON leftovers. */ +function parseSseStreamTail(buf: string): BridgeStreamChunk | 'done' | undefined { + const tail = buf.trim() + if (!tail) return undefined + const framed = parseSseFrame(tail) + if (framed !== undefined) return framed + let parsed: { error?: { message?: string; type?: string } } + try { + parsed = JSON.parse(tail) + } catch { + return undefined + } + if (parsed.error) { + throw new ValidationError( + `bridgeExecutor: bridge upstream error: ${parsed.error.message ?? parsed.error.type ?? 'unknown'}`, + ) + } + return undefined +} + /** Parse one SSE frame (possibly multi-line `data:`/comment) into a chunk, `'done'`, * or undefined (comment/keepalive/empty). */ function parseSseFrame(frame: string): BridgeStreamChunk | 'done' | undefined { @@ -1237,7 +1268,7 @@ function parseSseFrame(frame: string): BridgeStreamChunk | 'done' | undefined { } message?: { content?: string | null } }> - error?: { message?: string } + error?: { message?: string; type?: string } usage?: { prompt_tokens?: number; completion_tokens?: number; cost?: number } } try { @@ -1246,8 +1277,10 @@ function parseSseFrame(frame: string): BridgeStreamChunk | 'done' | undefined { return undefined } if (parsed.error) { + // `type` is the upstream's error class (e.g. kimi's access_terminated_error) + // — carry it when the payload has no message, never collapse to 'unknown'. throw new ValidationError( - `bridgeExecutor: bridge stream error: ${parsed.error.message ?? 'unknown'}`, + `bridgeExecutor: bridge stream error: ${parsed.error.message ?? parsed.error.type ?? 'unknown'}`, ) } const out: BridgeStreamChunk = {}