diff --git a/docs/api/primitive-catalog.md b/docs/api/primitive-catalog.md index 80ffe43..d06d544 100644 --- a/docs/api/primitive-catalog.md +++ b/docs/api/primitive-catalog.md @@ -7,7 +7,7 @@ # Primitive catalog — the never-stale anti-reinvention inventory -> **GENERATED** from `@tangle-network/agent-runtime@0.87.0` and `@tangle-network/agent-eval@0.103.1` by `scripts/gen-primitive-catalog.mjs`. Do NOT hand-edit — run `pnpm run docs:api`. This is the mechanical companion to the JUDGMENT in `canonical-api.md` (§2 decision table + §1.5 AgentProfile law): that doc says WHICH primitive to reach for and what NOT to build; this catalog proves WHAT exists. Per-symbol signatures + `file:line` live in the per-module pages under `docs/api/`. +> **GENERATED** from `@tangle-network/agent-runtime@0.87.0` and `@tangle-network/agent-eval@0.103.2` by `scripts/gen-primitive-catalog.mjs`. Do NOT hand-edit — run `pnpm run docs:api`. This is the mechanical companion to the JUDGMENT in `canonical-api.md` (§2 decision table + §1.5 AgentProfile law): that doc says WHICH primitive to reach for and what NOT to build; this catalog proves WHAT exists. Per-symbol signatures + `file:line` live in the per-module pages under `docs/api/`. ## 1. agent-runtime — own public surface @@ -92,7 +92,7 @@ Import from `@tangle-network/agent-runtime` — 211 exports. | `DELEGATED_LOOP_MODES` | const | All valid delegated-loop mode names — used for validation and CLI surfaces. | | `FORWARD_HEADERS` | const | Standard names — lowercased so Headers maps interop on every runtime. | | `INTELLIGENCE_WIRE_VERSION` | const | Wire version the eval-runs ingest enforces (X-Tangle-Wire-Version + body). | -| `AgentEvalError` | class | _(no summary — add a TSDoc line at the declaration)_ | +| `AgentEvalError` | class | Base class for every contract error this package throws — carries the stable | | `BackendTransportError` | class | A backend transport call (HTTP, gRPC, sidecar IPC) failed with a non-success | | `CircuitBreakerState` | class | Live circuit-breaker state — one instance per (participant, conversation run). | | `CircuitOpenError` | class | Thrown when the circuit breaker is open for a participant and no retry is allowed yet. | @@ -276,7 +276,7 @@ Import from `@tangle-network/agent-runtime/loops` — 430 exports. | `createMcpEnvironment` | function | Wrap any MCP server as an `Environment`: `tools/list` becomes `AgenticTool[]` with provider-safe schemas; the domain supplies only the artifact lifecycle hooks. | | `createPushTraceSource` | function | A push source for OWNED tool loops (router-tools / cli-bridge tool dispatch): the loop calls | | `createSandboxLineage` | function | Build a lineage bound to one client + its probed capabilities. The | -| `createSandboxToolPartState` | function | _(no summary — add a TSDoc line at the declaration)_ | +| `createSandboxToolPartState` | function | Fresh per-turn {@link SandboxToolPartState} for {@link mapSandboxToolEvent} — an | | `createScope` | function | Create the reactive `Scope` a driver's `Agent.act` runs inside: spawn children on an atomically reserved conserved budget, settle via the `next()` cursor, journal for replay. | | `createScopeAnalyst` | function | Build a `ScopeAnalyst` that spawns the analyst agent through `Scope.spawn` (so its compute is | | `createShapeRegistry` | function | Build a fresh open `ShapeRegistry`. A factory is stored type-erased and re-cast on resolve — the | @@ -287,7 +287,7 @@ Import from `@tangle-network/agent-runtime/loops` — 430 exports. | `decodeToolPart` | function | Decode a part with a specific harness's adapter when known, else try every registered adapter | | `defaultSelectWinner` | function | The kernel's winner argmax — best-valid-score, ties broken by earliest index, | | `defaultToolDetectors` | function | The default online panel for a tool-call pipe: a worker repeating the same call, or hammering | -| `defineLeaderboard` | function | _(no summary — add a TSDoc line at the declaration)_ | +| `defineLeaderboard` | function | Assemble a declarative spec (`cases` + `prompt` + `score`) into a runnable | | `definePersona` | function | Build a frozen `Persona`. Fails loud on the executors-supplied invariant: a persona with | | `defineStrategy` | function | Author a Strategy from the composable steps — the open, compact way. | | `delegate` | function | Delegate an INTENT to a default authoring supervisor and return its `SupervisedResult` unchanged. | @@ -967,7 +967,7 @@ Import from `@tangle-network/agent-eval/campaign` — 226 exports. | `defaultRenderDiff` | function | Default surface diff renderer: produces a unified baseline/winner text diff for prompt surfaces or a worktree-ref summary for code surfaces. | | `detectScale` | function | Detect the native scale of a set of scores: 0-100 when any magnitude clears | | `dimensionRegressions` | function | Per-critical-dimension regression guard. For each dimension, pair the | -| `discoverEvalFixtures` | function | _(no summary — add a TSDoc line at the declaration)_ | +| `discoverEvalFixtures` | function | Walk `evalsDir` and return the relative name of every fixture directory (one containing an exact-case `PROMPT.md`). | | `emitLoopProvenance` | function | Build the provenance record + OTel spans and persist them durably under the | | `evolutionaryProposer` | function | Wrap a stateless `Mutator` (GEPA, AxGEPA, reflective-mutation) as a `SurfaceProposer` that mutates the current best surface into N candidates each generation. | | `extractFapoAttributionSignals` | function | Scan a findings array and extract FAPO attribution signals — per-level counts and failure clusters used to decide which optimization level to escalate to next. | @@ -987,8 +987,8 @@ Import from `@tangle-network/agent-eval/campaign` — 226 exports. | `isProposedCandidate` | function | Type guard: a proposal carrying its rationale vs a bare | | `labelTrustRank` | function | Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). | | `llmJudge` | function | Build a campaign-shaped `JudgeConfig` whose `score()` makes ONE LLM call | -| `loadEvalFixture` | function | _(no summary — add a TSDoc line at the declaration)_ | -| `loadEvalFixtureScenarios` | function | _(no summary — add a TSDoc line at the declaration)_ | +| `loadEvalFixture` | function | Load ONE fixture by name: reads `PROMPT.md` (plus `EVAL.ts`/`EVAL.tsx` and `package.json` under | +| `loadEvalFixtureScenarios` | function | Load fixtures (all discovered, or just `names`) as campaign `Scenario`s tagged `eval-fixture`. | | `loopProvenanceSpans` | function | Build the loop's OTLP-ingestable spans from a provenance record. One root | | `makePlaybackDispatch` | function | Adapt a `PlaybackDriver` into a `runProfileMatrix` dispatch. The artifact the | | `memoryCurationProposer` | function | Build the CURATOR proposer. | @@ -998,9 +998,9 @@ Import from `@tangle-network/agent-eval/campaign` — 226 exports. | `paretoSignificanceGate` | function | Wrap the bus + a policy as a `Gate`. Plugs into the existing | | `parseSkillPatchResponse` | function | Parse a SkillOpt LLM response into validated `SkillPatch` objects, throwing `SkillPatchParseError` on malformed JSON and silently dropping ops that violate the edit budget. | | `patchEditCount` | function | Total ops in a patch — the edit-budget axis (SkillOpt's "textual learning | -| `planCampaignRun` | function | _(no summary — add a TSDoc line at the declaration)_ | -| `planEvalFixtureRun` | function | _(no summary — add a TSDoc line at the declaration)_ | -| `policyEditProposer` | function | _(no summary — add a TSDoc line at the declaration)_ | +| `planCampaignRun` | function | Plan a campaign WITHOUT dispatching: computes the manifest hash and the per-cell | +| `planEvalFixtureRun` | function | Dry-run planner for a fixture campaign: loads the scenarios, delegates to `planCampaignRun`, | +| `policyEditProposer` | function | `SurfaceProposer` that admission-checks typed analyst `PolicyEdit`s and applies each | | `provenanceRecordPath` | function | Canonical durable paths under the run dir. | | `provenanceSpansPath` | function | Canonical path for the durable OTLP spans JSONL file under a loop run directory. | | `renderScoreboardMarkdown` | function | Render the scoreboard as a launch-readiness Markdown document — the literal | @@ -1024,11 +1024,11 @@ Import from `@tangle-network/agent-eval/campaign` — 226 exports. | `traceAnalystProposer` | function | Wrap agent-eval's trace-analyst registry as a SurfaceProposer (prompt-tier). | | `userStoryScoreboard` | function | Flatten story verdicts into the per-requirement scoreboard — the literal | | `paretoPolicy` | const | The default strategy: symmetric multi-objective Pareto significance. Ship iff | -| `FsLabeledScenarioStore` | class | _(no summary — add a TSDoc line at the declaration)_ | -| `LabeledScenarioStoreError` | class | _(no summary — add a TSDoc line at the declaration)_ | +| `FsLabeledScenarioStore` | class | Filesystem `LabeledScenarioStore`: appends one JSONL file per source with provenance and | +| `LabeledScenarioStoreError` | class | Typed rejection from a labeled-scenario store (bad provenance, rate limit, invalid sample args) — carries a stable string `code`. | | `ProfileMatrixError` | class | Thrown when the matrix is misconfigured (no profiles, a profile whose model | | `SkillPatchParseError` | class | Parse + validate the patch response. Throws `SkillPatchParseError` when the | -| `WorktreeAdapterError` | class | _(no summary — add a TSDoc line at the declaration)_ | +| `WorktreeAdapterError` | class | Typed failure from a `WorktreeAdapter` operation (create/finalize/discard) — wraps the underlying git error as `cause`. | | `AceProposerOptions` | interface | `aceProposer` — Agentic Context Engineering: an APPEND-MOSTLY curator, the | | `AnalystArtifact` | interface | The analyst's output for one scenario — the artifact the judge scores. | | `AnalystScenario` | interface | A labeled trace scenario: a FIXED trace corpus plus the failure modes a | diff --git a/docs/api/runtime.md b/docs/api/runtime.md index 3693a0b..830b5e1 100644 --- a/docs/api/runtime.md +++ b/docs/api/runtime.md @@ -15815,7 +15815,11 @@ passes. Ground truth — the driver ends directly, no validation. The check read > **defineLeaderboard**\<`TCase`, `TArtifact`\>(`spec`): [`DefinedLeaderboard`](#definedleaderboard)\<`TCase`, `TArtifact`\> -Defined in: [runtime/define-leaderboard.ts:294](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L294) +Defined in: [runtime/define-leaderboard.ts:299](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/define-leaderboard.ts#L299) + +Assemble a declarative spec (`cases` + `prompt` + `score`) into a runnable +harness×model leaderboard — `run()` executes the matrix, `toBenchmarkAdapter()` +exposes the same domain as a structural `BenchmarkAdapter`. #### Type Parameters @@ -17219,10 +17223,13 @@ readonly `SandboxEvent`[] > **createSandboxToolPartState**(): [`SandboxToolPartState`](#sandboxtoolpartstate) -Defined in: [runtime/sandbox-events.ts:155](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/sandbox-events.ts#L155) +Defined in: [runtime/sandbox-events.ts:160](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/sandbox-events.ts#L160) **`Experimental`** +Fresh per-turn [SandboxToolPartState](#sandboxtoolpartstate) for [mapSandboxToolEvent](#mapsandboxtoolevent) — an +empty call-status map so each turn projects tool frames independently. + #### Returns [`SandboxToolPartState`](#sandboxtoolpartstate) @@ -17233,7 +17240,7 @@ Defined in: [runtime/sandbox-events.ts:155](https://github.com/tangle-network/ag > **mapSandboxToolEvent**(`event`, `state`): [`RuntimeStreamEvent`](index.md#runtimestreamevent) & `object`[] -Defined in: [runtime/sandbox-events.ts:186](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/sandbox-events.ts#L186) +Defined in: [runtime/sandbox-events.ts:191](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/sandbox-events.ts#L191) **`Experimental`** @@ -17277,7 +17284,7 @@ Returns `[]` for every non-tool event. > **mapSandboxEvent**(`event`, `opts?`): [`RuntimeStreamEvent`](index.md#runtimestreamevent) \| `undefined` -Defined in: [runtime/sandbox-events.ts:313](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/sandbox-events.ts#L313) +Defined in: [runtime/sandbox-events.ts:318](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/sandbox-events.ts#L318) Project one `SandboxEvent` onto the `RuntimeStreamEvent` chat-UX vocabulary, for runtimes that bridge a sandbox `streamPrompt` into the diff --git a/package.json b/package.json index de346bb..620c4bd 100644 --- a/package.json +++ b/package.json @@ -94,7 +94,7 @@ }, "devDependencies": { "@biomejs/biome": "^2.4.15", - "@tangle-network/agent-eval": "^0.103.1", + "@tangle-network/agent-eval": "^0.103.2", "@tangle-network/agent-interface": ">=0.14.0 <1.0.0", "@tangle-network/sandbox": ">=0.8.0 <1.0.0", "@types/node": "^25.9.3", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cce5f93..0bcd864 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -12,14 +12,14 @@ importers: specifier: ^2.4.15 version: 2.4.15 '@tangle-network/agent-eval': - specifier: ^0.103.1 - version: 0.103.1(typescript@5.9.3) + specifier: ^0.103.2 + version: 0.103.2(typescript@5.9.3) '@tangle-network/agent-interface': specifier: '>=0.14.0 <1.0.0' version: 0.14.0 '@tangle-network/sandbox': specifier: '>=0.8.0 <1.0.0' - version: 0.9.5(viem@2.52.2(typescript@5.9.3)(zod@4.4.3)) + version: 0.9.5(viem@2.54.2(typescript@5.9.3)(zod@4.4.3)) '@types/node': specifier: ^25.9.3 version: 25.9.3 @@ -432,8 +432,8 @@ packages: '@gerrit0/mini-shiki@3.23.0': resolution: {integrity: sha512-bEMORlG0cqdjVyCEuU0cDQbORWX+kYCeo0kV1lbxF5bt4r7SID2l9bqsxJEM0zndaxpOUT7riCyIVEuqq/Ynxg==} - '@hono/node-server@2.0.4': - resolution: {integrity: sha512-Ut3y0dMMPWy6bZ2kVfx25EOVbZlm15dhF4mOsezMlhpNHy+4MkU1qN9Y6lnruYi4wPmFzimGX2X7LF/FwHli4A==} + '@hono/node-server@2.0.8': + resolution: {integrity: sha512-GuCWzLxwg218fy1JaHculFsdcuY12hxit83V+algozTPnwhNjLrRL/Alg9OYjLZLoUZ1rw/S4CdTMsnkSKCmFA==} engines: {node: '>=20'} peerDependencies: hono: ^4 @@ -636,13 +636,13 @@ packages: '@tangle-network/agent-core@0.3.4': resolution: {integrity: sha512-Hvz3ABRouNtBmRvGqPxifAO2yuILneJMylWH5jW/jeS2F03RvqkGYuXyGXWWLqosYbb3hVAvSEe4Ykm2FMGEDQ==} - '@tangle-network/agent-eval@0.103.1': - resolution: {integrity: sha512-9V37IcaRixSfIUkZ50pgU8a5nSVrkVmq5BimNLwVzbi3USwOkkJ9RcecMScpLUnrYNeaoe5Sac8lS6kzL1uTDQ==} + '@tangle-network/agent-eval@0.103.2': + resolution: {integrity: sha512-ANdqOcd171PUSzPP8ul8AB8Y2uJM16XYP8A8aAxs3Jdr9vznENiajzm9b8ibNG+VwLiclnD9RPbaYUdZzDqUBg==} engines: {node: '>=20'} hasBin: true - '@tangle-network/agent-interface@0.10.0': - resolution: {integrity: sha512-oiREgihkeX/xcGEtFfi9AkAfU2VzuF7SSla2s0iliXPUXyHCIIx6jwzHiYdwb1ZGCfvC+T+0SWOIa6fN5u195g==} + '@tangle-network/agent-interface@0.10.1': + resolution: {integrity: sha512-yehY/0EgKvu8lG6jIVoZCtMPLkj8VEWwasuAtuph2RaB9MKE5wuxRF647O6jw8KufNZ3aQ2UVVWpZ19dGCbs6w==} '@tangle-network/agent-interface@0.13.0': resolution: {integrity: sha512-CeTPGRLoXqpt0h+BCyFgZPkfU1zyRpWmqfD+85i/uk+uvbqxkfI+JprfKVf3tBsQuCgJPSjPt5qjdW8n3h2BVg==} @@ -862,8 +862,8 @@ packages: engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} os: [darwin] - hono@4.12.25: - resolution: {integrity: sha512-2NFaIyNVgJmBs/ecmtGzlmluTFs5cHEWGTdu0t1HBwYzoGXOL5nUQBRMXsXWla5i4KkG//QMzVP88m1+I3fdAQ==} + hono@4.12.27: + resolution: {integrity: sha512-1yrb/+w6HWQJrUCLkJ2IF5jNIPvvFkblV5RNOYl6bV+OA6p9GLcMpHFFGTosSvHvcAUibuUukRqhlYI4z32C7Q==} engines: {node: '>=16.9.0'} isows@1.0.7: @@ -1123,8 +1123,8 @@ packages: undici-types@7.24.6: resolution: {integrity: sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg==} - viem@2.52.2: - resolution: {integrity: sha512-HSU12p5aD/kAPZfrlbCUqdiP4P/c6hQ9AhfTS51VbLUQIjkWd1d5EjrCx/SCxZ0zhZVRn4Iv5X5WDqXPG8Ubew==} + viem@2.54.2: + resolution: {integrity: sha512-o0+5dEAUekBMTbixXy2mKbSDPnwsCJ+8+mOeMBDjkuS9iM4fcr3yKUWb2zlOy2NKInkg3anl1W11sxYspLiXig==} peerDependencies: typescript: '>=5.0.4' peerDependenciesMeta: @@ -1209,8 +1209,8 @@ packages: engines: {node: '>=8'} hasBin: true - ws@8.20.1: - resolution: {integrity: sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==} + ws@8.21.0: + resolution: {integrity: sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==} engines: {node: '>=10.0.0'} peerDependencies: bufferutil: ^4.0.1 @@ -1444,9 +1444,9 @@ snapshots: '@shikijs/types': 3.23.0 '@shikijs/vscode-textmate': 10.0.2 - '@hono/node-server@2.0.4(hono@4.12.25)': + '@hono/node-server@2.0.8(hono@4.12.27)': dependencies: - hono: 4.12.25 + hono: 4.12.27 '@jridgewell/gen-mapping@0.3.13': dependencies: @@ -1604,14 +1604,14 @@ snapshots: '@tangle-network/agent-interface': 0.14.0 zod: 4.4.3 - '@tangle-network/agent-eval@0.103.1(typescript@5.9.3)': + '@tangle-network/agent-eval@0.103.2(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.3) '@ax-llm/ax': 19.0.45(zod@4.4.3) - '@hono/node-server': 2.0.4(hono@4.12.25) - '@tangle-network/agent-interface': 0.10.0 + '@hono/node-server': 2.0.8(hono@4.12.27) + '@tangle-network/agent-interface': 0.10.1 '@tangle-network/tcloud': 0.4.14(typescript@5.9.3)(zod@4.4.3) - hono: 4.12.25 + hono: 4.12.27 zod: 4.4.3 transitivePeerDependencies: - '@mastra/core' @@ -1622,7 +1622,7 @@ snapshots: - typescript - utf-8-validate - '@tangle-network/agent-interface@0.10.0': + '@tangle-network/agent-interface@0.10.1': dependencies: zod: 4.4.3 @@ -1634,12 +1634,12 @@ snapshots: dependencies: zod: 4.4.3 - '@tangle-network/sandbox@0.9.5(viem@2.52.2(typescript@5.9.3)(zod@4.4.3))': + '@tangle-network/sandbox@0.9.5(viem@2.54.2(typescript@5.9.3)(zod@4.4.3))': dependencies: '@tangle-network/agent-core': 0.3.4 '@tangle-network/agent-interface': 0.13.0 optionalDependencies: - viem: 2.52.2(typescript@5.9.3)(zod@4.4.3) + viem: 2.54.2(typescript@5.9.3)(zod@4.4.3) '@tangle-network/tcloud-attestation@0.1.1': {} @@ -1647,10 +1647,10 @@ snapshots: dependencies: '@scure/bip32': 2.2.0 '@scure/bip39': 2.2.0 - '@tangle-network/sandbox': 0.9.5(viem@2.52.2(typescript@5.9.3)(zod@4.4.3)) + '@tangle-network/sandbox': 0.9.5(viem@2.54.2(typescript@5.9.3)(zod@4.4.3)) '@tangle-network/tcloud-attestation': 0.1.1 commander: 14.0.3 - viem: 2.52.2(typescript@5.9.3)(zod@4.4.3) + viem: 2.54.2(typescript@5.9.3)(zod@4.4.3) transitivePeerDependencies: - '@mastra/core' - '@modelcontextprotocol/sdk' @@ -1864,11 +1864,11 @@ snapshots: fsevents@2.3.3: optional: true - hono@4.12.25: {} + hono@4.12.27: {} - isows@1.0.7(ws@8.20.1): + isows@1.0.7(ws@8.21.0): dependencies: - ws: 8.20.1 + ws: 8.21.0 joycon@3.1.1: {} @@ -2126,16 +2126,16 @@ snapshots: undici-types@7.24.6: {} - viem@2.52.2(typescript@5.9.3)(zod@4.4.3): + viem@2.54.2(typescript@5.9.3)(zod@4.4.3): dependencies: '@noble/curves': 1.9.1 '@noble/hashes': 1.8.0 '@scure/bip32': 1.7.0 '@scure/bip39': 1.6.0 abitype: 1.2.3(typescript@5.9.3)(zod@4.4.3) - isows: 1.0.7(ws@8.20.1) + isows: 1.0.7(ws@8.21.0) ox: 0.14.29(typescript@5.9.3)(zod@4.4.3) - ws: 8.20.1 + ws: 8.21.0 optionalDependencies: typescript: 5.9.3 transitivePeerDependencies: @@ -2224,7 +2224,7 @@ snapshots: siginfo: 2.0.0 stackback: 0.0.2 - ws@8.20.1: {} + ws@8.21.0: {} yaml@2.9.0: {} diff --git a/scripts/gen-primitive-catalog.mjs b/scripts/gen-primitive-catalog.mjs index 0fd1851..6ed9f83 100644 --- a/scripts/gen-primitive-catalog.mjs +++ b/scripts/gen-primitive-catalog.mjs @@ -271,7 +271,7 @@ for (let i = 0; i < allModules.length; i++) bySpecifier.set(allModules[i].specif // The ceiling is the exact current count; when a backfill lowers the real number, // lower the constant to match. Exceeding it (a new undocumented callable) exits 1. -const maxUndocumentedCallables = 34 +const maxUndocumentedCallables = 0 const ratchetKinds = new Set(['function', 'class', 'const']) // ───────────────────────────────────────────────────────────────────────────── diff --git a/src/runtime/define-leaderboard.ts b/src/runtime/define-leaderboard.ts index 0596413..6095dbf 100644 --- a/src/runtime/define-leaderboard.ts +++ b/src/runtime/define-leaderboard.ts @@ -291,6 +291,11 @@ function normalizeScore(s: number | LeaderboardScore): LeaderboardScore { return typeof s === 'number' ? { composite: s } : s } +/** + * Assemble a declarative spec (`cases` + `prompt` + `score`) into a runnable + * harness×model leaderboard — `run()` executes the matrix, `toBenchmarkAdapter()` + * exposes the same domain as a structural `BenchmarkAdapter`. + */ export function defineLeaderboard( spec: LeaderboardSpec, ): DefinedLeaderboard { diff --git a/src/runtime/sandbox-events.ts b/src/runtime/sandbox-events.ts index 829c640..d2b6c86 100644 --- a/src/runtime/sandbox-events.ts +++ b/src/runtime/sandbox-events.ts @@ -151,7 +151,12 @@ export interface SandboxToolPartState { seq: number } -/** @experimental */ +/** + * Fresh per-turn {@link SandboxToolPartState} for {@link mapSandboxToolEvent} — an + * empty call-status map so each turn projects tool frames independently. + * + * @experimental + */ export function createSandboxToolPartState(): SandboxToolPartState { return { statusByCall: new Map(), seq: 0 } }