tangle-network · drewstone · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026
diff --git a/docs/api/primitive-catalog.md b/docs/api/primitive-catalog.md
diff --git a/docs/api/runtime.md b/docs/api/runtime.md
diff --git a/docs/canonical-api.md b/docs/canonical-api.md
@@ -2,7 +2,7 @@
 
 <!-- This doc is the JUDGMENT layer: the mental model (§1), the AgentProfile law (§1.5), and the anti-reinvention decision table (§2) — WHICH primitive to reach for and what NOT to build. The export INVENTORY (WHAT exists) and per-symbol signatures + `file:line` are GENERATED into `docs/api/` (TypeDoc + `scripts/gen-primitive-catalog.mjs`, do NOT hand-edit) — that is the mechanical reference: `docs/api/primitive-catalog.md` is the never-stale list of every primitive to reuse. The freshness gate (`pnpm docs:freshness`) FAILS CI if a version pin, a cited `file:line`, a decision-table symbol, or the generated catalog drifts from source — see `docs/MAINTAINING.md`. Keep this file the small, hand-curated spine; never re-list the inventory here — point at the catalog. -->
 
-> **Version 0.83.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.97.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under.
+> **Version 0.84.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.101.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under.
 >
 > **`./loops` is the runtime barrel** — `package.json` maps it to `src/runtime/index.ts`. Everything below labelled `/loops` is the recursive-atom + loop-kernel surface.
 >

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.83.0",
+  "version": "0.84.0",
   "description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {
@@ -94,7 +94,7 @@
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.15",
-    "@tangle-network/agent-eval": ">=0.100.0 <1.0.0",
+    "@tangle-network/agent-eval": "^0.103.1",
     "@tangle-network/agent-interface": ">=0.14.0 <1.0.0",
     "@tangle-network/sandbox": ">=0.8.0 <1.0.0",
     "@types/node": "^25.9.3",
@@ -123,7 +123,7 @@
   "license": "MIT",
   "packageManager": "pnpm@10.28.0",
   "peerDependencies": {
-    "@tangle-network/agent-eval": ">=0.97.0 <1.0.0",
+    "@tangle-network/agent-eval": ">=0.101.0 <1.0.0",
     "@tangle-network/agent-interface": ">=0.14.0 <1.0.0",
     "@tangle-network/sandbox": ">=0.8.0 <1.0.0",
     "playwright": "^1.40.0"

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/runtime/define-leaderboard.test.ts b/src/runtime/define-leaderboard.test.ts
@@ -0,0 +1,182 @@
+import { mkdtempSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import type { SandboxEvent } from '@tangle-network/sandbox'
+import { describe, expect, it } from 'vitest'
+import { defineLeaderboard, type LeaderboardRunContext } from './define-leaderboard'
+import { inProcessSandboxClient } from './in-process-sandbox-client'
+
+interface FakeCase {
+  id: string
+  answer: string
+}
+
+const CASES: FakeCase[] = [
+  { id: 'case-alpha', answer: 'ALPHA-42' },
+  { id: 'case-beta', answer: 'BETA-7' },
+]
+
+/** Offline backend: echoes the prompt's embedded answer + meters an llm_call,
+ *  so the matrix integrity guard sees a real (non-stub) backend. */
+function fakeBackend() {
+  return inProcessSandboxClient({
+    onPrompt: (prompt): SandboxEvent[] => {
+      const answer = /answer=(\S+)/.exec(prompt)?.[1] ?? 'missing'
+      return [
+        { type: 'llm_call', data: { tokensIn: 12, tokensOut: 6, costUsd: 0.002 } },
+        { type: 'result', data: { finalText: `final answer=${answer}` } },
+      ]
+    },
+  })
+}
+
+function board(overrides: Partial<Parameters<typeof defineLeaderboard<FakeCase>>[0]> = {}) {
+  return defineLeaderboard<FakeCase>({
+    name: 'fake-board',
+    cases: CASES,
+    prompt: async (c) => `solve the task. answer=${c.answer}`,
+    score: (output, c) => (output.includes(c.answer) ? 1 : 0),
+    backends: { inproc: fakeBackend },
+    export: async () => {}, // silence the default table print in tests
+    ...overrides,
+  })
+}
+
+const AXIS = ['--backend', 'inproc', '--harnesses', 'opencode', '--models', 'test-model@2026-01-01']
+
+describe('defineLeaderboard', () => {
+  it('runs the matrix end-to-end offline and scores every (profile, case) cell', async () => {
+    const result = await board().run([...AXIS])
+
+    expect(result.records).toHaveLength(2)
+    expect(Object.keys(result.byScenario).sort()).toEqual(['case-alpha', 'case-beta'])
+    const summaries = Object.values(result.byProfile)
+    expect(summaries).toHaveLength(1)
+    expect(summaries[0]?.meanComposite).toBe(1)
+    expect(summaries[0]?.model).toBe('test-model@2026-01-01')
+    // The fake backend's llm_call events were metered — the run is REAL, not a stub.
+    expect(result.integrity.verdict).toBe('real')
+    for (const r of result.records) expect(r.tokenUsage.input).toBeGreaterThan(0)
+  })
+
+  it('defaults to a FRESH run dir per invocation (no stale cell-cache reuse)', async () => {
+    const dirs: string[] = []
+    const b = board({
+      export: async (_result, ctx: LeaderboardRunContext) => {
+        dirs.push(ctx.runDir)
+      },
+    })
+    await b.run([...AXIS, '--cases', 'case-alpha'])
+    await b.run([...AXIS, '--cases', 'case-alpha'])
+    expect(dirs).toHaveLength(2)
+    expect(dirs[0]).not.toBe(dirs[1])
+    for (const d of dirs) expect(d.startsWith(tmpdir())).toBe(true)
+  })
+
+  it('honors an explicit --run-dir (the opt-in resume path)', async () => {
+    const runDir = mkdtempSync(join(tmpdir(), 'lb-explicit-'))
+    let seen: string | undefined
+    await board({
+      export: async (_r, ctx) => {
+        seen = ctx.runDir
+      },
+    }).run([...AXIS, '--cases', 'case-alpha', '--run-dir', runDir])
+    expect(seen).toBe(runDir)
+  })
+
+  it('subsets cases via --cases and rejects unknown ids', async () => {
+    const result = await board().run([...AXIS, '--cases', 'case-beta'])
+    expect(result.records).toHaveLength(1)
+    expect(Object.keys(result.byScenario)).toEqual(['case-beta'])
+
+    await expect(board().run([...AXIS, '--cases', 'nope'])).rejects.toThrow(/unknown case "nope"/)
+  })
+
+  it('stamps a snapshot onto bare model ids (RunRecord identity requirement)', async () => {
+    const result = await board().run([
+      '--backend',
+      'inproc',
+      '--harnesses',
+      'opencode',
+      '--models',
+      'test-model',
+    ])
+    expect(result.records[0]?.model).toBe('test-model@leaderboard')
+  })
+
+  it('wraps score() as the campaign judge, carrying dimensions and notes', async () => {
+    const result = await board({
+      score: (output, c) => ({
+        composite: output.includes(c.answer) ? 0.5 : 0,
+        dimensions: { exactness: 1 },
+        notes: 'structured',
+      }),
+    }).run([...AXIS, '--cases', 'case-alpha'])
+    expect(Object.values(result.byProfile)[0]?.meanComposite).toBe(0.5)
+    const outcome = result.records[0]?.outcome as { searchScore?: number } | undefined
+    expect(outcome?.searchScore).toBe(0.5)
+  })
+
+  it('feeds each cell raw events + case through onCellEvents (the metric-capture seam)', async () => {
+    const seen: Array<{ id: string; types: string[] }> = []
+    await board({
+      onCellEvents: (events, c) => {
+        seen.push({ id: c.id, types: events.map((e) => (e as { type: string }).type) })
+      },
+    }).run([...AXIS])
+    expect(seen.map((s) => s.id).sort()).toEqual(['case-alpha', 'case-beta'])
+    for (const s of seen) expect(s.types).toContain('llm_call')
+  })
+
+  it('parses spec.flags and surfaces every flag to the hooks via ctx.args', async () => {
+    let args: Record<string, string | undefined> = {}
+    await board({
+      flags: { split: { default: 'dev', description: 'dataset split' } },
+      setup: (ctx) => {
+        args = ctx.args
+      },
+    }).run([...AXIS, '--cases', 'case-alpha', '--split', 'holdout'])
+    expect(args.split).toBe('holdout')
+    expect(args.backend).toBe('inproc')
+    expect(args.harnesses).toBe('opencode')
+  })
+
+  it("fails loud on the default 'sandbox' backend with guidance to supply a real client", async () => {
+    await expect(
+      defineLeaderboard<FakeCase>({
+        name: 'no-backend',
+        cases: CASES,
+        prompt: (c) => c.id,
+        score: () => 0,
+      }).run(['--models', 'm@1']),
+    ).rejects.toThrow(/backends\.sandbox/)
+  })
+
+  it('toBenchmarkAdapter(): loadTasks/judge round-trip in the structural BenchmarkAdapter shape', async () => {
+    const adapter = board().toBenchmarkAdapter()
+    expect(adapter.name).toBe('fake-board')
+    await adapter.preflight()
+
+    const tasks = await adapter.loadTasks()
+    expect(tasks.map((t) => t.id)).toEqual(['case-alpha', 'case-beta'])
+    expect(tasks[0]?.prompt).toContain('answer=ALPHA-42')
+
+    const pass = await adapter.judge(tasks[0] as { id: string; prompt: string }, 'final ALPHA-42')
+    expect(pass).toMatchObject({ resolved: true, score: 1 })
+    const fail = await adapter.judge(tasks[0] as { id: string; prompt: string }, 'wrong')
+    expect(fail).toMatchObject({ resolved: false, score: 0 })
+
+    const subset = await adapter.loadTasks({ ids: ['case-beta'] })
+    expect(subset.map((t) => t.id)).toEqual(['case-beta'])
+    expect(await adapter.goldArtifact(tasks[0] as { id: string; prompt: string })).toBeUndefined()
+
+    // preflight fails loud on duplicate ids — the cheap corpus-integrity check.
+    const dup = defineLeaderboard<FakeCase>({
+      name: 'dup',
+      cases: [CASES[0] as FakeCase, CASES[0] as FakeCase],
+      prompt: (c) => c.id,
+      score: () => 0,
+    }).toBenchmarkAdapter()
+    await expect(dup.preflight()).rejects.toThrow(/duplicate case id/)
+  })
+})