Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 52 additions & 36 deletions docs/api/primitive-catalog.md

Large diffs are not rendered by default.

689 changes: 689 additions & 0 deletions docs/api/runtime.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/canonical-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<!-- This doc is the JUDGMENT layer: the mental model (§1), the AgentProfile law (§1.5), and the anti-reinvention decision table (§2) — WHICH primitive to reach for and what NOT to build. The export INVENTORY (WHAT exists) and per-symbol signatures + `file:line` are GENERATED into `docs/api/` (TypeDoc + `scripts/gen-primitive-catalog.mjs`, do NOT hand-edit) — that is the mechanical reference: `docs/api/primitive-catalog.md` is the never-stale list of every primitive to reuse. The freshness gate (`pnpm docs:freshness`) FAILS CI if a version pin, a cited `file:line`, a decision-table symbol, or the generated catalog drifts from source — see `docs/MAINTAINING.md`. Keep this file the small, hand-curated spine; never re-list the inventory here — point at the catalog. -->

> **Version 0.83.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.97.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under.
> **Version 0.84.0.** The export inventory + per-symbol signatures live in the generated `docs/api/` reference: **`docs/api/primitive-catalog.md`** is the never-stale, grouped list of every primitive to reuse (own surface + the agent-eval judge / authenticity / verification / statistics / campaign / token-usage surfaces), with each one's import path and one-line summary read live from source; the per-module pages hold the full signatures. The pinned substrate is agent-eval `>=0.101.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`, plus environment-provider types) are owned by **`@tangle-network/agent-interface`** (peer `>=0.14.0 <1.0.0`) — the single source of truth. Substrate primitives are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package — the catalog's §2 shows exactly which subpath each lives under.
>
> **`./loops` is the runtime barrel** — `package.json` maps it to `src/runtime/index.ts`. Everything below labelled `/loops` is the recursive-atom + loop-kernel surface.
>
Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-runtime",
"version": "0.83.0",
"version": "0.84.0",
"description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.",
"homepage": "https://github.com/tangle-network/agent-runtime#readme",
"repository": {
Expand Down Expand Up @@ -94,7 +94,7 @@
},
"devDependencies": {
"@biomejs/biome": "^2.4.15",
"@tangle-network/agent-eval": ">=0.100.0 <1.0.0",
"@tangle-network/agent-eval": "^0.103.1",
"@tangle-network/agent-interface": ">=0.14.0 <1.0.0",
"@tangle-network/sandbox": ">=0.8.0 <1.0.0",
"@types/node": "^25.9.3",
Expand Down Expand Up @@ -123,7 +123,7 @@
"license": "MIT",
"packageManager": "pnpm@10.28.0",
"peerDependencies": {
"@tangle-network/agent-eval": ">=0.97.0 <1.0.0",
"@tangle-network/agent-eval": ">=0.101.0 <1.0.0",
"@tangle-network/agent-interface": ">=0.14.0 <1.0.0",
"@tangle-network/sandbox": ">=0.8.0 <1.0.0",
"playwright": "^1.40.0"
Expand Down
53 changes: 10 additions & 43 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

182 changes: 182 additions & 0 deletions src/runtime/define-leaderboard.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import { mkdtempSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import type { SandboxEvent } from '@tangle-network/sandbox'
import { describe, expect, it } from 'vitest'
import { defineLeaderboard, type LeaderboardRunContext } from './define-leaderboard'
import { inProcessSandboxClient } from './in-process-sandbox-client'

interface FakeCase {
id: string
answer: string
}

const CASES: FakeCase[] = [
{ id: 'case-alpha', answer: 'ALPHA-42' },
{ id: 'case-beta', answer: 'BETA-7' },
]

/** Offline backend: echoes the prompt's embedded answer + meters an llm_call,
* so the matrix integrity guard sees a real (non-stub) backend. */
function fakeBackend() {
return inProcessSandboxClient({
onPrompt: (prompt): SandboxEvent[] => {
const answer = /answer=(\S+)/.exec(prompt)?.[1] ?? 'missing'
return [
{ type: 'llm_call', data: { tokensIn: 12, tokensOut: 6, costUsd: 0.002 } },
{ type: 'result', data: { finalText: `final answer=${answer}` } },
]
},
})
}

function board(overrides: Partial<Parameters<typeof defineLeaderboard<FakeCase>>[0]> = {}) {
return defineLeaderboard<FakeCase>({
name: 'fake-board',
cases: CASES,
prompt: async (c) => `solve the task. answer=${c.answer}`,
score: (output, c) => (output.includes(c.answer) ? 1 : 0),
backends: { inproc: fakeBackend },
export: async () => {}, // silence the default table print in tests
...overrides,
})
}

const AXIS = ['--backend', 'inproc', '--harnesses', 'opencode', '--models', 'test-model@2026-01-01']

describe('defineLeaderboard', () => {
it('runs the matrix end-to-end offline and scores every (profile, case) cell', async () => {
const result = await board().run([...AXIS])

expect(result.records).toHaveLength(2)
expect(Object.keys(result.byScenario).sort()).toEqual(['case-alpha', 'case-beta'])
const summaries = Object.values(result.byProfile)
expect(summaries).toHaveLength(1)
expect(summaries[0]?.meanComposite).toBe(1)
expect(summaries[0]?.model).toBe('test-model@2026-01-01')
// The fake backend's llm_call events were metered — the run is REAL, not a stub.
expect(result.integrity.verdict).toBe('real')
for (const r of result.records) expect(r.tokenUsage.input).toBeGreaterThan(0)
})

it('defaults to a FRESH run dir per invocation (no stale cell-cache reuse)', async () => {
const dirs: string[] = []
const b = board({
export: async (_result, ctx: LeaderboardRunContext) => {
dirs.push(ctx.runDir)
},
})
await b.run([...AXIS, '--cases', 'case-alpha'])
await b.run([...AXIS, '--cases', 'case-alpha'])
expect(dirs).toHaveLength(2)
expect(dirs[0]).not.toBe(dirs[1])
for (const d of dirs) expect(d.startsWith(tmpdir())).toBe(true)
})

it('honors an explicit --run-dir (the opt-in resume path)', async () => {
const runDir = mkdtempSync(join(tmpdir(), 'lb-explicit-'))
let seen: string | undefined
await board({
export: async (_r, ctx) => {
seen = ctx.runDir
},
}).run([...AXIS, '--cases', 'case-alpha', '--run-dir', runDir])
expect(seen).toBe(runDir)
})

it('subsets cases via --cases and rejects unknown ids', async () => {
const result = await board().run([...AXIS, '--cases', 'case-beta'])
expect(result.records).toHaveLength(1)
expect(Object.keys(result.byScenario)).toEqual(['case-beta'])

await expect(board().run([...AXIS, '--cases', 'nope'])).rejects.toThrow(/unknown case "nope"/)
})

it('stamps a snapshot onto bare model ids (RunRecord identity requirement)', async () => {
const result = await board().run([
'--backend',
'inproc',
'--harnesses',
'opencode',
'--models',
'test-model',
])
expect(result.records[0]?.model).toBe('test-model@leaderboard')
})

it('wraps score() as the campaign judge, carrying dimensions and notes', async () => {
const result = await board({
score: (output, c) => ({
composite: output.includes(c.answer) ? 0.5 : 0,
dimensions: { exactness: 1 },
notes: 'structured',
}),
}).run([...AXIS, '--cases', 'case-alpha'])
expect(Object.values(result.byProfile)[0]?.meanComposite).toBe(0.5)
const outcome = result.records[0]?.outcome as { searchScore?: number } | undefined
expect(outcome?.searchScore).toBe(0.5)
})

it('feeds each cell raw events + case through onCellEvents (the metric-capture seam)', async () => {
const seen: Array<{ id: string; types: string[] }> = []
await board({
onCellEvents: (events, c) => {
seen.push({ id: c.id, types: events.map((e) => (e as { type: string }).type) })
},
}).run([...AXIS])
expect(seen.map((s) => s.id).sort()).toEqual(['case-alpha', 'case-beta'])
for (const s of seen) expect(s.types).toContain('llm_call')
})

it('parses spec.flags and surfaces every flag to the hooks via ctx.args', async () => {
let args: Record<string, string | undefined> = {}
await board({
flags: { split: { default: 'dev', description: 'dataset split' } },
setup: (ctx) => {
args = ctx.args
},
}).run([...AXIS, '--cases', 'case-alpha', '--split', 'holdout'])
expect(args.split).toBe('holdout')
expect(args.backend).toBe('inproc')
expect(args.harnesses).toBe('opencode')
})

it("fails loud on the default 'sandbox' backend with guidance to supply a real client", async () => {
await expect(
defineLeaderboard<FakeCase>({
name: 'no-backend',
cases: CASES,
prompt: (c) => c.id,
score: () => 0,
}).run(['--models', 'm@1']),
).rejects.toThrow(/backends\.sandbox/)
})

it('toBenchmarkAdapter(): loadTasks/judge round-trip in the structural BenchmarkAdapter shape', async () => {
const adapter = board().toBenchmarkAdapter()
expect(adapter.name).toBe('fake-board')
await adapter.preflight()

const tasks = await adapter.loadTasks()
expect(tasks.map((t) => t.id)).toEqual(['case-alpha', 'case-beta'])
expect(tasks[0]?.prompt).toContain('answer=ALPHA-42')

const pass = await adapter.judge(tasks[0] as { id: string; prompt: string }, 'final ALPHA-42')
expect(pass).toMatchObject({ resolved: true, score: 1 })
const fail = await adapter.judge(tasks[0] as { id: string; prompt: string }, 'wrong')
expect(fail).toMatchObject({ resolved: false, score: 0 })

const subset = await adapter.loadTasks({ ids: ['case-beta'] })
expect(subset.map((t) => t.id)).toEqual(['case-beta'])
expect(await adapter.goldArtifact(tasks[0] as { id: string; prompt: string })).toBeUndefined()

// preflight fails loud on duplicate ids — the cheap corpus-integrity check.
const dup = defineLeaderboard<FakeCase>({
name: 'dup',
cases: [CASES[0] as FakeCase, CASES[0] as FakeCase],
prompt: (c) => c.id,
score: () => 0,
}).toBenchmarkAdapter()
await expect(dup.preflight()).rejects.toThrow(/duplicate case id/)
})
})
Loading
Loading