From 0e35e5f0b10c2c9db10094031a2ac92e59fff9f3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 23 Apr 2026 13:40:27 -0500
Subject: [PATCH 001/111] feat: agentic benchmark ingest + UI with offload-mode
 halo

Adds agentic_traces scenario end-to-end:
- Schema migrations for agentic scenario, availability, and KV offload mode
- DB ingest/ETL + query updates to carry scenario, offload_mode, and
  server/theoretical cache-hit rates through to the API layer
- Frontend types, filters (GlobalFilterContext / InferenceContext /
  ChartControls), URL state, and tooltip rows for agentic-only fields
- ScatterGraph: subtle dashed halo on Pareto-frontier points that used
  KV offload so the tradeoff is visible at a glance
---
 packages/app/cypress/support/mock-data.ts     |   2 +
 .../app/src/app/api/unofficial-run/route.ts   |   2 +
 .../src/components/GlobalFilterContext.tsx    |  12 +-
 .../components/inference/InferenceContext.tsx |  15 ++-
 .../inference/hooks/useChartData.ts           |  34 +++--
 .../app/src/components/inference/types.ts     |  26 ++++
 .../components/inference/ui/ChartControls.tsx |  27 +++-
 .../components/inference/ui/ScatterGraph.tsx  |  21 +++
 .../inference/utils/tooltipUtils.ts           |  54 +++++++-
 .../app/src/components/ui/chart-selectors.tsx | 124 ++++++++++++++++++
 .../unofficial-run-provider.test.ts           |   2 +
 .../components/unofficial-run-provider.tsx    |   4 +-
 packages/app/src/lib/api.ts                   |  14 +-
 .../app/src/lib/benchmark-transform.test.ts   |   2 +
 packages/app/src/lib/benchmark-transform.ts   |  65 ++++++++-
 packages/app/src/lib/data-mappings.ts         |  72 +++++++++-
 packages/app/src/lib/url-state.ts             |   2 +
 packages/constants/src/models.ts              |  17 +++
 .../db/migrations/002_agentic_scenario.sql    |  30 +++++
 .../migrations/003_agentic_availability.sql   |  21 +++
 packages/db/migrations/004_offload_mode.sql   |  42 ++++++
 packages/db/src/etl/benchmark-ingest.ts       |  28 ++--
 packages/db/src/etl/benchmark-mapper.ts       |  45 ++++++-
 packages/db/src/ingest-ci-run.ts              |   6 +-
 packages/db/src/ingest-gcs-backup.ts          |   6 +-
 packages/db/src/ingest-supplemental.ts        |  14 +-
 packages/db/src/json-provider.ts              |   8 +-
 packages/db/src/queries/benchmarks.ts         |  13 +-
 packages/db/src/queries/workflow-info.ts      |  15 ++-
 29 files changed, 645 insertions(+), 78 deletions(-)
 create mode 100644 packages/db/migrations/002_agentic_scenario.sql
 create mode 100644 packages/db/migrations/003_agentic_availability.sql
 create mode 100644 packages/db/migrations/004_offload_mode.sql

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index e6720c0b..7a4f59a9 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,6 +189,8 @@ export function createMockInferenceContext(
     workflowInfo: null,
     selectedYAxisMetric: 'y_tpPerGpu',
     setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
+    selectedPercentile: 'median',
+    setSelectedPercentile: namedStub('setSelectedPercentile'),
     selectedXAxisMetric: null,
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 79ac0665..dbfb9c33 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -49,6 +49,8 @@ export function normalizeArtifactRows(
       decode_num_workers: config.decodeNumWorkers,
       num_prefill_gpu: config.numPrefillGpu,
       num_decode_gpu: config.numDecodeGpu,
+      benchmark_type: params.benchmarkType,
+      offload_mode: params.offloadMode,
       isl: params.isl,
       osl: params.osl,
       conc: params.conc,
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 65f510cd..f603081a 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -11,7 +11,7 @@ import {
   useState,
 } from 'react';
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
 
 import { useAvailability } from '@/hooks/api/use-availability';
 import { useWorkflowInfo } from '@/hooks/api/use-workflow-info';
@@ -172,11 +172,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   const availableSequences = useMemo(() => {
     if (!availabilityRows) return SEQUENCE_OPTIONS;
     const seqs = [
-      ...new Set(
-        modelRows
-          .map((r) => islOslToSequence(r.isl, r.osl))
-          .filter((s): s is Sequence => s !== null),
-      ),
+      ...new Set(modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null)),
     ];
     return seqs.length > 0 ? seqs : SEQUENCE_OPTIONS;
   }, [availabilityRows, modelRows]);
@@ -190,7 +186,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   // Precisions available for the selected model + sequence
   const availablePrecisions = useMemo(() => {
     if (!availabilityRows) return ['fp4'];
-    const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const precs = [...new Set(rows.map((r) => r.precision))].toSorted();
     return precs.length > 0 ? precs : ['fp4'];
   }, [availabilityRows, modelRows, effectiveSequence]);
@@ -205,7 +201,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   // Dates available for selected model + sequence + precisions
   const availableDates = useMemo(() => {
     if (!availabilityRows) return [];
-    const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision));
     if (rows.length === 0) {
       return [...new Set(seqRows.map((r) => r.date))].toSorted();
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 7fa416fd..6f45d8d7 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -11,7 +11,7 @@ import {
   useState,
 } from 'react';
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { track } from '@/lib/analytics';
 import { FAVORITE_PRESETS, type FavoritePreset } from '@/components/favorites/favorite-presets';
 
@@ -110,6 +110,11 @@ export function InferenceProvider({
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || null,
   );
+  // Latency percentile applied to the chart x-axis for agentic scenarios.
+  // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
+  const [selectedPercentile, setSelectedPercentile] = useState<string>(
+    () => getUrlParam('i_pctl') || 'median',
+  );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
   );
@@ -163,6 +168,7 @@ export function InferenceProvider({
     effectiveRunDate,
     isActive,
     latestDate,
+    selectedPercentile,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
@@ -176,7 +182,7 @@ export function InferenceProvider({
     if (!availabilityRows) return availableDates;
     const rows = availabilityRows.filter((r) => {
       if (!dbModelKeys.includes(r.model)) return false;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false;
+      if (rowToSequence(r) !== effectiveSequence) return false;
       if (!effectivePrecisions.includes(r.precision)) return false;
       if (!r.hardware) return false;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -201,7 +207,7 @@ export function InferenceProvider({
     const hwKeys = new Set<string>();
     for (const r of availabilityRows) {
       if (!dbModelKeys.includes(r.model)) continue;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue;
+      if (rowToSequence(r) !== effectiveSequence) continue;
       if (!effectivePrecisions.includes(r.precision)) continue;
       if (!r.hardware) continue;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -589,6 +595,7 @@ export function InferenceProvider({
   useUrlStateSync(
     {
       i_metric: selectedYAxisMetric,
+      i_pctl: selectedPercentile,
       i_gpus: selectedGPUs.join(','),
       i_dates: selectedDates.join(','),
       i_dstart: selectedDateRange.startDate,
@@ -783,6 +790,8 @@ export function InferenceProvider({
       workflowInfo,
       selectedYAxisMetric,
       setSelectedYAxisMetric: setSelectedYAxisMetricAndClear,
+      selectedPercentile,
+      setSelectedPercentile,
       selectedGPUs,
       setSelectedGPUs: setSelectedGPUsAndClear,
       availableGPUs,
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 625e63ab..81ab0780 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -1,7 +1,7 @@
 import { useMemo, useRef } from 'react';
 
 import { useQueries } from '@tanstack/react-query';
-import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants';
+import { rowToSequence } from '@semianalysisai/inferencex-constants';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type {
@@ -15,7 +15,7 @@ import type {
 import { filterDataByCostLimit } from '@/components/inference/utils';
 import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants';
-import { transformBenchmarkRows } from '@/lib/benchmark-transform';
+import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
 import type { Model, Sequence } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 
@@ -79,6 +79,7 @@ export function useChartData(
   selectedRunDate?: string,
   enabled = true,
   latestAvailableDate?: string,
+  selectedPercentile = 'median',
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
@@ -119,11 +120,13 @@ export function useChartData(
   // Merge main rows with comparison date rows.
   // Stamp each row with the *requested* date (not the actual DB date) so that
   // GPUGraph's activeDates filter (keyed by user-selected date) matches the points.
-  const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]);
+  //
+  // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via
+  // benchmark_type), so one filter covers every scenario.
   const rows = useMemo(() => {
-    if (!allRows || !sequenceIslOsl) return [];
-    const seqFilter = (r: { isl: number; osl: number }) =>
-      r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl;
+    if (!allRows) return [];
+    const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) =>
+      rowToSequence(r) === selectedSequence;
     const seqFiltered = allRows.filter(seqFilter);
 
     // For each (hw, framework, spec_method, disagg, precision) group, keep only
@@ -150,14 +153,14 @@ export function useChartData(
         .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })),
     );
     return [...mainRows, ...extraRows];
-  }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]);
+  }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]);
 
   // Transform filtered rows into chart data
   const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => {
     if (rows.length === 0)
       return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig };
-    return transformBenchmarkRows(rows);
-  }, [rows]);
+    return transformBenchmarkRows(rows, selectedPercentile);
+  }, [rows, selectedPercentile]);
 
   // Sort hardware config — stabilize reference when keys haven't changed.
   // Different sequences for the same model often have the same GPU configs,
@@ -192,8 +195,11 @@ export function useChartData(
       (chartDefinitions as ChartDefinition[]).map((chartDef) => {
         const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
 
-        // Determine dynamic x-axis
-        let xAxisField: keyof AggDataEntry = chartDef.x;
+        // Default x-axis = chart's natural latency metric, percentile-adjusted
+        // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic
+        // scenarios `withPercentile` is a no-op when percentile === 'median'.
+        const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry;
+        let xAxisField: keyof AggDataEntry = naturalX;
         let xAxisLabel = chartDef.x_label;
 
         const metricTitle =
@@ -232,8 +238,10 @@ export function useChartData(
         // (e.g. interactivity → TTFT: "higher is better" → "lower is better").
         // E2EL → TTFT keeps the same direction ("lower is better" for both),
         // so no roofline flip is needed for the e2e chart.
+        // Compare against `naturalX` (percentile-adjusted) — switching the
+        // percentile of the same logical metric is NOT a flip.
         const xAxisFlipped =
-          xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride);
+          xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride);
 
         const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition;
         const dynamicYLabel = chartDef[yLabelKey];
@@ -261,7 +269,7 @@ export function useChartData(
           xAxisField,
         };
       }),
-    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric],
+    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile],
   );
 
   // Build renderable graphs (data processing + stable chart definitions)
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index a23707ba..53c8d84c 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -88,6 +88,29 @@ export interface AggDataEntry {
   actualDate?: string;
   /** URL to the GitHub Actions workflow run that produced this data point. */
   run_url?: string;
+  /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */
+  benchmark_type?: string;
+  /** ISL in tokens — null for agentic_traces. */
+  isl?: number | null;
+  /** OSL in tokens — null for agentic_traces. */
+  osl?: number | null;
+  // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ──
+  /** "on" | "off" — whether KV cache offload to CPU was enabled. */
+  offload_mode?: string;
+  /** Actual server-observed GPU prefix-cache hit rate (0..1). */
+  server_gpu_cache_hit_rate?: number;
+  /** Actual server-observed CPU prefix-cache hit rate (0..1). */
+  server_cpu_cache_hit_rate?: number;
+  /** Infinite-cache theoretical hit rate (0..1) computed from trace. */
+  theoretical_cache_hit_rate?: number;
+  /** Total requests attempted during the window. */
+  num_requests_total?: number;
+  /** Requests that completed successfully. */
+  num_requests_successful?: number;
+  /** Total prompt tokens served. */
+  total_prompt_tokens?: number;
+  /** Total generated (output) tokens. */
+  total_generation_tokens?: number;
 }
 
 /**
@@ -468,6 +491,9 @@ export interface InferenceChartContextType {
   workflowInfo: any;
   selectedYAxisMetric: string;
   setSelectedYAxisMetric: (metric: string) => void;
+  /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */
+  selectedPercentile: string;
+  setSelectedPercentile: (p: string) => void;
   selectedXAxisMetric: string | null;
   setSelectedXAxisMetric: (metric: string | null) => void;
   selectedE2eXAxisMetric: string | null;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 5f8e7787..e4f55ad7 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -1,11 +1,14 @@
 'use client';
 
+import { useEffect, useState } from 'react';
+
 import { track } from '@/lib/analytics';
 
 import { useInference } from '@/components/inference/InferenceContext';
 import {
   ModelSelector,
-  SequenceSelector,
+  ScenarioSelector,
+  PercentileSelector,
   PrecisionSelector,
 } from '@/components/ui/chart-selectors';
 import { DateRangePicker } from '@/components/ui/date-range-picker';
@@ -23,7 +26,7 @@ import {
 import { TooltipProvider } from '@/components/ui/tooltip';
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type { ChartDefinition } from '@/components/inference/types';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model, type Percentile } from '@/lib/data-mappings';
 
 // Build Y-axis metric options from static chart config JSON — available immediately, no API wait
 const METRIC_GROUPS = [
@@ -78,6 +81,13 @@ interface ChartControlsProps {
 }
 
 export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) {
+  // The percentile selector is rendered conditionally on `selectedSequence`,
+  // which on the client is hydrated from URL params. SSR doesn't see the URL,
+  // so deferring the conditional until after mount keeps the initial DOM
+  // identical between server and client (avoids hydration warnings).
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const {
     selectedModel,
     setSelectedModel,
@@ -87,6 +97,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
     setSelectedPrecisions,
     selectedYAxisMetric,
     setSelectedYAxisMetric,
+    selectedPercentile,
+    setSelectedPercentile,
     graphs,
     selectedGPUs,
     setSelectedGPUs,
@@ -203,12 +215,19 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
             availableModels={availableModels}
             data-testid="model-selector"
           />
-          <SequenceSelector
+          <ScenarioSelector
             value={selectedSequence}
             onChange={handleSequenceChange}
             availableSequences={availableSequences}
-            data-testid="sequence-selector"
+            data-testid="scenario-selector"
           />
+          {mounted && selectedSequence === Sequence.AgenticTraces && (
+            <PercentileSelector
+              value={selectedPercentile}
+              onChange={(p: Percentile) => setSelectedPercentile(p)}
+              data-testid="percentile-selector"
+            />
+          )}
           <PrecisionSelector
             value={selectedPrecisions}
             onChange={handlePrecisionChange}
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 2e078f89..15bb60f0 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -1512,6 +1512,24 @@ const ScatterGraph = React.memo(
             .attr('pointer-events', 'none');
         });
 
+        // Offload halo: dashed ring on frontier points that used KV offload
+        zoomGroup.selectAll<SVGGElement, InferenceData>('.dot-group').each(function (d) {
+          const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`);
+          const showHalo = onFrontier && d.offload_mode === 'on';
+          d3.select(this)
+            .selectAll<SVGCircleElement, boolean>('.offload-halo')
+            .data(showHalo ? [true] : [])
+            .join('circle')
+            .attr('class', 'offload-halo')
+            .attr('r', POINT_SIZE + 4)
+            .attr('fill', 'none')
+            .attr('stroke', 'var(--foreground)')
+            .attr('stroke-width', 1.5)
+            .attr('stroke-dasharray', '3 2')
+            .attr('opacity', 0.9)
+            .attr('pointer-events', 'none');
+        });
+
         // Double-click to track/untrack
         zoomGroup
           .selectAll<SVGGElement, InferenceData>('.dot-group')
@@ -1567,6 +1585,9 @@ const ScatterGraph = React.memo(
         chartDefinition.chartType,
         xScaleConfig._isLog,
         yScaleConfig.type,
+        optimalPointKeys,
+        getCssColor,
+        resolveColor,
       ],
     );
 
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index e88e9930..7391225e 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -88,6 +88,51 @@ const runLinkHTML = (runUrl?: string) =>
 const tooltipLine = (label: string, value: string | number) =>
   `<div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;"><strong>${label}:</strong> ${value}</div>`;
 
+const formatPct = (v: number | undefined): string | null =>
+  v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
+
+/**
+ * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
+ * success, token totals. Returns an empty string for non-agentic rows.
+ */
+const generateAgenticHTML = (d: InferenceData): string => {
+  if (d.benchmark_type !== 'agentic_traces') return '';
+
+  const parts: string[] = [];
+  if (d.offload_mode) {
+    parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase()));
+  }
+
+  const gpuHit = formatPct(d.server_gpu_cache_hit_rate);
+  const cpuHit = formatPct(d.server_cpu_cache_hit_rate);
+  const theoHit = formatPct(d.theoretical_cache_hit_rate);
+  if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit));
+  if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit));
+  if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit));
+
+  if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) {
+    const successPct =
+      d.num_requests_total > 0
+        ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)`
+        : '';
+    parts.push(
+      tooltipLine(
+        'Requests',
+        `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`,
+      ),
+    );
+  }
+
+  if (d.total_prompt_tokens !== undefined) {
+    parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens)));
+  }
+  if (d.total_generation_tokens !== undefined) {
+    parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
+  }
+
+  return parts.join('');
+};
+
 /**
  * Generates HTML for the parallelism configuration section of a tooltip.
  * Falls back to GPU count for old data without parallelism fields.
@@ -177,9 +222,10 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
       ${
         isPinned
@@ -231,9 +277,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
     </div>
   `;
 };
@@ -292,9 +339,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
     </div>
   `;
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 75e2f257..1c843e12 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -19,12 +19,16 @@ import {
   type Model,
   type Precision,
   type Sequence,
+  type Percentile,
+  PERCENTILE_OPTIONS,
   getModelCategory,
   getModelLabel,
+  getPercentileLabel,
   getPrecisionLabel,
   getSequenceCategory,
   getSequenceLabel,
   groupByCategory,
+  sequenceKind,
 } from '@/lib/data-mappings';
 
 function DeprecatedLabel({ reason }: { reason: string }) {
@@ -167,6 +171,126 @@ export function SequenceSelector({
   );
 }
 
+interface ScenarioSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Sequence) => void;
+  availableSequences: string[];
+  'data-testid'?: string;
+}
+
+/**
+ * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length",
+ * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL
+ * framing only applies to the fixed-seq subset).
+ */
+export function ScenarioSelector({
+  id = 'scenario-select',
+  value,
+  onChange,
+  availableSequences,
+  'data-testid': testId,
+}: ScenarioSelectorProps) {
+  const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq');
+  const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic');
+  const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence));
+
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Scenario"
+        tooltip="Benchmark scenario. Fixed Sequence Length runs use a defined input/output token count (ISL/OSL). Agentic Traces replay real agentic workloads with variable inputs/outputs."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_scenario_changed', { scenario: v });
+          onChange(v as Sequence);
+        }}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {fixedSeq.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Fixed Sequence Length</SelectLabel>
+              {fixedGroups.default.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+              {fixedGroups.deprecated.length > 0 && (
+                <>
+                  <DeprecatedLabel reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  {fixedGroups.deprecated.map((seq) => (
+                    <SelectItem key={seq} value={seq}>
+                      {getSequenceLabel(seq as Sequence)}
+                    </SelectItem>
+                  ))}
+                </>
+              )}
+            </SelectGroup>
+          )}
+          {agentic.map((seq) => (
+            <SelectItem key={seq} value={seq}>
+              {getSequenceLabel(seq as Sequence)}
+            </SelectItem>
+          ))}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
+interface PercentileSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Percentile) => void;
+  'data-testid'?: string;
+}
+
+/**
+ * Latency percentile selector for agentic-trace charts. The selected value
+ * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so
+ * picking p99 plots p99 e2e latency / interactivity instead of the median.
+ */
+export function PercentileSelector({
+  id = 'percentile-select',
+  value,
+  onChange,
+  'data-testid': testId,
+}: PercentileSelectorProps) {
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Latency Percentile"
+        tooltip="Percentile of the latency distribution used for the chart x-axis. Agentic runs carry median/p90/p99/p99.9 variants; switch percentiles to see tail-latency behavior."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_percentile_changed', { percentile: v });
+          onChange(v as Percentile);
+        }}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {PERCENTILE_OPTIONS.map((p) => (
+            <SelectItem key={p} value={p}>
+              {getPercentileLabel(p)}
+            </SelectItem>
+          ))}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
 interface PrecisionSelectorProps {
   id?: string;
   value: string[];
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index f4263d2c..05b522c5 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -29,6 +29,8 @@ function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 128,
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index 2dccdf7f..42530a51 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -12,7 +12,7 @@ import {
 
 import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types';
 import { UnofficialBanner } from '@/components/ui/unofficial-banner';
-import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { computeToggle } from '@/hooks/useTogglableSet';
 import type { BenchmarkRow, EvalRow } from '@/lib/api';
 import { normalizeEvalHardwareKey } from '@/lib/chart-utils';
@@ -93,7 +93,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData
   const groups = new Map<string, BenchmarkRow[]>();
   for (const row of benchmarks) {
     const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model;
-    const sequence = islOslToSequence(row.isl, row.osl);
+    const sequence = rowToSequence(row);
     if (!sequence) continue;
     const key = `${displayModel}_${sequence}`;
     if (!groups.has(key)) groups.set(key, []);
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 11ba4521..240251c3 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -23,9 +23,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   date: string;
@@ -140,13 +144,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index be76438e..6a6c97c8 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -23,6 +23,8 @@ function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 64,
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 107f0b12..69745da2 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -15,9 +15,39 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils';
 import { getHardwareConfig } from '@/lib/constants';
 import type { BenchmarkRow } from '@/lib/api';
 
+/**
+ * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl
+ * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here:
+ *   e2el   ≡ ttlt   (time-to-last-token == end-to-end latency)
+ *   tpot   ≡ itl    (time-per-output-token == inter-token-latency for single-output)
+ *   intvty ≡ 1/itl  (tok/s from the user's perspective)
+ * Existing fields win if present; we only fill in the gaps.
+ */
+function agenticAliases(m: Record<string, number>): Record<string, number> {
+  const out: Record<string, number> = {};
+  for (const suffix of ['mean', 'median', 'p90', 'p99']) {
+    const itl = m[`${suffix}_itl`];
+    const ttlt = m[`${suffix}_ttlt`];
+    if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
+    if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl;
+    if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) {
+      out[`${suffix}_intvty`] = 1 / itl;
+    }
+  }
+  return out;
+}
+
 /** Convert a DB benchmark row to an AggDataEntry. */
 export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
-  const m = row.metrics;
+  const isAgentic = row.benchmark_type === 'agentic_traces';
+  const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics;
+  // Prefer the dedicated column (added in migration 004); fall back to the
+  // legacy stash inside `metrics` for any rows ingested before that column
+  // existed.
+  const rawMetrics = row.metrics as Record<string, unknown>;
+  const offloadMode =
+    row.offload_mode ??
+    (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
   return {
     hw: row.hardware,
     framework: row.framework,
@@ -68,6 +98,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     date: row.date,
     actualDate: (row as any).actualDate ?? row.date,
     run_url: row.run_url ?? undefined,
+    benchmark_type: row.benchmark_type,
+    isl: row.isl,
+    osl: row.osl,
+    offload_mode: offloadMode,
+    server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate,
+    server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate,
+    theoretical_cache_hit_rate: m.theoretical_cache_hit_rate,
+    num_requests_total: m.num_requests_total,
+    num_requests_successful: m.num_requests_successful,
+    total_prompt_tokens: m.total_prompt_tokens,
+    total_generation_tokens: m.total_generation_tokens,
   };
 }
 
@@ -77,13 +118,30 @@ interface PreparedEntry {
   date: string;
 }
 
+/**
+ * Rewrite a chart x-axis key to use a different latency percentile prefix
+ * (`median_` → `p99_` etc). Only touches keys that start with a known
+ * percentile prefix; leaves everything else alone.
+ */
+export function withPercentile(key: string, percentile: string): string {
+  return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`);
+}
+
 /**
  * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
  * Returns one InferenceData[] per chart definition (e2e, interactivity).
  *
  * Converts rows to AggDataEntry once, then reuses for each chart definition.
+ *
+ * @param percentile Optional latency percentile for the chart x-axis
+ *   (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart
+ *   definition for the chosen percentile — only agentic rows carry the
+ *   full set (median/p90/p99/p99.9) so this mainly affects that scenario.
  */
-export function transformBenchmarkRows(rows: BenchmarkRow[]): {
+export function transformBenchmarkRows(
+  rows: BenchmarkRow[],
+  percentile = 'median',
+): {
   chartData: InferenceData[][];
   hardwareConfig: HardwareConfig;
 } {
@@ -109,13 +167,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): {
 
   // Phase 2: Build chart data per chart definition (reusing prepared entries)
   const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => {
+    const xKey = withPercentile(chartDef.x, percentile);
     const groupedByHw: Record<string, InferenceData[]> = {};
 
     for (const { entry, hwKey, date } of prepared) {
       const dataPoint = createChartDataPoint(
         date,
         entry,
-        chartDef.x as keyof AggDataEntry,
+        xKey as keyof AggDataEntry,
         chartDef.y as keyof AggDataEntry,
         hwKey,
       );
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 823b6823..8900f50e 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -102,17 +102,77 @@ export enum Sequence {
   OneK_OneK = '1k/1k',
   OneK_EightK = '1k/8k',
   EightK_OneK = '8k/1k',
+  AgenticTraces = 'agentic-traces',
 }
 
-const SEQUENCE_CONFIG: Record<Sequence, { label: string; compact: string; category: CategoryTag }> =
-  {
-    [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' },
-    [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' },
-    [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' },
-  };
+/**
+ * Top-level scenario kind. Fixed-seq sequences cluster under a single group
+ * in the selector; agentic traces sit alongside as their own kind.
+ */
+export type ScenarioKind = 'fixed-seq' | 'agentic';
+
+export function sequenceKind(seq: Sequence): ScenarioKind {
+  return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq';
+}
+
+const SEQUENCE_CONFIG: Record<
+  Sequence,
+  { label: string; compact: string; category: CategoryTag; kind: ScenarioKind }
+> = {
+  [Sequence.OneK_OneK]: {
+    label: '1K / 1K',
+    compact: '1k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.OneK_EightK]: {
+    label: '1K / 8K',
+    compact: '1k8k',
+    category: 'deprecated',
+    kind: 'fixed-seq',
+  },
+  [Sequence.EightK_OneK]: {
+    label: '8K / 1K',
+    compact: '8k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.AgenticTraces]: {
+    label: 'Agentic Traces',
+    compact: 'agentic',
+    category: 'default',
+    kind: 'agentic',
+  },
+};
 
 export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 
+/**
+ * Percentile of the latency distribution used for the chart x-axis when
+ * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which
+ * slice to plot.
+ */
+export enum Percentile {
+  Median = 'median',
+  P90 = 'p90',
+  P99 = 'p99',
+  P99_9 = 'p99.9',
+}
+
+const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
+  [Percentile.Median]: { label: 'p50 (median)' },
+  [Percentile.P90]: { label: 'p90' },
+  [Percentile.P99]: { label: 'p99' },
+  [Percentile.P99_9]: { label: 'p99.9' },
+};
+
+export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
+
+export function getPercentileLabel(p: Percentile): string {
+  return PERCENTILE_CONFIG[p]?.label ?? p;
+}
+
 export const DEPRECATED_SEQUENCES: ReadonlySet<Sequence> = new Set(
   (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][])
     .filter(([, c]) => c.category === 'deprecated')
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 3947488f..fb2e9d70 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -22,6 +22,7 @@ const URL_STATE_KEYS = [
   'i_seq',
   'i_prec',
   'i_metric',
+  'i_pctl',
   'i_xmetric',
   'i_e2e_xmetric',
   'i_scale',
@@ -61,6 +62,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_seq: '8k/1k',
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
+  i_pctl: 'median',
   i_xmetric: 'p99_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',
diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts
index 6d646f08..d9a3d2d1 100644
--- a/packages/constants/src/models.ts
+++ b/packages/constants/src/models.ts
@@ -53,3 +53,20 @@ export function islOslToSequence(isl: number, osl: number): string | null {
   };
   return map[`${isl}_${osl}`] ?? null;
 }
+
+/**
+ * Map a benchmark/availability row to its sequence (scenario) string.
+ * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl.
+ * - Other rows (today: `single_turn`) fall back to `islOslToSequence`.
+ * Returns `null` for rows that can't be classified (e.g. `single_turn` with
+ * unmapped isl/osl values).
+ */
+export function rowToSequence(row: {
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+}): string | null {
+  if (row.benchmark_type === 'agentic_traces') return 'agentic-traces';
+  if (row.isl === null || row.osl === null) return null;
+  return islOslToSequence(row.isl, row.osl);
+}
diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql
new file mode 100644
index 00000000..c143914e
--- /dev/null
+++ b/packages/db/migrations/002_agentic_scenario.sql
@@ -0,0 +1,30 @@
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+--   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+--   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+  alter column isl drop not null,
+  alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+  drop constraint benchmark_results_isl_positive,
+  drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+--    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql
new file mode 100644
index 00000000..e96cbd50
--- /dev/null
+++ b/packages/db/migrations/003_agentic_availability.sql
@@ -0,0 +1,21 @@
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+  drop constraint availability_pkey;
+
+alter table availability
+  alter column isl drop not null,
+  alter column osl drop not null,
+  add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+  add constraint availability_natural_key unique nulls not distinct
+    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql
new file mode 100644
index 00000000..24b617f1
--- /dev/null
+++ b/packages/db/migrations/004_offload_mode.sql
@@ -0,0 +1,42 @@
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+  add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+   set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+   and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+  br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index 67173c64..ea802d3f 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows(
 
   // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears
   // more than once in a single batch. Deduplicate within the batch, keeping
-  // the last occurrence (last metrics for each unique config/isl/osl/conc).
+  // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode).
   const seen = new Map<string, BenchmarkParams & { configId: number }>();
-  for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r);
+  for (const r of rows) {
+    seen.set(
+      `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`,
+      r,
+    );
+  }
   const deduped = [...seen.values()];
 
   const configIds = deduped.map((r) => r.configId);
+  const benchmarkTypes = deduped.map((r) => r.benchmarkType);
+  const offloadModes = deduped.map((r) => r.offloadMode);
   const isls = deduped.map((r) => r.isl);
   const osls = deduped.map((r) => r.osl);
   const concs = deduped.map((r) => r.conc);
@@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows(
 
   const result = await sql<{ inserted: boolean; id: number }[]>`
     insert into benchmark_results (
-      workflow_run_id, config_id, benchmark_type, date,
+      workflow_run_id, config_id, benchmark_type, offload_mode, date,
       isl, osl, conc, image, metrics
     )
     select
       ${workflowRunId},
       unnest(${sql.array(configIds)}::int[]),
-      'single_turn',
+      unnest(${sql.array(benchmarkTypes)}::text[]),
+      unnest(${sql.array(offloadModes)}::text[]),
       ${date}::date,
       unnest(${sql.array(isls)}::int[]),
       unnest(${sql.array(osls)}::int[]),
       unnest(${sql.array(concs)}::int[]),
       unnest(${sql.array(images)}),
       unnest(${sql.array(metricsJsons)}::jsonb[])
-    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc)
+    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode)
     do update set
       metrics = excluded.metrics,
       image = excluded.image
@@ -147,13 +155,14 @@ export async function bulkUpsertAvailability(
   sql: Sql,
   rows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[],
   date: string,
 ): Promise<void> {
@@ -162,7 +171,7 @@ export async function bulkUpsertAvailability(
   const seen = new Set<string>();
   const unique: typeof rows = [];
   for (const r of rows) {
-    const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`;
+    const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`;
     if (!seen.has(key)) {
       seen.add(key);
       unique.push(r);
@@ -170,7 +179,7 @@ export async function bulkUpsertAvailability(
   }
 
   await sql`
-    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date)
+    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date)
     select
       unnest(${sql.array(unique.map((r) => r.model))}::text[]),
       unnest(${sql.array(unique.map((r) => r.isl))}::int[]),
@@ -180,6 +189,7 @@ export async function bulkUpsertAvailability(
       unnest(${sql.array(unique.map((r) => r.framework))}::text[]),
       unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]),
       unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]),
+      unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]),
       ${date}::date
     on conflict do nothing
   `;
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 7d78e175..5b120843 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([
   'decode_num_workers',
   'num_prefill_gpu',
   'num_decode_gpu',
+  // agentic scenario
+  'scenario_type',
+  'users',
+  'offload_mode',
+  'num_requests_total',
+  'num_requests_successful',
 ]);
 
+/**
+ * `benchmark_type` values understood by the ingest.
+ * - `single_turn`    — fixed sequence-length runs (isl/osl set).
+ * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc).
+ */
+export type BenchmarkType = 'single_turn' | 'agentic_traces';
+
 /**
  * METRIC_KEYS from constants is the canonical set of known metric keys.
  * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured
@@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set<string>();
 
 export interface BenchmarkParams {
   config: ConfigParams;
-  isl: number;
-  osl: number;
+  benchmarkType: BenchmarkType;
+  // Null for agentic_traces; present for single_turn.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */
+  offloadMode: string;
   image: string | null;
   metrics: Record<string, number>;
 }
@@ -114,10 +131,15 @@ export function mapBenchmarkRow(
     return null;
   }
 
-  const isl = parseInt2(row.isl) ?? islOslFallback?.isl;
-  const osl = parseInt2(row.osl) ?? islOslFallback?.osl;
-  const conc = parseInt2(row.conc);
-  if (!isl || !osl || !conc) {
+  // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants),
+  // no isl/osl, and `users` instead of `conc`. Everything else stays as-is.
+  const isAgentic = String(row.scenario_type ?? '').startsWith('agentic');
+  const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn';
+
+  const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
+  const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
+  const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc);
+  if (!conc || (!isAgentic && (!isl || !osl))) {
     tracker.skips.noIslOsl++;
     return null;
   }
@@ -182,6 +204,12 @@ export function mapBenchmarkRow(
     }
   }
 
+  // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it
+  // as a stringified metric so the frontend can expose it in tooltips.
+  if (isAgentic && typeof row.offload_mode === 'string') {
+    (metrics as Record<string, unknown>).offload_mode = row.offload_mode;
+  }
+
   // Artifact names encode '/' as '#' to avoid path separators; restore the URI.
   const image = row.image ? String(row.image).replaceAll('#', '/') : null;
 
@@ -205,9 +233,14 @@ export function mapBenchmarkRow(
       numPrefillGpu,
       numDecodeGpu,
     },
+    benchmarkType,
     isl,
     osl,
     conc,
+    offloadMode:
+      typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+        ? row.offload_mode
+        : 'off',
     image,
     metrics,
   };
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 14c7b4d0..8cce43ca 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -248,13 +248,14 @@ async function main(): Promise<void> {
 
   const availRows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[] = [];
 
   let totalNewBmk = 0,
@@ -367,6 +368,7 @@ async function main(): Promise<void> {
               framework: r.config.framework,
               specMethod: r.config.specMethod,
               disagg: r.config.disagg,
+              benchmarkType: r.benchmarkType,
             });
           }
 
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index e20278d6..6dc604e9 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -596,13 +596,14 @@ async function main(): Promise<void> {
     // Upsert availability rows only for successfully resolved configs
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const r of allInserted) {
       availRows.push({
@@ -614,6 +615,7 @@ async function main(): Promise<void> {
         framework: r.config.framework,
         specMethod: r.config.specMethod,
         disagg: r.config.disagg,
+        benchmarkType: r.benchmarkType,
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts
index 1e494e9f..43aae047 100644
--- a/packages/db/src/ingest-supplemental.ts
+++ b/packages/db/src/ingest-supplemental.ts
@@ -219,8 +219,10 @@ async function ingestSupplementalBmk(
 
     const rows: {
       configId: number;
-      isl: number;
-      osl: number;
+      benchmarkType: 'single_turn' | 'agentic_traces';
+      offloadMode: string;
+      isl: number | null;
+      osl: number | null;
       conc: number;
       image: string | null;
       metrics: Record<string, number>;
@@ -271,6 +273,8 @@ async function ingestSupplementalBmk(
 
       rows.push({
         configId,
+        benchmarkType: 'single_turn',
+        offloadMode: 'off',
         isl: entry.isl,
         osl: entry.osl,
         conc: entry.conc,
@@ -294,13 +298,14 @@ async function ingestSupplementalBmk(
     // to `rows` are exactly the valid ones.
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const entry of entries) {
       const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined });
@@ -317,6 +322,7 @@ async function ingestSupplementalBmk(
         framework,
         specMethod,
         disagg,
+        benchmarkType: 'single_turn',
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 0d9373d3..f09a2686 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -290,6 +290,8 @@ function toBenchmarkRow(
     decode_num_workers: c.decode_num_workers,
     num_prefill_gpu: c.num_prefill_gpu,
     num_decode_gpu: c.num_decode_gpu,
+    benchmark_type: br.benchmark_type ?? 'single_turn',
+    offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off',
     isl: br.isl,
     osl: br.osl,
     conc: br.conc,
@@ -410,7 +412,11 @@ export function getAvailabilityData(): AvailabilityRow[] {
   for (const a of s.availability) {
     const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`;
     if (validKeys.has(key)) {
-      rows.push({ ...a, date: toDateString(a.date) });
+      rows.push({
+        ...a,
+        benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn',
+        date: toDateString(a.date),
+      });
     }
   }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 1c30b1fd..74e20380 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -18,9 +18,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces; numeric for single_turn fixed-seq runs.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   date: string;
@@ -68,6 +72,8 @@ export async function getLatestBenchmarks(
         c.decode_num_workers,
         c.num_prefill_gpu,
         c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
         br.isl,
         br.osl,
         br.conc,
@@ -106,6 +112,8 @@ export async function getLatestBenchmarks(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      lb.benchmark_type,
+      lb.offload_mode,
       lb.isl,
       lb.osl,
       lb.conc,
@@ -153,6 +161,7 @@ export async function getAllBenchmarksForHistory(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
       br.isl,
       br.osl,
       br.conc,
diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts
index b4e4f255..d5e2d933 100644
--- a/packages/db/src/queries/workflow-info.ts
+++ b/packages/db/src/queries/workflow-info.ts
@@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise<DateC
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
-/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, date) combos for the availability API. */
+/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, benchmark_type, date) combos for the availability API. */
 export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRow[]> {
   const rows = await sql`
-    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text
+    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text
     FROM availability a
     WHERE EXISTS (
       SELECT 1
@@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRo
         AND c.hardware = a.hardware
         AND c.framework = a.framework
         AND c.precision = a.precision
-        AND br.isl = a.isl
-        AND br.osl = a.osl
+        AND br.isl IS NOT DISTINCT FROM a.isl
+        AND br.osl IS NOT DISTINCT FROM a.osl
+        AND br.benchmark_type = a.benchmark_type
         AND br.date = a.date
         AND br.error IS NULL
         AND wr.conclusion IS NOT NULL

From 9c43a762cdaf9edd0091ef9d3034d4a974071c6d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 30 Apr 2026 19:01:56 -0500
Subject: [PATCH 002/111] =?UTF-8?q?fix:=20agentic=20offload=20variants=20?=
 =?UTF-8?q?=E2=80=94=20render=20both=20halos=20+=20map=20renamed=20fields?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ScatterGraph: include `offload_mode` in `buildPointConfigId` so d3's data
  join keeps both `on` and `off` variants for the same (config, conc).
  Without it, the second variant collapsed onto the first key, so FP8
  offload-on points (and their halos) silently disappeared.
- benchmark-mapper: handle older artifacts that emit `users`/`offload_mode`
  AND newer ones that emit `conc`/`offloading` (with 'none' → 'off' mapping).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  |  4 +++
 packages/db/src/etl/benchmark-mapper.ts       | 27 ++++++++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 15bb60f0..55a206ce 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -295,6 +295,10 @@ const ScatterGraph = React.memo(
     const buildPointConfigId = useCallback((point: InferenceData): string => {
       let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`;
       if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`;
+      // Agentic runs emit two rows per (config, conc) — one offload=on, one off.
+      // Without this suffix, d3's data join treats them as the same point and
+      // drops one variant (along with its halo).
+      if (point.offload_mode) key += `|offload-${point.offload_mode}`;
       return key;
     }, []);
 
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 5b120843..d842276e 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -138,12 +138,24 @@ export function mapBenchmarkRow(
 
   const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
   const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
-  const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc);
+  // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones.
+  const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc);
   if (!conc || (!isAgentic && (!isl || !osl))) {
     tracker.skips.noIslOsl++;
     return null;
   }
 
+  // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
+  // ('none' → 'off'; any other non-empty value → 'on').
+  const offloadModeRaw =
+    typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+      ? row.offload_mode
+      : typeof row.offloading === 'string' && row.offloading.length > 0
+        ? row.offloading === 'none'
+          ? 'off'
+          : 'on'
+        : 'off';
+
   const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg);
   const isMultinode = parseBool(row.is_multinode);
   const precision = normalizePrecision(String(row.precision ?? ''));
@@ -204,10 +216,10 @@ export function mapBenchmarkRow(
     }
   }
 
-  // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it
-  // as a stringified metric so the frontend can expose it in tooltips.
-  if (isAgentic && typeof row.offload_mode === 'string') {
-    (metrics as Record<string, unknown>).offload_mode = row.offload_mode;
+  // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`)
+  // — preserve as a stringified metric so the frontend can expose it in tooltips.
+  if (isAgentic) {
+    (metrics as Record<string, unknown>).offload_mode = offloadModeRaw;
   }
 
   // Artifact names encode '/' as '#' to avoid path separators; restore the URI.
@@ -237,10 +249,7 @@ export function mapBenchmarkRow(
     isl,
     osl,
     conc,
-    offloadMode:
-      typeof row.offload_mode === 'string' && row.offload_mode.length > 0
-        ? row.offload_mode
-        : 'off',
+    offloadMode: offloadModeRaw,
     image,
     metrics,
   };

From 07ba10636dae87b5a819afa524d7c10322fae41b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 00:29:55 -0500
Subject: [PATCH 003/111] fix: render offload halo on every offload-on point,
 not just frontier

The halo's purpose is to surface KV-offload usage; restricting it to
Pareto-frontier-only points hid the indicator on most runs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/ScatterGraph.tsx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 55a206ce..61ac0983 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -1516,10 +1516,9 @@ const ScatterGraph = React.memo(
             .attr('pointer-events', 'none');
         });
 
-        // Offload halo: dashed ring on frontier points that used KV offload
+        // Offload halo: dashed ring on every point that used KV offload (Pareto or not)
         zoomGroup.selectAll<SVGGElement, InferenceData>('.dot-group').each(function (d) {
-          const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`);
-          const showHalo = onFrontier && d.offload_mode === 'on';
+          const showHalo = d.offload_mode === 'on';
           d3.select(this)
             .selectAll<SVGCircleElement, boolean>('.offload-halo')
             .data(showHalo ? [true] : [])

From 95e9dc77431adf5354ef0df36989816199624383 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 01:13:42 -0500
Subject: [PATCH 004/111] fix: strip runner-pool suffix (-p1, -p2, ...) from hw
 identifier

b300-p1 (and similar) artifacts were skipping ingest because the runner-pool
suffix wasn't in the strip list and didn't normalize to the canonical b300
GPU key.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/normalizers.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts
index ad12a454..bd497f7a 100644
--- a/packages/db/src/etl/normalizers.ts
+++ b/packages/db/src/etl/normalizers.ts
@@ -34,7 +34,8 @@ export function hwToGpuKey(hw: string): string | null {
     .replace(/-dgxc-slurm$/, '')
     .replace(/-dgxc$/, '')
     .replace(/-nb$/, '')
-    .replace(/-nv$/, '');
+    .replace(/-nv$/, '')
+    .replace(/-p\d+$/, ''); // strip runner-pool suffix (e.g. b300-p1 → b300)
   return GPU_KEYS.has(base) ? base : null;
 }
 

From 982106da5f4421983841304f0503b6467033852d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:25:33 -0500
Subject: [PATCH 005/111] feat: bold scatter labels with concurrency tag +
 collision avoidance

- Label text now includes `C=<conc>` alongside the GPU/parallelism tag
  (default `<tp> C=<conc>`, advanced `<getPointLabel> C=<conc>`)
- Bumped point-label font-weight to 700 so the labels read clearly against
  the chart fill
- Greedy collision-avoidance pass on render and zoom: tries placing each
  label above/below the point through 4 candidate dy offsets, hiding the
  label only when no slot is free

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 68 ++++++++++++++++++-
 .../src/lib/d3-chart/layers/scatter-points.ts |  1 +
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 61ac0983..3fbd8588 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -55,6 +55,63 @@ import {
   buildGradientColorMap,
 } from '@/components/inference/utils/paretoLabels';
 
+// Greedy label-collision avoidance: try positions above/below the point;
+// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom.
+function avoidLabelCollisions(
+  zoomGroup: d3.Selection<SVGGElement, unknown, null, undefined>,
+): void {
+  const labels: {
+    el: SVGTextElement;
+    cx: number;
+    cy: number;
+    w: number;
+    h: number;
+  }[] = [];
+  zoomGroup.selectAll<SVGGElement, unknown>('.dot-group').each(function () {
+    const labelEl = this.querySelector<SVGTextElement>('.point-label');
+    if (!labelEl) return;
+    if ((this as SVGGElement).style.opacity === '0') return;
+    const transform = (this as SVGGElement).getAttribute('transform') ?? '';
+    const m = transform.match(/translate\(([^,]+),([^)]+)\)/);
+    if (!m) return;
+    const cx = parseFloat(m[1]);
+    const cy = parseFloat(m[2]);
+    labelEl.setAttribute('dy', '-8');
+    labelEl.style.opacity = '1';
+    const bbox = labelEl.getBBox();
+    labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height });
+  });
+  labels.sort((a, b) => a.cx - b.cx);
+  const placed: { left: number; right: number; top: number; bottom: number }[] = [];
+  const pad = 1;
+  const candidates = [-8, 14, -22, 28];
+  for (const lab of labels) {
+    let chosenDy: number | null = null;
+    let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
+    for (const dy of candidates) {
+      const top = lab.cy + dy - lab.h - pad;
+      const bottom = lab.cy + dy + pad;
+      const left = lab.cx - lab.w / 2 - pad;
+      const right = lab.cx + lab.w / 2 + pad;
+      const collides = placed.some(
+        (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
+      );
+      if (!collides) {
+        chosenDy = dy;
+        chosenBox = { left, right, top, bottom };
+        break;
+      }
+    }
+    if (chosenDy !== null && chosenBox) {
+      lab.el.setAttribute('dy', String(chosenDy));
+      lab.el.style.opacity = '1';
+      placed.push(chosenBox);
+    } else {
+      lab.el.style.opacity = '0';
+    }
+  }
+}
+
 // X-shape path for overlay (unofficial) data points
 const X_SIZE = 5;
 const X_HOVER_SIZE = 7;
@@ -603,6 +660,7 @@ const ScatterGraph = React.memo(
               d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any,
             );
           }
+          avoidLabelCollisions(ctx.layout.zoomGroup);
         },
       }),
       [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type],
@@ -1251,7 +1309,8 @@ const ScatterGraph = React.memo(
           getOpacity: (d) => (isPointVisible(d) ? 1 : 0),
           getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'),
           hideLabels: hidePointLabels || showGradientLabels,
-          getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+          getLabelText: (d) =>
+            useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
           foreground: 'var(--foreground)',
           dataAttrs: {
             'hw-key': (d) => String(d.hwKey),
@@ -1353,8 +1412,11 @@ const ScatterGraph = React.memo(
                   .attr('text-anchor', 'middle')
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
+                  .attr('font-weight', '700')
                   .attr('pointer-events', 'none')
-                  .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp));
+                  .text(
+                    useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
+                  );
               });
 
               // Overlay tooltip handlers
@@ -1566,6 +1628,8 @@ const ScatterGraph = React.memo(
             });
           });
 
+        avoidLabelCollisions(zoomGroup);
+
         // Log tick formatting on initial render
         if (xScaleConfig._isLog) {
           const xScale = ctx.xScale as d3.ScaleLogarithmic<number, number>;
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 507654e1..9f2d2f38 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -72,6 +72,7 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
       .attr('text-anchor', 'middle')
       .attr('fill', config.foreground)
       .attr('font-size', '10px')
+      .attr('font-weight', '700')
       .attr('pointer-events', 'none')
       .text(config.getLabelText);
   }

From 9572b95e86de7cece1179b5f48dd29135765002b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:32:44 -0500
Subject: [PATCH 006/111] fix: stack multi-line point labels upward so they
 don't overlap the point
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tspans now ride above the text's `dy` anchor — the LAST line sits at the
anchor (just above the point) and earlier lines stack above it. Previously
the second tspan landed below the anchor and crashed into the marker.

Also widened collision candidates by label height so the flipped-below
position fully clears the point on multi-line labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 28 +++++++---
 .../src/lib/d3-chart/layers/scatter-points.ts | 52 +++++++++++++------
 2 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 3fbd8588..f8ce9b8f 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -84,8 +84,11 @@ function avoidLabelCollisions(
   labels.sort((a, b) => a.cx - b.cx);
   const placed: { left: number; right: number; top: number; bottom: number }[] = [];
   const pad = 1;
-  const candidates = [-8, 14, -22, 28];
   for (const lab of labels) {
+    // Candidates scale with the label's own height so multi-line labels don't
+    // overlap the point shape when flipped below.
+    const below = lab.h + 8;
+    const candidates = [-8, below, -8 - below - 4, 2 * below];
     let chosenDy: number | null = null;
     let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
     for (const dy of candidates) {
@@ -1310,7 +1313,7 @@ const ScatterGraph = React.memo(
           getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'),
           hideLabels: hidePointLabels || showGradientLabels,
           getLabelText: (d) =>
-            useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
+            useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`,
           foreground: 'var(--foreground)',
           dataAttrs: {
             'hw-key': (d) => String(d.hwKey),
@@ -1403,7 +1406,14 @@ const ScatterGraph = React.memo(
               // Labels
               const showLabels = !hidePointLabels && !showGradientLabels;
               overlayPoints.each(function (d) {
-                d3.select(this)
+                const lines = showLabels
+                  ? (useAdvancedLabels
+                      ? `${getPointLabel(d)}\nC=${d.conc}`
+                      : `${d.tp}\nC=${d.conc}`
+                    ).split('\n')
+                  : [];
+                const text = d3
+                  .select(this)
                   .selectAll<SVGTextElement, boolean>('.overlay-label')
                   .data(showLabels ? [true] : [])
                   .join('text')
@@ -1413,10 +1423,14 @@ const ScatterGraph = React.memo(
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
                   .attr('font-weight', '700')
-                  .attr('pointer-events', 'none')
-                  .text(
-                    useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
-                  );
+                  .attr('pointer-events', 'none');
+                text
+                  .selectAll<SVGTSpanElement, string>('tspan')
+                  .data(lines)
+                  .join('tspan')
+                  .attr('x', 0)
+                  .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+                  .text((l) => l);
               });
 
               // Overlay tooltip handlers
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 9f2d2f38..13c588d8 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -63,18 +63,30 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
     applyNormalState(shape, d.precision);
   });
 
-  // Label (enter only)
+  // Label (enter only). Multi-line labels are passed as `\n`-separated strings;
+  // we stack tspans UPWARD from the text's `dy` anchor so the LAST line sits
+  // at `dy` (just above the point) and earlier lines land above it. That way,
+  // the collision-avoidance pass only has to move the `<text>` element — the
+  // intra-stack offsets stay correct whether the label ends up above or below.
   if (!config.hideLabels && config.getLabelText && config.foreground) {
-    entered
-      .append('text')
-      .attr('class', 'point-label')
-      .attr('dy', -8)
-      .attr('text-anchor', 'middle')
-      .attr('fill', config.foreground)
-      .attr('font-size', '10px')
-      .attr('font-weight', '700')
-      .attr('pointer-events', 'none')
-      .text(config.getLabelText);
+    const labelGetter = config.getLabelText;
+    entered.each(function (d) {
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .append('text')
+        .attr('class', 'point-label')
+        .attr('dy', -8)
+        .attr('text-anchor', 'middle')
+        .attr('fill', config.foreground!)
+        .attr('font-size', '10px')
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      lines.forEach((line, i) => {
+        const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em';
+        text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line);
+      });
+    });
   }
 
   // Exit: remove stale points
@@ -103,9 +115,12 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
 
   // Update labels: use data join so labels are created/removed properly on toggle
   if (!config.hideLabels && config.getLabelText && config.foreground) {
+    const labelGetter = config.getLabelText;
     points.each(function (d) {
-      const g = d3.select(this);
-      g.selectAll<SVGTextElement, boolean>('.point-label')
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .selectAll<SVGTextElement, boolean>('.point-label')
         .data([true])
         .join('text')
         .attr('class', 'point-label')
@@ -113,8 +128,15 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
-        .attr('pointer-events', 'none')
-        .text(config.getLabelText!(d));
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      text
+        .selectAll<SVGTSpanElement, string>('tspan')
+        .data(lines)
+        .join('tspan')
+        .attr('x', 0)
+        .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+        .text((l) => l);
     });
   } else {
     points.selectAll('.point-label').remove();

From 37eecc6e28c10751ffc52c8a0d0588177e43d4d8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:38:39 -0500
Subject: [PATCH 007/111] fix: anchor multi-line labels via first tspan +
 tspan-aware collision pass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a `<text>` contains tspans, the parent's `dy` does not shift the bbox
cleanly — its (unused) y=0 origin still factors in, so the rendered text
ended up centered on the point. Move the absolute offset into the FIRST
tspan's `dy`; later tspans cascade by 1.1em.

Collision avoidance now drives the first tspan's `dy` and tries four
candidate baselines (primary above, primary below, secondary above,
secondary below), accounting for full label height when picking a non-
overlapping slot. Labels still hidden as a last resort.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 72 +++++++++++++------
 .../src/lib/d3-chart/layers/scatter-points.ts | 25 ++++---
 2 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index f8ce9b8f..27d3680c 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -55,58 +55,88 @@ import {
   buildGradientColorMap,
 } from '@/components/inference/utils/paretoLabels';
 
-// Greedy label-collision avoidance: try positions above/below the point;
-// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom.
+// Greedy label-collision avoidance.
+// Each candidate is the y-position of the FIRST baseline (relative to point
+// center) which we apply via the first tspan's `dy` — later tspans cascade
+// down by 1.1em. We try above/below at primary and secondary offsets, and
+// hide the label if all four positions collide.
 function avoidLabelCollisions(
   zoomGroup: d3.Selection<SVGGElement, unknown, null, undefined>,
 ): void {
-  const labels: {
+  interface LabelInfo {
     el: SVGTextElement;
+    firstTspan: SVGTSpanElement;
     cx: number;
     cy: number;
     w: number;
-    h: number;
-  }[] = [];
+    nLines: number;
+    defaultFirstY: number;
+  }
+  const labels: LabelInfo[] = [];
+  const ASCENT = 9;
+  const DESCENT = 3;
+  const LINE_H = 11;
+
   zoomGroup.selectAll<SVGGElement, unknown>('.dot-group').each(function () {
     const labelEl = this.querySelector<SVGTextElement>('.point-label');
     if (!labelEl) return;
     if ((this as SVGGElement).style.opacity === '0') return;
+    const tspans = labelEl.querySelectorAll<SVGTSpanElement>('tspan');
+    if (tspans.length === 0) return;
     const transform = (this as SVGGElement).getAttribute('transform') ?? '';
     const m = transform.match(/translate\(([^,]+),([^)]+)\)/);
     if (!m) return;
     const cx = parseFloat(m[1]);
     const cy = parseFloat(m[2]);
-    labelEl.setAttribute('dy', '-8');
+    const nLines = tspans.length;
+    const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point
+    // Reset to default before measuring so prior positioning doesn't bias bbox
+    tspans[0].setAttribute('dy', `${defaultFirstY}px`);
     labelEl.style.opacity = '1';
     const bbox = labelEl.getBBox();
-    labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height });
+    labels.push({
+      el: labelEl,
+      firstTspan: tspans[0],
+      cx,
+      cy,
+      w: bbox.width,
+      nLines,
+      defaultFirstY,
+    });
   });
+
   labels.sort((a, b) => a.cx - b.cx);
   const placed: { left: number; right: number; top: number; bottom: number }[] = [];
-  const pad = 1;
+  const pad = 2;
+
   for (const lab of labels) {
-    // Candidates scale with the label's own height so multi-line labels don't
-    // overlap the point shape when flipped below.
-    const below = lab.h + 8;
-    const candidates = [-8, below, -8 - below - 4, 2 * below];
-    let chosenDy: number | null = null;
+    const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT;
+    const aboveFirstY = lab.defaultFirstY;
+    const belowFirstY = 14; // first baseline 14px below point center
+    const candidates = [
+      aboveFirstY,
+      belowFirstY,
+      aboveFirstY - blockH - 2,
+      belowFirstY + blockH + 2,
+    ];
+    let chosenY: number | null = null;
     let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
-    for (const dy of candidates) {
-      const top = lab.cy + dy - lab.h - pad;
-      const bottom = lab.cy + dy + pad;
+    for (const firstY of candidates) {
+      const top = lab.cy + firstY - ASCENT - pad;
+      const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad;
       const left = lab.cx - lab.w / 2 - pad;
       const right = lab.cx + lab.w / 2 + pad;
       const collides = placed.some(
         (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
       );
       if (!collides) {
-        chosenDy = dy;
+        chosenY = firstY;
         chosenBox = { left, right, top, bottom };
         break;
       }
     }
-    if (chosenDy !== null && chosenBox) {
-      lab.el.setAttribute('dy', String(chosenDy));
+    if (chosenY !== null && chosenBox) {
+      lab.firstTspan.setAttribute('dy', `${chosenY}px`);
       lab.el.style.opacity = '1';
       placed.push(chosenBox);
     } else {
@@ -1418,18 +1448,18 @@ const ScatterGraph = React.memo(
                   .data(showLabels ? [true] : [])
                   .join('text')
                   .attr('class', 'overlay-label')
-                  .attr('dy', -10)
                   .attr('text-anchor', 'middle')
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
                   .attr('font-weight', '700')
                   .attr('pointer-events', 'none');
+                const firstDy = -(1 + (lines.length - 1) * 1.1);
                 text
                   .selectAll<SVGTSpanElement, string>('tspan')
                   .data(lines)
                   .join('tspan')
                   .attr('x', 0)
-                  .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+                  .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
                   .text((l) => l);
               });
 
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 13c588d8..71d1f050 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -64,10 +64,10 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
   });
 
   // Label (enter only). Multi-line labels are passed as `\n`-separated strings;
-  // we stack tspans UPWARD from the text's `dy` anchor so the LAST line sits
-  // at `dy` (just above the point) and earlier lines land above it. That way,
-  // the collision-avoidance pass only has to move the `<text>` element — the
-  // intra-stack offsets stay correct whether the label ends up above or below.
+  // we anchor the entire stack via the FIRST tspan's `dy` so getBBox() doesn't
+  // pick up the text element's own (unused) y=0 origin. The first tspan is
+  // raised so the LAST line baseline lands ~8px above the point; subsequent
+  // tspans cascade down by 1.1em.
   if (!config.hideLabels && config.getLabelText && config.foreground) {
     const labelGetter = config.getLabelText;
     entered.each(function (d) {
@@ -76,15 +76,18 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .select(this)
         .append('text')
         .attr('class', 'point-label')
-        .attr('dy', -8)
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
         .attr('font-weight', '700')
         .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
       lines.forEach((line, i) => {
-        const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em';
-        text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line);
+        text
+          .append('tspan')
+          .attr('x', 0)
+          .attr('dy', i === 0 ? `${firstDy}em` : '1.1em')
+          .text(line);
       });
     });
   }
@@ -113,7 +116,9 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
   // Update colors on existing shapes (handles hw color changes)
   points.select('.visible-shape').attr('fill', config.getColor as any);
 
-  // Update labels: use data join so labels are created/removed properly on toggle
+  // Update labels: use data join so labels are created/removed properly on toggle.
+  // Anchor the stack via the first tspan (NOT the text dy — that doesn't shift the
+  // bbox cleanly when there are tspan children).
   if (!config.hideLabels && config.getLabelText && config.foreground) {
     const labelGetter = config.getLabelText;
     points.each(function (d) {
@@ -124,18 +129,18 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .data([true])
         .join('text')
         .attr('class', 'point-label')
-        .attr('dy', -8)
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
         .attr('font-weight', '700')
         .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
       text
         .selectAll<SVGTSpanElement, string>('tspan')
         .data(lines)
         .join('tspan')
         .attr('x', 0)
-        .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+        .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
         .text((l) => l);
     });
   } else {

From f317377dfaea35f9cb5dc435ea177966aa17fbf8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 10:21:00 -0500
Subject: [PATCH 008/111] fix: dedupe artifacts by logical name + skip
 0-successful agg rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two complementary fixes for runs whose `results_bmk` aggregated artifact
ends up containing both a successful row and a failed-attempt row for the
same (config, conc, offload) — the failed row's null metrics were
overwriting the good row via ON CONFLICT DO UPDATE.

1. Artifact-level: strip the trailing `_<runner-pool>_<attempt>` suffix
   from each artifact name and group by the logical name, keeping only the
   most recent per group.

2. Row-level: skip rows with `num_requests_successful === 0` AND
   `num_requests_total > 0`. The aggregated artifact merges rows from all
   runners — including failed ones — so artifact-level dedup alone can't
   reach inside it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/benchmark-mapper.ts | 14 +++++++++++
 packages/db/src/etl/skip-tracker.ts     | 10 +++++++-
 packages/db/src/ingest-ci-run.ts        | 33 ++++++++++++++++++++-----
 packages/db/src/ingest-gcs-backup.ts    |  1 +
 4 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index d842276e..1aff5ea9 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -145,6 +145,20 @@ export function mapBenchmarkRow(
     return null;
   }
 
+  // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from
+  // every runner, including ones with 0 successful requests and null metrics.
+  // Without this skip, the empty row's nulls overwrite a good row via
+  // ON CONFLICT DO UPDATE when both share the same (config, conc, offload).
+  if (
+    typeof row.num_requests_successful === 'number' &&
+    row.num_requests_successful === 0 &&
+    typeof row.num_requests_total === 'number' &&
+    row.num_requests_total > 0
+  ) {
+    tracker.skips.failedRun++;
+    return null;
+  }
+
   // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
   // ('none' → 'off'; any other non-empty value → 'on').
   const offloadModeRaw =
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 6166ea44..588718dd 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -8,6 +8,7 @@ export interface Skips {
   unmappedModel: number;
   unmappedHw: number;
   noIslOsl: number;
+  failedRun: number;
   dbError: number;
 }
 
@@ -66,7 +67,14 @@ const MAX_DB_ERRORS = 10;
  * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets.
  */
 export function createSkipTracker(): SkipTracker {
-  const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 };
+  const skips: Skips = {
+    badZip: 0,
+    unmappedModel: 0,
+    unmappedHw: 0,
+    noIslOsl: 0,
+    failedRun: 0,
+    dbError: 0,
+  };
   const unmappedModels = new Set<string>();
   const unmappedHws = new Set<string>();
   const unmappedPrecisions = new Set<string>();
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 8cce43ca..fb1fbbbc 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -101,15 +101,30 @@ if (isDownloadMode) {
     } catch {}
   }
 
-  const byName = new Map<string, (typeof allArtifacts)[0]>();
+  // Strip the trailing `_<runner-pool>_<attempt-digits>` token from each
+  // artifact name, then group by the resulting logical name and keep only
+  // the most recent per group. Without this, two artifacts produced on
+  // different runners for the same logical config (e.g. `…_h200-cw_00` and
+  // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty
+  // metrics can overwrite the good one via ON CONFLICT DO UPDATE.
+  //
+  // The runner pool name itself has no underscores (`h200-cw`,
+  // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip
+  // bounded — using `\w` here would over-match across earlier `_`
+  // separators and collapse different (conc, offload) variants into the
+  // same logical name.
+  const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/;
+  const byLogical = new Map<string, (typeof allArtifacts)[0]>();
   for (const a of allArtifacts) {
-    const existing = byName.get(a.name);
+    const key = a.name.replace(RUNNER_SUFFIX_RE, '');
+    const existing = byLogical.get(key);
     if (!existing || a.created_at > existing.created_at) {
-      byName.set(a.name, a);
+      byLogical.set(key, a);
     }
   }
 
-  for (const [name, artifact] of byName) {
+  for (const [, artifact] of byLogical) {
+    const name = artifact.name;
     console.log(`  ${name}`);
     const zipPath = path.join(artifactsDir, 'artifact.zip');
     execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
@@ -121,7 +136,7 @@ if (isDownloadMode) {
     fs.unlinkSync(zipPath);
   }
 
-  console.log(`\n  Downloaded ${byName.size} artifact(s)`);
+  console.log(`\n  Downloaded ${byLogical.size} artifact(s)`);
 
   // Fetch run attempt from API
   const attemptStr = execSync(
@@ -510,11 +525,17 @@ async function main(): Promise<void> {
 
   const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker;
   const totalSkips =
-    skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError;
+    skips.badZip +
+    skips.unmappedModel +
+    skips.unmappedHw +
+    skips.noIslOsl +
+    skips.failedRun +
+    skips.dbError;
   if (totalSkips > 0) {
     console.log(`\n  Skipped: ${totalSkips} rows`);
     const skipLines: [string, number][] = [
       ['no isl/osl (old format)', skips.noIslOsl],
+      ['failed run (0 successful)', skips.failedRun],
       ['unmapped model', skips.unmappedModel],
       ['unmapped hw', skips.unmappedHw],
       ['bad/empty zip', skips.badZip],
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index 6dc604e9..d67f5164 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -434,6 +434,7 @@ async function mapWorkflowDir(
       unmappedModel: local.skips.unmappedModel,
       unmappedHw: local.skips.unmappedHw,
       noIslOsl: local.skips.noIslOsl,
+      failedRun: local.skips.failedRun,
     },
     localUnmappedModels: new Set(local.unmappedModels),
     localUnmappedHws: new Set(local.unmappedHws),

From c2f66f62f5a1dedb6a87c7c5e58ca990b3cb0956 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 7 May 2026 08:41:26 -0500
Subject: [PATCH 009/111] feat: add AIPerf to FRAMEWORK_LABELS

Tag display name for the `aiperf` spec_method suffix used by the
alternate-harness runs ingested for the agentic minimax sweep.
Without this entry the legend shows 'AIPERF' from the default
toUpperCase fallback.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/constants/src/framework-aliases.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts
index cc5eb6b4..e23a93bc 100644
--- a/packages/constants/src/framework-aliases.ts
+++ b/packages/constants/src/framework-aliases.ts
@@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record<string, string> = {
     ]),
   ),
   mtp: 'MTP',
+  aiperf: 'AIPerf',
 };
 
 /**

From 024797a978a2a6e2954f66a963de3205b62a149e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 12 May 2026 15:02:07 -0500
Subject: [PATCH 010/111] fix(changelog): coerce ids to string when filtering
 changelog by run

bigint workflow_run_id sometimes deserializes as a number on the
frontend depending on the postgres adapter's behavior; strict ===
between a number and a string silently dropped every match, so the
changelog popover always reported "no changelog data available."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/GlobalFilterContext.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 08fc7094..11e56de7 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -87,7 +87,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record<string, RunInfo> {
   const runs: Record<string, RunInfo> = {};
   for (const run of data.runs) {
     const runId = String(run.github_run_id);
-    const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id);
+    const runChangelogs = data.changelogs.filter(
+      (c) => String(c.workflow_run_id) === String(run.github_run_id),
+    );
     runs[runId] = {
       runId,
       runDate: run.created_at,

From aa154193dfbc12535f25444cdf6fccc16a3e1382 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 12 May 2026 15:36:57 -0500
Subject: [PATCH 011/111] feat: default sequence to Agentic Traces when
 available

If the selected model has agentic_traces data, prefer that over the
default 8K/1K fixed-seq when the user hasn't explicitly chosen via URL.
effectiveSequence already falls back to availableSequences[0] for models
without agentic, so models with only fixed-seq data still render correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/GlobalFilterContext.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 11e56de7..7813d079 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -125,7 +125,9 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   const [selectedSequence, setSelectedSequence] = useState<Sequence>(() => {
     const urlSeq = getUrlParam('i_seq');
     if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence;
-    return Sequence.EightK_OneK;
+    // Prefer Agentic Traces by default when the selected model has it; the
+    // effectiveSequence fallback below handles models without agentic data.
+    return Sequence.AgenticTraces;
   });
 
   const [selectedPrecisions, setSelectedPrecisionsRaw] = useState<string[]>(() => {

From 099a33efcb53f5130dc40d715a0f4b86d6136a93 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:25:25 -0500
Subject: [PATCH 012/111] fix(agentic): respect percentile selector for
 input-throughput x axis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rowToAggDataEntry was only copying median/p99 metric variants — picking
p90/p99.9 in the percentile selector silently fell back to 0 and
collapsed every point into a vertical line at x=0. Copy the full
median/p90/p99/p99.9 set into AggDataEntry.

Hide the X-Axis Metric dropdown for agentic mode (it doubled up with the
percentile selector) and route the input-metric chart through
withPercentile so picking p99 actually plots p99_ttft instead of the
hard-coded p99_ttft config default. Percentile options pared back to
median + p99.
---
 .../inference/hooks/useChartData.ts           | 46 +++++++++++++++++--
 .../app/src/components/inference/types.ts     | 10 ++++
 .../components/inference/ui/ChartControls.tsx |  3 +-
 packages/app/src/lib/benchmark-transform.ts   | 12 ++++-
 packages/app/src/lib/data-mappings.ts         |  8 +---
 packages/app/src/lib/energy-metrics.test.ts   | 10 ++++
 6 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 81ab0780..57e9a1c2 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -16,7 +16,7 @@ import { filterDataByCostLimit } from '@/components/inference/utils';
 import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants';
 import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 
 /** Build deduplicated comparison dates, excluding the main run date. */
@@ -216,7 +216,14 @@ export function useChartData(
             ? 'P99 Time To First Token (s)'
             : 'Median Time To First Token (s)';
 
-        if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
+
+        if (
+          effectiveXMetric &&
+          chartDef.chartType === 'interactivity' &&
+          isInputMetric &&
+          !isAgentic
+        ) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) {
@@ -225,15 +232,40 @@ export function useChartData(
             xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label;
           }
         } else if (chartDef.chartType === 'interactivity' && isInputMetric) {
+          // Agentic falls through here too — the manual X-axis dropdown is
+          // hidden in agentic mode (would double up with the percentile
+          // selector), so the config default + percentile post-processing
+          // below drives the x axis.
           const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition;
           const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
           xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label;
-        } else if (chartDef.chartType === 'e2e' && isTtftOverride) {
+        } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           xAxisLabel = ttftLabel;
         }
 
+        // Agentic: rewrite the resolved x metric to the chosen percentile,
+        // and relabel accordingly. naturalX is already percentile-adjusted,
+        // so the per-metric override path is the only one that actually
+        // changes here.
+        if (isAgentic) {
+          const adjusted = withPercentile(
+            xAxisField as string,
+            selectedPercentile,
+          ) as keyof AggDataEntry;
+          if (adjusted !== xAxisField) {
+            const pctlWord =
+              selectedPercentile === 'median'
+                ? 'Median'
+                : selectedPercentile === 'p99.9'
+                  ? 'P99.9'
+                  : selectedPercentile.toUpperCase();
+            xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+            xAxisField = adjusted;
+          }
+        }
+
         // The x-axis is "flipped" only when the good-direction reverses
         // (e.g. interactivity → TTFT: "higher is better" → "lower is better").
         // E2EL → TTFT keeps the same direction ("lower is better" for both),
@@ -269,7 +301,13 @@ export function useChartData(
           xAxisField,
         };
       }),
-    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile],
+    [
+      selectedYAxisMetric,
+      selectedXAxisMetric,
+      selectedE2eXAxisMetric,
+      selectedPercentile,
+      selectedSequence,
+    ],
   );
 
   // Build renderable graphs (data processing + stable chart definitions)
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index a2d9ef2e..cddeba54 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -50,23 +50,33 @@ export interface AggDataEntry {
   mean_ttft: number;
   median_ttft: number;
   std_ttft: number;
+  p90_ttft: number;
   p99_ttft: number;
+  'p99.9_ttft': number;
   mean_tpot: number;
   mean_intvty: number;
   median_tpot: number;
   median_intvty: number;
   std_tpot: number;
   std_intvty: number;
+  p90_tpot: number;
+  p90_intvty: number;
   p99_tpot: number;
   p99_intvty: number;
+  'p99.9_tpot': number;
+  'p99.9_intvty': number;
   mean_itl: number;
   median_itl: number;
   std_itl: number;
+  p90_itl: number;
   p99_itl: number;
+  'p99.9_itl': number;
   mean_e2el: number;
   median_e2el: number;
   std_e2el: number;
+  p90_e2el: number;
   p99_e2el: number;
+  'p99.9_e2el': number;
   disagg: boolean;
   num_prefill_gpu: number;
   num_decode_gpu: number;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 6707bd9e..7b4fa08f 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -269,7 +269,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
           </div>
 
           {graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') &&
-            isInputMetric && (
+            isInputMetric &&
+            selectedSequence !== Sequence.AgenticTraces && (
               <div className="flex flex-col space-y-1.5 lg:col-span-1">
                 <LabelWithTooltip
                   htmlFor="x-axis-select"
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 69745da2..eb62a18a 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -25,7 +25,7 @@ import type { BenchmarkRow } from '@/lib/api';
  */
 function agenticAliases(m: Record<string, number>): Record<string, number> {
   const out: Record<string, number> = {};
-  for (const suffix of ['mean', 'median', 'p90', 'p99']) {
+  for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) {
     const itl = m[`${suffix}_itl`];
     const ttlt = m[`${suffix}_ttlt`];
     if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
@@ -62,23 +62,33 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     mean_ttft: m.mean_ttft ?? 0,
     median_ttft: m.median_ttft ?? 0,
     std_ttft: m.std_ttft ?? 0,
+    p90_ttft: m.p90_ttft ?? 0,
     p99_ttft: m.p99_ttft ?? 0,
+    'p99.9_ttft': m['p99.9_ttft'] ?? 0,
     mean_tpot: m.mean_tpot ?? 0,
     median_tpot: m.median_tpot ?? 0,
     std_tpot: m.std_tpot ?? 0,
+    p90_tpot: m.p90_tpot ?? 0,
     p99_tpot: m.p99_tpot ?? 0,
+    'p99.9_tpot': m['p99.9_tpot'] ?? 0,
     mean_intvty: m.mean_intvty ?? 0,
     median_intvty: m.median_intvty ?? 0,
     std_intvty: m.std_intvty ?? 0,
+    p90_intvty: m.p90_intvty ?? 0,
     p99_intvty: m.p99_intvty ?? 0,
+    'p99.9_intvty': m['p99.9_intvty'] ?? 0,
     mean_itl: m.mean_itl ?? 0,
     median_itl: m.median_itl ?? 0,
     std_itl: m.std_itl ?? 0,
+    p90_itl: m.p90_itl ?? 0,
     p99_itl: m.p99_itl ?? 0,
+    'p99.9_itl': m['p99.9_itl'] ?? 0,
     mean_e2el: m.mean_e2el ?? 0,
     median_e2el: m.median_e2el ?? 0,
     std_e2el: m.std_e2el ?? 0,
+    p90_e2el: m.p90_e2el ?? 0,
     p99_e2el: m.p99_e2el ?? 0,
+    'p99.9_e2el': m['p99.9_e2el'] ?? 0,
     disagg: row.disagg,
     num_prefill_gpu: row.num_prefill_gpu,
     num_decode_gpu: row.num_decode_gpu,
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index f137875c..bf48c864 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,21 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which
- * slice to plot.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the
+ * two most commonly read slices (p50, p99) are surfaced in the UI.
  */
 export enum Percentile {
   Median = 'median',
-  P90 = 'p90',
   P99 = 'p99',
-  P99_9 = 'p99.9',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.Median]: { label: 'p50 (median)' },
-  [Percentile.P90]: { label: 'p90' },
   [Percentile.P99]: { label: 'p99' },
-  [Percentile.P99_9]: { label: 'p99.9' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 28cc1e36..54788585 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,23 +57,33 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_ttft: 0.5,
     median_ttft: 0.4,
     std_ttft: 0.1,
+    p90_ttft: 0.7,
     p99_ttft: 0.8,
+    'p99.9_ttft': 0.9,
     mean_tpot: 0.02,
     mean_intvty: 45,
     median_tpot: 0.02,
     median_intvty: 44,
     std_tpot: 0.005,
     std_intvty: 5,
+    p90_tpot: 0.025,
+    p90_intvty: 55,
     p99_tpot: 0.03,
     p99_intvty: 60,
+    'p99.9_tpot': 0.035,
+    'p99.9_intvty': 65,
     mean_itl: 0.01,
     median_itl: 0.01,
     std_itl: 0.002,
+    p90_itl: 0.013,
     p99_itl: 0.015,
+    'p99.9_itl': 0.018,
     mean_e2el: 5,
     median_e2el: 4.8,
     std_e2el: 0.5,
+    p90_e2el: 5.5,
     p99_e2el: 6,
+    'p99.9_e2el': 6.5,
     disagg: false,
     num_prefill_gpu: 0,
     num_decode_gpu: 0,

From 50a06d1419c70ddd8d24b2c6545da44fe6be3a4d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:27:19 -0500
Subject: [PATCH 013/111] fix(agentic): default percentile to p99 and drop
 median option

---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/data-mappings.ts                      | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index b4ccb9ef..af2d364e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -122,7 +122,7 @@ export function InferenceProvider({
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
-    () => getUrlParam('i_pctl') || 'median',
+    () => getUrlParam('i_pctl') || 'p99',
   );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index bf48c864..1b4f47c3 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,16 +186,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the
- * two most commonly read slices (p50, p99) are surfaced in the UI.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p99
+ * is surfaced in the UI.
  */
 export enum Percentile {
-  Median = 'median',
   P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
-  [Percentile.Median]: { label: 'p50 (median)' },
   [Percentile.P99]: { label: 'p99' },
 };
 

From 3c96e9137776d1c368a0acdfeee6e769d5733464 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:31:27 -0500
Subject: [PATCH 014/111] fix(agentic): keep only p90 as the percentile option

---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/data-mappings.ts                      | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 0ba14a21..accfdf9e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -136,7 +136,7 @@ export function InferenceProvider({
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
-    () => getUrlParam('i_pctl') || 'p99',
+    () => getUrlParam('i_pctl') || 'p90',
   );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 0afb304a..83e6648a 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -191,12 +191,10 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
  */
 export enum Percentile {
   P90 = 'p90',
-  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
-  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];

From 642081af77c8165ac89a5177abbd6c0244dfb9c0 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:31:30 -0400
Subject: [PATCH 015/111] fix(agentic): default percentile to p90, surface only
 p90/p99

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts                | 2 +-
 .../app/src/components/inference/InferenceContext.tsx    | 2 +-
 .../app/src/components/inference/hooks/useChartData.ts   | 9 ++-------
 packages/app/src/components/ui/chart-selectors.tsx       | 2 +-
 packages/app/src/lib/data-mappings.ts                    | 6 ++++--
 packages/app/src/lib/url-state.ts                        | 2 +-
 6 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index f267dcc9..34b89aba 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,7 +189,7 @@ export function createMockInferenceContext(
     workflowInfo: null,
     selectedYAxisMetric: 'y_tpPerGpu',
     setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
-    selectedPercentile: 'median',
+    selectedPercentile: 'p90',
     setSelectedPercentile: namedStub('setSelectedPercentile'),
     selectedXAxisMetric: null,
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index accfdf9e..36dc672d 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -134,7 +134,7 @@ export function InferenceProvider({
     () => getUrlParam('i_e2e_xmetric') || null,
   );
   // Latency percentile applied to the chart x-axis for agentic scenarios.
-  // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
+  // Values: 'p90' | 'p99'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
     () => getUrlParam('i_pctl') || 'p90',
   );
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index f2ef85ec..436fd662 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -83,7 +83,7 @@ export function useChartData(
   selectedRunDate?: string,
   enabled = true,
   latestAvailableDate?: string,
-  selectedPercentile = 'median',
+  selectedPercentile = 'p90',
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
 ) {
@@ -261,12 +261,7 @@ export function useChartData(
             selectedPercentile,
           ) as keyof AggDataEntry;
           if (adjusted !== xAxisField) {
-            const pctlWord =
-              selectedPercentile === 'median'
-                ? 'Median'
-                : selectedPercentile === 'p99.9'
-                  ? 'P99.9'
-                  : selectedPercentile.toUpperCase();
+            const pctlWord = selectedPercentile.toUpperCase();
             xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
             xAxisField = adjusted;
           }
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index d2940de4..e30816fa 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -315,7 +315,7 @@ export function PercentileSelector({
       <LabelWithTooltip
         htmlFor={id}
         label="Latency Percentile"
-        tooltip="Percentile of the latency distribution used for the chart x-axis. Agentic runs carry median/p90/p99/p99.9 variants; switch percentiles to see tail-latency behavior."
+        tooltip="Percentile of the latency distribution used for the chart x-axis. Switch between p90 and p99 to see tail-latency behavior on agentic runs."
       />
       <Select
         value={value}
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 83e6648a..0970f8d7 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,15 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); p90 and
- * p99 are surfaced in the UI.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
+ * and p99 are surfaced in the UI.
  */
 export enum Percentile {
   P90 = 'p90',
+  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
+  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 35ac2359..54ce43d9 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -67,7 +67,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_seq: '8k/1k',
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
-  i_pctl: 'median',
+  i_pctl: 'p90',
   i_xmetric: 'p99_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',

From 3f45f4df92e1990070bf5a58dd7753aa9a91baff Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:38:23 -0400
Subject: [PATCH 016/111] fix(agentic): drop p99 + median TTFT, p90 only across
 selectors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aligns the TTFT x-axis selectors with the percentile selector — only
p90 is offered everywhere. Default x-axis metric and chart config
input-throughput x are p90_ttft.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx |  2 +-
 .../inference/hooks/useChartData.ts           | 10 +---
 .../inference/inference-chart-config.json     | 10 ++--
 .../inference/replay/buildReplayTimeline.ts   |  3 +-
 .../components/inference/ui/ChartControls.tsx |  7 +--
 .../components/inference/ui/ChartDisplay.tsx  | 19 ++-----
 .../src/components/inference/utils.test.ts    | 57 +++++++------------
 .../app/src/components/inference/utils.ts     |  3 +-
 .../app/src/components/ui/chart-selectors.tsx |  2 +-
 packages/app/src/lib/data-mappings.ts         |  4 +-
 packages/app/src/lib/url-state.ts             |  2 +-
 11 files changed, 42 insertions(+), 77 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 36dc672d..e88f57d8 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -128,7 +128,7 @@ export function InferenceProvider({
     () => getUrlParam('i_metric') || 'y_tpPerGpu',
   );
   const [selectedXAxisMetric, setSelectedXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_xmetric') || 'p99_ttft',
+    () => getUrlParam('i_xmetric') || 'p90_ttft',
   );
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || null,
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 436fd662..69222859 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -215,12 +215,8 @@ export function useChartData(
         // Resolve the effective x-axis override per chart type
         const effectiveXMetric =
           chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
-        const isTtftOverride =
-          effectiveXMetric === 'p99_ttft' || effectiveXMetric === 'median_ttft';
-        const ttftLabel =
-          effectiveXMetric === 'p99_ttft'
-            ? 'P99 Time To First Token (s)'
-            : 'Median Time To First Token (s)';
+        const isTtftOverride = effectiveXMetric === 'p90_ttft';
+        const ttftLabel = 'P90 Time To First Token (s)';
 
         const isAgentic = selectedSequence === Sequence.AgenticTraces;
 
@@ -340,7 +336,7 @@ export function useChartData(
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
-        const isTtftX = xAxisField === 'p99_ttft' || xAxisField === 'median_ttft';
+        const isTtftX = xAxisField === 'p90_ttft';
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)
diff --git a/packages/app/src/components/inference/inference-chart-config.json b/packages/app/src/components/inference/inference-chart-config.json
index e26d237e..dcd91e60 100644
--- a/packages/app/src/components/inference/inference-chart-config.json
+++ b/packages/app/src/components/inference/inference-chart-config.json
@@ -13,9 +13,9 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_left",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
-    "y_inputTputPerGpu_heading": "vs. P99 Time To First Token",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
+    "y_inputTputPerGpu_heading": "vs. P90 Time To First Token",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
@@ -105,8 +105,8 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_right",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
diff --git a/packages/app/src/components/inference/replay/buildReplayTimeline.ts b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
index be076418..b0eb1446 100644
--- a/packages/app/src/components/inference/replay/buildReplayTimeline.ts
+++ b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
@@ -82,8 +82,7 @@ function resolveXAxisField(
   const metricTitle =
     (chartDef[`${selectedYAxisMetric}_title` as keyof ChartDefinition] as string) || '';
   const isInputMetric = metricTitle.toLowerCase().includes('input');
-  const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     return selectedXAxisMetric;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 7b4fa08f..ad222edc 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -275,11 +275,11 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
                 <LabelWithTooltip
                   htmlFor="x-axis-select"
                   label="X-Axis Metric"
-                  tooltip="The latency metric displayed on the chart's X-axis. Options include P99 Time To First Token and Median Time To First Token."
+                  tooltip="The latency metric displayed on the chart's X-axis: P90 Time To First Token."
                 />
                 <Select
                   onValueChange={handleXAxisMetricChange}
-                  value={selectedXAxisMetric ?? 'p99_ttft'}
+                  value={selectedXAxisMetric ?? 'p90_ttft'}
                 >
                   <SelectTrigger
                     id="x-axis-select"
@@ -289,8 +289,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
                     <SelectValue />
                   </SelectTrigger>
                   <SelectContent portalled={false}>
-                    <SelectItem value="p99_ttft">P99 TTFT</SelectItem>
-                    <SelectItem value="median_ttft">Median TTFT</SelectItem>
+                    <SelectItem value="p90_ttft">P90 TTFT</SelectItem>
                   </SelectContent>
                 </Select>
               </div>
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index f0e1692a..78df2c37 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -408,27 +408,20 @@ export default function ChartDisplay() {
                             if (
                               graph.chartDefinition.chartType === 'interactivity' &&
                               isInputMetric &&
-                              selectedXAxisMetric
+                              selectedXAxisMetric === 'p90_ttft'
                             ) {
-                              if (selectedXAxisMetric === 'p99_ttft') {
-                                return 'vs. P99 Time To First Token';
-                              } else if (selectedXAxisMetric === 'median_ttft') {
-                                return 'vs. Median Time To First Token';
-                              }
+                              return 'vs. P90 Time To First Token';
                             }
 
                             // For e2e chart: render clickable inline dropdown for x-axis
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p99_ttft'
-                                  ? 'P99 TTFT'
-                                  : selectedE2eXAxisMetric === 'median_ttft'
-                                    ? 'Median TTFT'
-                                    : 'End-to-end Latency';
+                                selectedE2eXAxisMetric === 'p90_ttft'
+                                  ? 'P90 TTFT'
+                                  : 'End-to-end Latency';
                               const xAxisOptions = [
                                 { value: null, label: 'End-to-end Latency' },
-                                { value: 'p99_ttft', label: 'P99 TTFT' },
-                                { value: 'median_ttft', label: 'Median TTFT' },
+                                { value: 'p90_ttft', label: 'P90 TTFT' },
                               ];
                               const zoomPrefix =
                                 selectedDateRange.startDate &&
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 8f8705e1..589ba580 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -157,12 +157,12 @@ describe('processOverlayChartData', () => {
   });
 
   it('remaps x to config override for input metrics on interactivity chart', () => {
-    // inputTputPerGpu has x override to p99_ttft on interactivity chart
+    // inputTputPerGpu has x override to p90_ttft on interactivity chart
     const data = [
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_intvty: 50,
       } as any),
     ];
@@ -176,16 +176,11 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        median_ttft: 0.1,
+        p90_ttft: 0.1,
         median_intvty: 50,
       } as any),
     ];
-    const result = processOverlayChartData(
-      data,
-      'interactivity',
-      'y_inputTputPerGpu',
-      'median_ttft',
-    );
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.1);
   });
@@ -195,76 +190,62 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_e2el: 2.5,
       } as any),
     ];
     const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null);
     expect(result).toHaveLength(1);
-    // e2e uses median_e2el as x (from chart config default), not p99_ttft
+    // e2e uses median_e2el as x (from chart config default), not p90_ttft
     expect(result[0].x).toBe(2.5);
   });
 
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => {
-    const data = [
-      pt({
-        x: 100,
-        tpPerGpu: { y: 42, roof: false },
-        p99_ttft: 0.35,
-        median_e2el: 2.5,
-      } as any),
-    ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
-    expect(result).toHaveLength(1);
-    expect(result[0].x).toBe(0.35);
-  });
-
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => {
+  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => {
     const data = [
       pt({
         x: 100,
         tpPerGpu: { y: 42, roof: false },
-        median_ttft: 0.12,
+        p90_ttft: 0.12,
         median_e2el: 2.5,
       } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.12);
   });
 
   it('filters e2e TTFT outliers exceeding y_latency_limit', () => {
     const data = [
-      pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any),
-      pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any),
+      pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any),
+      pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
 
   it('does not filter interactivity points by latency limit when x-axis is default', () => {
-    // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity
+    // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity
     // chart's x-axis stays median_intvty for non-input metrics. The latency limit
     // (60) must NOT apply to median_intvty values.
     const data = [
       pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any),
       pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(2);
   });
 
   it('applies latency limit on interactivity only when x-axis is actually overridden', () => {
-    // When an input metric IS selected and x-axis overrides to p99_ttft,
+    // When an input metric IS selected and x-axis overrides to p90_ttft,
     // the latency limit should apply.
     const data = [
-      pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any),
-      pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any),
+      pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any),
+      pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft');
-    // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
+    // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4b5335b6..735007ab 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -88,8 +88,7 @@ export function processOverlayChartData(
   let xAxisField: string = chartDef.x;
   // selectedXAxisMetric is already the effective metric for this chart type
   // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
-  const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     xAxisField = selectedXAxisMetric;
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index e30816fa..19b4bfb0 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -315,7 +315,7 @@ export function PercentileSelector({
       <LabelWithTooltip
         htmlFor={id}
         label="Latency Percentile"
-        tooltip="Percentile of the latency distribution used for the chart x-axis. Switch between p90 and p99 to see tail-latency behavior on agentic runs."
+        tooltip="Percentile of the latency distribution used for the chart x-axis on agentic runs."
       />
       <Select
         value={value}
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 0970f8d7..91f65a34 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -187,16 +187,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
  * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
- * and p99 are surfaced in the UI.
+ * is surfaced in the UI.
  */
 export enum Percentile {
   P90 = 'p90',
-  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
-  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 54ce43d9..b88c92b2 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -68,7 +68,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
   i_pctl: 'p90',
-  i_xmetric: 'p99_ttft',
+  i_xmetric: 'p90_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',
   i_gpus: '',

From 03c775ac9710b4a95d2d2c270adfcfe202219130 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:41:14 -0400
Subject: [PATCH 017/111] fix(agentic): honor e2e TTFT override in agentic mode
 too

The `!isAgentic` gate on the e2e TTFT override branch dropped the
user's `p90_ttft` pick in agentic mode, leaving the chart on the
default p90_e2el. The trailing withPercentile pass is idempotent
when xAxisField is already at the right percentile, so the gate is
unnecessary.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/hooks/useChartData.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 69222859..2a344cef 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -242,7 +242,7 @@ export function useChartData(
           const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
           xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label;
-        } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) {
+        } else if (chartDef.chartType === 'e2e' && isTtftOverride) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           xAxisLabel = ttftLabel;
         }

From 49f2b2780d71cdad7b4a52ae0fdab0e2b8013d09 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:45:19 -0400
Subject: [PATCH 018/111] fix(agentic): default e2e chart x-axis to p90 TTFT

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/url-state.ts                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index e88f57d8..c80afc2e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -131,7 +131,7 @@ export function InferenceProvider({
     () => getUrlParam('i_xmetric') || 'p90_ttft',
   );
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_e2e_xmetric') || null,
+    () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
   );
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index b88c92b2..4a48a776 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -69,7 +69,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_metric: 'y_tpPerGpu',
   i_pctl: 'p90',
   i_xmetric: 'p90_ttft',
-  i_e2e_xmetric: '',
+  i_e2e_xmetric: 'p90_ttft',
   i_scale: 'auto',
   i_gpus: '',
   i_dates: '',

From 9e2c5322b0873ecd8ba8720d7e7e21961a7178dd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:47:22 -0500
Subject: [PATCH 019/111] fix(tooltip): cap data-point numeric values at 3
 decimal places

---
 .../inference/utils/tooltipUtils.ts           | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 4359fc44..3154070a 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -91,6 +91,14 @@ const tooltipLine = (label: string, value: string | number) =>
 const formatPct = (v: number | undefined): string | null =>
   v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
 
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */
+const fmt = (v: number): string => {
+  if (!Number.isFinite(v)) return String(v);
+  const rounded = parseFloat(v.toFixed(3));
+  if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
+  return String(rounded);
+};
+
 /**
  * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
  * success, token totals. Returns an empty string for non-agentic rows.
@@ -201,16 +209,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -218,7 +226,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -274,10 +282,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
         <strong>Date:</strong> ${d.actualDate ?? d.date}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${tooltipLine('Total GPUs', d.tp)}
       ${generateParallelismHTML(d)}
@@ -318,16 +326,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -335,7 +343,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }

From 50ed25fa95e36d2ad881a1f68aa70010a19f34de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:05:09 -0500
Subject: [PATCH 020/111] fix(agentic): relabel x-axis title for natural-x case
 too

---
 .../components/inference/hooks/useChartData.ts    | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 2a344cef..b14775b6 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -248,19 +248,16 @@ export function useChartData(
         }
 
         // Agentic: rewrite the resolved x metric to the chosen percentile,
-        // and relabel accordingly. naturalX is already percentile-adjusted,
-        // so the per-metric override path is the only one that actually
-        // changes here.
+        // and relabel accordingly. Both have to be updated unconditionally —
+        // xAxisField may already be percentile-adjusted (via naturalX) while
+        // xAxisLabel still carries the raw chartDef.x_label prefix.
         if (isAgentic) {
-          const adjusted = withPercentile(
+          xAxisField = withPercentile(
             xAxisField as string,
             selectedPercentile,
           ) as keyof AggDataEntry;
-          if (adjusted !== xAxisField) {
-            const pctlWord = selectedPercentile.toUpperCase();
-            xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
-            xAxisField = adjusted;
-          }
+          const pctlWord = selectedPercentile.toUpperCase();
+          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
         }
 
         // The x-axis is "flipped" only when the good-direction reverses

From e9d8e3f66143fcdce8709f4a55bd0f29889d7174 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:08:05 -0500
Subject: [PATCH 021/111] fix(agentic): include percentile word in chart
 heading

---
 .../app/src/components/inference/hooks/useChartData.ts |  9 +++++++++
 .../app/src/components/inference/ui/ChartDisplay.tsx   | 10 ++++------
 .../components/inference/ui/UnofficialChartDisplay.tsx |  4 +---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index b14775b6..0d13b8ca 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -251,6 +251,10 @@ export function useChartData(
         // and relabel accordingly. Both have to be updated unconditionally —
         // xAxisField may already be percentile-adjusted (via naturalX) while
         // xAxisLabel still carries the raw chartDef.x_label prefix.
+        // The chart heading ("vs. <latency>") is also rewritten to include
+        // the percentile so the title above the plot reflects what's drawn.
+        const headingKey = `${selectedYAxisMetric}_heading` as keyof ChartDefinition;
+        let chartHeading = (chartDef[headingKey] as string) || chartDef.heading;
         if (isAgentic) {
           xAxisField = withPercentile(
             xAxisField as string,
@@ -258,6 +262,10 @@ export function useChartData(
           ) as keyof AggDataEntry;
           const pctlWord = selectedPercentile.toUpperCase();
           xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+          chartHeading = chartHeading.replace(
+            /^(vs\.\s+)(?:(Median|Mean|P90|P99(?:\.9)?)\s+)?/iu,
+            `$1${pctlWord} `,
+          );
         }
 
         // The x-axis is "flipped" only when the good-direction reverses
@@ -288,6 +296,7 @@ export function useChartData(
           chartDefinition: {
             ...chartDef,
             ...rooflineOverrides,
+            heading: chartHeading,
             x_label: xAxisLabel,
             y_label: dynamicYLabel === null ? undefined : String(dynamicYLabel),
           },
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 78df2c37..35213a14 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -449,12 +449,10 @@ export default function ChartDisplay() {
                               );
                             }
 
-                            // Fall back to configured heading
-                            return (
-                              graph.chartDefinition[
-                                `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                              ] || graph.chartDefinition.heading
-                            );
+                            // Fall back to the heading baked into chartDefinition
+                            // by useChartData (already resolves per-metric overrides
+                            // and applies the agentic percentile rewrite).
+                            return graph.chartDefinition.heading;
                           })()}
                         </h2>
                         <p className="text-sm text-muted-foreground mb-2">
diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
index f9b1b3c8..73018483 100644
--- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
@@ -194,9 +194,7 @@ export function UnofficialChartDisplay() {
                           `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
                         ]
                       }{' '}
-                      {graph.chartDefinition[
-                        `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                      ] || graph.chartDefinition.heading}
+                      {graph.chartDefinition.heading}
                     </h2>
                     <p className="text-sm text-muted-foreground mb-2">
                       {graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence}

From 2046282eb3386bd0e7164b57a3f5dace9465e169 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:15:24 -0500
Subject: [PATCH 022/111] fix(agentic): include percentile in e2e chart heading
 dropdown

---
 .../src/components/inference/ui/ChartDisplay.tsx    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 35213a14..e9021aed 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -40,6 +40,7 @@ import {
   getModelLabel,
   getPrecisionLabel,
   getSequenceLabel,
+  sequenceKind,
 } from '@/lib/data-mappings';
 import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
@@ -152,6 +153,7 @@ export default function ChartDisplay() {
     activeHwTypes,
     activeDates,
     setSelectedE2eXAxisMetric,
+    selectedPercentile,
     compareGpuPair,
   } = useInference();
 
@@ -415,12 +417,15 @@ export default function ChartDisplay() {
 
                             // For e2e chart: render clickable inline dropdown for x-axis
                             if (graph.chartDefinition.chartType === 'e2e') {
+                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                              const pctlWord = selectedPercentile.toUpperCase();
+                              const e2elLabel = isAgentic
+                                ? `${pctlWord} End-to-end Latency`
+                                : 'End-to-end Latency';
                               const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p90_ttft'
-                                  ? 'P90 TTFT'
-                                  : 'End-to-end Latency';
+                                selectedE2eXAxisMetric === 'p90_ttft' ? 'P90 TTFT' : e2elLabel;
                               const xAxisOptions = [
-                                { value: null, label: 'End-to-end Latency' },
+                                { value: null, label: e2elLabel },
                                 { value: 'p90_ttft', label: 'P90 TTFT' },
                               ];
                               const zoomPrefix =

From 9957f19e630c14fbfadb411725ba1736d58a83e1 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 18:53:56 -0500
Subject: [PATCH 023/111] feat(agentic): per-point trace_replay storage +
 detail page POC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Persist aiperf's profile_export.jsonl and server_metrics_export.{csv,json}
per agentic benchmark point in a new agentic_trace_replay sibling table
(migration 006), then a follow-up column for the gzipped time-series JSON
(migration 007). Ingest hook walks the agentic_<suffix> sibling artifact
and captures all three files; ~6 MB gz per point.

New /inference/agentic/[id] detail page renders:
- ISL / OSL histograms with p50/p75/p90/p95 guide lines
- KV cache utilization over time (raw scatter + 50-sample rolling avg)
- Request queue depth (running / waiting / total, smoothed)
- Prefix cache hit rate per interval (raw scatter + smoothed)
- Total + decode throughput with cumulative running-avg overlay
- Cumulative prompt token source breakdown (stacked area)

SiblingNav at the top renders the SKU label (e.g. "B200 · DeepSeek V4 Pro
· FP4 · vLLM") with chips for every (TP, conc, offload) variant in the
same workflow run so users can jump between sibling points.

Tooltip changes:
- portal to document.body + position:fixed so the tooltip can escape
  parent stacking contexts (backdrop-filter on the chart Card)
- clamp positioning to keep the tooltip inside the chart area
- "View charts →" button on pinned agentic points navigates to the
  detail page

Also ignores .claude/worktrees/ from oxlint so parallel agent worktrees
don't trip the pre-commit hook.
---
 .eslintignore                                 |   3 +
 .../inference/agentic/[id]/page.tsx           |  17 +
 .../app/src/app/api/unofficial-run/route.ts   |   4 +
 .../app/api/v1/benchmark-siblings/route.ts    |  38 +++
 .../src/app/api/v1/trace-histograms/route.ts  |  60 ++++
 .../app/api/v1/trace-server-metrics/route.ts  |  40 +++
 .../agentic-point/agentic-point-detail.tsx    | 308 +++++++++++++++++
 .../inference/agentic-point/distribution.tsx  | 140 ++++++++
 .../inference/agentic-point/sibling-nav.tsx   | 118 +++++++
 .../agentic-point/time-series-chart.tsx       | 311 ++++++++++++++++++
 .../app/src/components/inference/types.ts     |   2 +
 .../components/inference/ui/ScatterGraph.tsx  | 225 +++++++++----
 .../inference/utils/tooltipUtils.ts           |  34 +-
 .../src/components/ui/d3-chart-wrapper.tsx    |  53 ++-
 .../unofficial-run-provider.test.ts           |   1 +
 .../src/hooks/api/use-benchmark-siblings.ts   |  46 +++
 .../app/src/hooks/api/use-trace-histograms.ts |  39 +++
 .../src/hooks/api/use-trace-server-metrics.ts |  70 ++++
 packages/app/src/lib/api.ts                   |   2 +
 .../app/src/lib/benchmark-transform.test.ts   |   1 +
 packages/app/src/lib/benchmark-transform.ts   |   2 +
 .../app/src/lib/compare-pair-defaults.test.ts |   1 +
 .../src/lib/d3-chart/layers/scatter-points.ts |  30 +-
 .../migrations/006_agentic_trace_replay.sql   |  34 ++
 .../007_agentic_trace_server_metrics_json.sql |  17 +
 packages/db/src/etl/skip-tracker.test.ts      |   1 +
 packages/db/src/etl/skip-tracker.ts           |   3 +
 packages/db/src/etl/trace-replay-ingest.ts    |  83 +++++
 packages/db/src/ingest-ci-run.ts              |  90 +++++
 packages/db/src/ingest-gcs-backup.ts          |   2 +
 packages/db/src/json-provider.ts              |   1 +
 packages/db/src/queries/benchmark-siblings.ts | 132 ++++++++
 packages/db/src/queries/benchmarks.ts         |   9 +
 packages/db/src/queries/trace-histograms.ts   |  82 +++++
 .../db/src/queries/trace-server-metrics.ts    | 275 ++++++++++++++++
 35 files changed, 2196 insertions(+), 78 deletions(-)
 create mode 100644 .eslintignore
 create mode 100644 packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
 create mode 100644 packages/app/src/app/api/v1/benchmark-siblings/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-histograms/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-server-metrics/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/distribution.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/sibling-nav.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/time-series-chart.tsx
 create mode 100644 packages/app/src/hooks/api/use-benchmark-siblings.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-histograms.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-server-metrics.ts
 create mode 100644 packages/db/migrations/006_agentic_trace_replay.sql
 create mode 100644 packages/db/migrations/007_agentic_trace_server_metrics_json.sql
 create mode 100644 packages/db/src/etl/trace-replay-ingest.ts
 create mode 100644 packages/db/src/queries/benchmark-siblings.ts
 create mode 100644 packages/db/src/queries/trace-histograms.ts
 create mode 100644 packages/db/src/queries/trace-server-metrics.ts

diff --git a/.eslintignore b/.eslintignore
new file mode 100644
index 00000000..513a873e
--- /dev/null
+++ b/.eslintignore
@@ -0,0 +1,3 @@
+# Stale agent worktrees produced by parallel Claude Code sessions — they
+# hold their own branches and are linted as part of their own runs.
+.claude/worktrees/
diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
new file mode 100644
index 00000000..77f29805
--- /dev/null
+++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
@@ -0,0 +1,17 @@
+import type { Metadata } from 'next';
+
+import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail';
+
+export const metadata: Metadata = {
+  title: 'Agentic trace detail | InferenceX',
+  robots: { index: false },
+};
+
+export default async function AgenticPointDetailPage({
+  params,
+}: {
+  params: Promise<{ id: string }>;
+}) {
+  const { id } = await params;
+  return <AgenticPointDetail id={Number(id)} />;
+}
diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 7578e897..3d2d0da7 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -33,6 +33,10 @@ export function normalizeArtifactRows(
     if (!params) continue;
     const { config } = params;
     results.push({
+      // Synthetic id — overlay rows aren't persisted, so trace_replay lookups
+      // (keyed on benchmark_results.id) will always miss, which is the
+      // intended behaviour: overlays never have stored trace_replay blobs.
+      id: 0,
       hardware: config.hardware,
       framework: config.framework,
       model: config.model,
diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
new file mode 100644
index 00000000..14c1d461
--- /dev/null
+++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
@@ -0,0 +1,38 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getBenchmarkSiblings,
+  type BenchmarkSiblings,
+} from '@semianalysisai/inferencex-db/queries/benchmark-siblings';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedSiblings = cachedQuery(
+  (id: number): Promise<BenchmarkSiblings | null> => getBenchmarkSiblings(getDb(), id),
+  'benchmark-siblings',
+);
+
+/**
+ * GET /api/v1/benchmark-siblings?id=N
+ *
+ * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the
+ * benchmark_result + all sibling rows that share that SKU within the same
+ * workflow_run. Used by the agentic detail page to render a navigator.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedSiblings(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching benchmark siblings:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
new file mode 100644
index 00000000..fd7572a8
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -0,0 +1,60 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceHistograms,
+  type TraceHistogramMap,
+} from '@semianalysisai/inferencex-db/queries/trace-histograms';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceHistograms = cachedQuery(
+  (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
+  'trace-histograms',
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/trace-histograms?ids=1,2,3
+ *
+ * Returns per-request ISL/OSL arrays parsed from the stored aiperf
+ * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`.
+ * Ids without a trace_replay blob are omitted from the response.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    // Sort the cache key so the same set of ids in any order hits the same entry.
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const histograms = await getCachedTraceHistograms(sorted);
+    return cachedJson(histograms);
+  } catch (error) {
+    console.error('Error fetching trace histograms:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
new file mode 100644
index 00000000..7346a3e8
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceServerMetrics,
+  type TraceServerMetrics,
+} from '@semianalysisai/inferencex-db/queries/trace-server-metrics';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceServerMetrics = cachedQuery(
+  (id: number): Promise<TraceServerMetrics | null> => getTraceServerMetrics(getDb(), id),
+  'trace-server-metrics',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/trace-server-metrics?id=N
+ *
+ * Returns parsed time-series for the agentic detail view: KV cache usage,
+ * prefix cache hit rate per interval, queue depth, and per-source prompt
+ * token rates. Times are in seconds from benchmark start. 404 if the point
+ * has no stored server_metrics_export.json blob.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedTraceServerMetrics(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching trace server metrics:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
new file mode 100644
index 00000000..3cd274ba
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -0,0 +1,308 @@
+'use client';
+
+import Link from 'next/link';
+import { useRouter } from 'next/navigation';
+import { ArrowLeft } from 'lucide-react';
+
+import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import {
+  useTraceServerMetrics,
+  type PointMeta,
+  type QueueDepthPoint,
+  type TimeSeriesPoint,
+} from '@/hooks/api/use-trace-server-metrics';
+import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+
+import { Distribution } from './distribution';
+import { SiblingNav } from './sibling-nav';
+import {
+  StackedAreaChart,
+  TimeSeriesChart,
+  cumulativeAverage,
+  rollingAverage,
+  sumSeries,
+} from './time-series-chart';
+
+interface Props {
+  id: number;
+}
+
+const fmtPct = (v: number | null | undefined): string =>
+  v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`;
+
+function MetaLine({ label, value }: { label: string; value: React.ReactNode }) {
+  return (
+    <div className="flex flex-col gap-0.5">
+      <span className="text-xs uppercase tracking-wide text-muted-foreground">{label}</span>
+      <span className="text-sm font-medium text-foreground">{value}</span>
+    </div>
+  );
+}
+
+function PointSummary({ meta }: { meta: PointMeta }) {
+  return (
+    <div className="mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-2">
+        <p className="text-sm text-muted-foreground">
+          Selected point
+          {meta.disagg ? ' · disagg' : ''}
+          {meta.spec_method && meta.spec_method !== 'none' ? ` · spec=${meta.spec_method}` : ''}
+        </p>
+        {meta.run_url && (
+          <a
+            href={meta.run_url}
+            target="_blank"
+            rel="noopener noreferrer"
+            className="text-xs text-muted-foreground hover:text-foreground underline"
+          >
+            GitHub Actions run →
+          </a>
+        )}
+      </div>
+      <div className="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-6 gap-3">
+        <MetaLine label="Offload" value={(meta.offload_mode ?? 'off').toUpperCase()} />
+        <MetaLine label="Concurrency" value={meta.conc} />
+        <MetaLine label="GPU cache hit" value={fmtPct(meta.server_gpu_cache_hit_rate)} />
+        <MetaLine label="CPU cache hit" value={fmtPct(meta.server_cpu_cache_hit_rate)} />
+        {meta.isl !== null && <MetaLine label="ISL" value={meta.isl} />}
+        {meta.osl !== null && <MetaLine label="OSL" value={meta.osl} />}
+      </div>
+    </div>
+  );
+}
+
+function ChartCard({ title, children }: { title: string; children: React.ReactNode }) {
+  return (
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+      <h2 className="text-sm font-semibold text-foreground mb-3">{title}</h2>
+      {children}
+    </div>
+  );
+}
+
+export function AgenticPointDetail({ id }: Props) {
+  const router = useRouter();
+  const histQuery = useTraceHistograms([id], true);
+  const metricsQuery = useTraceServerMetrics(id, true);
+  const siblingsQuery = useBenchmarkSiblings(id);
+
+  const hist = histQuery.data?.[id];
+  const metrics = metricsQuery.data;
+  const siblingsData = siblingsQuery.data;
+
+  return (
+    <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
+      <div className="flex items-center gap-2">
+        <button
+          type="button"
+          onClick={() => router.back()}
+          className="inline-flex items-center gap-1 text-sm text-muted-foreground hover:text-foreground"
+        >
+          <ArrowLeft className="size-4" /> Back
+        </button>
+        <span className="text-sm text-muted-foreground">·</span>
+        <Link href="/inference" className="text-sm text-muted-foreground hover:text-foreground">
+          Inference chart
+        </Link>
+      </div>
+
+      {siblingsData ? (
+        <SiblingNav sku={siblingsData.sku} siblings={siblingsData.siblings} />
+      ) : siblingsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading SKU navigator…</div>
+      ) : null}
+
+      {metrics ? (
+        <PointSummary meta={metrics.meta} />
+      ) : metricsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading point metadata…</div>
+      ) : null}
+
+      {metricsQuery.isError && (
+        <div className="rounded-lg border border-destructive/40 bg-destructive/10 p-4 text-sm text-destructive">
+          Failed to load trace data for benchmark point #{id}.
+        </div>
+      )}
+      {metricsQuery.data === null && !metricsQuery.isLoading && (
+        <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+          No stored trace_replay blob for benchmark point #{id}. This point predates the aiperf
+          time-series capture, or its source artifacts have expired on GitHub.
+        </div>
+      )}
+
+      <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+        <ChartCard title="Input sequence length distribution">
+          {hist ? (
+            <Distribution values={hist.isl} unit="tokens" />
+          ) : histQuery.isLoading ? (
+            <Skeleton />
+          ) : (
+            <Empty />
+          )}
+        </ChartCard>
+        <ChartCard title="Output sequence length distribution">
+          {hist ? (
+            <Distribution values={hist.osl} unit="tokens" />
+          ) : histQuery.isLoading ? (
+            <Skeleton />
+          ) : (
+            <Empty />
+          )}
+        </ChartCard>
+
+        <ChartCard title="KV cache utilization over time">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'GPU KV cache (avg n=50)',
+                  data: rollingAverage(metrics.kvCacheUsage, 50),
+                  rawData: metrics.kvCacheUsage,
+                  color: '#3b82f6',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yMax={1}
+              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+              yAxisLabel="KV cache (%)"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Request queue depth">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'Running (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.running,
+                    })),
+                    50,
+                  ),
+                  color: '#22c55e',
+                  strokeWidth: 2,
+                },
+                {
+                  name: 'Waiting (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.waiting,
+                    })),
+                    50,
+                  ),
+                  color: '#ef4444',
+                  strokeWidth: 2,
+                },
+                {
+                  name: 'Total (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.total,
+                    })),
+                    50,
+                  ),
+                  color: '#3b82f6',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yAxisLabel="Requests"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Prefix cache hit rate per interval">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'GPU (HBM, avg n=50)',
+                  data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                  rawData: metrics.prefixCacheHitRate,
+                  color: '#a855f7',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yMax={1}
+              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+              yAxisLabel="Hit rate (%)"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Throughput (total & decode)">
+          {metrics ? (
+            (() => {
+              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(total, 50),
+                      color: '#3b82f6',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Decode (avg n=50)',
+                      data: rollingAverage(metrics.decodeTps, 50),
+                      color: '#f97316',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Total running avg',
+                      data: cumulativeAverage(total),
+                      color: '#ef4444',
+                      strokeWidth: 3,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens / sec"
+                />
+              );
+            })()
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Cumulative prompt token source breakdown">
+          {metrics ? (
+            <StackedAreaChart
+              sourceSeries={metrics.promptTokensBySource}
+              durationS={metrics.durationS}
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+      </div>
+    </div>
+  );
+}
+
+function Skeleton() {
+  return <div className="h-[260px] rounded-md bg-muted/30 animate-pulse" />;
+}
+
+function Empty() {
+  return (
+    <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+  );
+}
+
+// Re-export type for use by sub-components
+export type { TimeSeriesPoint, QueueDepthPoint };
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
new file mode 100644
index 00000000..c9a563fe
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -0,0 +1,140 @@
+'use client';
+
+import { useMemo, useRef } from 'react';
+
+/**
+ * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
+ * detail-page card — fills its container width via `viewBox` + 100% width.
+ */
+export function Distribution({
+  values,
+  unit,
+  height = 260,
+}: {
+  values: readonly number[];
+  unit: string;
+  height?: number;
+}) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const svgParts = useMemo(() => {
+    if (values.length === 0) return { bars: '', guides: '', legend: '', axis: '', yTicks: '' };
+    const sorted = [...values].toSorted((a, b) => a - b);
+    const min = sorted[0]!;
+    const max = sorted.at(-1)!;
+    const range = Math.max(1e-9, max - min);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+
+    // Sturges-ish, scaled with sample size, capped so bars stay visible.
+    const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
+    const counts: number[] = Array.from({ length: nBins }, () => 0);
+    for (const v of values) {
+      const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+      counts[i]!++;
+    }
+    const maxCount = Math.max(...counts, 1);
+    const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+    const barW = innerW / nBins;
+
+    const fmt = (n: number) =>
+      n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+    const quantile = (q: number): number => {
+      const pos = (sorted.length - 1) * q;
+      const lo = Math.floor(pos);
+      const hi = Math.ceil(pos);
+      return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+    };
+
+    const bars = counts
+      .map((c, i) => {
+        const h = (c / maxCount) * innerH;
+        const x = PAD.left + i * barW;
+        const y = PAD.top + (innerH - h);
+        return `<rect x="${x.toFixed(2)}" y="${y.toFixed(2)}" width="${Math.max(0, barW - 1).toFixed(2)}" height="${h.toFixed(2)}" fill="currentColor" opacity="0.55" />`;
+      })
+      .join('');
+
+    const GUIDES = [
+      { label: 'p50', q: 0.5, color: '#3b82f6' },
+      { label: 'p75', q: 0.75, color: '#22c55e' },
+      { label: 'p90', q: 0.9, color: '#f59e0b' },
+      { label: 'p95', q: 0.95, color: '#ef4444' },
+    ] as const;
+    const guides = GUIDES.map(({ q, color }) => {
+      const v = quantile(q);
+      const x = xScale(v);
+      return `<line x1="${x.toFixed(2)}" x2="${x.toFixed(2)}" y1="${PAD.top}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" opacity="0.95" />`;
+    }).join('');
+
+    // 4-tick x-axis: min, ~33%, ~66%, max
+    const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+    const axisY = PAD.top + innerH + 14;
+    const axisLine = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${(PAD.top + innerH).toFixed(2)}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="currentColor" opacity="0.2" />`;
+    const xLabels = xTickVals
+      .map((v, i) => {
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${xScale(v).toFixed(2)}" y="${axisY}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmt(v)}</text>`;
+      })
+      .join('');
+    const axisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">value (${unit})</text>`;
+
+    // 5-tick y-axis
+    const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
+    const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${PAD.left}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.4" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${fmt(v)}</text></g>`;
+      })
+      .join('');
+    const yAxisLabel = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">count</text>`;
+
+    const chipY = H - 8;
+    const chipW = innerW / GUIDES.length;
+    const legend = GUIDES.map(({ label: ql, q, color }, i) => {
+      const v = quantile(q);
+      const x = PAD.left + i * chipW;
+      return `
+      <line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" />
+      <text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${ql} ${fmt(v)}</text>`;
+    }).join('');
+
+    return {
+      bars,
+      guides,
+      legend,
+      axis: axisLine + xLabels + axisTitle + yAxisLabel,
+      yTicks,
+    };
+  }, [values, unit, H]);
+
+  const ref = useRef<HTMLDivElement | null>(null);
+
+  if (values.length === 0) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <div ref={ref} className="w-full">
+      <div className="mb-2 text-xs text-muted-foreground">
+        {values.length.toLocaleString()} requests · range {Math.round(Math.min(...values))}–
+        {Math.round(Math.max(...values))} {unit}
+      </div>
+      <svg
+        viewBox={`0 0 ${W} ${H}`}
+        preserveAspectRatio="xMidYMid meet"
+        className="w-full h-auto text-foreground"
+        dangerouslySetInnerHTML={{
+          __html:
+            svgParts.bars + svgParts.guides + svgParts.axis + svgParts.yTicks + svgParts.legend,
+        }}
+      />
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
new file mode 100644
index 00000000..776c8ba2
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -0,0 +1,118 @@
+'use client';
+
+import { useRouter } from 'next/navigation';
+import { ChevronLeft, ChevronRight } from 'lucide-react';
+
+import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
+
+const HW_LABELS: Record<string, string> = {
+  b200: 'B200',
+  b300: 'B300',
+  gb200: 'GB200',
+  gb300: 'GB300',
+  h100: 'H100',
+  h200: 'H200',
+  mi300x: 'MI300X',
+  mi325x: 'MI325X',
+  mi355x: 'MI355X',
+};
+
+const MODEL_LABELS: Record<string, string> = {
+  dsr1: 'DeepSeek R1',
+  dsv4: 'DeepSeek V4 Pro',
+  glm5: 'GLM-5',
+  'glm5.1': 'GLM-5.1',
+  gptoss120b: 'gpt-oss 120B',
+  kimik2: 'Kimi K2',
+  'kimik2.5': 'Kimi K2.5',
+  'kimik2.6': 'Kimi K2.6',
+  llama70b: 'Llama 3.3 70B',
+  'minimaxm2.5': 'MiniMax M2.5',
+  'minimaxm2.7': 'MiniMax M2.7',
+  'qwen3.5': 'Qwen 3.5',
+};
+
+function hwLabel(hw: string) {
+  return HW_LABELS[hw] ?? hw.toUpperCase();
+}
+function modelLabel(m: string) {
+  return MODEL_LABELS[m] ?? m;
+}
+function frameworkLabel(fw: string) {
+  if (fw === 'vllm') return 'vLLM';
+  if (fw === 'sglang') return 'SGLang';
+  if (fw === 'trt') return 'TRT';
+  if (fw === 'mori-sglang') return 'Mori-SGLang';
+  if (fw.startsWith('dynamo-')) return `Dynamo ${fw.slice('dynamo-'.length).toUpperCase()}`;
+  return fw;
+}
+
+/** Short label for a sibling chip: parallelism + concurrency. */
+function chipLabel(s: BenchmarkSibling): string {
+  const parallel = s.disagg
+    ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
+    : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
+  const offload = s.offload_mode === 'on' ? ' • off=ON' : '';
+  return `${parallel} • c=${s.conc}${offload}`;
+}
+
+export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) {
+  const router = useRouter();
+  const currentIdx = siblings.findIndex((s) => s.is_current);
+  const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null;
+  const next =
+    currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null;
+
+  const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`;
+
+  return (
+    <div className="border-b border-border/40 pb-4 mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-3">
+        <h1 className="text-2xl font-semibold text-foreground">{skuLabel}</h1>
+        <span className="text-xs text-muted-foreground">
+          {siblings.length} point{siblings.length === 1 ? '' : 's'} in this run · {sku.date}
+        </span>
+      </div>
+      <div className="flex items-center gap-2 flex-wrap">
+        <button
+          type="button"
+          disabled={!prev}
+          onClick={() => prev && router.push(`/inference/agentic/${prev.id}`)}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Previous point"
+        >
+          <ChevronLeft className="size-3.5" /> prev
+        </button>
+        <div className="flex items-center gap-1 flex-wrap">
+          {siblings.map((s) => {
+            const active = s.is_current;
+            return (
+              <button
+                key={s.id}
+                type="button"
+                onClick={() => !active && router.push(`/inference/agentic/${s.id}`)}
+                className={`px-2 py-1 rounded-md text-xs border transition-colors ${
+                  active
+                    ? 'border-primary bg-primary text-primary-foreground font-medium'
+                    : 'border-border/40 text-foreground hover:bg-accent'
+                } ${s.has_trace ? '' : 'opacity-60'}`}
+                title={s.has_trace ? undefined : 'No stored trace data'}
+              >
+                {chipLabel(s)}
+              </button>
+            );
+          })}
+        </div>
+        <button
+          type="button"
+          disabled={!next}
+          onClick={() => next && router.push(`/inference/agentic/${next.id}`)}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Next point"
+        >
+          next <ChevronRight className="size-3.5" />
+        </button>
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
new file mode 100644
index 00000000..bc081b4e
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -0,0 +1,311 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+
+interface Series {
+  name: string;
+  /** The line to draw (caller pre-smooths if desired). */
+  data: TimeSeriesPoint[];
+  /** Optional raw per-scrape values; rendered as low-opacity scatter behind the line. */
+  rawData?: TimeSeriesPoint[];
+  color: string;
+  /** Override default stroke width (1.8). Use higher values for emphasis lines. */
+  strokeWidth?: number;
+}
+
+interface TimeSeriesChartProps {
+  series: Series[];
+  durationS: number;
+  yMax?: number;
+  yFmt?: (v: number) => string;
+  yAxisLabel?: string;
+  height?: number;
+}
+
+/** Centered rolling average over `windowSize` samples. */
+export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowSize <= 1) return data;
+  const half = Math.floor(windowSize / 2);
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const start = Math.max(0, i - half);
+    const end = Math.min(data.length, i + half + 1);
+    let sum = 0;
+    let n = 0;
+    for (let j = start; j < end; j++) {
+      sum += data[j]!.value;
+      n++;
+    }
+    out[i] = { t: data[i]!.t, value: n > 0 ? sum / n : 0 };
+  }
+  return out;
+}
+
+/**
+ * Expanding-window cumulative mean from index 0..i. Useful for "running
+ * average over the entire run" lines (red overlay in the throughput chart).
+ */
+export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  let sum = 0;
+  for (let i = 0; i < data.length; i++) {
+    sum += data[i]!.value;
+    out[i] = { t: data[i]!.t, value: sum / (i + 1) };
+  }
+  return out;
+}
+
+/** Pointwise sum of two arrays sharing the same t index. */
+export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  const n = Math.min(a.length, b.length);
+  const out: TimeSeriesPoint[] = Array.from({ length: n });
+  for (let i = 0; i < n; i++) {
+    out[i] = { t: a[i]!.t, value: a[i]!.value + b[i]!.value };
+  }
+  return out;
+}
+
+const fmtInt = (n: number) =>
+  n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+const fmtSeconds = (s: number) => {
+  if (s < 60) return `${Math.round(s)}s`;
+  const m = Math.floor(s / 60);
+  const rem = Math.round(s % 60);
+  return `${m}m ${rem}s`;
+};
+
+export function TimeSeriesChart({
+  series,
+  durationS,
+  yMax: yMaxOpt,
+  yFmt = fmtInt,
+  yAxisLabel,
+  height = 260,
+}: TimeSeriesChartProps) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const inner = useMemo(() => {
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const xMax = Math.max(durationS, 1);
+    const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value)));
+    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+    const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
+
+    const subsample = (arr: TimeSeriesPoint[]) => {
+      if (arr.length === 0) return arr;
+      const stride = Math.max(1, Math.floor(arr.length / innerW));
+      return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+    };
+
+    // Layered render: raw scatter (back) → lines (front). Iterate twice so
+    // emphasis lines (high strokeWidth) draw over everything else.
+    const dotsLayer = series
+      .filter((s) => s.rawData && s.rawData.length > 0)
+      .map((s) =>
+        subsample(s.rawData!)
+          .map((d) => {
+            const x = xScale(d.t);
+            const y = yScale(d.value);
+            return `<circle cx="${x.toFixed(2)}" cy="${y.toFixed(2)}" r="1.5" fill="${s.color}" opacity="0.2" />`;
+          })
+          .join(''),
+      )
+      .join('');
+
+    const lineLayer = series
+      .map((s) => {
+        if (s.data.length === 0) return '';
+        const sampled = subsample(s.data);
+        const pts = sampled.map((d) => [xScale(d.t), yScale(d.value)] as [number, number]);
+        const path = pts
+          .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+          .join(' ');
+        return `<path d="${path}" fill="none" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 1.8}" />`;
+      })
+      .join('');
+
+    const paths = dotsLayer + lineLayer;
+
+    // X-axis: 5 ticks at 0..xMax
+    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+    const axisY = PAD.top + innerH;
+    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
+      .map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
+      })
+      .join('')}`;
+    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
+
+    // Y-axis: 5 ticks at 0..yMax
+    const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${yFmt(v)}</text></g>`;
+      })
+      .join('');
+    const yAxisTitle = yAxisLabel
+      ? `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">${yAxisLabel}</text>`
+      : '';
+
+    // Legend at the bottom of the SVG
+    const chipY = H - 8;
+    const chipW = innerW / Math.max(1, series.length);
+    const legend = series
+      .map((s, i) => {
+        const x = PAD.left + i * chipW;
+        return `<line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 2}" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${s.name}</text>`;
+      })
+      .join('');
+
+    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
+  }, [series, durationS, yMaxOpt, yFmt, yAxisLabel, H]);
+
+  if (series.every((s) => s.data.length === 0)) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <svg
+      viewBox={`0 0 ${W} ${H}`}
+      preserveAspectRatio="xMidYMid meet"
+      className="w-full h-auto text-foreground"
+      dangerouslySetInnerHTML={{ __html: inner }}
+    />
+  );
+}
+
+/** Stacked-area chart for token-source share over time. */
+export function StackedAreaChart({
+  sourceSeries,
+  durationS,
+  height = 260,
+}: {
+  sourceSeries: Record<string, TimeSeriesPoint[]>;
+  durationS: number;
+  height?: number;
+}) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const inner = useMemo(() => {
+    const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
+    if (entries.length === 0) return '';
+    const tValues = entries[0]![1].map((p) => p.t);
+    const cum: Record<string, number[]> = {};
+    for (const [name, arr] of entries) {
+      let acc = 0;
+      cum[name] = arr.map((p) => {
+        acc += p.value;
+        return acc;
+      });
+    }
+    const shares: Record<string, number[]> = {};
+    for (const name of Object.keys(cum)) shares[name] = [];
+    for (let i = 0; i < tValues.length; i++) {
+      const total = entries.reduce((s, [name]) => s + (cum[name]?.[i] ?? 0), 0);
+      for (const [name] of entries) {
+        shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
+      }
+    }
+
+    const colors: Record<string, string> = {
+      local_compute: '#f97316',
+      local_cache_hit: '#3b82f6',
+      external_kv_transfer: '#22c55e',
+      miss: '#f97316',
+    };
+    const labelFor: Record<string, string> = {
+      local_compute: 'Prefill',
+      local_cache_hit: 'HBM Cache Hit',
+      external_kv_transfer: 'Offload Cache Hit',
+      miss: 'Miss',
+    };
+
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const xMax = Math.max(durationS, 1);
+    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+    const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+    const stackOrder = Object.keys(shares);
+    const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+    const layers = stackOrder.map((name) => {
+      const upper = shares[name]!.map((v, i) => lower[i]! + v);
+      const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+      const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+      const d = `${top
+        .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+        .join(' ')} ${[...bottom]
+        .toReversed()
+        .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+        .join(' ')} Z`;
+      const color = colors[name] ?? '#6b7280';
+      const path = `<path d="${d}" fill="${color}" opacity="0.75" />`;
+      for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+      return { name, color, path };
+    });
+
+    const paths = layers.map((l) => l.path).join('');
+
+    // X-axis
+    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+    const axisY = PAD.top + innerH;
+    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
+      .map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
+      })
+      .join('')}`;
+    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
+
+    // Y-axis 0..100%
+    const yTickVals = [0, 0.25, 0.5, 0.75, 1];
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${(v * 100).toFixed(0)}%</text></g>`;
+      })
+      .join('');
+    const yAxisTitle = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">% of prefill tokens</text>`;
+
+    const chipY = H - 8;
+    const chipW = innerW / Math.max(1, layers.length);
+    const legend = layers
+      .map((l, i) => {
+        const x = PAD.left + i * chipW;
+        return `<rect x="${(x + 2).toFixed(2)}" y="${chipY - 9}" width="12" height="8" fill="${l.color}" opacity="0.75" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${labelFor[l.name] ?? l.name}</text>`;
+      })
+      .join('');
+
+    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
+  }, [sourceSeries, durationS, H]);
+
+  if (Object.values(sourceSeries).every((v) => v.length === 0)) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <svg
+      viewBox={`0 0 ${W} ${H}`}
+      preserveAspectRatio="xMidYMid meet"
+      className="w-full h-auto text-foreground"
+      dangerouslySetInnerHTML={{ __html: inner }}
+    />
+  );
+}
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index f848e0e4..7a39bbd1 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -36,6 +36,8 @@ import type { Model, Sequence } from '@/lib/data-mappings';
  * @property {number} p99_e2el - 99th percentile of End-to-End Latency.
  */
 export interface AggDataEntry {
+  /** Stable per-point id from benchmark_results — for trace_replay lookups. */
+  id?: number;
   hw: string;
   mtp?: string;
   hwKey: string;
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 98562fb9..fdcf8952 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -6,6 +6,8 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react';
 
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
+import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import { useRouter } from 'next/navigation';
 import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import { computeToggle } from '@/hooks/useTogglableSet';
@@ -348,6 +350,10 @@ const ScatterGraph = React.memo(
     );
 
     const rooflines = useMemo(() => {
+      // Frontier scope is (hw, precision, date) — points from different dates
+      // can never share a frontier (a May 15 point can't dominate a May 17 plot).
+      // The legend grouping is still by (hw, precision); we just split the
+      // pareto compute per date and re-merge into the legend bucket.
       const result: Record<string, InferenceData[]> = {};
       const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition;
       const dir = chartDefinition[rooflineKey] as
@@ -356,17 +362,31 @@ const ScatterGraph = React.memo(
         | 'lower_left'
         | 'lower_right'
         | undefined;
-      for (const hw of Object.keys(groupedData)) {
-        const front =
-          dir === 'upper_right'
-            ? paretoFrontUpperRight(groupedData[hw])
-            : dir === 'upper_left'
-              ? paretoFrontUpperLeft(groupedData[hw])
-              : dir === 'lower_left'
-                ? paretoFrontLowerLeft(groupedData[hw])
-                : paretoFrontLowerRight(groupedData[hw]);
-        front.sort((a, b) => a.x - b.x);
-        result[hw] = front;
+      const frontierFn =
+        dir === 'upper_right'
+          ? paretoFrontUpperRight
+          : dir === 'upper_left'
+            ? paretoFrontUpperLeft
+            : dir === 'lower_left'
+              ? paretoFrontLowerLeft
+              : paretoFrontLowerRight;
+      for (const hwKey of Object.keys(groupedData)) {
+        const byDate = new Map<string, InferenceData[]>();
+        for (const p of groupedData[hwKey]) {
+          const d = p.date;
+          let bucket = byDate.get(d);
+          if (!bucket) {
+            bucket = [];
+            byDate.set(d, bucket);
+          }
+          bucket.push(p);
+        }
+        const combined: InferenceData[] = [];
+        for (const datePoints of byDate.values()) {
+          combined.push(...frontierFn(datePoints));
+        }
+        combined.sort((a, b) => a.x - b.x);
+        result[hwKey] = combined;
       }
       return result;
     }, [groupedData, selectedYAxisMetric, chartDefinition]);
@@ -374,7 +394,7 @@ const ScatterGraph = React.memo(
     const optimalPointKeys = useMemo(() => {
       const keys = new Set<string>();
       Object.values(rooflines).forEach((pts) =>
-        pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}-${p.x}-${p.y}`)),
+        pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}_${p.date}-${p.x}-${p.y}`)),
       );
       return keys;
     }, [rooflines]);
@@ -477,6 +497,18 @@ const ScatterGraph = React.memo(
     // All official points for rendering (unfiltered — visibility via opacity)
     const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]);
 
+    // Trace-replay histograms (ISL / OSL distributions) for agentic points.
+    // Pre-fetch the whole visible set so tooltip render stays synchronous.
+    const agenticIds = useMemo(() => {
+      const ids: number[] = [];
+      for (const p of pointsData) {
+        if (p.benchmark_type === 'agentic_traces' && typeof p.id === 'number') ids.push(p.id);
+      }
+      return ids;
+    }, [pointsData]);
+    const { data: traceHistograms } = useTraceHistograms(agenticIds);
+    const router = useRouter();
+
     // Gradient label data
     const allPointLabelsByKey = useMemo(() => {
       const globalLabelColorMap = new Map<string, string>();
@@ -516,7 +548,9 @@ const ScatterGraph = React.memo(
     const visiblePoints = useMemo(() => {
       let pts = filteredData;
       if (hideNonOptimal) {
-        pts = pts.filter((d) => optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`));
+        pts = pts.filter((d) =>
+          optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`),
+        );
       }
       return processedOverlayData.length > 0 ? [...pts, ...processedOverlayData] : pts;
     }, [filteredData, processedOverlayData, hideNonOptimal, optimalPointKeys]);
@@ -601,7 +635,8 @@ const ScatterGraph = React.memo(
       (d: InferenceData) =>
         effectiveActiveHwTypes.has(d.hwKey as string) &&
         selectedPrecisions.includes(d.precision) &&
-        (!hideNonOptimal || optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)),
+        (!hideNonOptimal ||
+          optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`)),
       [effectiveActiveHwTypes, selectedPrecisions, hideNonOptimal, optimalPointKeys],
     );
 
@@ -739,6 +774,8 @@ const ScatterGraph = React.memo(
             hardwareConfig,
             isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)),
             runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+            traceHistogram:
+              typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined,
           }),
         getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x),
         getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y),
@@ -754,26 +791,43 @@ const ScatterGraph = React.memo(
           ),
         onPointClick: (d: InferenceData) => {
           track('latency_data_point_clicked', { hw: String(d.hwKey), x: d.x, y: d.y });
-          // Attach track-over-time button handler in the tooltip
           const tooltipEl = chartRef.current?.getTooltipElement();
-          if (tooltipEl) {
-            const btn = tooltipEl.querySelector('[data-action="track-over-time"]');
-            if (btn) {
-              btn.addEventListener('click', (btnEvent) => {
-                btnEvent.stopPropagation();
-                const configId = buildPointConfigId(d);
-                if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
-                else addTrackedConfig(d, chartDefinition.chartType);
-                chartRef.current?.dismissTooltip();
-                chartRef.current?.hideTooltip();
-                track('latency_point_tracked_via_tooltip', {
-                  hwKey: String(d.hwKey),
-                  tp: d.tp,
-                  conc: d.conc,
-                  precision: d.precision,
-                });
+          if (!tooltipEl) return;
+
+          // ── Summary-page actions ──────────────────────────────────────────
+          const trackBtn = tooltipEl.querySelector('[data-action="track-over-time"]');
+          if (trackBtn) {
+            trackBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              const configId = buildPointConfigId(d);
+              if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
+              else addTrackedConfig(d, chartDefinition.chartType);
+              chartRef.current?.dismissTooltip();
+              chartRef.current?.hideTooltip();
+              track('latency_point_tracked_via_tooltip', {
+                hwKey: String(d.hwKey),
+                tp: d.tp,
+                conc: d.conc,
+                precision: d.precision,
               });
-            }
+            });
+          }
+
+          // ── "View charts" → navigate to dedicated detail page ────────────
+          const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+          if (viewBtn && typeof d.id === 'number') {
+            const pointId = d.id;
+            viewBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              track('latency_view_charts_opened', {
+                id: pointId,
+                hwKey: String(d.hwKey),
+                conc: d.conc,
+              });
+              chartRef.current?.dismissTooltip();
+              chartRef.current?.hideTooltip();
+              router.push(`/inference/agentic/${pointId}`);
+            });
           }
         },
         attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0)
@@ -788,6 +842,11 @@ const ScatterGraph = React.memo(
         removeTrackedConfig,
         chartDefinition.chartType,
         selectedPrecisions,
+        // Tooltip content closure reads traceHistograms to decide whether to
+        // show the "View charts" button — rebuild config when the histogram
+        // fetch resolves so the button appears for points that have data.
+        traceHistograms,
+        router,
       ],
     );
 
@@ -838,35 +897,64 @@ const ScatterGraph = React.memo(
             const precision = key.split('_').pop()!;
             const visible =
               effectiveActiveHwTypes.has(hw) && selectedPrecisions.includes(precision);
-            let stroke = getCssColor(resolveColor(hw));
-
-            if (showGradientLabels) {
-              const pointLabels = allPointLabelsByKey[key];
-              if (pointLabels) {
-                const stops = computeGradientStops(pointLabels, xScale);
-                if (stops) {
-                  const gid = `roofline-gradient-${chartId}-${key}`;
-                  activeGradientIds.add(gid);
-                  let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
-                  if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
-                  gradient
-                    .attr('gradientUnits', 'userSpaceOnUse')
-                    .attr('x1', xScale(pts[0].x))
-                    .attr('y1', 0)
-                    .attr('x2', xScale(pts.at(-1)!.x))
-                    .attr('y2', 0);
-                  gradient
-                    .selectAll('stop')
-                    .data(stops)
-                    .join('stop')
-                    .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
-                    .attr('stop-color', (s) => s.color);
-                  stroke = `url(#${gid})`;
-                }
+            const baseStroke = getCssColor(resolveColor(hw));
+
+            // Split into per-date sub-paths so the line never crosses dates.
+            // (When only one date is present the loop runs once with the full set.)
+            const byDate = new Map<string, InferenceData[]>();
+            for (const p of pts) {
+              let bucket = byDate.get(p.date);
+              if (!bucket) {
+                bucket = [];
+                byDate.set(p.date, bucket);
               }
+              bucket.push(p);
             }
+            const singleDate = byDate.size === 1;
+
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length <= 1) continue;
+              const entryKey = singleDate ? key : `${key}__${date}`;
+              let stroke = baseStroke;
+
+              // Gradient labels only apply in the single-date case; mapping the
+              // (key-wide) ParetoPointLabel array onto per-date sub-segments is
+              // ambiguous and the comparison-date overlay is a rare combo.
+              if (singleDate && showGradientLabels) {
+                const pointLabels = allPointLabelsByKey[key];
+                if (pointLabels) {
+                  const stops = computeGradientStops(pointLabels, xScale);
+                  if (stops) {
+                    const gid = `roofline-gradient-${chartId}-${entryKey}`;
+                    activeGradientIds.add(gid);
+                    let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
+                    if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
+                    gradient
+                      .attr('gradientUnits', 'userSpaceOnUse')
+                      .attr('x1', xScale(datePoints[0].x))
+                      .attr('y1', 0)
+                      .attr('x2', xScale(datePoints.at(-1)!.x))
+                      .attr('y2', 0);
+                    gradient
+                      .selectAll('stop')
+                      .data(stops)
+                      .join('stop')
+                      .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
+                      .attr('stop-color', (s) => s.color);
+                    stroke = `url(#${gid})`;
+                  }
+                }
+              }
 
-            entries.push({ key, hw, precision, points: pts, stroke, visible });
+              entries.push({
+                key: entryKey,
+                hw,
+                precision,
+                points: datePoints,
+                stroke,
+                visible,
+              });
+            }
           });
 
           // Remove stale gradients
@@ -1271,11 +1359,26 @@ const ScatterGraph = React.memo(
             .y((d) => newYScale(d.y))
             .curve(d3.curveMonotoneX);
 
-          // Update roofline paths
+          // Update roofline paths — must split per-date so the zoom redraw
+          // matches the per-date sub-paths created in the initial render.
           Object.entries(rooflines).forEach(([key, pts]) => {
             if (pts.length < 2) return;
-            const sel = zoomGroup.select<SVGPathElement>(`.roofline-${key}`);
-            if (!sel.empty()) sel.attr('d', lineGen(pts) as string);
+            const byDate = new Map<string, InferenceData[]>();
+            for (const p of pts) {
+              let bucket = byDate.get(p.date);
+              if (!bucket) {
+                bucket = [];
+                byDate.set(p.date, bucket);
+              }
+              bucket.push(p);
+            }
+            const singleDate = byDate.size === 1;
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length < 2) continue;
+              const cls = singleDate ? `roofline-${key}` : `roofline-${key}__${date}`;
+              const sel = zoomGroup.select<SVGPathElement>(`.${CSS.escape(cls)}`);
+              if (!sel.empty()) sel.attr('d', lineGen(datePoints) as string);
+            }
           });
 
           // Update gradient coordinates
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 3154070a..ccc371f9 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -19,6 +19,13 @@ export interface TooltipConfig {
   isTracked?: boolean;
   /** URL to the GitHub Actions workflow run */
   runUrl?: string;
+  /**
+   * Per-request ISL/OSL arrays for agentic points, sourced from the stored
+   * aiperf `profile_export.jsonl`. Used to detect whether the point has any
+   * trace data (so the "View charts" button can appear); the actual
+   * distributions are rendered on the detail page, not inline.
+   */
+  traceHistogram?: { isl: number[]; osl: number[] } | undefined;
 }
 
 export interface OverlayTooltipConfig extends TooltipConfig {
@@ -138,9 +145,24 @@ const generateAgenticHTML = (d: InferenceData): string => {
     parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
   }
 
+  // Histograms + time-series live on the dedicated detail page now; the
+  // "View charts" button (rendered by the wrapper when pinned + has trace
+  // data) takes the user there.
+
   return parts.join('');
 };
 
+/** "View charts" button — only visible when the tooltip is pinned and the
+ *  point has stored trace data. Wired up by the ScatterGraph click handler. */
+const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => {
+  if (!isPinned || !hasTraceData) return '';
+  return `<button data-action="view-charts" style="
+    margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500;
+    border: 1px solid var(--border); border-radius: 6px; cursor: pointer;
+    background: var(--accent); color: var(--accent-foreground);
+  ">View charts &rarr;</button>`;
+};
+
 const shortenSha = (image: string) => image.replaceAll(/(sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$1…');
 
 const imageTooltipLine = (image: string) =>
@@ -191,7 +213,16 @@ const generateParallelismHTML = (d: InferenceData): string => {
  * @returns HTML string for the tooltip content
  */
 export const generateTooltipContent = (config: TooltipConfig): string => {
-  const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+  const {
+    data: d,
+    isPinned,
+    xLabel,
+    yLabel,
+    selectedYAxisMetric,
+    hardwareConfig,
+    runUrl,
+    traceHistogram,
+  } = config;
 
   return `
     <div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 12px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1); user-select: ${isPinned ? 'text' : 'none'};">
@@ -240,6 +271,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       </div>
       ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
+      ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))}
       ${
         isPinned
           ? `<button data-action="track-over-time" style="
diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx
index 0392ac10..44013b1b 100644
--- a/packages/app/src/components/ui/d3-chart-wrapper.tsx
+++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx
@@ -1,6 +1,41 @@
 'use client';
 
-import React from 'react';
+import React, { useEffect, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+/**
+ * Renders the d3 tooltip element via React Portal to document.body so it
+ * escapes any parent stacking context (e.g. the chart Card's backdrop-filter
+ * creates one, trapping z-index inside it). Position is set as viewport
+ * coordinates by the d3 layer.
+ */
+function PortalTooltip({
+  tooltipRef,
+  pinned,
+}: {
+  tooltipRef: React.RefObject<HTMLDivElement | null>;
+  pinned: boolean;
+}) {
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+  const node = (
+    <div
+      ref={tooltipRef}
+      data-chart-tooltip
+      style={{
+        position: 'fixed',
+        left: 0,
+        top: 0,
+        opacity: pinned ? 1 : 0,
+        pointerEvents: pinned ? 'auto' : 'none',
+        display: pinned ? 'block' : 'none',
+        zIndex: 9999,
+      }}
+    />
+  );
+  if (!mounted || typeof document === 'undefined') return node;
+  return createPortal(node, document.body);
+}
 
 export interface D3ChartWrapperProps {
   chartId: string;
@@ -72,17 +107,11 @@ export function D3ChartWrapper({
                 }
               }}
             />
-            <div
-              ref={tooltipRef}
-              data-chart-tooltip
-              style={{
-                position: 'absolute',
-                opacity: pinnedPoint ? 1 : 0,
-                pointerEvents: pinnedPoint ? 'auto' : 'none',
-                display: pinnedPoint ? 'block' : 'none',
-                zIndex: 50,
-              }}
-            />
+            {/* Tooltip is portalled to <body> with position:fixed so it can
+                rise above sibling chart cards' stacking contexts. The d3 layer
+                writes viewport-coords into style.left/top — see
+                computeTooltipPosition. */}
+            <PortalTooltip tooltipRef={tooltipRef} pinned={Boolean(pinnedPoint)} />
             {noDataOverlay}
           </div>
           <p className="no-export text-xs text-muted-foreground text-center mt-2">{instructions}</p>
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index aa0f6c43..3c24d32b 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -12,6 +12,7 @@ import { buildChartData, parseAvailableModelsAndSequences } from './unofficial-r
 /** Minimal BenchmarkRow stub — only fields used by buildChartData key logic. */
 function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'sglang',
     model: 'dsr1',
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
new file mode 100644
index 00000000..1ea90c0d
--- /dev/null
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -0,0 +1,46 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  is_current: boolean;
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  github_run_id: number;
+  date: string;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export function useBenchmarkSiblings(id: number | null) {
+  return useQuery({
+    queryKey: ['benchmark-siblings', id] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch(`/api/v1/benchmark-siblings?id=${id}`, { signal });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`benchmark-siblings ${res.status}`);
+      return (await res.json()) as BenchmarkSiblings;
+    },
+    enabled: id !== null && id > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts
new file mode 100644
index 00000000..db4220d2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-histograms.ts
@@ -0,0 +1,39 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TraceHistogramPoint {
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+async function fetchTraceHistograms(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<TraceHistogramMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/trace-histograms?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`trace-histograms ${res.status}`);
+  return (await res.json()) as TraceHistogramMap;
+}
+
+/**
+ * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values.
+ * Ids without a stored trace_replay blob are silently omitted from the response.
+ *
+ * Caller passes the agentic id set currently on screen; React Query handles
+ * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so
+ * any permutation of the same set hits the same cache entry.
+ */
+export function useTraceHistograms(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['trace-histograms', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchTraceHistograms(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
new file mode 100644
index 00000000..8418aa4f
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -0,0 +1,70 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  run_url: string | null;
+  server_gpu_cache_hit_rate: number | null;
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+  meta: PointMeta;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+}
+
+async function fetchTraceServerMetrics(
+  id: number,
+  signal?: AbortSignal,
+): Promise<TraceServerMetrics | null> {
+  const res = await fetch(`/api/v1/trace-server-metrics?id=${id}`, { signal });
+  if (res.status === 404) return null;
+  if (!res.ok) throw new Error(`trace-server-metrics ${res.status}`);
+  return (await res.json()) as TraceServerMetrics;
+}
+
+/**
+ * Lazy-fetch parsed server-metric time-series for one agentic point.
+ * Enabled only when the caller passes `enabled=true` (the detail panel opens),
+ * so we don't pay the parse cost on every hover.
+ */
+export function useTraceServerMetrics(id: number | null, enabled = false) {
+  return useQuery({
+    queryKey: ['trace-server-metrics', id] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      id ? fetchTraceServerMetrics(id, signal) : Promise.resolve(null),
+    enabled: enabled && Boolean(id),
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 435f7629..98587c2f 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -6,6 +6,8 @@
 import type { SubmissionsResponse } from './submissions-types';
 
 export interface BenchmarkRow {
+  /** Stable per-point id from benchmark_results; used to look up trace histograms. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 6a6c97c8..fcbca681 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -6,6 +6,7 @@ import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform
 
 function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'trt',
     model: 'dsr1',
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index eb62a18a..c5bdd6ed 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -49,6 +49,8 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     row.offload_mode ??
     (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
   return {
+    // Coerce: Postgres bigint comes through the SQL client as a string.
+    id: typeof row.id === 'number' ? row.id : Number(row.id),
     hw: row.hardware,
     framework: row.framework,
     model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model,
diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts
index 3b49dfbc..da81ca0e 100644
--- a/packages/app/src/lib/compare-pair-defaults.test.ts
+++ b/packages/app/src/lib/compare-pair-defaults.test.ts
@@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults';
 
 function makeRow(overrides: Partial<BenchmarkRow>): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h100',
     framework: 'sglang',
     model: 'dsr1',
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 4fa19fe8..421ac69b 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -289,7 +289,21 @@ export function attachScatterTooltipHandlers<
     });
 }
 
-/** Compute tooltip left/top, flipping when it would overflow the chart container. */
+/**
+ * Compute tooltip left/top **in viewport coordinates** so the tooltip can be
+ * rendered via portal with `position: fixed`. Callers still pass cursor coords
+ * relative to `container` (matching `d3.pointer(event, container)`).
+ *
+ * Why viewport coords: the chart cards use `backdrop-filter`, which creates
+ * a stacking context. A tooltip painted inside the upper card's stacking
+ * context cannot rise above the lower card's stacking context regardless of
+ * its z-index. Portalling to document.body + `position: fixed` sidesteps the
+ * whole problem; we just need the coordinates in viewport space.
+ *
+ * Strategy: pick preferred side (right/below cursor), flip if it overflows the
+ * container, then clamp to container bounds. Tall tooltips that don't fit get
+ * clamped to the container edges.
+ */
 export function computeTooltipPosition(
   mx: number,
   my: number,
@@ -308,13 +322,21 @@ export function computeTooltipPosition(
   // Force reflow so we get real dimensions
   const tw = node.getBoundingClientRect().width || node.offsetWidth;
   const th = node.getBoundingClientRect().height || node.offsetHeight;
+  const rect = container.getBoundingClientRect();
   const cw = container.clientWidth;
   const ch = container.clientHeight;
+  const EDGE_PAD = 4;
+
+  // Prefer right of cursor; flip to left if no room.
+  let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw;
+  left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left));
 
-  const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset;
-  const top = my + offset + th > ch ? my - offset - th : my + offset;
+  // Prefer below cursor; flip above if no room.
+  let top = my + offset + th <= ch ? my + offset : my - offset - th;
+  top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top));
 
-  return { left, top };
+  // Convert container-local coords → viewport coords for `position: fixed`.
+  return { left: left + rect.left, top: top + rect.top };
 }
 
 /** Update scatter point positions on zoom. */
diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql
new file mode 100644
index 00000000..398bc725
--- /dev/null
+++ b/packages/db/migrations/006_agentic_trace_replay.sql
@@ -0,0 +1,34 @@
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_<suffix>` artifact:
+--   - profile_export.jsonl         (~2 MB raw, per-request data)
+--   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+  id                                bigserial   primary key,
+  -- gzip(profile_export.jsonl); null when only the server metrics file existed
+  profile_export_jsonl_gz           bytea,
+  profile_export_uncompressed_size  bigint,
+  -- raw csv bytes; null when only the profile file existed
+  server_metrics_csv                bytea,
+  server_metrics_csv_size           bigint,
+  created_at                        timestamptz not null default now()
+);
+
+alter table benchmark_results
+  add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+  on benchmark_results (trace_replay_id)
+  where trace_replay_id is not null;
diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
new file mode 100644
index 00000000..ba7bd095
--- /dev/null
+++ b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
@@ -0,0 +1,17 @@
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+  add column server_metrics_json_gz bytea,
+  add column server_metrics_json_uncompressed_size bigint;
diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts
index 90ad73b7..e407db3a 100644
--- a/packages/db/src/etl/skip-tracker.test.ts
+++ b/packages/db/src/etl/skip-tracker.test.ts
@@ -9,6 +9,7 @@ describe('createSkipTracker', () => {
     expect(tracker.skips.unmappedHw).toBe(0);
     expect(tracker.skips.noIslOsl).toBe(0);
     expect(tracker.skips.dbError).toBe(0);
+    expect(tracker.skips.traceReplayMissing).toBe(0);
   });
 
   it('initializes with empty unmapped sets', () => {
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 588718dd..401d197c 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -10,6 +10,8 @@ export interface Skips {
   noIslOsl: number;
   failedRun: number;
   dbError: number;
+  /** Agentic point whose sibling `agentic_<suffix>` artifact had no trace_replay files. */
+  traceReplayMissing: number;
 }
 
 export interface SkipSnapshot {
@@ -74,6 +76,7 @@ export function createSkipTracker(): SkipTracker {
     noIslOsl: 0,
     failedRun: 0,
     dbError: 0,
+    traceReplayMissing: 0,
   };
   const unmappedModels = new Set<string>();
   const unmappedHws = new Set<string>();
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
new file mode 100644
index 00000000..8c6d92b6
--- /dev/null
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -0,0 +1,83 @@
+/**
+ * Insert per-point aiperf trace files (`profile_export.jsonl` +
+ * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row
+ * to each provided benchmark_results row via `trace_replay_id`.
+ *
+ * Mirrors the {@link insertServerLog} idempotency contract: rows that already
+ * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't
+ * duplicate the sibling blob.
+ */
+
+import { gzipSync } from 'node:zlib';
+
+import type postgres from 'postgres';
+
+type Sql = ReturnType<typeof postgres>;
+
+/**
+ * Persist the per-point trace files and link them to `benchmarkResultIds`.
+ *
+ * @param sql                 Active `postgres` connection.
+ * @param benchmarkResultIds  DB ids of the benchmark_results rows produced by
+ *                            the same `bmk_agentic_<suffix>` artifact whose
+ *                            sibling `agentic_<suffix>` directory holds these
+ *                            trace files.
+ * @param profileExportJsonl  Raw bytes of `profile_export.jsonl`, or null.
+ *                            Gzipped before storage.
+ * @param serverMetricsCsv    Raw bytes of `server_metrics_export.csv`, or null.
+ *                            Stored as-is.
+ * @param serverMetricsJson   Raw bytes of `server_metrics_export.json` —
+ *                            per-scrape time-series of every Prometheus metric.
+ *                            Optional, gzipped before storage (~42x ratio).
+ */
+export async function insertTraceReplay(
+  sql: Sql,
+  benchmarkResultIds: number[],
+  profileExportJsonl: Buffer | null,
+  serverMetricsCsv: Buffer | null,
+  serverMetricsJson: Buffer | null = null,
+): Promise<void> {
+  if (benchmarkResultIds.length === 0) return;
+  if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
+
+  // Only link rows that don't already point at a trace_replay row — keeps
+  // re-ingest from inserting duplicate sibling blobs.
+  const unlinked = await sql<{ id: number }[]>`
+    select id from benchmark_results
+    where id = any(${sql.array(benchmarkResultIds)}::bigint[])
+      and trace_replay_id is null
+  `;
+  if (unlinked.length === 0) return;
+
+  const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null;
+  const profileSize = profileExportJsonl ? profileExportJsonl.length : null;
+  const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null;
+  const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
+  const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
+
+  const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
+    insert into agentic_trace_replay (
+      profile_export_jsonl_gz,
+      profile_export_uncompressed_size,
+      server_metrics_csv,
+      server_metrics_csv_size,
+      server_metrics_json_gz,
+      server_metrics_json_uncompressed_size
+    )
+    values (
+      ${profileGz},
+      ${profileSize},
+      ${serverMetricsCsv},
+      ${csvSize},
+      ${metricsJsonGz},
+      ${metricsJsonSize}
+    )
+    returning id
+  `;
+
+  await sql`
+    update benchmark_results
+    set trace_replay_id = ${traceReplayId}
+    where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+  `;
+}
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 35183789..eeb55313 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -45,6 +45,7 @@ import {
   bulkUpsertAvailability,
   insertServerLog,
 } from './etl/benchmark-ingest';
+import { insertTraceReplay } from './etl/trace-replay-ingest';
 import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper';
 import { ingestEvalRow } from './etl/eval-ingest';
 import { mapEvalSamples } from './etl/eval-samples-mapper';
@@ -209,6 +210,14 @@ const ARTIFACT_NAMES = {
   changelog: 'changelog-metadata',
 } as const;
 
+/**
+ * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name
+ * so the bare suffix becomes a shared key between `bmk_agentic_<suffix>` and
+ * its sibling `agentic_<suffix>` artifact.
+ */
+const stripBmkAndAgenticPrefix = (s: string): string =>
+  s.replace(/^bmk_/u, '').replace(/^agentic_/u, '');
+
 function readJson(filePath: string): unknown {
   try {
     return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -327,6 +336,7 @@ async function main(): Promise<void> {
   let totalSamples = 0;
   let totalSampleFiles = 0;
   let totalChangelogs = 0;
+  let totalTraceReplayLinked = 0;
 
   // ── Check for evals-only flag in changelog ────────────────────────────
   const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog);
@@ -381,6 +391,56 @@ async function main(): Promise<void> {
       console.log(`  Found ${serverLogPaths.size} server log artifact(s)`);
     }
 
+    // Sibling aiperf artifacts: each `bmk_agentic_<suffix>` is paired with an
+    // `agentic_<suffix>` dir holding `profile_export.jsonl` and
+    // `server_metrics_export.csv`. The harness emits these under either a
+    // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current).
+    // Older non-aiperf agentic runs don't ship this sibling. Key on the bare
+    // suffix so both names map to the same Map entry.
+    const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
+    const traceReplayPaths = new Map<
+      string,
+      {
+        profileJsonl: string | null;
+        serverMetricsCsv: string | null;
+        serverMetricsJson: string | null;
+      }
+    >();
+    if (fs.existsSync(artifactsDir)) {
+      for (const d of fs.readdirSync(artifactsDir)) {
+        if (!d.startsWith('agentic_')) continue;
+        let profile: string | null = null;
+        let metrics: string | null = null;
+        let metricsJson: string | null = null;
+        for (const sub of TRACE_SUBDIRS) {
+          const dir = path.join(artifactsDir, d, sub);
+          if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) continue;
+          if (!profile) {
+            const p = path.join(dir, 'profile_export.jsonl');
+            if (fs.existsSync(p)) profile = p;
+          }
+          if (!metrics) {
+            const m = path.join(dir, 'server_metrics_export.csv');
+            if (fs.existsSync(m)) metrics = m;
+          }
+          if (!metricsJson) {
+            const j = path.join(dir, 'server_metrics_export.json');
+            if (fs.existsSync(j)) metricsJson = j;
+          }
+        }
+        if (!profile && !metrics && !metricsJson) continue;
+        const suffix = stripBmkAndAgenticPrefix(d);
+        traceReplayPaths.set(suffix, {
+          profileJsonl: profile,
+          serverMetricsCsv: metrics,
+          serverMetricsJson: metricsJson,
+        });
+      }
+    }
+    if (traceReplayPaths.size > 0) {
+      console.log(`  Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`);
+    }
+
     const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))];
     console.log(`  Found ${allBmkFiles.length} benchmark JSON file(s)`);
 
@@ -448,12 +508,42 @@ async function main(): Promise<void> {
               }
             }
           }
+
+          // Trace-replay sibling lookup for agentic points only. The aiperf
+          // harness emits `agentic_<suffix>/trace_replay/...` next to the
+          // `bmk_agentic_<suffix>` artifact we just ingested.
+          if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) {
+            const suffix = stripBmkAndAgenticPrefix(parentDir);
+            const trace = traceReplayPaths.get(suffix);
+            if (trace) {
+              try {
+                const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
+                const metrics = trace.serverMetricsCsv
+                  ? fs.readFileSync(trace.serverMetricsCsv)
+                  : null;
+                const metricsJson = trace.serverMetricsJson
+                  ? fs.readFileSync(trace.serverMetricsJson)
+                  : null;
+                await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson);
+                totalTraceReplayLinked += insertedIds.length;
+              } catch (error: any) {
+                tracker.recordDbError(`trace_replay for ${suffix}`, error);
+              }
+            } else {
+              tracker.skips.traceReplayMissing++;
+            }
+          }
         } catch (error: any) {
           tracker.recordDbError(path.basename(file), error);
         }
       }
     }
     console.log(`  Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`);
+    if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) {
+      console.log(
+        `  Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`,
+      );
+    }
 
     if (availRows.length > 0) {
       try {
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index 6857f817..b4a6fb95 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -458,6 +458,8 @@ async function mapWorkflowDir(
       unmappedHw: local.skips.unmappedHw,
       noIslOsl: local.skips.noIslOsl,
       failedRun: local.skips.failedRun,
+      // GCS backup doesn't ingest aiperf trace files; counter stays 0.
+      traceReplayMissing: local.skips.traceReplayMissing,
     },
     localUnmappedModels: new Set(local.unmappedModels),
     localUnmappedHws: new Set(local.unmappedHws),
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 19527f22..785d82c4 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -273,6 +273,7 @@ function toBenchmarkRow(
   metrics?: Record<string, number>,
 ): BenchmarkRow {
   return {
+    id: br.id,
     hardware: c.hardware,
     framework: c.framework,
     model: c.model,
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
new file mode 100644
index 00000000..245a1170
--- /dev/null
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -0,0 +1,132 @@
+/**
+ * Find all benchmark_results that share the same SKU (hardware + framework +
+ * model + precision + spec_method + disagg + benchmark_type + workflow_run)
+ * as the given point. Used by the detail page to render a "switch between
+ * concs / parallelisms" navigator within a single run.
+ */
+
+import type { DbClient } from '../connection.js';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  /** "on" | "off" | null. */
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  /** True if this row IS the point passed in. */
+  is_current: boolean;
+  /** Whether the row has a stored trace_replay blob (for navigation hint). */
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  /** Human-readable workflow_run summary so the page header can hint at provenance. */
+  github_run_id: number;
+  date: string;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export async function getBenchmarkSiblings(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<BenchmarkSiblings | null> {
+  // Step 1: resolve the SKU defining fields for the requested point.
+  const seed = (await sql`
+    select
+      c.hardware, c.framework, c.model, c.precision, c.spec_method,
+      br.benchmark_type, br.workflow_run_id, br.date::text,
+      wr.github_run_id
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as {
+    hardware: string;
+    framework: string;
+    model: string;
+    precision: string;
+    spec_method: string;
+    benchmark_type: string;
+    workflow_run_id: number;
+    date: string;
+    github_run_id: number;
+  }[];
+  const root = seed[0];
+  if (!root) return null;
+
+  // Step 2: pull every sibling row sharing the SKU within the same workflow_run.
+  const rows = (await sql`
+    select
+      br.id, br.conc, br.offload_mode,
+      c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep,
+      c.num_prefill_gpu, c.num_decode_gpu, c.disagg,
+      (br.trace_replay_id is not null) as has_trace
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    where br.workflow_run_id = ${root.workflow_run_id}
+      and br.benchmark_type = ${root.benchmark_type}
+      and c.hardware = ${root.hardware}
+      and c.framework = ${root.framework}
+      and c.model = ${root.model}
+      and c.precision = ${root.precision}
+      and c.spec_method = ${root.spec_method}
+    order by c.decode_tp, c.decode_ep, br.offload_mode nulls first, br.conc
+  `) as unknown as {
+    id: number;
+    conc: number;
+    offload_mode: string | null;
+    decode_tp: number;
+    decode_ep: number;
+    prefill_tp: number;
+    prefill_ep: number;
+    num_prefill_gpu: number;
+    num_decode_gpu: number;
+    disagg: boolean;
+    has_trace: boolean;
+  }[];
+
+  const siblings: BenchmarkSibling[] = rows.map((r) => ({
+    id: Number(r.id),
+    conc: r.conc,
+    offload_mode: r.offload_mode,
+    decode_tp: r.decode_tp,
+    decode_ep: r.decode_ep,
+    prefill_tp: r.prefill_tp,
+    prefill_ep: r.prefill_ep,
+    num_prefill_gpu: r.num_prefill_gpu,
+    num_decode_gpu: r.num_decode_gpu,
+    disagg: r.disagg,
+    is_current: Number(r.id) === benchmarkResultId,
+    has_trace: r.has_trace,
+  }));
+
+  return {
+    sku: {
+      hardware: root.hardware,
+      framework: root.framework,
+      model: root.model,
+      precision: root.precision,
+      spec_method: root.spec_method,
+      benchmark_type: root.benchmark_type,
+      github_run_id: Number(root.github_run_id),
+      date: root.date,
+    },
+    siblings,
+  };
+}
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 74e20380..36bb0e65 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -1,6 +1,13 @@
 import type { DbClient } from '../connection.js';
 
 export interface BenchmarkRow {
+  /**
+   * Stable per-point id from benchmark_results. Used by the frontend to look
+   * up associated detail blobs (e.g. trace_replay histograms).
+   * Number is fine in TS but it's a Postgres bigint — Date arithmetic on huge
+   * runs is hypothetically lossy, in practice well below Number.MAX_SAFE_INTEGER.
+   */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -55,6 +62,7 @@ export async function getLatestBenchmarks(
     const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`;
     const rows = await sql`
       SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+        br.id,
         c.hardware,
         c.framework,
         c.model,
@@ -95,6 +103,7 @@ export async function getLatestBenchmarks(
   // No date filter: use materialized view for instant lookups
   const rows = await sql`
     SELECT
+      lb.id,
       c.hardware,
       c.framework,
       c.model,
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
new file mode 100644
index 00000000..c243afd8
--- /dev/null
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -0,0 +1,82 @@
+/**
+ * Fetch per-request ISL/OSL arrays from stored aiperf `profile_export.jsonl`
+ * blobs (gzipped in `agentic_trace_replay.profile_export_jsonl_gz`). Caller
+ * passes the set of `benchmark_results.id`s it wants and receives one entry
+ * per id that actually has a trace_replay blob (others are silently skipped).
+ *
+ * The JSONL has one JSON object per request with the shape:
+ *   { metrics: { input_sequence_length: { value, unit }, output_sequence_length: {...}, ... } }
+ *
+ * Returns raw arrays rather than pre-binned histograms — payload stays tiny
+ * (~256 ints * 2 fields per point, ~2 KB compressed) and the frontend can bin
+ * however it wants.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface TraceHistogramPoint {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+export async function getTraceHistograms(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<TraceHistogramMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.profile_export_jsonl_gz as blob
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+      and atr.profile_export_jsonl_gz is not null
+  `) as { benchmark_result_id: number; blob: Buffer }[];
+
+  const result: TraceHistogramMap = {};
+  for (const row of rows) {
+    try {
+      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const isl: number[] = [];
+      const osl: number[] = [];
+      for (const line of jsonl.split('\n')) {
+        if (!line) continue;
+        let rec: { metrics?: Record<string, { value?: number } | number> };
+        try {
+          rec = JSON.parse(line);
+        } catch {
+          continue;
+        }
+        const m = rec.metrics ?? {};
+        const islVal = readMetric(m['input_sequence_length']);
+        const oslVal = readMetric(m['output_sequence_length']);
+        if (typeof islVal === 'number' && Number.isFinite(islVal)) isl.push(islVal);
+        if (typeof oslVal === 'number' && Number.isFinite(oslVal)) osl.push(oslVal);
+      }
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        isl,
+        osl,
+      };
+    } catch {
+      // Drop malformed blobs silently — caller treats missing ids as "no data".
+    }
+  }
+  return result;
+}
+
+function readMetric(v: { value?: number } | number | undefined): number | undefined {
+  if (v === undefined || v === null) return undefined;
+  if (typeof v === 'number') return v;
+  return v.value;
+}
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
new file mode 100644
index 00000000..822ae633
--- /dev/null
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -0,0 +1,275 @@
+/**
+ * Parse aiperf's `server_metrics_export.json` blob (gzipped in
+ * `agentic_trace_replay.server_metrics_json_gz`) and return a slim, chart-ready
+ * time-series for one benchmark point.
+ *
+ * The raw JSON has shape:
+ *   metrics: {
+ *     "<metric_name>": {
+ *       series: [
+ *         {
+ *           labels: { ... },
+ *           stats: { ... summary ... },
+ *           timeslices: [
+ *             { start_ns, end_ns, avg, min, max }            // gauges
+ *             { start_ns, end_ns, total, rate }              // counters
+ *           ]
+ *         }
+ *       ]
+ *     }
+ *   }
+ *
+ * Timeslices are ~1 Hz windows. The benchmark window can be tens of minutes
+ * (1800+ windows). We return them as `[{ t, ...}]` arrays with `t` measured
+ * in seconds from the benchmark start so the frontend doesn't need to
+ * shuffle bigint nanoseconds around.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+interface GaugeSlice {
+  start_ns: number;
+  end_ns: number;
+  avg?: number;
+  min?: number;
+  max?: number;
+}
+
+interface CounterSlice {
+  start_ns: number;
+  end_ns: number;
+  total?: number;
+  rate?: number;
+}
+
+interface Series {
+  endpoint_url?: string;
+  labels?: Record<string, string>;
+  stats?: Record<string, unknown>;
+  timeslices?: (GaugeSlice & CounterSlice)[];
+}
+
+interface MetricsJson {
+  metrics?: Record<string, { type?: string; description?: string; series?: Series[] }>;
+}
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  /** Optional total — frontend can compute too. */
+  total: number;
+}
+
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  /** GitHub Actions run URL for jumping to the source. */
+  run_url: string | null;
+  /** Cumulative end-of-run cache-hit number the dashboard already shows. */
+  server_gpu_cache_hit_rate: number | null;
+  /** Cumulative end-of-run CPU offload cache-hit. */
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+  /** Point context — hardware, model, conc, etc. for the page header. */
+  meta: PointMeta;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  /** vllm:kv_cache_usage_perc avg per scrape, values in 0..1. */
+  kvCacheUsage: TimeSeriesPoint[];
+  /** Per-window prefix-cache hit rate computed as Δhits / Δqueries (0..1). */
+  prefixCacheHitRate: TimeSeriesPoint[];
+  /** Request queue depth: running, waiting, total per scrape. */
+  queueDepth: QueueDepthPoint[];
+  /**
+   * Per-source prompt-token counts over time (counter rate per scrape).
+   * Keyed by the value of the `source` label (typically `local_cache_hit`,
+   * `external_cache_hit`, `miss`, etc.). Plot as stacked area.
+   */
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  /** Prefill throughput: vllm:prompt_tokens rate (tokens/sec) per scrape. */
+  prefillTps: TimeSeriesPoint[];
+  /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
+  decodeTps: TimeSeriesPoint[];
+}
+
+export async function getTraceServerMetrics(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const rows = (await sql`
+    select
+      atr.server_metrics_json_gz as blob,
+      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+      br.date::text,
+      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as ({ blob: Buffer | null } & PointMeta)[];
+  const row = rows[0];
+  if (!row) return null;
+  const blob = row.blob;
+  if (!blob) return null;
+  const pointMeta: PointMeta = {
+    id: Number(row.id),
+    hardware: row.hardware,
+    framework: row.framework,
+    model: row.model,
+    precision: row.precision,
+    spec_method: row.spec_method,
+    disagg: row.disagg,
+    conc: row.conc,
+    offload_mode: row.offload_mode,
+    isl: row.isl,
+    osl: row.osl,
+    benchmark_type: row.benchmark_type,
+    date: row.date,
+    run_url: row.run_url,
+    server_gpu_cache_hit_rate:
+      row.server_gpu_cache_hit_rate === null ? null : Number(row.server_gpu_cache_hit_rate),
+    server_cpu_cache_hit_rate:
+      row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
+  };
+
+  const parsed = JSON.parse(gunzipSync(blob).toString('utf8')) as MetricsJson;
+  const metrics = parsed.metrics ?? {};
+
+  const firstSeries = (name: string): Series | undefined => {
+    const s = metrics[name]?.series;
+    return s && s.length > 0 ? s[0] : undefined;
+  };
+
+  // Compute timing reference from the first gauge metric we can find.
+  let startNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+  let timeslicesCount = 0;
+  for (const metricMeta of Object.values(metrics)) {
+    for (const s of metricMeta?.series ?? []) {
+      const ts = s.timeslices ?? [];
+      if (ts.length === 0) continue;
+      timeslicesCount = Math.max(timeslicesCount, ts.length);
+      const first = ts[0]!;
+      const last = ts.at(-1)!;
+      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+    }
+  }
+  if (!Number.isFinite(startNs)) startNs = 0;
+  const tOf = (ns: number) => (ns - startNs) / 1e9;
+
+  // KV cache usage (gauge, 0..1)
+  const kvCacheUsage: TimeSeriesPoint[] = [];
+  const kvSeries =
+    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number') {
+      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
+    }
+  }
+
+  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
+  // `rate` is already per-window delta; we just divide.
+  const hitsTs = firstSeries('vllm:prefix_cache_hits')?.timeslices ?? [];
+  const qsTs = firstSeries('vllm:prefix_cache_queries')?.timeslices ?? [];
+  const prefixCacheHitRate: TimeSeriesPoint[] = [];
+  const minLen = Math.min(hitsTs.length, qsTs.length);
+  for (let i = 0; i < minLen; i++) {
+    const h = hitsTs[i]!;
+    const q = qsTs[i]!;
+    if (typeof q.rate === 'number' && q.rate > 0 && typeof h.rate === 'number') {
+      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
+    }
+  }
+
+  // Queue depth: pair running + waiting by index.
+  const runTs = firstSeries('vllm:num_requests_running')?.timeslices ?? [];
+  const waitTs = firstSeries('vllm:num_requests_waiting')?.timeslices ?? [];
+  const queueDepth: QueueDepthPoint[] = [];
+  const qlen = Math.min(runTs.length, waitTs.length);
+  for (let i = 0; i < qlen; i++) {
+    const r = runTs[i]!;
+    const w = waitTs[i]!;
+    const running = typeof r.avg === 'number' ? r.avg : 0;
+    const waiting = typeof w.avg === 'number' ? w.avg : 0;
+    queueDepth.push({
+      t: tOf(r.start_ns),
+      running,
+      waiting,
+      total: running + waiting,
+    });
+  }
+
+  // Throughput: extract counter `rate` (already per-second delta from aiperf).
+  const counterRateSeries = (name: string): TimeSeriesPoint[] => {
+    const s = firstSeries(name);
+    if (!s) return [];
+    const out: TimeSeriesPoint[] = [];
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.rate === 'number') out.push({ t: tOf(ts.start_ns), value: ts.rate });
+    }
+    return out;
+  };
+  const prefillTps = counterRateSeries('vllm:prompt_tokens');
+  const decodeTps = counterRateSeries('vllm:generation_tokens');
+
+  // Per-source prompt tokens — emit one TS array per source label.
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+    const labels = series.labels ?? {};
+    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+    const arr: TimeSeriesPoint[] = [];
+    for (const ts of series.timeslices ?? []) {
+      if (typeof ts.rate === 'number') {
+        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    if (arr.length > 0) promptTokensBySource[source] = arr;
+  }
+
+  return {
+    meta: pointMeta,
+    startNs,
+    endNs,
+    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+    timeslicesCount,
+    kvCacheUsage,
+    prefixCacheHitRate,
+    queueDepth,
+    promptTokensBySource,
+    prefillTps,
+    decodeTps,
+  };
+}

From 0067bfcd72d0f57242a418e5acc1cef604135554 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 19:01:49 -0500
Subject: [PATCH 024/111] feat(agentic): hover crosshair + expand-to-dialog on
 detail charts

Refactor every chart on /inference/agentic/[id] from innerHTML string SVG
to JSX SVG so we can attach mouse handlers. New shared ChartHover overlay
renders a vertical crosshair following the cursor and a floating tooltip
listing series values at that x:
- TimeSeriesChart: linearly interpolated value per series, timestamp title
- Distribution: bin range + count + cumulative percentile under cursor
- StackedAreaChart: per-source % share at the nearest timeslice

Each chart card now has a maximize button that opens the same chart in
a Dialog at 1300x520 (vs 720x260 inline), preserving hover and all data
labels. Charts accept width/height props so they re-render appropriately
in either size.
---
 .../agentic-point/agentic-point-detail.tsx    | 334 +++++------
 .../inference/agentic-point/chart-hover.tsx   | 148 +++++
 .../inference/agentic-point/distribution.tsx  | 298 ++++++----
 .../agentic-point/expandable-chart.tsx        |  46 ++
 .../agentic-point/time-series-chart.tsx       | 525 ++++++++++++------
 5 files changed, 922 insertions(+), 429 deletions(-)
 create mode 100644 packages/app/src/components/inference/agentic-point/chart-hover.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/expandable-chart.tsx

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 3cd274ba..ee58332d 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -14,6 +14,7 @@ import {
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
 
 import { Distribution } from './distribution';
+import { ExpandableChart } from './expandable-chart';
 import { SiblingNav } from './sibling-nav';
 import {
   StackedAreaChart,
@@ -71,14 +72,11 @@ function PointSummary({ meta }: { meta: PointMeta }) {
   );
 }
 
-function ChartCard({ title, children }: { title: string; children: React.ReactNode }) {
-  return (
-    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
-      <h2 className="text-sm font-semibold text-foreground mb-3">{title}</h2>
-      {children}
-    </div>
-  );
-}
+/** Sizes passed to charts for the inline (small) vs expanded (dialog) render. */
+const CHART_SIZES = {
+  inline: { width: 720, height: 260 },
+  expanded: { width: 1300, height: 520 },
+};
 
 export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
@@ -131,164 +129,178 @@ export function AgenticPointDetail({ id }: Props) {
       )}
 
       <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-        <ChartCard title="Input sequence length distribution">
-          {hist ? (
-            <Distribution values={hist.isl} unit="tokens" />
-          ) : histQuery.isLoading ? (
-            <Skeleton />
-          ) : (
-            <Empty />
-          )}
-        </ChartCard>
-        <ChartCard title="Output sequence length distribution">
-          {hist ? (
-            <Distribution values={hist.osl} unit="tokens" />
-          ) : histQuery.isLoading ? (
-            <Skeleton />
-          ) : (
-            <Empty />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Input sequence length distribution"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
+            return histQuery.isLoading ? <Skeleton /> : <Empty />;
+          }}
+        />
+        <ExpandableChart
+          title="Output sequence length distribution"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
+            return histQuery.isLoading ? <Skeleton /> : <Empty />;
+          }}
+        />
 
-        <ChartCard title="KV cache utilization over time">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'GPU KV cache (avg n=50)',
-                  data: rollingAverage(metrics.kvCacheUsage, 50),
-                  rawData: metrics.kvCacheUsage,
-                  color: '#3b82f6',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yMax={1}
-              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-              yAxisLabel="KV cache (%)"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="KV cache utilization over time"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'GPU KV cache (avg n=50)',
+                    data: rollingAverage(metrics.kvCacheUsage, 50),
+                    rawData: metrics.kvCacheUsage,
+                    color: '#3b82f6',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yMax={1}
+                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                yAxisLabel="KV cache (%)"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Request queue depth">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'Running (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.running,
-                    })),
-                    50,
-                  ),
-                  color: '#22c55e',
-                  strokeWidth: 2,
-                },
-                {
-                  name: 'Waiting (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.waiting,
-                    })),
-                    50,
-                  ),
-                  color: '#ef4444',
-                  strokeWidth: 2,
-                },
-                {
-                  name: 'Total (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.total,
-                    })),
-                    50,
-                  ),
-                  color: '#3b82f6',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yAxisLabel="Requests"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Request queue depth"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'Running (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.running,
+                      })),
+                      50,
+                    ),
+                    color: '#22c55e',
+                    strokeWidth: 2,
+                  },
+                  {
+                    name: 'Waiting (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.waiting,
+                      })),
+                      50,
+                    ),
+                    color: '#ef4444',
+                    strokeWidth: 2,
+                  },
+                  {
+                    name: 'Total (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.total,
+                      })),
+                      50,
+                    ),
+                    color: '#3b82f6',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yAxisLabel="Requests"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Prefix cache hit rate per interval">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'GPU (HBM, avg n=50)',
-                  data: rollingAverage(metrics.prefixCacheHitRate, 50),
-                  rawData: metrics.prefixCacheHitRate,
-                  color: '#a855f7',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yMax={1}
-              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-              yAxisLabel="Hit rate (%)"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Prefix cache hit rate per interval"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'GPU (HBM, avg n=50)',
+                    data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                    rawData: metrics.prefixCacheHitRate,
+                    color: '#a855f7',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yMax={1}
+                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                yAxisLabel="Hit rate (%)"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Throughput (total & decode)">
-          {metrics ? (
-            (() => {
-              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
-              return (
-                <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'Total (avg n=50)',
-                      data: rollingAverage(total, 50),
-                      color: '#3b82f6',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Decode (avg n=50)',
-                      data: rollingAverage(metrics.decodeTps, 50),
-                      color: '#f97316',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Total running avg',
-                      data: cumulativeAverage(total),
-                      color: '#ef4444',
-                      strokeWidth: 3,
-                    },
-                  ]}
-                  durationS={metrics.durationS}
-                  yAxisLabel="Tokens / sec"
-                />
-              );
-            })()
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Throughput (total & decode)"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'Total (avg n=50)',
+                    data: rollingAverage(total, 50),
+                    color: '#3b82f6',
+                    strokeWidth: 1.6,
+                  },
+                  {
+                    name: 'Decode (avg n=50)',
+                    data: rollingAverage(metrics.decodeTps, 50),
+                    color: '#f97316',
+                    strokeWidth: 1.6,
+                  },
+                  {
+                    name: 'Total running avg',
+                    data: cumulativeAverage(total),
+                    color: '#ef4444',
+                    strokeWidth: 3,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yAxisLabel="Tokens / sec"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Cumulative prompt token source breakdown">
-          {metrics ? (
-            <StackedAreaChart
-              sourceSeries={metrics.promptTokensBySource}
-              durationS={metrics.durationS}
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Cumulative prompt token source breakdown"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <StackedAreaChart
+                sourceSeries={metrics.promptTokensBySource}
+                durationS={metrics.durationS}
+                {...size}
+              />
+            );
+          }}
+        />
       </div>
     </div>
   );
diff --git a/packages/app/src/components/inference/agentic-point/chart-hover.tsx b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
new file mode 100644
index 00000000..24270122
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
@@ -0,0 +1,148 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+
+/** Vertical crosshair + floating value tooltip overlay shared by every chart. */
+export interface HoverItem {
+  /** Color swatch to render next to the label. */
+  color: string;
+  label: string;
+  value: string;
+  /** Optional faint secondary line (e.g. timestamp under main values). */
+  hint?: string;
+}
+
+interface ChartHoverProps {
+  /** Padding inside the SVG; matches the chart's CHART_PAD. */
+  pad: { top: number; right: number; bottom: number; left: number };
+  /** SVG viewBox dimensions used to render the chart. */
+  width: number;
+  height: number;
+  /**
+   * Called with the cursor's normalized x in [0..1] across the plot area.
+   * Returns `null` to hide the tooltip (e.g. cursor outside data range).
+   */
+  resolve: (xFraction: number) => { items: HoverItem[]; title?: string } | null;
+  children: ReactNode;
+}
+
+/**
+ * Wrap a chart's <svg> render to add mouse-driven crosshair + tooltip.
+ *
+ * The chart owner renders its bars / lines / axes via `children`; this wrapper
+ * adds an invisible <rect> across the plot area to capture pointer events, a
+ * vertical line that follows the cursor, and a floating tooltip on the right
+ * of the cursor (auto-flipping to the left when it would overflow).
+ */
+export function ChartHover({ pad, width, height, resolve, children }: ChartHoverProps) {
+  const [hover, setHover] = useState<{
+    xPx: number;
+    yPx: number;
+    fraction: number;
+    items: HoverItem[];
+    title?: string;
+  } | null>(null);
+
+  const innerW = width - pad.left - pad.right;
+  const innerH = height - pad.top - pad.bottom;
+
+  const onMove = (e: React.MouseEvent<SVGRectElement>) => {
+    const svg = e.currentTarget.ownerSVGElement;
+    if (!svg) return;
+    const rect = svg.getBoundingClientRect();
+    // Convert client coords → SVG viewBox coords.
+    const sx = ((e.clientX - rect.left) * width) / rect.width;
+    const sy = ((e.clientY - rect.top) * height) / rect.height;
+    const fraction = Math.max(0, Math.min(1, (sx - pad.left) / innerW));
+    const resolved = resolve(fraction);
+    if (!resolved) {
+      setHover(null);
+      return;
+    }
+    setHover({ xPx: sx, yPx: sy, fraction, items: resolved.items, title: resolved.title });
+  };
+
+  const onLeave = () => setHover(null);
+
+  return (
+    <div className="relative w-full">
+      <svg
+        viewBox={`0 0 ${width} ${height}`}
+        preserveAspectRatio="xMidYMid meet"
+        className="w-full h-auto text-foreground"
+      >
+        {children}
+        {hover && (
+          <line
+            x1={hover.xPx}
+            x2={hover.xPx}
+            y1={pad.top}
+            y2={pad.top + innerH}
+            stroke="currentColor"
+            strokeWidth={1}
+            strokeDasharray="3 3"
+            opacity={0.4}
+            pointerEvents="none"
+          />
+        )}
+        <rect
+          x={pad.left}
+          y={pad.top}
+          width={innerW}
+          height={innerH}
+          fill="transparent"
+          onMouseMove={onMove}
+          onMouseLeave={onLeave}
+        />
+      </svg>
+      {hover && hover.items.length > 0 && (
+        <HoverTooltip
+          xFraction={hover.fraction}
+          containerWidth={width}
+          padLeft={pad.left}
+          innerW={innerW}
+          title={hover.title}
+          items={hover.items}
+        />
+      )}
+    </div>
+  );
+}
+
+function HoverTooltip({
+  xFraction,
+  containerWidth,
+  padLeft,
+  innerW,
+  title,
+  items,
+}: {
+  xFraction: number;
+  containerWidth: number;
+  padLeft: number;
+  innerW: number;
+  title?: string;
+  items: HoverItem[];
+}) {
+  // Position tooltip near the crosshair as a % of the container.
+  // We flip to the cursor's left side when it would overflow the right edge.
+  const xPx = padLeft + xFraction * innerW;
+  const onRight = xPx < containerWidth * 0.55;
+  const left = onRight ? `${(xPx / containerWidth) * 100}%` : 'auto';
+  const right = onRight ? 'auto' : `${((containerWidth - xPx) / containerWidth) * 100}%`;
+  return (
+    <div
+      className="pointer-events-none absolute top-2 z-10 rounded-md border border-border bg-popover px-2 py-1.5 text-xs shadow-md"
+      style={{ left, right, marginLeft: onRight ? 8 : 0, marginRight: onRight ? 0 : 8 }}
+    >
+      {title && <div className="font-medium text-foreground mb-1">{title}</div>}
+      {items.map((it, i) => (
+        <div key={i} className="flex items-center gap-1.5 leading-tight">
+          <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: it.color }} />
+          <span className="text-muted-foreground">{it.label}</span>
+          <span className="ml-auto font-medium text-foreground tabular-nums">{it.value}</span>
+        </div>
+      ))}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
index c9a563fe..685b73f3 100644
--- a/packages/app/src/components/inference/agentic-point/distribution.tsx
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -1,140 +1,242 @@
 'use client';
 
-import { useMemo, useRef } from 'react';
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+
+const fmtNum = (n: number) =>
+  n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
 
 /**
  * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
  * detail-page card — fills its container width via `viewBox` + 100% width.
+ * Hover shows the bin range + count + cumulative percentile.
  */
 export function Distribution({
   values,
   unit,
+  width = 720,
   height = 260,
 }: {
   values: readonly number[];
   unit: string;
+  width?: number;
   height?: number;
 }) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const svgParts = useMemo(() => {
-    if (values.length === 0) return { bars: '', guides: '', legend: '', axis: '', yTicks: '' };
+  const computed = useMemo(() => {
+    if (values.length === 0) return null;
     const sorted = [...values].toSorted((a, b) => a - b);
     const min = sorted[0]!;
     const max = sorted.at(-1)!;
     const range = Math.max(1e-9, max - min);
     const innerW = W - PAD.left - PAD.right;
     const innerH = H - PAD.top - PAD.bottom;
-
-    // Sturges-ish, scaled with sample size, capped so bars stay visible.
     const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
     const counts: number[] = Array.from({ length: nBins }, () => 0);
     for (const v of values) {
       const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
       counts[i]!++;
     }
-    const maxCount = Math.max(...counts, 1);
-    const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
-    const barW = innerW / nBins;
-
-    const fmt = (n: number) =>
-      n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
-
-    const quantile = (q: number): number => {
-      const pos = (sorted.length - 1) * q;
-      const lo = Math.floor(pos);
-      const hi = Math.ceil(pos);
-      return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
-    };
-
-    const bars = counts
-      .map((c, i) => {
-        const h = (c / maxCount) * innerH;
-        const x = PAD.left + i * barW;
-        const y = PAD.top + (innerH - h);
-        return `<rect x="${x.toFixed(2)}" y="${y.toFixed(2)}" width="${Math.max(0, barW - 1).toFixed(2)}" height="${h.toFixed(2)}" fill="currentColor" opacity="0.55" />`;
-      })
-      .join('');
-
-    const GUIDES = [
-      { label: 'p50', q: 0.5, color: '#3b82f6' },
-      { label: 'p75', q: 0.75, color: '#22c55e' },
-      { label: 'p90', q: 0.9, color: '#f59e0b' },
-      { label: 'p95', q: 0.95, color: '#ef4444' },
-    ] as const;
-    const guides = GUIDES.map(({ q, color }) => {
-      const v = quantile(q);
-      const x = xScale(v);
-      return `<line x1="${x.toFixed(2)}" x2="${x.toFixed(2)}" y1="${PAD.top}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" opacity="0.95" />`;
-    }).join('');
-
-    // 4-tick x-axis: min, ~33%, ~66%, max
-    const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
-    const axisY = PAD.top + innerH + 14;
-    const axisLine = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${(PAD.top + innerH).toFixed(2)}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="currentColor" opacity="0.2" />`;
-    const xLabels = xTickVals
-      .map((v, i) => {
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${xScale(v).toFixed(2)}" y="${axisY}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmt(v)}</text>`;
-      })
-      .join('');
-    const axisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">value (${unit})</text>`;
-
-    // 5-tick y-axis
-    const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
-    const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${PAD.left}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.4" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${fmt(v)}</text></g>`;
-      })
-      .join('');
-    const yAxisLabel = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">count</text>`;
-
-    const chipY = H - 8;
-    const chipW = innerW / GUIDES.length;
-    const legend = GUIDES.map(({ label: ql, q, color }, i) => {
-      const v = quantile(q);
-      const x = PAD.left + i * chipW;
-      return `
-      <line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" />
-      <text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${ql} ${fmt(v)}</text>`;
-    }).join('');
-
-    return {
-      bars,
-      guides,
-      legend,
-      axis: axisLine + xLabels + axisTitle + yAxisLabel,
-      yTicks,
-    };
-  }, [values, unit, H]);
-
-  const ref = useRef<HTMLDivElement | null>(null);
-
-  if (values.length === 0) {
+    return { sorted, min, max, range, innerW, innerH, nBins, counts };
+  }, [values, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
+
+  if (!computed) {
     return (
       <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
     );
   }
+  const { sorted, min, max, range, innerW, innerH, nBins, counts } = computed;
+  const maxCount = Math.max(...counts, 1);
+  const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+  const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+  const barW = innerW / nBins;
+
+  const fmt = fmtNum;
+
+  const quantile = (q: number): number => {
+    const pos = (sorted.length - 1) * q;
+    const lo = Math.floor(pos);
+    const hi = Math.ceil(pos);
+    return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+  };
+
+  const GUIDES = [
+    { label: 'p50', q: 0.5, color: '#3b82f6' },
+    { label: 'p75', q: 0.75, color: '#22c55e' },
+    { label: 'p90', q: 0.9, color: '#f59e0b' },
+    { label: 'p95', q: 0.95, color: '#ef4444' },
+  ] as const;
+
+  // Hover: report the bin range under cursor, its count, and what percentile
+  // the bin's midpoint represents in the empirical distribution.
+  const resolve = (fraction: number) => {
+    const v = min + fraction * range;
+    const binIdx = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+    const binLo = min + (binIdx * range) / nBins;
+    const binHi = min + ((binIdx + 1) * range) / nBins;
+    const count = counts[binIdx] ?? 0;
+    // Cumulative % at the bin's right edge.
+    let cumCount = 0;
+    for (let i = 0; i <= binIdx; i++) cumCount += counts[i] ?? 0;
+    const cumPct = (cumCount / values.length) * 100;
+    const items: HoverItem[] = [
+      { color: 'currentColor', label: 'Bin', value: `${fmt(binLo)}–${fmt(binHi)} ${unit}` },
+      { color: 'currentColor', label: 'Count', value: count.toLocaleString() },
+      { color: 'currentColor', label: 'Cumulative', value: `${cumPct.toFixed(1)}%` },
+    ];
+    return { items };
+  };
+
+  const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
 
   return (
-    <div ref={ref} className="w-full">
+    <div className="w-full">
       <div className="mb-2 text-xs text-muted-foreground">
-        {values.length.toLocaleString()} requests · range {Math.round(Math.min(...values))}–
-        {Math.round(Math.max(...values))} {unit}
+        {values.length.toLocaleString()} requests · range {fmt(min)}–{fmt(max)} {unit}
       </div>
-      <svg
-        viewBox={`0 0 ${W} ${H}`}
-        preserveAspectRatio="xMidYMid meet"
-        className="w-full h-auto text-foreground"
-        dangerouslySetInnerHTML={{
-          __html:
-            svgParts.bars + svgParts.guides + svgParts.axis + svgParts.yTicks + svgParts.legend,
-        }}
-      />
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis gridlines + labels */}
+        {yTickVals.map((v, i) => {
+          const y = yScale(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left - 4}
+                x2={PAD.left}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* Bars */}
+        {counts.map((c, i) => {
+          const h = (c / maxCount) * innerH;
+          const x = PAD.left + i * barW;
+          const y = PAD.top + (innerH - h);
+          return (
+            <rect
+              key={i}
+              x={x}
+              y={y}
+              width={Math.max(0, barW - 1)}
+              height={h}
+              fill="currentColor"
+              opacity={0.55}
+            />
+          );
+        })}
+
+        {/* Percentile guide lines */}
+        {GUIDES.map(({ q, color }) => {
+          const v = quantile(q);
+          const x = xScale(v);
+          return (
+            <line
+              key={q}
+              x1={x}
+              x2={x}
+              y1={PAD.top}
+              y2={PAD.top + innerH}
+              stroke={color}
+              strokeWidth={2}
+              strokeDasharray="5 3"
+              opacity={0.95}
+            />
+          );
+        })}
+
+        {/* X axis */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.2}
+        />
+        {xTickVals.map((v, i) => {
+          const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+          return (
+            <text
+              key={`x${i}`}
+              x={xScale(v)}
+              y={PAD.top + innerH + 14}
+              fontSize={11}
+              fill="currentColor"
+              opacity={0.7}
+              textAnchor={anchor}
+            >
+              {fmt(v)}
+            </text>
+          );
+        })}
+        <text
+          x={W / 2}
+          y={H - 22}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+        >
+          value ({unit})
+        </text>
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          count
+        </text>
+
+        {/* Percentile legend chips */}
+        {(() => {
+          const chipY = H - 8;
+          const chipW = innerW / GUIDES.length;
+          return GUIDES.map(({ label: ql, q, color }, i) => {
+            const v = quantile(q);
+            const x = PAD.left + i * chipW;
+            return (
+              <g key={ql}>
+                <line
+                  x1={x + 2}
+                  x2={x + 14}
+                  y1={chipY - 4}
+                  y2={chipY - 4}
+                  stroke={color}
+                  strokeWidth={2}
+                  strokeDasharray="5 3"
+                />
+                <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                  {ql} {fmt(v)}
+                </text>
+              </g>
+            );
+          });
+        })()}
+      </ChartHover>
     </div>
   );
 }
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
new file mode 100644
index 00000000..7c8e4538
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -0,0 +1,46 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+import { Maximize2 } from 'lucide-react';
+
+import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog';
+
+/**
+ * Wraps a chart in a card with a header + expand button. Click the button to
+ * open the chart in a large dialog. The `render` prop receives `expanded:true`
+ * inside the dialog so charts can pick larger width/height.
+ */
+export function ExpandableChart({
+  title,
+  render,
+}: {
+  title: string;
+  render: (expanded: boolean) => ReactNode;
+}) {
+  const [open, setOpen] = useState(false);
+
+  return (
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+      <div className="flex items-start justify-between mb-3 gap-2">
+        <h2 className="text-sm font-semibold text-foreground">{title}</h2>
+        <button
+          type="button"
+          aria-label="Expand chart"
+          onClick={() => setOpen(true)}
+          className="text-muted-foreground hover:text-foreground transition-colors"
+        >
+          <Maximize2 className="size-4" />
+        </button>
+      </div>
+      {render(false)}
+      <Dialog open={open} onOpenChange={setOpen}>
+        <DialogContent className="max-w-[min(96vw,1400px)] w-[min(96vw,1400px)]">
+          <DialogHeader>
+            <DialogTitle>{title}</DialogTitle>
+          </DialogHeader>
+          <div className="w-full">{render(true)}</div>
+        </DialogContent>
+      </Dialog>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index bc081b4e..cd10aff7 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -4,6 +4,8 @@ import { useMemo } from 'react';
 
 import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
 
+import { ChartHover, type HoverItem } from './chart-hover';
+
 interface Series {
   name: string;
   /** The line to draw (caller pre-smooths if desired). */
@@ -21,6 +23,7 @@ interface TimeSeriesChartProps {
   yMax?: number;
   yFmt?: (v: number) => string;
   yAxisLabel?: string;
+  width?: number;
   height?: number;
 }
 
@@ -43,10 +46,7 @@ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): Tim
   return out;
 }
 
-/**
- * Expanding-window cumulative mean from index 0..i. Useful for "running
- * average over the entire run" lines (red overlay in the throughput chart).
- */
+/** Expanding-window cumulative mean from index 0..i. */
 export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   if (data.length === 0) return data;
   const out: TimeSeriesPoint[] = Array.from({ length: data.length });
@@ -68,7 +68,7 @@ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSerie
   return out;
 }
 
-const fmtInt = (n: number) =>
+const fmtIntDefault = (n: number) =>
   n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
 
 const fmtSeconds = (s: number) => {
@@ -78,97 +78,72 @@ const fmtSeconds = (s: number) => {
   return `${m}m ${rem}s`;
 };
 
+/** Linear-interpolated value at time `t` from a time-sorted series. */
+function interpAt(data: TimeSeriesPoint[], t: number): number | null {
+  if (data.length === 0) return null;
+  if (t <= data[0]!.t) return data[0]!.value;
+  if (t >= data.at(-1)!.t) return data.at(-1)!.value;
+  // Binary search
+  let lo = 0;
+  let hi = data.length - 1;
+  while (hi - lo > 1) {
+    const mid = (lo + hi) >> 1;
+    if (data[mid]!.t <= t) lo = mid;
+    else hi = mid;
+  }
+  const a = data[lo]!;
+  const b = data[hi]!;
+  if (b.t === a.t) return a.value;
+  const frac = (t - a.t) / (b.t - a.t);
+  return a.value + (b.value - a.value) * frac;
+}
+
 export function TimeSeriesChart({
   series,
   durationS,
   yMax: yMaxOpt,
-  yFmt = fmtInt,
+  yFmt = fmtIntDefault,
   yAxisLabel,
+  width = 720,
   height = 260,
 }: TimeSeriesChartProps) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const inner = useMemo(() => {
+  const layout = useMemo(() => {
     const innerW = W - PAD.left - PAD.right;
     const innerH = H - PAD.top - PAD.bottom;
     const xMax = Math.max(durationS, 1);
     const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value)));
     const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
     const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
-
-    const subsample = (arr: TimeSeriesPoint[]) => {
-      if (arr.length === 0) return arr;
-      const stride = Math.max(1, Math.floor(arr.length / innerW));
-      return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
-    };
-
-    // Layered render: raw scatter (back) → lines (front). Iterate twice so
-    // emphasis lines (high strokeWidth) draw over everything else.
-    const dotsLayer = series
-      .filter((s) => s.rawData && s.rawData.length > 0)
-      .map((s) =>
-        subsample(s.rawData!)
-          .map((d) => {
-            const x = xScale(d.t);
-            const y = yScale(d.value);
-            return `<circle cx="${x.toFixed(2)}" cy="${y.toFixed(2)}" r="1.5" fill="${s.color}" opacity="0.2" />`;
-          })
-          .join(''),
-      )
-      .join('');
-
-    const lineLayer = series
-      .map((s) => {
-        if (s.data.length === 0) return '';
-        const sampled = subsample(s.data);
-        const pts = sampled.map((d) => [xScale(d.t), yScale(d.value)] as [number, number]);
-        const path = pts
-          .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
-          .join(' ');
-        return `<path d="${path}" fill="none" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 1.8}" />`;
-      })
-      .join('');
-
-    const paths = dotsLayer + lineLayer;
-
-    // X-axis: 5 ticks at 0..xMax
-    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
-    const axisY = PAD.top + innerH;
-    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
-      .map((v, i) => {
-        const x = xScale(v);
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
-      })
-      .join('')}`;
-    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
-
-    // Y-axis: 5 ticks at 0..yMax
-    const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${yFmt(v)}</text></g>`;
-      })
-      .join('');
-    const yAxisTitle = yAxisLabel
-      ? `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">${yAxisLabel}</text>`
-      : '';
-
-    // Legend at the bottom of the SVG
-    const chipY = H - 8;
-    const chipW = innerW / Math.max(1, series.length);
-    const legend = series
-      .map((s, i) => {
-        const x = PAD.left + i * chipW;
-        return `<line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 2}" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${s.name}</text>`;
-      })
-      .join('');
-
-    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
-  }, [series, durationS, yMaxOpt, yFmt, yAxisLabel, H]);
+    return { innerW, innerH, xMax, yMax, xScale, yScale };
+  }, [series, durationS, yMaxOpt, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
+
+  const { innerW, innerH, xMax, yMax, xScale, yScale } = layout;
+
+  const subsample = (arr: TimeSeriesPoint[]) => {
+    if (arr.length === 0) return arr;
+    const stride = Math.max(1, Math.floor(arr.length / innerW));
+    return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+  };
+
+  // Pre-format axis ticks.
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    const items: HoverItem[] = [];
+    for (const s of series) {
+      const v = interpAt(s.data, t);
+      if (v === null || !Number.isFinite(v)) continue;
+      items.push({ color: s.color, label: s.name, value: yFmt(v) });
+    }
+    if (items.length === 0) return null;
+    return { items, title: fmtSeconds(t) };
+  };
 
   if (series.every((s) => s.data.length === 0)) {
     return (
@@ -177,12 +152,146 @@ export function TimeSeriesChart({
   }
 
   return (
-    <svg
-      viewBox={`0 0 ${W} ${H}`}
-      preserveAspectRatio="xMidYMid meet"
-      className="w-full h-auto text-foreground"
-      dangerouslySetInnerHTML={{ __html: inner }}
-    />
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {/* y-axis gridlines + labels */}
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {yFmt(v)}
+            </text>
+          </g>
+        );
+      })}
+
+      {/* Raw scatter underlay */}
+      {series
+        .filter((s) => s.rawData && s.rawData.length > 0)
+        .map((s, si) =>
+          subsample(s.rawData!).map((d, i) => (
+            <circle
+              key={`r${si}-${i}`}
+              cx={xScale(d.t)}
+              cy={yScale(d.value)}
+              r={1.5}
+              fill={s.color}
+              opacity={0.2}
+            />
+          )),
+        )}
+
+      {/* Lines */}
+      {series.map((s, si) => {
+        if (s.data.length === 0) return null;
+        const sampled = subsample(s.data);
+        const path = sampled
+          .map(
+            (d, i) =>
+              `${i === 0 ? 'M' : 'L'}${xScale(d.t).toFixed(2)},${yScale(d.value).toFixed(2)}`,
+          )
+          .join(' ');
+        return (
+          <path
+            key={`l${si}`}
+            d={path}
+            fill="none"
+            stroke={s.color}
+            strokeWidth={s.strokeWidth ?? 1.8}
+          />
+        );
+      })}
+
+      {/* X-axis */}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+
+      {yAxisLabel && (
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          {yAxisLabel}
+        </text>
+      )}
+
+      {/* Legend */}
+      {(() => {
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, series.length);
+        return series.map((s, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <line
+                x1={x + 2}
+                x2={x + 14}
+                y1={chipY - 4}
+                y2={chipY - 4}
+                stroke={s.color}
+                strokeWidth={s.strokeWidth ?? 2}
+              />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {s.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
   );
 }
 
@@ -190,19 +299,21 @@ export function TimeSeriesChart({
 export function StackedAreaChart({
   sourceSeries,
   durationS,
+  width = 720,
   height = 260,
 }: {
   sourceSeries: Record<string, TimeSeriesPoint[]>;
   durationS: number;
+  width?: number;
   height?: number;
 }) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const inner = useMemo(() => {
+  const computed = useMemo(() => {
     const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
-    if (entries.length === 0) return '';
+    if (entries.length === 0) return null;
     const tValues = entries[0]![1].map((p) => p.t);
     const cum: Record<string, number[]> = {};
     for (const [name, arr] of entries) {
@@ -220,92 +331,166 @@ export function StackedAreaChart({
         shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
       }
     }
-
-    const colors: Record<string, string> = {
-      local_compute: '#f97316',
-      local_cache_hit: '#3b82f6',
-      external_kv_transfer: '#22c55e',
-      miss: '#f97316',
-    };
-    const labelFor: Record<string, string> = {
-      local_compute: 'Prefill',
-      local_cache_hit: 'HBM Cache Hit',
-      external_kv_transfer: 'Offload Cache Hit',
-      miss: 'Miss',
-    };
-
-    const innerW = W - PAD.left - PAD.right;
-    const innerH = H - PAD.top - PAD.bottom;
-    const xMax = Math.max(durationS, 1);
-    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
-    const yScale = (v: number) => PAD.top + (1 - v) * innerH;
-
-    const stackOrder = Object.keys(shares);
-    const lower: number[] = Array.from({ length: tValues.length }, () => 0);
-    const layers = stackOrder.map((name) => {
-      const upper = shares[name]!.map((v, i) => lower[i]! + v);
-      const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
-      const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
-      const d = `${top
-        .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
-        .join(' ')} ${[...bottom]
-        .toReversed()
-        .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
-        .join(' ')} Z`;
-      const color = colors[name] ?? '#6b7280';
-      const path = `<path d="${d}" fill="${color}" opacity="0.75" />`;
-      for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
-      return { name, color, path };
-    });
-
-    const paths = layers.map((l) => l.path).join('');
-
-    // X-axis
-    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
-    const axisY = PAD.top + innerH;
-    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
-      .map((v, i) => {
-        const x = xScale(v);
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
-      })
-      .join('')}`;
-    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
-
-    // Y-axis 0..100%
-    const yTickVals = [0, 0.25, 0.5, 0.75, 1];
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${(v * 100).toFixed(0)}%</text></g>`;
-      })
-      .join('');
-    const yAxisTitle = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">% of prefill tokens</text>`;
-
-    const chipY = H - 8;
-    const chipW = innerW / Math.max(1, layers.length);
-    const legend = layers
-      .map((l, i) => {
-        const x = PAD.left + i * chipW;
-        return `<rect x="${(x + 2).toFixed(2)}" y="${chipY - 9}" width="12" height="8" fill="${l.color}" opacity="0.75" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${labelFor[l.name] ?? l.name}</text>`;
-      })
-      .join('');
-
-    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
-  }, [sourceSeries, durationS, H]);
-
-  if (Object.values(sourceSeries).every((v) => v.length === 0)) {
+    return { tValues, shares };
+  }, [sourceSeries]);
+
+  const colors: Record<string, string> = {
+    local_compute: '#f97316',
+    local_cache_hit: '#3b82f6',
+    external_kv_transfer: '#22c55e',
+    miss: '#f97316',
+  };
+  const labelFor: Record<string, string> = {
+    local_compute: 'Prefill',
+    local_cache_hit: 'HBM Cache Hit',
+    external_kv_transfer: 'Offload Cache Hit',
+    miss: 'Miss',
+  };
+
+  if (!computed) {
     return (
       <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
     );
   }
+  const { tValues, shares } = computed;
+
+  const innerW = W - PAD.left - PAD.right;
+  const innerH = H - PAD.top - PAD.bottom;
+  const xMax = Math.max(durationS, 1);
+  const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+  const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+  const stackOrder = Object.keys(shares);
+  const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+  const layers = stackOrder.map((name) => {
+    const upper = shares[name]!.map((v, i) => lower[i]! + v);
+    const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const d = `${top
+      .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} ${[...bottom]
+      .toReversed()
+      .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} Z`;
+    const color = colors[name] ?? '#6b7280';
+    for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+    return { name, color, d };
+  });
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    // Find the closest tValue index.
+    let idx = 0;
+    let bestDist = Infinity;
+    for (let i = 0; i < tValues.length; i++) {
+      const d = Math.abs(tValues[i]! - t);
+      if (d < bestDist) {
+        bestDist = d;
+        idx = i;
+      }
+    }
+    const items: HoverItem[] = stackOrder.map((name) => ({
+      color: colors[name] ?? '#6b7280',
+      label: labelFor[name] ?? name,
+      value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
+    }));
+    return { items, title: fmtSeconds(t) };
+  };
+
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = [0, 0.25, 0.5, 0.75, 1];
 
   return (
-    <svg
-      viewBox={`0 0 ${W} ${H}`}
-      preserveAspectRatio="xMidYMid meet"
-      className="w-full h-auto text-foreground"
-      dangerouslySetInnerHTML={{ __html: inner }}
-    />
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {(v * 100).toFixed(0)}%
+            </text>
+          </g>
+        );
+      })}
+      {layers.map((l, i) => (
+        <path key={i} d={l.d} fill={l.color} opacity={0.75} />
+      ))}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+      <text
+        x={10}
+        y={H / 2}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+        transform={`rotate(-90 10 ${H / 2})`}
+      >
+        % of prefill tokens
+      </text>
+      {(() => {
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, layers.length);
+        return layers.map((l, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <rect x={x + 2} y={chipY - 9} width={12} height={8} fill={l.color} opacity={0.75} />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {labelFor[l.name] ?? l.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
   );
 }

From 1d502ac198495147ef579140121a3e49a9f4349f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 20:09:55 -0500
Subject: [PATCH 025/111] feat(inference): one chart with TTFT / E2E /
 Interactivity x-axis picker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the always-rendered pair of charts (interactivity + e2e) with a
single chart whose x-axis is chosen by big pill-shaped buttons above the
card. Three options: TTFT (e2e chart with x = p90_ttft), E2E Latency
(e2e chart with x = median_e2el / p90_e2el), Interactivity (interactivity
chart). The inline E2E dropdown is removed — the buttons replace it.

Mode is persisted to ?i_xmode= and defaults by scenario kind:
  agentic   → TTFT
  fixed-seq → Interactivity

Initial state is SSR-stable (always reads URL only) and a post-mount
effect snaps to the kind default if no URL value was provided. The same
effect re-snaps on subsequent sequence-kind switches. The mode setter
also keeps selectedE2eXAxisMetric aligned so the existing useChartData
pipeline resolves the right x-axis for the e2e chart variant.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts     |   2 +
 .../components/inference/InferenceContext.tsx |  45 ++++-
 .../app/src/components/inference/types.ts     |   9 +
 .../components/inference/ui/ChartDisplay.tsx  | 162 ++++++++----------
 packages/app/src/lib/url-state.ts             |   2 +
 5 files changed, 130 insertions(+), 90 deletions(-)

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index 34b89aba..2d3c982f 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -195,6 +195,8 @@ export function createMockInferenceContext(
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
     setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
+    selectedXAxisMode: 'interactivity',
+    setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
     scaleType: 'auto',
     setScaleType: namedStub('setScaleType'),
     isLegendExpanded: true,
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index c80afc2e..00ea316c 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -43,7 +43,7 @@ import {
 import { useUrlState } from '@/hooks/useUrlState';
 import { buildAvailabilityHwKey } from '@/lib/chart-utils';
 import { getHardwareConfig, getModelSortIndex, isKnownGpu, TABLEAU_10 } from '@/lib/constants';
-import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING } from '@/lib/data-mappings';
+import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING, sequenceKind } from '@/lib/data-mappings';
 import {
   MtpEngineConflictToast,
   type MtpEngineConflictDetail,
@@ -133,6 +133,26 @@ export function InferenceProvider({
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
   );
+  // Selected chart variant. Initialize from URL only — SSR cannot read URL, so
+  // computing a kind-based default here would diverge between server and client
+  // and cause a hydration mismatch. The scenario-kind default is applied in a
+  // post-mount effect below (and a ref tracks whether the user has overridden).
+  const urlXMode = (() => {
+    const v = getUrlParam('i_xmode');
+    return v === 'ttft' || v === 'e2e' || v === 'interactivity' ? v : null;
+  })();
+  const [selectedXAxisMode, setSelectedXAxisMode] = useState<'ttft' | 'e2e' | 'interactivity'>(
+    urlXMode ?? 'ttft',
+  );
+  const xAxisModeFromUrlRef = useRef(urlXMode !== null);
+  // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
+  // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
+  const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
+    xAxisModeFromUrlRef.current = true;
+    setSelectedXAxisMode(mode);
+    if (mode === 'ttft') setSelectedE2eXAxisMetric('p90_ttft');
+    else if (mode === 'e2e') setSelectedE2eXAxisMetric(null);
+  }, []);
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
@@ -325,6 +345,24 @@ export function InferenceProvider({
     setTrackedConfigs((prev) => (prev.length > 0 ? [] : prev));
   }, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]);
 
+  // Reconcile the x-axis mode with the scenario kind:
+  //  - On mount with no `i_xmode` URL param: snap to the kind's natural default
+  //    (agentic → ttft, fixed → interactivity). The state itself was initialized
+  //    to a SSR-stable constant so server and client render the same DOM; this
+  //    effect fixes it up after hydration.
+  //  - When the user later switches sequence kinds: snap to the new kind's
+  //    natural default (the prior selection was for a different kind, so it
+  //    doesn't carry over).
+  const lastSeqKindRef = useRef<ReturnType<typeof sequenceKind> | null>(null);
+  useEffect(() => {
+    const kind = sequenceKind(effectiveSequence);
+    const isInitialMount = lastSeqKindRef.current === null;
+    if (!isInitialMount && lastSeqKindRef.current === kind) return;
+    lastSeqKindRef.current = kind;
+    if (isInitialMount && xAxisModeFromUrlRef.current) return;
+    handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
+  }, [effectiveSequence, handleSetXAxisMode]);
+
   // Ref guard: when true, filter changes don't clear the active preset.
   // FavoritePresetsDropdown sets this while applying a preset so its own
   // programmatic setter calls don't accidentally deactivate it.
@@ -785,6 +823,7 @@ export function InferenceProvider({
       i_log: logScale ? '1' : '',
       i_xmetric: selectedXAxisMetric || '',
       i_e2e_xmetric: selectedE2eXAxisMetric || '',
+      i_xmode: selectedXAxisMode,
       i_scale: scaleType,
       i_legend: isLegendExpanded ? '' : '0',
       i_advlabel: useAdvancedLabels ? '1' : '',
@@ -798,6 +837,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       selectedGPUs,
       selectedDates,
@@ -968,6 +1008,8 @@ export function InferenceProvider({
       setSelectedXAxisMetric,
       selectedE2eXAxisMetric,
       setSelectedE2eXAxisMetric,
+      selectedXAxisMode,
+      setSelectedXAxisMode: handleSetXAxisMode,
       scaleType,
       setScaleType,
       loading,
@@ -1041,6 +1083,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       selectedGPUs,
       selectedDates,
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 7a39bbd1..3bbee596 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -532,6 +532,15 @@ export interface InferenceChartContextType {
   setSelectedXAxisMetric: (metric: string | null) => void;
   selectedE2eXAxisMetric: string | null;
   setSelectedE2eXAxisMetric: (metric: string | null) => void;
+  /**
+   * Which chart variant the user wants to see — the inference card shows one chart
+   * at a time, picked by the big TTFT / E2E Latency / Interactivity buttons.
+   * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
+   * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
+   * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+   */
+  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity';
+  setSelectedXAxisMode: (mode: 'ttft' | 'e2e' | 'interactivity') => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
   setIsLegendExpanded: (metric: boolean) => void;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index e9021aed..f0611274 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -2,7 +2,7 @@
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
 import { useMemo, useRef, useState } from 'react';
-import { BarChart3, ChevronDown, Table2, X } from 'lucide-react';
+import { BarChart3, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import { useInference } from '@/components/inference/InferenceContext';
@@ -30,7 +30,6 @@ import {
   DialogHeader,
   DialogTitle,
 } from '@/components/ui/dialog';
-import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
 import { Skeleton } from '@/components/ui/skeleton';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import {
@@ -60,54 +59,25 @@ const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagra
 });
 import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 
-/** Controlled popover dropdown for the e2e chart x-axis toggle. */
-function E2eXAxisDropdown({
-  xAxisLabel,
-  xAxisOptions,
-  selectedValue,
-  onSelect,
-}: {
-  xAxisLabel: string;
-  xAxisOptions: { value: string | null; label: string }[];
-  selectedValue: string | null;
-  onSelect: (value: string | null) => void;
-}) {
-  const [open, setOpen] = useState(false);
-  return (
-    <Popover open={open} onOpenChange={setOpen}>
-      <PopoverTrigger asChild>
-        <button
-          className="inline-flex items-center gap-1 hover:opacity-70 transition-opacity cursor-pointer"
-          onClick={(e) => e.stopPropagation()}
-        >
-          vs. {xAxisLabel}
-          <ChevronDown className="no-export size-3.5 shrink-0 opacity-60" />
-        </button>
-      </PopoverTrigger>
-      <PopoverContent className="w-48 p-1" align="start">
-        {xAxisOptions.map((opt) => (
-          <button
-            key={opt.label}
-            className={`w-full text-left px-3 py-1.5 text-sm rounded hover:bg-accent transition-colors ${
-              (opt.value === null && !selectedValue) || opt.value === selectedValue
-                ? 'font-medium'
-                : ''
-            }`}
-            onClick={() => {
-              onSelect(opt.value);
-              setOpen(false);
-            }}
-          >
-            {opt.label}
-          </button>
-        ))}
-      </PopoverContent>
-    </Popover>
-  );
-}
-
 type InferenceViewMode = 'chart' | 'table';
 
+/**
+ * The three chart variants the user can choose with the big buttons above the
+ * chart card. Each maps to one entry in `inference-chart-config.json` plus a
+ * forced x-axis override for the E2E chartType.
+ */
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity';
+
+interface XAxisModeButton {
+  value: XAxisMode;
+  label: string;
+}
+const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [
+  { value: 'ttft', label: 'TTFT' },
+  { value: 'e2e', label: 'E2E Latency' },
+  { value: 'interactivity', label: 'Interactivity' },
+];
+
 const VIEW_MODE_OPTIONS: SegmentedToggleOption<InferenceViewMode>[] = [
   {
     value: 'chart',
@@ -152,9 +122,10 @@ export default function ChartDisplay() {
     logScale,
     activeHwTypes,
     activeDates,
-    setSelectedE2eXAxisMetric,
     selectedPercentile,
     compareGpuPair,
+    selectedXAxisMode,
+    setSelectedXAxisMode,
   } = useInference();
 
   const {
@@ -329,17 +300,26 @@ export default function ChartDisplay() {
     }));
   }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
 
+  // Show one chart at a time, picked by the TTFT / E2E / Interactivity buttons.
+  // Both 'ttft' and 'e2e' modes render the e2e chart (the x-axis swap is handled
+  // upstream by `selectedE2eXAxisMetric`, which `setSelectedXAxisMode` keeps in sync).
+  const visibleGraphs = useMemo(() => {
+    const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
+    const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
+    return filtered.length > 0 ? filtered : effectiveGraphs;
+  }, [effectiveGraphs, selectedXAxisMode]);
+
   const displayGraphs = isFirstLoad
-    ? Array.from({ length: 2 }).map((_, index) => (
-        <Card key={`skeleton-${index}`}>
+    ? [
+        <Card key="skeleton-0">
           <Skeleton className="h-7 w-2/4 mb-1" />
           <Skeleton className="h-5 w-3/4 mb-2" />
           <Skeleton className="h-[600px] w-full" />
-        </Card>
-      ))
-    : effectiveGraphs.length === 0
+        </Card>,
+      ]
+    : visibleGraphs.length === 0
       ? []
-      : effectiveGraphs.map((graph, graphIndex) => {
+      : visibleGraphs.map((graph, graphIndex) => {
           const isTimelineMode = Boolean(
             selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
           );
@@ -415,43 +395,17 @@ export default function ChartDisplay() {
                               return 'vs. P90 Time To First Token';
                             }
 
-                            // For e2e chart: render clickable inline dropdown for x-axis
+                            // For e2e chart: heading is driven by the TTFT / E2E button
+                            // selection above the card, so the inline dropdown is gone.
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               const pctlWord = selectedPercentile.toUpperCase();
-                              const e2elLabel = isAgentic
-                                ? `${pctlWord} End-to-end Latency`
-                                : 'End-to-end Latency';
-                              const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p90_ttft' ? 'P90 TTFT' : e2elLabel;
-                              const xAxisOptions = [
-                                { value: null, label: e2elLabel },
-                                { value: 'p90_ttft', label: 'P90 TTFT' },
-                              ];
-                              const zoomPrefix =
-                                selectedDateRange.startDate &&
-                                selectedDateRange.endDate &&
-                                selectedGPUs.length > 0
-                                  ? 'gpu_timeseries'
-                                  : 'latency';
-                              return (
-                                <E2eXAxisDropdown
-                                  xAxisLabel={xAxisLabel}
-                                  xAxisOptions={xAxisOptions}
-                                  selectedValue={selectedE2eXAxisMetric}
-                                  onSelect={(value) => {
-                                    setSelectedE2eXAxisMetric(value);
-                                    track('latency_x_axis_metric_selected', {
-                                      metric: value ?? 'median_e2el',
-                                    });
-                                    window.dispatchEvent(
-                                      new CustomEvent(
-                                        `${zoomPrefix}_zoom_reset_chart-${graphIndex}`,
-                                      ),
-                                    );
-                                  }}
-                                />
-                              );
+                              if (selectedE2eXAxisMetric === 'p90_ttft') {
+                                return 'vs. P90 Time To First Token';
+                              }
+                              return isAgentic
+                                ? `vs. ${pctlWord} End-to-end Latency`
+                                : 'vs. End-to-end Latency';
                             }
 
                             // Fall back to the heading baked into chartDefinition
@@ -636,6 +590,36 @@ export default function ChartDisplay() {
           <CustomPowers loading={loading} />
         </section>
       )}
+      <section
+        className="flex flex-wrap justify-center gap-3 sm:gap-4"
+        role="tablist"
+        aria-label="Chart x-axis metric"
+        data-testid="x-axis-mode-buttons"
+      >
+        {X_AXIS_MODE_BUTTONS.map(({ value, label }) => {
+          const isActive = selectedXAxisMode === value;
+          return (
+            <button
+              key={value}
+              type="button"
+              role="tab"
+              aria-selected={isActive}
+              data-testid={`x-axis-mode-${value}`}
+              onClick={() => {
+                setSelectedXAxisMode(value);
+                track('latency_x_axis_mode_selected', { mode: value });
+              }}
+              className={`min-w-[160px] flex-1 sm:flex-initial rounded-full border-2 px-6 py-3 text-base font-semibold transition-colors ${
+                isActive
+                  ? 'border-primary bg-primary text-primary-foreground shadow-sm'
+                  : 'border-border bg-card text-foreground hover:border-primary/60 hover:bg-accent'
+              }`}
+            >
+              {label}
+            </button>
+          );
+        })}
+      </section>
       <div className="flex flex-col gap-4">{displayGraphs}</div>
 
       {/* Performance Over Time — Modal Drill-Down */}
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 4a48a776..73cbe0b7 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -25,6 +25,7 @@ const URL_STATE_KEYS = [
   'i_pctl',
   'i_xmetric',
   'i_e2e_xmetric',
+  'i_xmode',
   'i_scale',
   'i_gpus',
   'i_dates',
@@ -70,6 +71,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_pctl: 'p90',
   i_xmetric: 'p90_ttft',
   i_e2e_xmetric: 'p90_ttft',
+  i_xmode: '',
   i_scale: 'auto',
   i_gpus: '',
   i_dates: '',

From 965c8622a36f02a6762388728c855da3ff2aa530 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 20:15:42 -0500
Subject: [PATCH 026/111] fix(inference): TTFT/E2E pick metric by sequence kind
 + add P75 option

Two related fixes for the x-axis-mode picker:

1. Fixed-seq has no p90_ttft / p90_e2el in the metrics JSONB (only
   median/p99). The TTFT button was hardcoded to p90_ttft, so the chart
   went blank on fixed-seq scenarios. Reconcile selectedE2eXAxisMetric in
   a reactive effect that picks median_ttft for fixed-seq and the user's
   selected percentile for agentic. useChartData's TTFT override now
   matches any *_ttft metric and derives its label from the actual
   percentile, instead of hardcoding "P90".

2. Add P75 to the agentic latency percentile selector. Update
   withPercentile + the label/heading regexes to handle p75 and p95.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx | 23 +++++++++++++++++--
 .../inference/hooks/useChartData.ts           | 16 +++++++++----
 .../components/inference/ui/ChartDisplay.tsx  | 10 +++++---
 packages/app/src/lib/benchmark-transform.ts   |  2 +-
 packages/app/src/lib/data-mappings.ts         |  8 ++++---
 5 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 00ea316c..74bdb28b 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -150,8 +150,9 @@ export function InferenceProvider({
   const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
     xAxisModeFromUrlRef.current = true;
     setSelectedXAxisMode(mode);
-    if (mode === 'ttft') setSelectedE2eXAxisMetric('p90_ttft');
-    else if (mode === 'e2e') setSelectedE2eXAxisMetric(null);
+    // The e2e chart's x-axis metric is reconciled in a separate effect below,
+    // because it depends on sequence kind (fixed-seq has no p90_* metrics) and
+    // the agentic percentile, both of which can change independently.
   }, []);
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
@@ -363,6 +364,24 @@ export function InferenceProvider({
     handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
   }, [effectiveSequence, handleSetXAxisMode]);
 
+  // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
+  // agentic percentile changes. For fixed-seq the JSONB only carries
+  // median_* / p99_* (no p90_*), so the TTFT button there has to point at
+  // median_ttft — otherwise the chart goes blank. For agentic, we point at
+  // the user's chosen percentile so the dropdown actually drives the axis.
+  useEffect(() => {
+    const isAgentic = sequenceKind(effectiveSequence) === 'agentic';
+    if (selectedXAxisMode === 'ttft') {
+      setSelectedE2eXAxisMetric(isAgentic ? `${selectedPercentile}_ttft` : 'median_ttft');
+    } else if (selectedXAxisMode === 'e2e') {
+      // null = use the chart-config natural x (median_e2el), which useChartData
+      // rewrites to <pctl>_e2el for agentic via withPercentile().
+      setSelectedE2eXAxisMetric(null);
+    }
+    // 'interactivity' mode renders the interactivity chart, which keys off
+    // selectedXAxisMetric (not the e2e one), so nothing to do here.
+  }, [selectedXAxisMode, effectiveSequence, selectedPercentile]);
+
   // Ref guard: when true, filter changes don't clear the active preset.
   // FavoritePresetsDropdown sets this while applying a preset so its own
   // programmatic setter calls don't accidentally deactivate it.
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 0d13b8ca..ffa6a8a7 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -215,8 +215,16 @@ export function useChartData(
         // Resolve the effective x-axis override per chart type
         const effectiveXMetric =
           chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
-        const isTtftOverride = effectiveXMetric === 'p90_ttft';
-        const ttftLabel = 'P90 Time To First Token (s)';
+        // The TTFT override is now any *_ttft metric (not just p90_ttft) — the
+        // x-axis-mode picker reconciles the percentile prefix based on sequence
+        // kind (fixed-seq → median, agentic → user-picked percentile).
+        const isTtftOverride =
+          typeof effectiveXMetric === 'string' && effectiveXMetric.endsWith('_ttft');
+        const ttftPctl = isTtftOverride
+          ? (effectiveXMetric as string).replace(/_ttft$/u, '')
+          : 'p90';
+        const ttftPctlWord = ttftPctl === 'median' ? 'Median' : ttftPctl.toUpperCase();
+        const ttftLabel = `${ttftPctlWord} Time To First Token (s)`;
 
         const isAgentic = selectedSequence === Sequence.AgenticTraces;
 
@@ -261,9 +269,9 @@ export function useChartData(
             selectedPercentile,
           ) as keyof AggDataEntry;
           const pctlWord = selectedPercentile.toUpperCase();
-          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P75|P90|P95|P99(?:\.9)?)\b/iu, pctlWord);
           chartHeading = chartHeading.replace(
-            /^(vs\.\s+)(?:(Median|Mean|P90|P99(?:\.9)?)\s+)?/iu,
+            /^(vs\.\s+)(?:(Median|Mean|P75|P90|P95|P99(?:\.9)?)\s+)?/iu,
             `$1${pctlWord} `,
           );
         }
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index f0611274..ca7f9cd7 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -397,12 +397,16 @@ export default function ChartDisplay() {
 
                             // For e2e chart: heading is driven by the TTFT / E2E button
                             // selection above the card, so the inline dropdown is gone.
+                            // The metric carries the percentile prefix (e.g. p90_ttft,
+                            // median_ttft for fixed-seq, p75_ttft for agentic+p75).
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              const pctlWord = selectedPercentile.toUpperCase();
-                              if (selectedE2eXAxisMetric === 'p90_ttft') {
-                                return 'vs. P90 Time To First Token';
+                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
+                                return `vs. ${word} Time To First Token`;
                               }
+                              const pctlWord = selectedPercentile.toUpperCase();
                               return isAgentic
                                 ? `vs. ${pctlWord} End-to-end Latency`
                                 : 'vs. End-to-end Latency';
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index c5bdd6ed..ba26a978 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -136,7 +136,7 @@ interface PreparedEntry {
  * percentile prefix; leaves everything else alone.
  */
 export function withPercentile(key: string, percentile: string): string {
-  return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`);
+  return key.replace(/^(mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
 }
 
 /**
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 91f65a34..c18266ba 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -185,15 +185,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 
 /**
  * Percentile of the latency distribution used for the chart x-axis when
- * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
- * is surfaced in the UI.
+ * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9
+ * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl);
+ * p75 and p90 are surfaced in the UI.
  */
 export enum Percentile {
+  P75 = 'p75',
   P90 = 'p90',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
+  [Percentile.P75]: { label: 'p75' },
   [Percentile.P90]: { label: 'p90' },
 };
 

From e4d97f29bb3ff3a973a7b84113dc61278f70abf8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:18:17 -0500
Subject: [PATCH 027/111] feat(metrics): wire P75/P95 through frontend +
 register new aiperf keys

The P75 percentile option I just added was broken: rowToAggDataEntry
only copied median/mean/p90/p99/p99.9 from the metrics JSONB, so the
chart looked up entry.p75_ttft which didn't exist and points fell to 0.

- Add p75_*/p95_* fields for ttft/tpot/itl/e2el/intvty to AggDataEntry
  and rowToAggDataEntry so the existing percentile pipeline can resolve them.
- Update the energy-metrics test fixture for the new required fields.
- Register all new aiperf metric keys (p75/p95 latencies, qps stats,
  per-request token-count distribution, run totals, server cache hit rates,
  total/input/output tput_tps) in METRIC_KEYS so the ingest auto-capture
  warning stops firing on the next agentic run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/inference/types.ts     | 10 +++
 packages/app/src/lib/benchmark-transform.ts   | 10 +++
 packages/app/src/lib/energy-metrics.test.ts   | 10 +++
 packages/constants/src/metric-keys.ts         | 66 ++++++++++++++++++-
 4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 3bbee596..0a9908e3 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -52,7 +52,9 @@ export interface AggDataEntry {
   mean_ttft: number;
   median_ttft: number;
   std_ttft: number;
+  p75_ttft: number;
   p90_ttft: number;
+  p95_ttft: number;
   p99_ttft: number;
   'p99.9_ttft': number;
   mean_tpot: number;
@@ -61,8 +63,12 @@ export interface AggDataEntry {
   median_intvty: number;
   std_tpot: number;
   std_intvty: number;
+  p75_tpot: number;
+  p75_intvty: number;
   p90_tpot: number;
   p90_intvty: number;
+  p95_tpot: number;
+  p95_intvty: number;
   p99_tpot: number;
   p99_intvty: number;
   'p99.9_tpot': number;
@@ -70,13 +76,17 @@ export interface AggDataEntry {
   mean_itl: number;
   median_itl: number;
   std_itl: number;
+  p75_itl: number;
   p90_itl: number;
+  p95_itl: number;
   p99_itl: number;
   'p99.9_itl': number;
   mean_e2el: number;
   median_e2el: number;
   std_e2el: number;
+  p75_e2el: number;
   p90_e2el: number;
+  p95_e2el: number;
   p99_e2el: number;
   'p99.9_e2el': number;
   disagg: boolean;
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index ba26a978..3594750c 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -64,31 +64,41 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     mean_ttft: m.mean_ttft ?? 0,
     median_ttft: m.median_ttft ?? 0,
     std_ttft: m.std_ttft ?? 0,
+    p75_ttft: m.p75_ttft ?? 0,
     p90_ttft: m.p90_ttft ?? 0,
+    p95_ttft: m.p95_ttft ?? 0,
     p99_ttft: m.p99_ttft ?? 0,
     'p99.9_ttft': m['p99.9_ttft'] ?? 0,
     mean_tpot: m.mean_tpot ?? 0,
     median_tpot: m.median_tpot ?? 0,
     std_tpot: m.std_tpot ?? 0,
+    p75_tpot: m.p75_tpot ?? 0,
     p90_tpot: m.p90_tpot ?? 0,
+    p95_tpot: m.p95_tpot ?? 0,
     p99_tpot: m.p99_tpot ?? 0,
     'p99.9_tpot': m['p99.9_tpot'] ?? 0,
     mean_intvty: m.mean_intvty ?? 0,
     median_intvty: m.median_intvty ?? 0,
     std_intvty: m.std_intvty ?? 0,
+    p75_intvty: m.p75_intvty ?? 0,
     p90_intvty: m.p90_intvty ?? 0,
+    p95_intvty: m.p95_intvty ?? 0,
     p99_intvty: m.p99_intvty ?? 0,
     'p99.9_intvty': m['p99.9_intvty'] ?? 0,
     mean_itl: m.mean_itl ?? 0,
     median_itl: m.median_itl ?? 0,
     std_itl: m.std_itl ?? 0,
+    p75_itl: m.p75_itl ?? 0,
     p90_itl: m.p90_itl ?? 0,
+    p95_itl: m.p95_itl ?? 0,
     p99_itl: m.p99_itl ?? 0,
     'p99.9_itl': m['p99.9_itl'] ?? 0,
     mean_e2el: m.mean_e2el ?? 0,
     median_e2el: m.median_e2el ?? 0,
     std_e2el: m.std_e2el ?? 0,
+    p75_e2el: m.p75_e2el ?? 0,
     p90_e2el: m.p90_e2el ?? 0,
+    p95_e2el: m.p95_e2el ?? 0,
     p99_e2el: m.p99_e2el ?? 0,
     'p99.9_e2el': m['p99.9_e2el'] ?? 0,
     disagg: row.disagg,
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 54788585..2f5844c1 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,7 +57,9 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_ttft: 0.5,
     median_ttft: 0.4,
     std_ttft: 0.1,
+    p75_ttft: 0.65,
     p90_ttft: 0.7,
+    p95_ttft: 0.75,
     p99_ttft: 0.8,
     'p99.9_ttft': 0.9,
     mean_tpot: 0.02,
@@ -66,8 +68,12 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     median_intvty: 44,
     std_tpot: 0.005,
     std_intvty: 5,
+    p75_tpot: 0.022,
+    p75_intvty: 50,
     p90_tpot: 0.025,
     p90_intvty: 55,
+    p95_tpot: 0.028,
+    p95_intvty: 58,
     p99_tpot: 0.03,
     p99_intvty: 60,
     'p99.9_tpot': 0.035,
@@ -75,13 +81,17 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_itl: 0.01,
     median_itl: 0.01,
     std_itl: 0.002,
+    p75_itl: 0.012,
     p90_itl: 0.013,
+    p95_itl: 0.014,
     p99_itl: 0.015,
     'p99.9_itl': 0.018,
     mean_e2el: 5,
     median_e2el: 4.8,
     std_e2el: 0.5,
+    p75_e2el: 5.2,
     p90_e2el: 5.5,
+    p95_e2el: 5.8,
     p99_e2el: 6,
     'p99.9_e2el': 6.5,
     disagg: false,
diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts
index cf2c4d0b..70e50f96 100644
--- a/packages/constants/src/metric-keys.ts
+++ b/packages/constants/src/metric-keys.ts
@@ -1,46 +1,110 @@
 /**
  * Canonical set of metric keys stored in the benchmark_results.metrics JSONB column.
  *
- * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU.
+ * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are
+ * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment.
+ *
+ * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency,
+ * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs
+ * carry median/mean/p99/std for latency only.
  */
 export const METRIC_KEYS = new Set([
   // throughput (tokens/sec/GPU)
   'tput_per_gpu',
   'output_tput_per_gpu',
   'input_tput_per_gpu',
+  // throughput (tokens/sec, deployment total) — agentic aiperf reports both
+  'total_tput_tps',
+  'output_tput_tps',
+  'input_tput_tps',
   // TTFT — time to first token
   'median_ttft',
   'mean_ttft',
+  'p75_ttft',
   'p90_ttft',
+  'p95_ttft',
   'p99_ttft',
   'p99.9_ttft',
   'std_ttft',
   // TPOT — time per output token
   'median_tpot',
   'mean_tpot',
+  'p75_tpot',
   'p90_tpot',
+  'p95_tpot',
   'p99_tpot',
   'p99.9_tpot',
   'std_tpot',
   // ITL — inter-token latency
   'median_itl',
   'mean_itl',
+  'p75_itl',
   'p90_itl',
+  'p95_itl',
   'p99_itl',
   'p99.9_itl',
   'std_itl',
   // E2EL — end-to-end latency
   'median_e2el',
   'mean_e2el',
+  'p75_e2el',
   'p90_e2el',
+  'p95_e2el',
   'p99_e2el',
   'p99.9_e2el',
   'std_e2el',
   // interactivity
   'median_intvty',
   'mean_intvty',
+  'p75_intvty',
   'p90_intvty',
+  'p95_intvty',
   'p99_intvty',
   'p99.9_intvty',
   'std_intvty',
+  // QPS — queries per second (agentic aiperf)
+  'median_qps',
+  'mean_qps',
+  'p75_qps',
+  'p90_qps',
+  'p95_qps',
+  'p99_qps',
+  'p99.9_qps',
+  'std_qps',
+  // per-request input token count distribution
+  'median_input_tokens',
+  'mean_input_tokens',
+  'p75_input_tokens',
+  'p90_input_tokens',
+  'p95_input_tokens',
+  'p99_input_tokens',
+  'p99.9_input_tokens',
+  'std_input_tokens',
+  // per-request output token count distribution — actual served
+  'median_output_tokens_actual',
+  'mean_output_tokens_actual',
+  'p75_output_tokens_actual',
+  'p90_output_tokens_actual',
+  'p95_output_tokens_actual',
+  'p99_output_tokens_actual',
+  'p99.9_output_tokens_actual',
+  'std_output_tokens_actual',
+  // per-request output token count distribution — expected from trace
+  'median_output_tokens_expected',
+  'mean_output_tokens_expected',
+  'p75_output_tokens_expected',
+  'p90_output_tokens_expected',
+  'p95_output_tokens_expected',
+  'p99_output_tokens_expected',
+  'p99.9_output_tokens_expected',
+  'std_output_tokens_expected',
+  // run totals (agentic aiperf)
+  'duration_seconds',
+  'total_requests_completed',
+  'total_prompt_tokens',
+  'total_generation_tokens',
+  // server prefix-cache observability (agentic aiperf)
+  'server_gpu_cache_hit_rate',
+  'server_cpu_cache_hit_rate',
+  'theoretical_cache_hit_rate',
 ]);

From a7a135401f18ad2c24f6c87b25a1a255826309db Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:20:54 -0500
Subject: [PATCH 028/111] fix(inference): don't drop agentic TTFT points over
 60s as outliers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

useChartData hardcoded a 60s latency-limit filter when xAxisField was
'p90_ttft' — meant to suppress fixed-seq overload outliers (conc=2048
rows that compress the rest of the chart to the left). For agentic
runs, TTFTs > 60s are normal (long prompts, multi-turn) so the filter
hid legitimate data points (e.g. only 7/12 visible for the latest B200
DSV4 ingest).

- Skip the latency-limit filter for agentic scenarios in both
  useChartData and processOverlayChartData.
- Broaden the TTFT-override detection from `=== 'p90_ttft'` to any
  `*_ttft` so the new median/p75/p99 percentile picks behave the same.
- Pass isAgentic into processOverlayChartData from ChartDisplay so the
  unofficial-run overlay path matches the official one.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/hooks/useChartData.ts   | 10 +++++++---
 .../src/components/inference/ui/ChartDisplay.tsx |  1 +
 packages/app/src/components/inference/utils.ts   | 16 +++++++++++++---
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index ffa6a8a7..2557b0d8 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -350,7 +350,8 @@ export function useChartData(
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
-        const isTtftX = xAxisField === 'p90_ttft';
+        const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)
@@ -365,11 +366,14 @@ export function useChartData(
                   roof,
                 };
               })
-              // When TTFT is on the x-axis, apply the latency limit to filter overload outliers
-              // (e.g. conc=2048 rows with TTFT > 60s that compress all real data to the far left)
+              // When TTFT is on the x-axis, apply the latency limit to filter
+              // overload outliers (fixed-seq conc=2048 rows with TTFT > 60s that
+              // compress all real data to the far left). Skip for agentic — long
+              // TTFTs there reflect real workloads (multi-turn, big prompts).
               .filter(
                 (d) =>
                   !isTtftX ||
+                  isAgentic ||
                   !chartDefinition.y_latency_limit ||
                   d.x <= chartDefinition.y_latency_limit,
               )
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index ca7f9cd7..12f9f5de 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -183,6 +183,7 @@ export default function ChartDisplay() {
         chartType,
         selectedYAxisMetric,
         effectiveXMetric,
+        { isAgentic: sequenceKind(selectedSequence) === 'agentic' },
       );
 
       let overlayPoints = processed;
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 735007ab..4876c614 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -75,11 +75,13 @@ export function processOverlayChartData(
   chartType: 'e2e' | 'interactivity',
   selectedYAxisMetric: string,
   selectedXAxisMetric: string | null,
+  options?: { isAgentic?: boolean },
 ): InferenceData[] {
   const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType);
   if (!chartDef) return [];
 
   const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+  const isAgentic = options?.isAgentic === true;
 
   // Resolve x-axis field (must match useChartData logic)
   const metricTitle =
@@ -87,8 +89,11 @@ export function processOverlayChartData(
   const isInputMetric = metricTitle.toLowerCase().includes('input');
   let xAxisField: string = chartDef.x;
   // selectedXAxisMetric is already the effective metric for this chart type
-  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
-  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
+  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric).
+  // Match any *_ttft metric — the x-axis-mode picker can now select any
+  // percentile (median/p75/p90/p99) depending on sequence kind.
+  const isTtftOverride =
+    typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft');
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     xAxisField = selectedXAxisMetric;
@@ -108,7 +113,12 @@ export function processOverlayChartData(
     })
     .filter(
       (d) =>
-        xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit,
+        // Skip the latency limit for the natural x-axis or for agentic
+        // (long TTFTs are normal there, not overload outliers).
+        xAxisField === chartDef.x ||
+        isAgentic ||
+        !chartDef.y_latency_limit ||
+        d.x <= chartDef.y_latency_limit,
     );
 
   return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric);

From 07194de6e5df1ca75d1f35085d178a2dc2625493 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:42:53 -0500
Subject: [PATCH 029/111] fix(trace-histograms): chunk DB query + blob-cache to
 escape size caps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Loading trace histograms for ~30+ agentic points failed with HTTP 500
because the Neon serverless HTTP driver caps responses at 64 MB, and
each compressed profile_export.jsonl blob is ~1-2 MB — the JOIN
returned all matching blobs in one round-trip and blew the cap. With no
histogram data, the "View charts" button never appears on the tooltip,
so users couldn't open the per-point detail page after the latest run.

- Chunk getTraceHistograms to 12 IDs per query so each round-trip stays
  well under the 64 MB cap. Total payload still merged into one map.
- Switch the route's cachedQuery to blobOnly so the larger JSON
  response doesn't bump the Next.js unstable_cache 2 MB limit either.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/trace-histograms/route.ts  |  5 +++
 packages/db/src/queries/trace-histograms.ts   | 31 +++++++++++++------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
index fd7572a8..7a959a65 100644
--- a/packages/app/src/app/api/v1/trace-histograms/route.ts
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -10,9 +10,14 @@ import { cachedJson, cachedQuery } from '@/lib/api-cache';
 
 export const dynamic = 'force-dynamic';
 
+// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB
+// unstable_cache limit (each point carries one int per request, ~500-1000+
+// requests for agentic), which manifests as a 500 from the route. Blob
+// storage lets us cache the larger response without losing the warm-cache hit.
 const getCachedTraceHistograms = cachedQuery(
   (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
   'trace-histograms',
+  { blobOnly: true },
 );
 
 const MAX_IDS_PER_REQUEST = 200;
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
index c243afd8..20ebc0d5 100644
--- a/packages/db/src/queries/trace-histograms.ts
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -27,21 +27,34 @@ export interface TraceHistogramPoint {
 
 export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
 
+/**
+ * Cap the number of blobs we pull in a single Neon HTTP query — the serverless
+ * driver returns 507 ("response is too large, max 64 MB") if the combined gzip
+ * payload exceeds that. Each profile_export.jsonl blob can be ~1-2 MB
+ * compressed, so we stay well below the cap at 12.
+ */
+const QUERY_CHUNK_SIZE = 12;
+
 export async function getTraceHistograms(
   sql: DbClient,
   benchmarkResultIds: number[],
 ): Promise<TraceHistogramMap> {
   if (benchmarkResultIds.length === 0) return {};
 
-  const rows = (await sql`
-    select
-      br.id as benchmark_result_id,
-      atr.profile_export_jsonl_gz as blob
-    from benchmark_results br
-    join agentic_trace_replay atr on atr.id = br.trace_replay_id
-    where br.id = any(${benchmarkResultIds}::bigint[])
-      and atr.profile_export_jsonl_gz is not null
-  `) as { benchmark_result_id: number; blob: Buffer }[];
+  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+        and atr.profile_export_jsonl_gz is not null
+    `) as { benchmark_result_id: number; blob: Buffer }[];
+    rows.push(...chunkRows);
+  }
 
   const result: TraceHistogramMap = {};
   for (const row of rows) {

From a1e594b34a8faa181af01e6c8449498eafa7e086 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:48:54 -0500
Subject: [PATCH 030/111] feat(inference): run selector actually filters chart
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When two workflow runs land on the same date (e.g. re-ingesting a
config), the run picker's "Run 1/2" ↔ "Run 2/2" had no effect on the
chart — benchmarks API returned DISTINCT ON (config, conc, isl, osl)
ordered by date with no run tiebreaker, so Postgres arbitrarily picked
one row per config and both picker selections produced identical data.

Plumb runId through the request path:
- getLatestBenchmarks gets an optional runId branch that strictly
  scopes to one workflow_run (filter wr.github_run_id = $runId).
- /api/v1/benchmarks accepts ?runId=…, forwarded into the cached query
  so each run has its own blob-cache entry.
- fetchBenchmarks → benchmarkQueryOptions → useBenchmarks pass the
  runId through; React Query keys it for separate caches per run.
- useChartData accepts selectedRunId and forwards it.
- InferenceProvider only passes runId when the current date has >1
  runs — single-run dates keep the existing latest-per-config logic
  so configs from earlier dates remain visible.

Verified in the dashboard: switching Run 1/2 ↔ Run 2/2 fires distinct
requests with the correct runId and the chart re-renders per-run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/benchmarks/route.test.ts   | 24 +++++++++-
 .../app/src/app/api/v1/benchmarks/route.ts    |  7 +--
 .../components/inference/InferenceContext.tsx |  9 ++++
 .../inference/hooks/useChartData.ts           | 11 ++++-
 .../app/src/hooks/api/use-benchmarks.test.ts  | 21 +++++++-
 packages/app/src/hooks/api/use-benchmarks.ts  | 10 ++--
 packages/app/src/lib/api.ts                   |  3 ++
 packages/db/src/queries/benchmarks.ts         | 48 +++++++++++++++++++
 8 files changed, 122 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/app/api/v1/benchmarks/route.test.ts b/packages/app/src/app/api/v1/benchmarks/route.test.ts
index 780f775e..92d5f326 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.test.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.test.ts
@@ -59,6 +59,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       undefined,
       undefined,
+      undefined,
     );
   });
 
@@ -72,6 +73,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       '2026-03-01',
       undefined,
+      undefined,
     );
   });
 
@@ -82,7 +84,27 @@ describe('GET /api/v1/benchmarks', () => {
       req('/api/v1/benchmarks?model=DeepSeek-R1-0528&date=2026-03-01&exact=true'),
     );
     expect(res.status).toBe(200);
-    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith('mock-sql', ['dsr1'], '2026-03-01', true);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      '2026-03-01',
+      true,
+      undefined,
+    );
+  });
+
+  it('passes runId param to query when provided', async () => {
+    mockGetLatestBenchmarks.mockResolvedValueOnce([]);
+
+    const res = await GET(req('/api/v1/benchmarks?model=DeepSeek-R1-0528&runId=26194160120'));
+    expect(res.status).toBe(200);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      undefined,
+      undefined,
+      '26194160120',
+    );
   });
 
   it('returns 500 when query throws', async () => {
diff --git a/packages/app/src/app/api/v1/benchmarks/route.ts b/packages/app/src/app/api/v1/benchmarks/route.ts
index c79f1aa7..c4037208 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.ts
@@ -11,10 +11,10 @@ import { loadFixture } from '@/lib/test-fixtures';
 export const dynamic = 'force-dynamic';
 
 const getCachedBenchmarks = cachedQuery(
-  (dbModelKeys: string[], date?: string, exact?: boolean) => {
+  (dbModelKeys: string[], date?: string, exact?: boolean, runId?: string) => {
     if (JSON_MODE)
       return Promise.resolve(jsonProvider.getLatestBenchmarks(dbModelKeys, date, exact));
-    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact);
+    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact, runId);
   },
   'benchmarks',
   { blobOnly: true },
@@ -25,6 +25,7 @@ export async function GET(request: NextRequest) {
   const model = params.get('model') ?? '';
   const date = params.get('date') ?? undefined;
   const exact = params.get('exact') === 'true';
+  const runId = params.get('runId') ?? undefined;
   const dbModelKeys = DISPLAY_MODEL_TO_DB[model];
   if (!dbModelKeys || dbModelKeys.length === 0) {
     return NextResponse.json({ error: 'Unknown model' }, { status: 400 });
@@ -32,7 +33,7 @@ export async function GET(request: NextRequest) {
   if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks'));
 
   try {
-    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined);
+    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
     return cachedJson(rows);
   } catch (error) {
     console.error('Error fetching benchmarks:', error);
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 74bdb28b..edf0974e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -214,6 +214,14 @@ export function InferenceProvider({
   // ── Data fetching (gated by isActive) ──────────────────────────────────────
   const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined;
 
+  // Run-selector scoping: only constrain benchmark data to a specific run when
+  // the current date has >1 runs (ambiguous case). When there's one run per
+  // date, the picker is informational and the SQL's latest-per-config logic
+  // already returns that run's data — passing runId would needlessly narrow
+  // the cross-date config view.
+  const multipleRunsOnDate = availableRuns && Object.keys(availableRuns).length > 1;
+  const benchmarkRunId = multipleRunsOnDate && selectedRunId ? String(selectedRunId) : undefined;
+
   const {
     graphs,
     loading: chartDataLoading,
@@ -236,6 +244,7 @@ export function InferenceProvider({
     latestDate,
     selectedPercentile,
     compareGpuPair ?? null,
+    benchmarkRunId,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 2557b0d8..328750f0 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -86,10 +86,19 @@ export function useChartData(
   selectedPercentile = 'p90',
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
+  /**
+   * GitHub run id (g_runid) from the run picker. When set, the benchmarks API
+   * scopes results to that workflow run instead of returning the latest per
+   * config — disambiguates when two runs land on the same date.
+   */
+  selectedRunId?: string,
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
   // materialized view instead of firing a redundant second fetch with identical data.
+  // When a specific run is selected, we always go through the runId branch and the
+  // date is effectively ignored — keep queryDate set so React Query still has a
+  // distinct cache key per date if the user navigates back to "latest".
   const queryDate =
     selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate
       ? ''
@@ -99,7 +108,7 @@ export function useChartData(
     data: allRows,
     isLoading: queryLoading,
     error: queryError,
-  } = useBenchmarks(selectedModel, queryDate, enabled);
+  } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId);
 
   // GPU comparison: fetch data for each additional comparison date
   const comparisonDates = useMemo(
diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts
index 7329896d..c4f49130 100644
--- a/packages/app/src/hooks/api/use-benchmarks.test.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.test.ts
@@ -5,12 +5,29 @@ import { benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 describe('benchmarkQueryOptions', () => {
   it('builds query key from model and date', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01');
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest']);
+    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest', '']);
   });
 
   it('builds exact query key when exact=true', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true);
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact']);
+    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', '']);
+  });
+
+  it('includes runId in query key when provided', () => {
+    const opts = benchmarkQueryOptions(
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      true,
+      false,
+      '26194160120',
+    );
+    expect(opts.queryKey).toEqual([
+      'benchmarks',
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      'latest',
+      '26194160120',
+    ]);
   });
 
   it('produces distinct keys for different models', () => {
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
index 6da1568e..8fd1f4e9 100644
--- a/packages/app/src/hooks/api/use-benchmarks.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -8,14 +8,16 @@ export function benchmarkQueryOptions(
   date: string,
   enabled = true,
   exact?: boolean,
+  runId?: string,
 ) {
   return {
-    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest'] as const,
-    queryFn: ({ signal }: { signal: AbortSignal }) => fetchBenchmarks(model, date, exact, signal),
+    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? ''] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      fetchBenchmarks(model, date, exact, signal, runId),
     enabled: enabled && Boolean(model),
   };
 }
 
-export function useBenchmarks(model: string, date?: string, enabled = true) {
-  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled));
+export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) {
+  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId));
 }
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 98587c2f..31cf906a 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -121,10 +121,13 @@ export function fetchBenchmarks(
   date?: string,
   exact?: boolean,
   signal?: AbortSignal,
+  /** Optional github_run_id to scope to a specific workflow run. */
+  runId?: string,
 ) {
   const params = new URLSearchParams({ model });
   if (date) params.set('date', date);
   if (exact) params.set('exact', 'true');
+  if (runId) params.set('runId', runId);
   return fetchJson<BenchmarkRow[]>(`/api/v1/benchmarks?${params}`, signal);
 }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 36bb0e65..2291dc0c 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -53,8 +53,56 @@ export async function getLatestBenchmarks(
   modelKey: string | string[],
   date?: string,
   exact?: boolean,
+  /**
+   * If set, filter to a specific GitHub Actions workflow run.
+   * Bypasses the "latest per config" logic — when two runs landed on the same
+   * date and the user picked one in the run selector, this scopes the chart
+   * data to that run only. Value matches the URL param `g_runid` (a
+   * stringified github_run_id, not the DB id).
+   */
+  runId?: string,
 ): Promise<BenchmarkRow[]> {
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
+  if (runId) {
+    const rows = await sql`
+      SELECT
+        br.id,
+        c.hardware,
+        c.framework,
+        c.model,
+        c.precision,
+        c.spec_method,
+        c.disagg,
+        c.is_multinode,
+        c.prefill_tp,
+        c.prefill_ep,
+        c.prefill_dp_attention,
+        c.prefill_num_workers,
+        c.decode_tp,
+        c.decode_ep,
+        c.decode_dp_attention,
+        c.decode_num_workers,
+        c.num_prefill_gpu,
+        c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
+        br.isl,
+        br.osl,
+        br.conc,
+        br.image,
+        br.metrics,
+        br.date::text,
+        CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url
+      FROM benchmark_results br
+      JOIN configs c ON c.id = br.config_id
+      JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+      WHERE c.model = ANY(${modelKeys})
+        AND br.error IS NULL
+        AND wr.github_run_id = ${runId}::bigint
+      ORDER BY br.config_id, br.conc, br.isl, br.osl
+    `;
+    return rows as unknown as BenchmarkRow[];
+  }
   if (date) {
     // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest)
     // exact=true: only return data from this exact date (for GPU comparison)

From b0d228abeb344aa2ced0e2c5ab2ac43e0128a17e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 00:11:34 -0500
Subject: [PATCH 031/111] feat(inference): Session Time + Prefill TPS x-axis
 (live from trace blobs)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two new agentic-only chart variants per
https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa, computed
live from the stored aiperf profile_export.jsonl blobs (no backfill needed):

- Session Time: mean across sessions of Σ per-turn request_latency,
  rescaled by mean_load / session_load. The summed-latency definition
  inherently strips inter-turn tool/thinking gaps (only GPU active time
  contributes).
- Prefill TPS / user: per turn ISL / TTFT, P90 across the session's turns,
  mean across sessions. Captures worst-turn prefill responsiveness.

The buttons only show on agentic scenarios (gated by a mounted flag to
keep SSR identical to the first client render). Roofline corners match the
expected Pareto direction: Session Time sweeps bottom-left → top-right;
Prefill TPS sweeps top-left → bottom-right.

Plumbing:
- New `getDerivedAgenticMetrics(sql, ids)` in packages/db chunks JSONL
  blob loads to 6 per query so we stay under Neon's 64 MB cap. Includes
  5-case unit suite for the math.
- New `/api/v1/derived-agentic-metrics` route + `useDerivedAgenticMetrics`
  hook, mirroring trace-histograms (blob-cached).
- ChartDisplay fetches derived metrics for visible agentic point IDs and
  overrides scatter data.x + chart heading + axis label + roofline corner.

Two side-effects fixed along the way:
- Hydration mismatch from URL-driven initial state: x-axis-mode now seeds
  from a fixed default and applies the URL value post-mount.
- The run-selector scoping regression where DSR1 (no model-matching
  changelog on its date) tried to fetch with a runId from a different
  model's run and got zero rows. Only pass runId when there are >1 runs
  whose CHANGELOG explicitly mentions the current model + precision.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts     |   2 +-
 .../api/v1/derived-agentic-metrics/route.ts   |  68 ++++++
 .../components/inference/InferenceContext.tsx |  93 ++++++--
 .../app/src/components/inference/types.ts     |  10 +-
 .../components/inference/ui/ChartDisplay.tsx  | 114 +++++++--
 .../hooks/api/use-derived-agentic-metrics.ts  |  41 ++++
 .../queries/derived-agentic-metrics.test.ts   |  96 ++++++++
 .../db/src/queries/derived-agentic-metrics.ts | 224 ++++++++++++++++++
 8 files changed, 612 insertions(+), 36 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
 create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.test.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.ts

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index 2d3c982f..152e3f98 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -195,7 +195,7 @@ export function createMockInferenceContext(
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
     setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
-    selectedXAxisMode: 'interactivity',
+    selectedXAxisMode: 'interactivity' as const,
     setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
     scaleType: 'auto',
     setScaleType: namedStub('setScaleType'),
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
new file mode 100644
index 00000000..e5f6e0b2
--- /dev/null
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -0,0 +1,68 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getDerivedAgenticMetrics,
+  type DerivedAgenticMetricMap,
+} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: the response is one entry per id with two numbers, but the
+// derivation work parses thousands of JSONL records per blob — cache the
+// computed result so a chart-refresh hits the warm path.
+const getCachedDerivedAgenticMetrics = cachedQuery(
+  (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
+  'derived-agentic-metrics',
+  { blobOnly: true },
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/derived-agentic-metrics?ids=1,2,3
+ *
+ * Returns per-id derived metrics computed live from the stored aiperf
+ * profile_export.jsonl blobs:
+ *  - normalized_session_time_s: mean across sessions of session e2e time
+ *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
+ *  - mean_p90_prefill_tps_per_user: mean across sessions of P90 (over the
+ *    session's turns) prefill TPS/user (ISL / TTFT).
+ *
+ * Ids without a trace_replay blob or with unparseable records are omitted.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const result = await getCachedDerivedAgenticMetrics(sorted);
+    return cachedJson(result);
+  } catch (error) {
+    console.error('Error fetching derived agentic metrics:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index edf0974e..2e5a245f 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -137,17 +137,32 @@ export function InferenceProvider({
   // computing a kind-based default here would diverge between server and client
   // and cause a hydration mismatch. The scenario-kind default is applied in a
   // post-mount effect below (and a ref tracks whether the user has overridden).
-  const urlXMode = (() => {
+  type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  const VALID_X_MODES: XAxisMode[] = [
+    'ttft',
+    'e2e',
+    'interactivity',
+    'session-time',
+    'prefill-tps',
+  ];
+  // SSR has no URL access, so seed with a fixed default and apply the URL
+  // value (if any) in a post-mount effect — keeps server + client first render
+  // identical and avoids "didn't match" hydration warnings when the URL holds
+  // a non-default mode.
+  const [selectedXAxisMode, setSelectedXAxisMode] = useState<XAxisMode>('ttft');
+  const xAxisModeFromUrlRef = useRef(false);
+  useEffect(() => {
+    if (xAxisModeFromUrlRef.current) return;
     const v = getUrlParam('i_xmode');
-    return v === 'ttft' || v === 'e2e' || v === 'interactivity' ? v : null;
-  })();
-  const [selectedXAxisMode, setSelectedXAxisMode] = useState<'ttft' | 'e2e' | 'interactivity'>(
-    urlXMode ?? 'ttft',
-  );
-  const xAxisModeFromUrlRef = useRef(urlXMode !== null);
+    if (v && (VALID_X_MODES as string[]).includes(v)) {
+      xAxisModeFromUrlRef.current = true;
+      setSelectedXAxisMode(v as XAxisMode);
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
   // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
   // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
-  const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
+  const handleSetXAxisMode = useCallback((mode: XAxisMode) => {
     xAxisModeFromUrlRef.current = true;
     setSelectedXAxisMode(mode);
     // The e2e chart's x-axis metric is reconciled in a separate effect below,
@@ -215,12 +230,37 @@ export function InferenceProvider({
   const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined;
 
   // Run-selector scoping: only constrain benchmark data to a specific run when
-  // the current date has >1 runs (ambiguous case). When there's one run per
-  // date, the picker is informational and the SQL's latest-per-config logic
-  // already returns that run's data — passing runId would needlessly narrow
-  // the cross-date config view.
-  const multipleRunsOnDate = availableRuns && Object.keys(availableRuns).length > 1;
-  const benchmarkRunId = multipleRunsOnDate && selectedRunId ? String(selectedRunId) : undefined;
+  // there's actually a disambiguation to make for the CURRENT model. The
+  // raw `availableRuns` is across ALL models on the date, so the picker may
+  // auto-select a run that produced nothing for the current model — passing
+  // that runId would return zero rows and hide the chart entirely.
+  // Compute the set of runs whose CHANGELOG explicitly mentions this model +
+  // precision. We can't reuse `filterRunsByModel` here because it has a
+  // fallback that returns all runs when nothing matches (so the picker still
+  // renders) — which would make us pass a runId that produced no rows for
+  // the current model, hiding the chart.
+  const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING)
+    .filter(([, model]) => model === selectedModel)
+    .map(([prefix]) => prefix);
+  const runIdsWithModelChangelog: string[] = [];
+  if (availableRuns) {
+    for (const [runId, runInfo] of Object.entries(availableRuns)) {
+      if (!runInfo.changelog) continue;
+      const matches = runInfo.changelog.entries.some((entry) =>
+        entry.config_keys.some((key) => {
+          const parts = key.split('-');
+          return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!);
+        }),
+      );
+      if (matches) runIdsWithModelChangelog.push(runId);
+    }
+  }
+  const benchmarkRunId =
+    selectedRunId &&
+    runIdsWithModelChangelog.length > 1 &&
+    runIdsWithModelChangelog.includes(selectedRunId)
+      ? String(selectedRunId)
+      : undefined;
 
   const {
     graphs,
@@ -367,11 +407,30 @@ export function InferenceProvider({
   useEffect(() => {
     const kind = sequenceKind(effectiveSequence);
     const isInitialMount = lastSeqKindRef.current === null;
-    if (!isInitialMount && lastSeqKindRef.current === kind) return;
+    const isAgenticOnlyMode =
+      selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps';
+    // On a stale render where kind hasn't changed, bail unless the current
+    // mode is agentic-only and we just landed on a fixed-seq scenario — in
+    // that case force the snap so the chart doesn't try to plot trace-derived
+    // metrics against rows that have no trace_replay.
+    if (!isInitialMount && lastSeqKindRef.current === kind) {
+      if (kind === 'fixed-seq' && isAgenticOnlyMode) {
+        handleSetXAxisMode('interactivity');
+      }
+      return;
+    }
     lastSeqKindRef.current = kind;
-    if (isInitialMount && xAxisModeFromUrlRef.current) return;
+    if (
+      isInitialMount &&
+      xAxisModeFromUrlRef.current &&
+      !(kind === 'fixed-seq' && isAgenticOnlyMode)
+    ) {
+      // URL-restored agentic-only mode on a fixed-seq sequence makes no sense
+      // — fall through to the default snap below.
+      return;
+    }
     handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
-  }, [effectiveSequence, handleSetXAxisMode]);
+  }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]);
 
   // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
   // agentic percentile changes. For fixed-seq the JSONB only carries
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 0a9908e3..bedded40 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -544,13 +544,17 @@ export interface InferenceChartContextType {
   setSelectedE2eXAxisMetric: (metric: string | null) => void;
   /**
    * Which chart variant the user wants to see — the inference card shows one chart
-   * at a time, picked by the big TTFT / E2E Latency / Interactivity buttons.
+   * at a time, picked by the big buttons above the chart.
    * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
    * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
    * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+   * - 'session-time'  → agentic-only; x = mean-normalized session time (live-computed from trace blobs)
+   * - 'prefill-tps'   → agentic-only; x = mean of P90 prefill TPS/user per session
    */
-  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity';
-  setSelectedXAxisMode: (mode: 'ttft' | 'e2e' | 'interactivity') => void;
+  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  setSelectedXAxisMode: (
+    mode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps',
+  ) => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
   setIsLegendExpanded: (metric: boolean) => void;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 12f9f5de..63953b30 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -1,7 +1,7 @@
 'use client';
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
-import { useMemo, useRef, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { BarChart3, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
@@ -42,6 +42,7 @@ import {
   sequenceKind,
 } from '@/lib/data-mappings';
 import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
+import { useDerivedAgenticMetrics } from '@/hooks/api/use-derived-agentic-metrics';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
 import { hardwareKeyMatchesAnyBase } from '@/lib/constants';
 
@@ -62,20 +63,25 @@ import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 type InferenceViewMode = 'chart' | 'table';
 
 /**
- * The three chart variants the user can choose with the big buttons above the
- * chart card. Each maps to one entry in `inference-chart-config.json` plus a
- * forced x-axis override for the E2E chartType.
+ * The chart variants the user can choose with the big buttons above the chart
+ * card. The first three map to entries in `inference-chart-config.json` plus a
+ * forced x-axis override for the E2E chartType; the last two are agentic-only
+ * derived metrics computed live from the stored trace_replay blobs.
  */
-type XAxisMode = 'ttft' | 'e2e' | 'interactivity';
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
 
 interface XAxisModeButton {
   value: XAxisMode;
   label: string;
+  /** When true, the button is only shown on agentic scenarios. */
+  agenticOnly?: boolean;
 }
 const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [
   { value: 'ttft', label: 'TTFT' },
   { value: 'e2e', label: 'E2E Latency' },
   { value: 'interactivity', label: 'Interactivity' },
+  { value: 'session-time', label: 'Session Time', agenticOnly: true },
+  { value: 'prefill-tps', label: 'Prefill TPS / user', agenticOnly: true },
 ];
 
 const VIEW_MODE_OPTIONS: SegmentedToggleOption<InferenceViewMode>[] = [
@@ -134,6 +140,13 @@ export default function ChartDisplay() {
     totalDatesQueried,
   } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates);
 
+  // SSR has no URL access and `selectedSequence` defaults to agentic on the
+  // server even when the URL says fixed-seq — so any conditional rendering
+  // that keys off `sequenceKind(selectedSequence)` would diverge between
+  // server and client first render. Defer agentic-only UI until after mount.
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const [viewModes, setViewModes] = useState<Record<number, InferenceViewMode>>({});
   const replayHandlesRef = useRef<Record<number, ReplayLauncherHandle | null>>({});
   const getViewMode = (index: number): InferenceViewMode => viewModes[index] ?? 'chart';
@@ -301,15 +314,74 @@ export default function ChartDisplay() {
     }));
   }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
 
-  // Show one chart at a time, picked by the TTFT / E2E / Interactivity buttons.
-  // Both 'ttft' and 'e2e' modes render the e2e chart (the x-axis swap is handled
-  // upstream by `selectedE2eXAxisMetric`, which `setSelectedXAxisMode` keeps in sync).
+  // Show one chart at a time, picked by the buttons above the chart.
+  //  - 'interactivity' renders the interactivity chartType.
+  //  - 'ttft' / 'e2e' render the e2e chartType (x swap via selectedE2eXAxisMetric).
+  //  - 'session-time' / 'prefill-tps' render the e2e chartType too; the x-axis
+  //    is overridden below from live-computed derived metrics.
   const visibleGraphs = useMemo(() => {
     const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
     const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
     return filtered.length > 0 ? filtered : effectiveGraphs;
   }, [effectiveGraphs, selectedXAxisMode]);
 
+  // Derived-metric path: fetch live-computed values from the trace_replay blobs
+  // and override scatter data.x. Only fires for the two agentic-only modes.
+  const useDerived =
+    sequenceKind(selectedSequence) === 'agentic' &&
+    (selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps');
+  const derivedTargetIds = useMemo(() => {
+    if (!useDerived) return [] as number[];
+    const ids = new Set<number>();
+    for (const g of visibleGraphs) {
+      for (const d of g.data) {
+        if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') {
+          ids.add(d.id);
+        }
+      }
+    }
+    return [...ids];
+  }, [useDerived, visibleGraphs]);
+  const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
+  const derivedMetrics = derivedQuery.data;
+
+  const renderableGraphs = useMemo(() => {
+    if (!useDerived) return visibleGraphs;
+    if (!derivedMetrics) return visibleGraphs.map((g) => ({ ...g, data: [] }));
+    const isSession = selectedXAxisMode === 'session-time';
+    const xLabel = isSession
+      ? 'Mean Normalized Session Time (s)'
+      : 'Mean P90 Prefill TPS per user (tok/s)';
+    // Roofline corner = which corner the curve sweeps from / toward, matching
+    // existing chart-config convention:
+    //  - session-time: as concurrency rises, session time AND throughput both
+    //    grow → curve goes bottom-left → top-right → upper_right.
+    //  - prefill-tps:  as concurrency rises, per-user prefill TPS falls while
+    //    total throughput rises → curve goes top-left → bottom-right →
+    //    upper_left.
+    const rooflineCorner = isSession ? 'upper_right' : 'upper_left';
+    return visibleGraphs.map((g) => {
+      const overriddenChartDef = {
+        ...g.chartDefinition,
+        x_label: xLabel,
+        // y_latency_limit was meant to suppress fixed-seq overload outliers on
+        // the TTFT axis — irrelevant for these derived axes.
+        y_latency_limit: undefined,
+        [`${selectedYAxisMetric}_roofline` as keyof typeof g.chartDefinition]: rooflineCorner,
+      };
+      const data = g.data
+        .map((d) => {
+          if (typeof d.id !== 'number') return null;
+          const m = derivedMetrics[d.id];
+          const v = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          if (v === null || v === undefined || !Number.isFinite(v)) return null;
+          return { ...d, x: v };
+        })
+        .filter((d): d is NonNullable<typeof d> => d !== null);
+      return { ...g, chartDefinition: overriddenChartDef, data };
+    });
+  }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
+
   const displayGraphs = isFirstLoad
     ? [
         <Card key="skeleton-0">
@@ -318,9 +390,9 @@ export default function ChartDisplay() {
           <Skeleton className="h-[600px] w-full" />
         </Card>,
       ]
-    : visibleGraphs.length === 0
+    : renderableGraphs.length === 0
       ? []
-      : visibleGraphs.map((graph, graphIndex) => {
+      : renderableGraphs.map((graph, graphIndex) => {
           const isTimelineMode = Boolean(
             selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
           );
@@ -396,11 +468,16 @@ export default function ChartDisplay() {
                               return 'vs. P90 Time To First Token';
                             }
 
-                            // For e2e chart: heading is driven by the TTFT / E2E button
-                            // selection above the card, so the inline dropdown is gone.
-                            // The metric carries the percentile prefix (e.g. p90_ttft,
-                            // median_ttft for fixed-seq, p75_ttft for agentic+p75).
+                            // For e2e chart: heading is driven by the buttons above the
+                            // card. Derived-metric modes win first; otherwise the metric
+                            // carries the percentile prefix (e.g. p90_ttft, median_ttft).
                             if (graph.chartDefinition.chartType === 'e2e') {
+                              if (selectedXAxisMode === 'session-time') {
+                                return 'vs. Mean Normalized Session Time';
+                              }
+                              if (selectedXAxisMode === 'prefill-tps') {
+                                return 'vs. Mean P90 Prefill TPS / user';
+                              }
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
                                 const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
@@ -601,7 +678,14 @@ export default function ChartDisplay() {
         aria-label="Chart x-axis metric"
         data-testid="x-axis-mode-buttons"
       >
-        {X_AXIS_MODE_BUTTONS.map(({ value, label }) => {
+        {X_AXIS_MODE_BUTTONS.filter(({ agenticOnly }) => {
+          if (!agenticOnly) return true;
+          // Before client mount, conditionalize on the server-default kind
+          // (agentic) so SSR + first client render produce identical DOM. After
+          // mount, hide the agentic-only buttons on fixed-seq sequences.
+          if (!mounted) return true;
+          return sequenceKind(selectedSequence) === 'agentic';
+        }).map(({ value, label }) => {
           const isActive = selectedXAxisMode === value;
           return (
             <button
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
new file mode 100644
index 00000000..108312ee
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -0,0 +1,41 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface DerivedAgenticMetric {
+  id: number;
+  /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
+   *  by mean_load / session_load. Null when the JSONL had no usable records. */
+  normalized_session_time_s: number | null;
+  /** Mean across sessions of (P90 over turns of ISL/TTFT). Null when no
+   *  prefill rates could be computed. */
+  mean_p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+async function fetchDerivedAgenticMetrics(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<DerivedAgenticMetricMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`);
+  return (await res.json()) as DerivedAgenticMetricMap;
+}
+
+/**
+ * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user)
+ * computed live from the stored aiperf profile_export.jsonl. Used to drive
+ * the "Session Time" and "Prefill TPS/user" chart variants.
+ *
+ * Ids without a trace_replay blob (older or non-aiperf agentic runs) are
+ * silently omitted from the response.
+ */
+export function useDerivedAgenticMetrics(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['derived-agentic-metrics', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchDerivedAgenticMetrics(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..795be28a
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -0,0 +1,96 @@
+import { describe, expect, it } from 'vitest';
+
+import { computeDerivedFromBlob } from './derived-agentic-metrics.js';
+
+/** Build one aiperf JSONL record for the synthetic fixture. */
+function rec(
+  conversation_id: string,
+  turn_index: number,
+  fields: { isl: number; osl: number; ttft_ms: number; latency_ms: number },
+): string {
+  return JSON.stringify({
+    metadata: { conversation_id, turn_index, benchmark_phase: 'profiling' },
+    metrics: {
+      request_latency: { value: fields.latency_ms, unit: 'ms' },
+      time_to_first_token: { value: fields.ttft_ms, unit: 'ms' },
+      input_sequence_length: { value: fields.isl, unit: 'tokens' },
+      output_sequence_length: { value: fields.osl, unit: 'tokens' },
+    },
+  });
+}
+
+describe('computeDerivedFromBlob', () => {
+  it('returns nulls when no usable records', () => {
+    const out = computeDerivedFromBlob('');
+    expect(out.normalized_session_time_s).toBeNull();
+    expect(out.mean_p90_prefill_tps_per_user).toBeNull();
+  });
+
+  it('rescales single-session time and computes P90 prefill', () => {
+    // One session, two turns. load = (100+50) + (200+50) = 400.
+    // Single session ⇒ mean_load = load_i ⇒ T̃ = T = (1000+2000) ms = 3.0 s.
+    const jsonl = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
+    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → P90 within session = 200.
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('rescales times across sessions with unequal load', () => {
+    // s1: 1 turn, load = 100, T = 1s
+    // s2: 1 turn, load = 300, T = 3s
+    // mean_load = 200; T̃_1 = 1 * 200/100 = 2; T̃_2 = 3 * 200/300 = 2
+    // Mean T̃ = 2.0
+    const jsonl = [
+      rec('s1', 0, { isl: 90, osl: 10, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s2', 0, { isl: 270, osl: 30, ttft_ms: 500, latency_ms: 3000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(2, 6);
+  });
+
+  it('drops records missing required fields and skips non-profiling phase', () => {
+    const lines = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      // missing TTFT — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's1', turn_index: 1, benchmark_phase: 'profiling' },
+        metrics: {
+          request_latency: { value: 1000, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      // warmup phase — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's2', turn_index: 0, benchmark_phase: 'warmup' },
+        metrics: {
+          request_latency: { value: 9999, unit: 'ms' },
+          time_to_first_token: { value: 9999, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+    ];
+    const out = computeDerivedFromBlob(lines.join('\n'));
+    expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('p90 across turns: 10-turn session picks the right rank', () => {
+    // Prefill rates 100..1000 (per turn isl/ttft); p90 of 10 values (linear) = 910.
+    const turns = Array.from({ length: 10 }, (_, i) =>
+      rec('s1', i, {
+        isl: (i + 1) * 100, // 100, 200, ..., 1000 tokens
+        osl: 10,
+        ttft_ms: 1000, // 1 second → rates: 100..1000 tps
+        latency_ms: 1500,
+      }),
+    );
+    const out = computeDerivedFromBlob(turns.join('\n'));
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+  });
+});
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
new file mode 100644
index 00000000..14f3adcf
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -0,0 +1,224 @@
+/**
+ * Live-computed per-point metrics derived from the stored aiperf
+ * `profile_export.jsonl` blob. These aren't precomputed in the metrics JSONB
+ * because they require grouping by `conversation_id` and aggregating per
+ * session — work that's cheap once per agentic point but adds up to be
+ * meaningful only when actually plotted.
+ *
+ * - normalized_session_time_s: per the "Mean Normalized Session Time" proposal
+ *   (https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa). Sum of
+ *   per-turn `request_latency` per session (inter-turn tool/thinking gaps are
+ *   inherently excluded since we only sum the active GPU time, not wallclock).
+ *   Each session's time is rescaled by `mean_load / session_load`, where load
+ *   is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
+ *
+ * - mean_p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ *   Per turn: prefill_tps = ISL / TTFT_seconds. Per session: P90 across its
+ *   turns. Across sessions: arithmetic mean. Captures the worst-turn prefill
+ *   responsiveness from the end-user perspective.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface DerivedAgenticMetric {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Mean normalized session time in seconds. */
+  normalized_session_time_s: number | null;
+  /** Mean across sessions of (P90 prefill tps/user across the session's turns). */
+  mean_p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+/**
+ * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless
+ * HTTP driver caps responses at 64 MB — chunk to stay well under.
+ */
+const QUERY_CHUNK_SIZE = 6;
+
+interface RecordMetrics {
+  request_latency?: { value?: number; unit?: string } | number;
+  time_to_first_token?: { value?: number; unit?: string } | number;
+  input_sequence_length?: { value?: number } | number;
+  output_sequence_length?: { value?: number } | number;
+}
+
+interface RecordMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  benchmark_phase?: string;
+}
+
+interface ProfileRecord {
+  metadata?: RecordMetadata;
+  metrics?: RecordMetrics;
+}
+
+interface TurnFields {
+  request_latency_ms: number;
+  ttft_ms: number;
+  isl: number;
+  osl: number;
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return v;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+function extractTurn(rec: ProfileRecord): TurnFields | null {
+  const m = rec.metrics ?? {};
+  const rl = readNum(m.request_latency);
+  const tt = readNum(m.time_to_first_token);
+  const isl = readNum(m.input_sequence_length);
+  const osl = readNum(m.output_sequence_length);
+  if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null;
+  if (rl <= 0 || tt <= 0 || isl <= 0) return null;
+  return { request_latency_ms: rl, ttft_ms: tt, isl, osl };
+}
+
+/** Linear-interpolated percentile (matches numpy's default linear method). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 0) return Number.NaN;
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+  if (xs.length === 0) return Number.NaN;
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+
+/**
+ * Parse one point's JSONL and return the two derived metrics. Returns
+ * `{ session_time: null, prefill: null }` if the blob has no usable records.
+ */
+export function computeDerivedFromBlob(jsonl: string): {
+  normalized_session_time_s: number | null;
+  mean_p90_prefill_tps_per_user: number | null;
+} {
+  // Group records by conversation_id, filter to the profiling phase.
+  const bySession = new Map<string, TurnFields[]>();
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const sid = rec.metadata?.conversation_id;
+    if (!sid) continue;
+    const turn = extractTurn(rec);
+    if (!turn) continue;
+    let list = bySession.get(sid);
+    if (!list) {
+      list = [];
+      bySession.set(sid, list);
+    }
+    list.push(turn);
+  }
+  if (bySession.size === 0) {
+    return { normalized_session_time_s: null, mean_p90_prefill_tps_per_user: null };
+  }
+
+  // Per-session aggregates.
+  const sessionTimesS: number[] = [];
+  const sessionLoads: number[] = [];
+  const sessionP90Prefill: number[] = [];
+  for (const turns of bySession.values()) {
+    let timeMs = 0;
+    let load = 0;
+    const prefillRates: number[] = [];
+    for (const t of turns) {
+      timeMs += t.request_latency_ms;
+      load += t.isl + t.osl;
+      const ttftSec = t.ttft_ms / 1000;
+      if (ttftSec > 0) prefillRates.push(t.isl / ttftSec);
+    }
+    if (load > 0) {
+      sessionTimesS.push(timeMs / 1000);
+      sessionLoads.push(load);
+    }
+    if (prefillRates.length > 0) {
+      prefillRates.sort((a, b) => a - b);
+      sessionP90Prefill.push(quantile(prefillRates, 0.9));
+    }
+  }
+
+  // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+  let normalized: number | null = null;
+  if (sessionTimesS.length > 0) {
+    const meanLoad = meanOf(sessionLoads);
+    if (meanLoad > 0) {
+      const scaled: number[] = [];
+      for (let i = 0; i < sessionTimesS.length; i++) {
+        const ti = sessionTimesS[i]!;
+        const li = sessionLoads[i]!;
+        if (li > 0) scaled.push(ti * (meanLoad / li));
+      }
+      normalized = scaled.length > 0 ? meanOf(scaled) : null;
+    }
+  }
+
+  const prefill = sessionP90Prefill.length > 0 ? meanOf(sessionP90Prefill) : null;
+
+  return {
+    normalized_session_time_s: normalized,
+    mean_p90_prefill_tps_per_user: prefill,
+  };
+}
+
+export async function getDerivedAgenticMetrics(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<DerivedAgenticMetricMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+        and atr.profile_export_jsonl_gz is not null
+    `) as { benchmark_result_id: number; blob: Buffer }[];
+    rows.push(...chunkRows);
+  }
+
+  const result: DerivedAgenticMetricMap = {};
+  for (const row of rows) {
+    try {
+      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const { normalized_session_time_s, mean_p90_prefill_tps_per_user } =
+        computeDerivedFromBlob(jsonl);
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        normalized_session_time_s,
+        mean_p90_prefill_tps_per_user,
+      };
+    } catch {
+      // Skip malformed blobs silently — frontend treats missing ids as "no data".
+    }
+  }
+  return result;
+}

From 8af1f5cd42f6d423ded91c04310345a09343fa34 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:20:29 -0400
Subject: [PATCH 032/111] fix(inference): show Mean Normalized Session Time in
 minutes

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/ChartDisplay.tsx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 63953b30..6be524b4 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -350,7 +350,7 @@ export default function ChartDisplay() {
     if (!derivedMetrics) return visibleGraphs.map((g) => ({ ...g, data: [] }));
     const isSession = selectedXAxisMode === 'session-time';
     const xLabel = isSession
-      ? 'Mean Normalized Session Time (s)'
+      ? 'Mean Normalized Session Time (min)'
       : 'Mean P90 Prefill TPS per user (tok/s)';
     // Roofline corner = which corner the curve sweeps from / toward, matching
     // existing chart-config convention:
@@ -373,8 +373,9 @@ export default function ChartDisplay() {
         .map((d) => {
           if (typeof d.id !== 'number') return null;
           const m = derivedMetrics[d.id];
-          const v = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
-          if (v === null || v === undefined || !Number.isFinite(v)) return null;
+          const raw = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
+          const v = isSession ? raw / 60 : raw;
           return { ...d, x: v };
         })
         .filter((d): d is NonNullable<typeof d> => d !== null);

From be34e97dd07ca02de674be04c312f62f779cc95a Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:23:34 -0400
Subject: [PATCH 033/111] fix(inference): use global P90 of per-turn prefill
 TPS/user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the per-session P90 + cross-session mean sandwich; pool every turn
into one array and take a single P90 so the tail isn't dampened. Field
renamed mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user across
DB query, API, frontend hook, and chart labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../api/v1/derived-agentic-metrics/route.ts   |  4 +-
 .../components/inference/ui/ChartDisplay.tsx  |  6 +--
 .../hooks/api/use-derived-agentic-metrics.ts  |  6 +--
 .../queries/derived-agentic-metrics.test.ts   | 10 ++---
 .../db/src/queries/derived-agentic-metrics.ts | 41 +++++++++----------
 5 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index e5f6e0b2..c45173e5 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -28,8 +28,8 @@ const MAX_IDS_PER_REQUEST = 200;
  * profile_export.jsonl blobs:
  *  - normalized_session_time_s: mean across sessions of session e2e time
  *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
- *  - mean_p90_prefill_tps_per_user: mean across sessions of P90 (over the
- *    session's turns) prefill TPS/user (ISL / TTFT).
+ *  - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT)
+ *    across every turn in every session.
  *
  * Ids without a trace_replay blob or with unparseable records are omitted.
  */
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 6be524b4..bd3064d0 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -351,7 +351,7 @@ export default function ChartDisplay() {
     const isSession = selectedXAxisMode === 'session-time';
     const xLabel = isSession
       ? 'Mean Normalized Session Time (min)'
-      : 'Mean P90 Prefill TPS per user (tok/s)';
+      : 'P90 Prefill TPS per user (tok/s)';
     // Roofline corner = which corner the curve sweeps from / toward, matching
     // existing chart-config convention:
     //  - session-time: as concurrency rises, session time AND throughput both
@@ -373,7 +373,7 @@ export default function ChartDisplay() {
         .map((d) => {
           if (typeof d.id !== 'number') return null;
           const m = derivedMetrics[d.id];
-          const raw = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          const raw = isSession ? m?.normalized_session_time_s : m?.p90_prefill_tps_per_user;
           if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
           const v = isSession ? raw / 60 : raw;
           return { ...d, x: v };
@@ -477,7 +477,7 @@ export default function ChartDisplay() {
                                 return 'vs. Mean Normalized Session Time';
                               }
                               if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. Mean P90 Prefill TPS / user';
+                                return 'vs. P90 Prefill TPS / user';
                               }
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
index 108312ee..6bc7ae5e 100644
--- a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -5,9 +5,9 @@ export interface DerivedAgenticMetric {
   /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
    *  by mean_load / session_load. Null when the JSONL had no usable records. */
   normalized_session_time_s: number | null;
-  /** Mean across sessions of (P90 over turns of ISL/TTFT). Null when no
-   *  prefill rates could be computed. */
-  mean_p90_prefill_tps_per_user: number | null;
+  /** P90 of per-turn ISL/TTFT across every turn in every session.
+   *  Null when no prefill rates could be computed. */
+  p90_prefill_tps_per_user: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
index 795be28a..321434be 100644
--- a/packages/db/src/queries/derived-agentic-metrics.test.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -23,7 +23,7 @@ describe('computeDerivedFromBlob', () => {
   it('returns nulls when no usable records', () => {
     const out = computeDerivedFromBlob('');
     expect(out.normalized_session_time_s).toBeNull();
-    expect(out.mean_p90_prefill_tps_per_user).toBeNull();
+    expect(out.p90_prefill_tps_per_user).toBeNull();
   });
 
   it('rescales single-session time and computes P90 prefill', () => {
@@ -35,8 +35,8 @@ describe('computeDerivedFromBlob', () => {
     ].join('\n');
     const out = computeDerivedFromBlob(jsonl);
     expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
-    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → P90 within session = 200.
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → global P90 = 200.
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
   });
 
   it('rescales times across sessions with unequal load', () => {
@@ -77,7 +77,7 @@ describe('computeDerivedFromBlob', () => {
     ];
     const out = computeDerivedFromBlob(lines.join('\n'));
     expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
   });
 
   it('p90 across turns: 10-turn session picks the right rank', () => {
@@ -91,6 +91,6 @@ describe('computeDerivedFromBlob', () => {
       }),
     );
     const out = computeDerivedFromBlob(turns.join('\n'));
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6);
   });
 });
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index 14f3adcf..ac6fd38d 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -12,10 +12,10 @@
  *   Each session's time is rescaled by `mean_load / session_load`, where load
  *   is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
  *
- * - mean_p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
- *   Per turn: prefill_tps = ISL / TTFT_seconds. Per session: P90 across its
- *   turns. Across sessions: arithmetic mean. Captures the worst-turn prefill
- *   responsiveness from the end-user perspective.
+ * - p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ *   Per turn: prefill_tps = ISL / TTFT_seconds. Single P90 across every turn
+ *   in every session — the per-session percentile + cross-session mean
+ *   sandwich was discarded because it just dampens tail behavior.
  */
 
 import { gunzipSync } from 'node:zlib';
@@ -27,8 +27,8 @@ export interface DerivedAgenticMetric {
   id: number;
   /** Mean normalized session time in seconds. */
   normalized_session_time_s: number | null;
-  /** Mean across sessions of (P90 prefill tps/user across the session's turns). */
-  mean_p90_prefill_tps_per_user: number | null;
+  /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */
+  p90_prefill_tps_per_user: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
@@ -109,7 +109,7 @@ function meanOf(xs: number[]): number {
  */
 export function computeDerivedFromBlob(jsonl: string): {
   normalized_session_time_s: number | null;
-  mean_p90_prefill_tps_per_user: number | null;
+  p90_prefill_tps_per_user: number | null;
 } {
   // Group records by conversation_id, filter to the profiling phase.
   const bySession = new Map<string, TurnFields[]>();
@@ -134,31 +134,27 @@ export function computeDerivedFromBlob(jsonl: string): {
     list.push(turn);
   }
   if (bySession.size === 0) {
-    return { normalized_session_time_s: null, mean_p90_prefill_tps_per_user: null };
+    return { normalized_session_time_s: null, p90_prefill_tps_per_user: null };
   }
 
-  // Per-session aggregates.
+  // Per-session aggregates for session time; per-turn prefill rates pool into
+  // a single global array so the percentile sees the full distribution.
   const sessionTimesS: number[] = [];
   const sessionLoads: number[] = [];
-  const sessionP90Prefill: number[] = [];
+  const allPrefillRates: number[] = [];
   for (const turns of bySession.values()) {
     let timeMs = 0;
     let load = 0;
-    const prefillRates: number[] = [];
     for (const t of turns) {
       timeMs += t.request_latency_ms;
       load += t.isl + t.osl;
       const ttftSec = t.ttft_ms / 1000;
-      if (ttftSec > 0) prefillRates.push(t.isl / ttftSec);
+      if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec);
     }
     if (load > 0) {
       sessionTimesS.push(timeMs / 1000);
       sessionLoads.push(load);
     }
-    if (prefillRates.length > 0) {
-      prefillRates.sort((a, b) => a - b);
-      sessionP90Prefill.push(quantile(prefillRates, 0.9));
-    }
   }
 
   // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
@@ -176,11 +172,15 @@ export function computeDerivedFromBlob(jsonl: string): {
     }
   }
 
-  const prefill = sessionP90Prefill.length > 0 ? meanOf(sessionP90Prefill) : null;
+  let prefill: number | null = null;
+  if (allPrefillRates.length > 0) {
+    allPrefillRates.sort((a, b) => a - b);
+    prefill = quantile(allPrefillRates, 0.9);
+  }
 
   return {
     normalized_session_time_s: normalized,
-    mean_p90_prefill_tps_per_user: prefill,
+    p90_prefill_tps_per_user: prefill,
   };
 }
 
@@ -209,12 +209,11 @@ export async function getDerivedAgenticMetrics(
   for (const row of rows) {
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');
-      const { normalized_session_time_s, mean_p90_prefill_tps_per_user } =
-        computeDerivedFromBlob(jsonl);
+      const { normalized_session_time_s, p90_prefill_tps_per_user } = computeDerivedFromBlob(jsonl);
       result[Number(row.benchmark_result_id)] = {
         id: Number(row.benchmark_result_id),
         normalized_session_time_s,
-        mean_p90_prefill_tps_per_user,
+        p90_prefill_tps_per_user,
       };
     } catch {
       // Skip malformed blobs silently — frontend treats missing ids as "no data".

From c774c005f7c2dfc1fa451e293df5d6456ba5be71 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:29:27 -0400
Subject: [PATCH 034/111] fix(inference): no-data flash on session-time /
 prefill-tps modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two root causes for "No data available" when flipping to these modes:

1. Stale blob-cache: the v1 cache key still holds responses with the
   pre-rename `mean_p90_prefill_tps_per_user` field. The frontend's new
   `p90_prefill_tps_per_user` lookup misses → every row filters out.
   Bump the cache key to `derived-agentic-metrics-v2` to force a refresh.

2. Loading flicker: while the derived-metrics fetch is in flight we were
   passing empty `data: []` to ScatterGraph, which surfaces the misleading
   "change your filters" empty-state. Gate skeleton rendering on the
   derived query's pending/fetching state instead.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../api/v1/derived-agentic-metrics/route.ts   |   5 +-
 .../components/inference/ui/ChartDisplay.tsx  | 439 +++++++++---------
 2 files changed, 230 insertions(+), 214 deletions(-)

diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index c45173e5..6ce7c017 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -13,9 +13,12 @@ export const dynamic = 'force-dynamic';
 // blobOnly: the response is one entry per id with two numbers, but the
 // derivation work parses thousands of JSONL records per blob — cache the
 // computed result so a chart-refresh hits the warm path.
+// Bumped to v2 when mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user.
+// Stale v1 cache entries return undefined for the new field and silently
+// blank the chart with "No data available".
 const getCachedDerivedAgenticMetrics = cachedQuery(
   (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
-  'derived-agentic-metrics',
+  'derived-agentic-metrics-v2',
   { blobOnly: true },
 );
 
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index bd3064d0..fd6cd9c1 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -344,6 +344,14 @@ export default function ChartDisplay() {
   }, [useDerived, visibleGraphs]);
   const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
   const derivedMetrics = derivedQuery.data;
+  // Show skeleton (not "No data available") while the derived-metrics query
+  // is in flight. Without this gate, every flip to session-time / prefill-tps
+  // briefly blanks the chart and surfaces a misleading empty-state.
+  const isDerivedLoading =
+    useDerived &&
+    derivedTargetIds.length > 0 &&
+    (derivedQuery.isPending || derivedQuery.isFetching) &&
+    !derivedMetrics;
 
   const renderableGraphs = useMemo(() => {
     if (!useDerived) return visibleGraphs;
@@ -383,191 +391,181 @@ export default function ChartDisplay() {
     });
   }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
 
-  const displayGraphs = isFirstLoad
-    ? [
-        <Card key="skeleton-0">
-          <Skeleton className="h-7 w-2/4 mb-1" />
-          <Skeleton className="h-5 w-3/4 mb-2" />
-          <Skeleton className="h-[600px] w-full" />
-        </Card>,
-      ]
-    : renderableGraphs.length === 0
-      ? []
-      : renderableGraphs.map((graph, graphIndex) => {
-          const isTimelineMode = Boolean(
-            selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
-          );
-          const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
-          return (
-            <section key={graphIndex} className="pt-8 md:pt-0">
-              <figure data-testid="chart-figure" className="relative rounded-lg">
-                <ChartButtons
-                  chartId={`chart-${graphIndex}`}
-                  analyticsPrefix={
-                    isTimelineMode
-                      ? 'gpu_timeseries'
-                      : graph.chartDefinition.chartType === 'e2e'
-                        ? 'latency'
-                        : 'interactivity'
-                  }
-                  leadingControls={
-                    <SegmentedToggle
-                      value={getViewMode(graphIndex)}
-                      options={VIEW_MODE_OPTIONS}
-                      onValueChange={(v) => handleViewModeChange(graphIndex, v)}
-                      ariaLabel="View mode"
-                      testId={`inference-view-toggle-${graphIndex}`}
-                    />
-                  }
-                  hideImageExport={getViewMode(graphIndex) === 'table'}
-                  setIsLegendExpanded={setIsLegendExpanded}
-                  exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
-                  onExportMp4={
-                    replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined
-                  }
-                  onExportCsv={() => {
-                    const visibleData = graph.data.filter((d) =>
+  const displayGraphs =
+    isFirstLoad || isDerivedLoading
+      ? [
+          <Card key="skeleton-0">
+            <Skeleton className="h-7 w-2/4 mb-1" />
+            <Skeleton className="h-5 w-3/4 mb-2" />
+            <Skeleton className="h-[600px] w-full" />
+          </Card>,
+        ]
+      : renderableGraphs.length === 0
+        ? []
+        : renderableGraphs.map((graph, graphIndex) => {
+            const isTimelineMode = Boolean(
+              selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
+            );
+            const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
+            return (
+              <section key={graphIndex} className="pt-8 md:pt-0">
+                <figure data-testid="chart-figure" className="relative rounded-lg">
+                  <ChartButtons
+                    chartId={`chart-${graphIndex}`}
+                    analyticsPrefix={
                       isTimelineMode
-                        ? activeDates.has(`${d.date}_${d.hwKey}`)
-                        : activeHwTypes.has(d.hwKey as string) &&
-                          selectedPrecisions.includes(d.precision),
-                    );
-                    const { headers, rows } = inferenceChartToCsv(
-                      visibleData,
-                      graph.model,
-                      graph.sequence,
-                    );
-                    exportToCsv(
-                      `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
-                      headers,
-                      rows,
-                    );
-                  }}
-                />
-                <Card>
-                  {(() => {
-                    const chartCaption = (
-                      <>
-                        <h2 className="text-lg font-semibold">
-                          {
-                            graph.chartDefinition[
-                              `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                            ]
-                          }{' '}
-                          {(() => {
-                            // For Input metrics with dynamic x-axis, use dynamic heading
-                            const metricTitle =
-                              (graph.chartDefinition[
+                        ? 'gpu_timeseries'
+                        : graph.chartDefinition.chartType === 'e2e'
+                          ? 'latency'
+                          : 'interactivity'
+                    }
+                    leadingControls={
+                      <SegmentedToggle
+                        value={getViewMode(graphIndex)}
+                        options={VIEW_MODE_OPTIONS}
+                        onValueChange={(v) => handleViewModeChange(graphIndex, v)}
+                        ariaLabel="View mode"
+                        testId={`inference-view-toggle-${graphIndex}`}
+                      />
+                    }
+                    hideImageExport={getViewMode(graphIndex) === 'table'}
+                    setIsLegendExpanded={setIsLegendExpanded}
+                    exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
+                    onExportMp4={
+                      replayAvailable
+                        ? () => replayHandlesRef.current[graphIndex]?.open()
+                        : undefined
+                    }
+                    onExportCsv={() => {
+                      const visibleData = graph.data.filter((d) =>
+                        isTimelineMode
+                          ? activeDates.has(`${d.date}_${d.hwKey}`)
+                          : activeHwTypes.has(d.hwKey as string) &&
+                            selectedPrecisions.includes(d.precision),
+                      );
+                      const { headers, rows } = inferenceChartToCsv(
+                        visibleData,
+                        graph.model,
+                        graph.sequence,
+                      );
+                      exportToCsv(
+                        `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
+                        headers,
+                        rows,
+                      );
+                    }}
+                  />
+                  <Card>
+                    {(() => {
+                      const chartCaption = (
+                        <>
+                          <h2 className="text-lg font-semibold">
+                            {
+                              graph.chartDefinition[
                                 `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                              ] as string) || '';
-                            const isInputMetric = metricTitle.toLowerCase().includes('input');
-                            if (
-                              graph.chartDefinition.chartType === 'interactivity' &&
-                              isInputMetric &&
-                              selectedXAxisMetric === 'p90_ttft'
-                            ) {
-                              return 'vs. P90 Time To First Token';
-                            }
-
-                            // For e2e chart: heading is driven by the buttons above the
-                            // card. Derived-metric modes win first; otherwise the metric
-                            // carries the percentile prefix (e.g. p90_ttft, median_ttft).
-                            if (graph.chartDefinition.chartType === 'e2e') {
-                              if (selectedXAxisMode === 'session-time') {
-                                return 'vs. Mean Normalized Session Time';
+                              ]
+                            }{' '}
+                            {(() => {
+                              // For Input metrics with dynamic x-axis, use dynamic heading
+                              const metricTitle =
+                                (graph.chartDefinition[
+                                  `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                                ] as string) || '';
+                              const isInputMetric = metricTitle.toLowerCase().includes('input');
+                              if (
+                                graph.chartDefinition.chartType === 'interactivity' &&
+                                isInputMetric &&
+                                selectedXAxisMetric === 'p90_ttft'
+                              ) {
+                                return 'vs. P90 Time To First Token';
                               }
-                              if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. P90 Prefill TPS / user';
-                              }
-                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
-                                const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
-                                const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
-                                return `vs. ${word} Time To First Token`;
+
+                              // For e2e chart: heading is driven by the buttons above the
+                              // card. Derived-metric modes win first; otherwise the metric
+                              // carries the percentile prefix (e.g. p90_ttft, median_ttft).
+                              if (graph.chartDefinition.chartType === 'e2e') {
+                                if (selectedXAxisMode === 'session-time') {
+                                  return 'vs. Mean Normalized Session Time';
+                                }
+                                if (selectedXAxisMode === 'prefill-tps') {
+                                  return 'vs. P90 Prefill TPS / user';
+                                }
+                                const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                                if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                  const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                  const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
+                                  return `vs. ${word} Time To First Token`;
+                                }
+                                const pctlWord = selectedPercentile.toUpperCase();
+                                return isAgentic
+                                  ? `vs. ${pctlWord} End-to-end Latency`
+                                  : 'vs. End-to-end Latency';
                               }
-                              const pctlWord = selectedPercentile.toUpperCase();
-                              return isAgentic
-                                ? `vs. ${pctlWord} End-to-end Latency`
-                                : 'vs. End-to-end Latency';
-                            }
 
-                            // Fall back to the heading baked into chartDefinition
-                            // by useChartData (already resolves per-metric overrides
-                            // and applies the agentic percentile rewrite).
-                            return graph.chartDefinition.heading;
-                          })()}
-                        </h2>
-                        <p className="text-sm text-muted-foreground mb-2">
-                          {getModelLabel(graph.model as Model)} •{' '}
-                          {selectedPrecisions
-                            .map((prec) => getPrecisionLabel(prec as Precision))
-                            .join(', ')}{' '}
-                          • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
-                          {isUnofficialRun
-                            ? 'Source: UNOFFICIAL'
-                            : 'Source: SemiAnalysis InferenceX™'}
-                          {selectedRunDate && (
-                            <>
-                              {' '}
-                              • Updated:{' '}
-                              {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
-                                'en-US',
-                                {
-                                  year: 'numeric',
-                                  month: '2-digit',
-                                  day: '2-digit',
-                                  timeZone: 'UTC',
-                                },
-                              )}
-                            </>
-                          )}
-                        </p>
-                        <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
-                        <UnofficialDomainNotice />
-                      </>
-                    );
-
-                    if (getViewMode(graphIndex) === 'table') {
-                      const overlay =
-                        graph.chartDefinition.chartType === 'e2e'
-                          ? overlayDataByChartType.e2e
-                          : overlayDataByChartType.interactivity;
-                      const overlayRows = (overlay?.data ?? []).filter((p) =>
-                        selectedPrecisions.includes(p.precision),
-                      );
-                      return (
-                        <>
-                          {chartCaption}
-                          <InferenceTable
-                            data={
-                              overlayRows.length > 0 ? [...graph.data, ...overlayRows] : graph.data
-                            }
-                            chartDefinition={graph.chartDefinition}
-                            selectedYAxisMetric={selectedYAxisMetric}
-                          />
+                              // Fall back to the heading baked into chartDefinition
+                              // by useChartData (already resolves per-metric overrides
+                              // and applies the agentic percentile rewrite).
+                              return graph.chartDefinition.heading;
+                            })()}
+                          </h2>
+                          <p className="text-sm text-muted-foreground mb-2">
+                            {getModelLabel(graph.model as Model)} •{' '}
+                            {selectedPrecisions
+                              .map((prec) => getPrecisionLabel(prec as Precision))
+                              .join(', ')}{' '}
+                            • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
+                            {isUnofficialRun
+                              ? 'Source: UNOFFICIAL'
+                              : 'Source: SemiAnalysis InferenceX™'}
+                            {selectedRunDate && (
+                              <>
+                                {' '}
+                                • Updated:{' '}
+                                {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
+                                  'en-US',
+                                  {
+                                    year: 'numeric',
+                                    month: '2-digit',
+                                    day: '2-digit',
+                                    timeZone: 'UTC',
+                                  },
+                                )}
+                              </>
+                            )}
+                          </p>
+                          <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
+                          <UnofficialDomainNotice />
                         </>
                       );
-                    }
 
-                    return selectedDateRange.startDate &&
-                      selectedDateRange.endDate &&
-                      selectedGPUs.length > 0 ? (
-                      <GPUGraph
-                        chartId={`chart-${graphIndex}`}
-                        modelLabel={graph.model}
-                        data={graph.data}
-                        xLabel={graph.chartDefinition.x_label}
-                        yLabel={`${
-                          graph.chartDefinition[
-                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                          ]
-                        }`}
-                        chartDefinition={graph.chartDefinition}
-                        caption={chartCaption}
-                      />
-                    ) : (
-                      <div className="relative">
-                        <ScatterGraph
+                      if (getViewMode(graphIndex) === 'table') {
+                        const overlay =
+                          graph.chartDefinition.chartType === 'e2e'
+                            ? overlayDataByChartType.e2e
+                            : overlayDataByChartType.interactivity;
+                        const overlayRows = (overlay?.data ?? []).filter((p) =>
+                          selectedPrecisions.includes(p.precision),
+                        );
+                        return (
+                          <>
+                            {chartCaption}
+                            <InferenceTable
+                              data={
+                                overlayRows.length > 0
+                                  ? [...graph.data, ...overlayRows]
+                                  : graph.data
+                              }
+                              chartDefinition={graph.chartDefinition}
+                              selectedYAxisMetric={selectedYAxisMetric}
+                            />
+                          </>
+                        );
+                      }
+
+                      return selectedDateRange.startDate &&
+                        selectedDateRange.endDate &&
+                        selectedGPUs.length > 0 ? (
+                        <GPUGraph
                           chartId={`chart-${graphIndex}`}
                           modelLabel={graph.model}
                           data={graph.data}
@@ -579,43 +577,58 @@ export default function ChartDisplay() {
                           }`}
                           chartDefinition={graph.chartDefinition}
                           caption={chartCaption}
-                          overlayData={
-                            graph.chartDefinition.chartType === 'e2e'
-                              ? (overlayDataByChartType.e2e ?? undefined)
-                              : (overlayDataByChartType.interactivity ?? undefined)
-                          }
                         />
-                        {selectedGPUs.length > 0 &&
-                          (!selectedDateRange.startDate || !selectedDateRange.endDate) && (
-                            <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
-                              <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
-                                Select a date range to view GPU comparison
-                              </p>
-                            </div>
-                          )}
-                      </div>
-                    );
-                  })()}
-                  {replayAvailable && (
-                    <ReplayLauncher
-                      ref={(handle) => {
-                        replayHandlesRef.current[graphIndex] = handle;
-                      }}
-                      parentChartId={`chart-${graphIndex}`}
-                      chartDefinition={graph.chartDefinition}
-                      yLabel={`${
-                        graph.chartDefinition[
-                          `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                        ]
-                      }`}
-                      xLabel={graph.chartDefinition.x_label}
-                    />
-                  )}
-                </Card>
-              </figure>
-            </section>
-          );
-        });
+                      ) : (
+                        <div className="relative">
+                          <ScatterGraph
+                            chartId={`chart-${graphIndex}`}
+                            modelLabel={graph.model}
+                            data={graph.data}
+                            xLabel={graph.chartDefinition.x_label}
+                            yLabel={`${
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                              ]
+                            }`}
+                            chartDefinition={graph.chartDefinition}
+                            caption={chartCaption}
+                            overlayData={
+                              graph.chartDefinition.chartType === 'e2e'
+                                ? (overlayDataByChartType.e2e ?? undefined)
+                                : (overlayDataByChartType.interactivity ?? undefined)
+                            }
+                          />
+                          {selectedGPUs.length > 0 &&
+                            (!selectedDateRange.startDate || !selectedDateRange.endDate) && (
+                              <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
+                                <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
+                                  Select a date range to view GPU comparison
+                                </p>
+                              </div>
+                            )}
+                        </div>
+                      );
+                    })()}
+                    {replayAvailable && (
+                      <ReplayLauncher
+                        ref={(handle) => {
+                          replayHandlesRef.current[graphIndex] = handle;
+                        }}
+                        parentChartId={`chart-${graphIndex}`}
+                        chartDefinition={graph.chartDefinition}
+                        yLabel={`${
+                          graph.chartDefinition[
+                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                          ]
+                        }`}
+                        xLabel={graph.chartDefinition.x_label}
+                      />
+                    )}
+                  </Card>
+                </figure>
+              </section>
+            );
+          });
 
   return (
     <div data-testid="inference-chart-display" className="flex flex-col gap-4">

From d5dbda773ef653d715cb1d0634c2b70cc94a826f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 15:40:32 -0500
Subject: [PATCH 035/111] feat(agentic-detail): aggregates-across-configs view

Adds a 'Per-point / Aggregates across configs' toggle near the SKU header
on /inference/agentic/[id]. The aggregates view replaces the per-point
charts with four multi-line charts (ISL, OSL, KV cache util, prefix cache
hit rate) showing how mean/P50/P75/P90/P99 vary across every sibling
config in the SKU. X-axis is sibling labels matching SiblingNav chips
(parallelism + concurrency); each percentile gets its own colored line.

Plumbing:
- `getAgenticAggregates(sql, ids)` in packages/db parses both the
  profile_export.jsonl (per-request ISL/OSL) and the server_metrics_json
  (KV cache util + prefix hit rate time-series) per id, computes the five
  percentiles. 6-case unit suite covers percentile math, JSONL parsing,
  and the prefix-hit derivation.
- /api/v1/agentic-aggregates blob-cached like trace-histograms.
- New `useAgenticAggregates` hook + new AggregateChart component (multi-
  line with hover + ExpandableChart parity).

Memory + transport handling:
- Each row pulls TWO compressed blobs and `server_metrics_json_gz` can be
  up to ~17 MB compressed per high-conc row. Chunked query at size 2
  keeps each Neon HTTP response under the 64 MB cap and limits Node heap
  to ~one chunk's worth of decompressed JSON at a time (parallel chunks
  OOM'd on a 12-sibling SKU).
- Slow path runs ~20s on a 12-sibling SKU; cached afterwards (blobOnly).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/api/v1/agentic-aggregates/route.ts    |  64 +++
 .../agentic-point/agentic-point-detail.tsx    | 479 ++++++++++++------
 .../agentic-point/aggregate-chart.tsx         | 230 +++++++++
 .../inference/agentic-point/sibling-nav.tsx   |   2 +-
 .../src/hooks/api/use-agentic-aggregates.ts   |  45 ++
 .../db/src/queries/agentic-aggregates.test.ts | 113 +++++
 packages/db/src/queries/agentic-aggregates.ts | 255 ++++++++++
 7 files changed, 1020 insertions(+), 168 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/agentic-aggregates/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
 create mode 100644 packages/app/src/hooks/api/use-agentic-aggregates.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.test.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.ts

diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
new file mode 100644
index 00000000..63cb2dc0
--- /dev/null
+++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
@@ -0,0 +1,64 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getAgenticAggregates,
+  type AgenticAggregateMap,
+} from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: response stays small (a few numbers per id), but generating it
+// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the
+// "Aggregates" toggle stays snappy.
+const getCachedAgenticAggregates = cachedQuery(
+  (ids: number[]): Promise<AgenticAggregateMap> => getAgenticAggregates(getDb(), ids),
+  'agentic-aggregates',
+  { blobOnly: true },
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/agentic-aggregates?ids=1,2,3
+ *
+ * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization,
+ * and prefix cache hit rate — computed live from the stored aiperf
+ * profile_export.jsonl + server_metrics_json blobs. Ids without a
+ * trace_replay blob (or with no usable samples) get nulls.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const result = await getCachedAgenticAggregates(sorted);
+    return cachedJson(result);
+  } catch (error) {
+    console.error('Error fetching agentic aggregates:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index ee58332d..a5bca4e0 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -2,8 +2,10 @@
 
 import Link from 'next/link';
 import { useRouter } from 'next/navigation';
+import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
+import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -12,10 +14,12 @@ import {
   type TimeSeriesPoint,
 } from '@/hooks/api/use-trace-server-metrics';
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
 
+import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
 import { ExpandableChart } from './expandable-chart';
-import { SiblingNav } from './sibling-nav';
+import { SiblingNav, chipLabel } from './sibling-nav';
 import {
   StackedAreaChart,
   TimeSeriesChart,
@@ -78,6 +82,28 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
+type DetailView = 'point' | 'aggregates';
+const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
+  { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+  { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
+];
+
+/** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
+function toAggPoint(
+  sibling: { id: number; label: string },
+  pct: { mean: number; p50: number; p75: number; p90: number; p99: number } | null | undefined,
+): AggregatePoint {
+  const values: Partial<Record<PercentileKey, number>> = {};
+  if (pct) {
+    values.mean = pct.mean;
+    values.p50 = pct.p50;
+    values.p75 = pct.p75;
+    values.p90 = pct.p90;
+    values.p99 = pct.p99;
+  }
+  return { id: sibling.id, label: sibling.label, values };
+}
+
 export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
   const histQuery = useTraceHistograms([id], true);
@@ -88,6 +114,13 @@ export function AgenticPointDetail({ id }: Props) {
   const metrics = metricsQuery.data;
   const siblingsData = siblingsQuery.data;
 
+  const [view, setView] = useState<DetailView>('point');
+  // Fetch aggregates only when the aggregates view is active. Uses the full
+  // sibling set (across parallelism + concurrency configs) so each chart
+  // shows how the metric varies across the SKU.
+  const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
+  const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
       <div className="flex items-center gap-2">
@@ -128,180 +161,292 @@ export function AgenticPointDetail({ id }: Props) {
         </div>
       )}
 
-      <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-        <ExpandableChart
-          title="Input sequence length distribution"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
-            return histQuery.isLoading ? <Skeleton /> : <Empty />;
-          }}
-        />
-        <ExpandableChart
-          title="Output sequence length distribution"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
-            return histQuery.isLoading ? <Skeleton /> : <Empty />;
-          }}
+      <div className="flex items-center justify-between gap-3">
+        <SegmentedToggle
+          value={view}
+          options={VIEW_OPTIONS}
+          onValueChange={setView}
+          ariaLabel="Detail view"
+          testId="detail-view-toggle"
+          buttonClassName="px-3 py-1.5 text-sm"
         />
+        {view === 'aggregates' && (
+          <span className="text-xs text-muted-foreground">
+            {siblingIds.length} configs in SKU
+            {aggregatesQuery.isLoading ? ' · loading…' : ''}
+          </span>
+        )}
+      </div>
 
-        <ExpandableChart
-          title="KV cache utilization over time"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'GPU KV cache (avg n=50)',
-                    data: rollingAverage(metrics.kvCacheUsage, 50),
-                    rawData: metrics.kvCacheUsage,
-                    color: '#3b82f6',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yMax={1}
-                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                yAxisLabel="KV cache (%)"
-                {...size}
-              />
-            );
-          }}
+      {view === 'aggregates' ? (
+        <AggregatesGrid
+          siblings={siblingsData?.siblings ?? []}
+          aggregates={aggregatesQuery.data}
+          isLoading={aggregatesQuery.isLoading}
         />
+      ) : (
+        <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+          <ExpandableChart
+            title="Input sequence length distribution"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
+              return histQuery.isLoading ? <Skeleton /> : <Empty />;
+            }}
+          />
+          <ExpandableChart
+            title="Output sequence length distribution"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
+              return histQuery.isLoading ? <Skeleton /> : <Empty />;
+            }}
+          />
 
-        <ExpandableChart
-          title="Request queue depth"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'Running (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.running,
-                      })),
-                      50,
-                    ),
-                    color: '#22c55e',
-                    strokeWidth: 2,
-                  },
-                  {
-                    name: 'Waiting (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.waiting,
-                      })),
-                      50,
-                    ),
-                    color: '#ef4444',
-                    strokeWidth: 2,
-                  },
-                  {
-                    name: 'Total (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.total,
-                      })),
-                      50,
-                    ),
-                    color: '#3b82f6',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yAxisLabel="Requests"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="KV cache utilization over time"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'GPU KV cache (avg n=50)',
+                      data: rollingAverage(metrics.kvCacheUsage, 50),
+                      rawData: metrics.kvCacheUsage,
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yMax={1}
+                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                  yAxisLabel="KV cache (%)"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Prefix cache hit rate per interval"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'GPU (HBM, avg n=50)',
-                    data: rollingAverage(metrics.prefixCacheHitRate, 50),
-                    rawData: metrics.prefixCacheHitRate,
-                    color: '#a855f7',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yMax={1}
-                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                yAxisLabel="Hit rate (%)"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Request queue depth"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Running (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.running,
+                        })),
+                        50,
+                      ),
+                      color: '#22c55e',
+                      strokeWidth: 2,
+                    },
+                    {
+                      name: 'Waiting (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.waiting,
+                        })),
+                        50,
+                      ),
+                      color: '#ef4444',
+                      strokeWidth: 2,
+                    },
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.total,
+                        })),
+                        50,
+                      ),
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Requests"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Throughput (total & decode)"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'Total (avg n=50)',
-                    data: rollingAverage(total, 50),
-                    color: '#3b82f6',
-                    strokeWidth: 1.6,
-                  },
-                  {
-                    name: 'Decode (avg n=50)',
-                    data: rollingAverage(metrics.decodeTps, 50),
-                    color: '#f97316',
-                    strokeWidth: 1.6,
-                  },
-                  {
-                    name: 'Total running avg',
-                    data: cumulativeAverage(total),
-                    color: '#ef4444',
-                    strokeWidth: 3,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yAxisLabel="Tokens / sec"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Prefix cache hit rate per interval"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'GPU (HBM, avg n=50)',
+                      data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                      rawData: metrics.prefixCacheHitRate,
+                      color: '#a855f7',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yMax={1}
+                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                  yAxisLabel="Hit rate (%)"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Cumulative prompt token source breakdown"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <StackedAreaChart
-                sourceSeries={metrics.promptTokensBySource}
-                durationS={metrics.durationS}
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Throughput (total & decode)"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(total, 50),
+                      color: '#3b82f6',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Decode (avg n=50)',
+                      data: rollingAverage(metrics.decodeTps, 50),
+                      color: '#f97316',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Total running avg',
+                      data: cumulativeAverage(total),
+                      color: '#ef4444',
+                      strokeWidth: 3,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens / sec"
+                  {...size}
+                />
+              );
+            }}
+          />
+
+          <ExpandableChart
+            title="Cumulative prompt token source breakdown"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <StackedAreaChart
+                  sourceSeries={metrics.promptTokensBySource}
+                  durationS={metrics.durationS}
+                  {...size}
+                />
+              );
+            }}
+          />
+        </div>
+      )}
+    </div>
+  );
+}
+
+function AggregatesGrid({
+  siblings,
+  aggregates,
+  isLoading,
+}: {
+  siblings: {
+    id: number;
+    conc: number;
+    decode_tp: number;
+    decode_ep: number;
+    disagg: boolean;
+    num_prefill_gpu: number;
+    num_decode_gpu: number;
+    offload_mode?: string | null;
+  }[];
+  aggregates: AgenticAggregateMap | undefined;
+  isLoading: boolean;
+}) {
+  if (siblings.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        SKU sibling list not loaded yet — open a point to populate.
       </div>
+    );
+  }
+  if (isLoading && !aggregates) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        Computing aggregates across {siblings.length} configs… (parsing trace blobs)
+      </div>
+    );
+  }
+  const labeled = siblings.map((s) => ({ id: s.id, label: chipLabel(s as any) }));
+  const islPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.isl));
+  const oslPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.osl));
+  const kvPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.kvCacheUtil));
+  const prefixPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.prefixCacheHitRate));
+  return (
+    <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+      <ExpandableChart
+        title="ISL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={islPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="OSL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={oslPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="KV cache utilization (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={kvPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="Prefix cache hit rate (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={prefixPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
     </div>
   );
 }
diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
new file mode 100644
index 00000000..446677ad
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -0,0 +1,230 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+
+export type PercentileKey = 'mean' | 'p50' | 'p75' | 'p90' | 'p99';
+
+interface PercentileLine {
+  key: PercentileKey;
+  /** Display label in legend / tooltip. */
+  label: string;
+  color: string;
+}
+
+const PERCENTILE_LINES: PercentileLine[] = [
+  { key: 'mean', label: 'Mean', color: '#ef4444' },
+  { key: 'p50', label: 'P50', color: '#3b82f6' },
+  { key: 'p75', label: 'P75', color: '#22c55e' },
+  { key: 'p90', label: 'P90', color: '#f59e0b' },
+  { key: 'p99', label: 'P99', color: '#a855f7' },
+];
+
+export interface AggregatePoint {
+  /** Sibling label rendered on x-axis (e.g. "TP8 • c=8"). */
+  label: string;
+  /** Per-percentile value; missing percentiles are dropped from the plot. */
+  values: Partial<Record<PercentileKey, number>>;
+  /** Sibling id — purely informational, used in the tooltip title. */
+  id?: number;
+}
+
+/**
+ * Multi-line chart: one x-position per sibling config, one line per
+ * percentile (mean/p50/p75/p90/p99). Designed for the "Aggregates across
+ * configs" view on the agentic detail page.
+ */
+export function AggregateChart({
+  points,
+  unit,
+  yMax,
+  yFmt,
+  width = 720,
+  height = 320,
+}: {
+  points: readonly AggregatePoint[];
+  unit: string;
+  /** Optional fixed y-axis upper bound (e.g. 1 for percentages). */
+  yMax?: number;
+  /** Optional value formatter (e.g. percentage → "30%"). */
+  yFmt?: (v: number) => string;
+  width?: number;
+  height?: number;
+}) {
+  const W = width;
+  const H = height;
+  const PAD = { top: 16, right: 16, bottom: 90, left: 64 };
+  const fmt = (v: number) =>
+    yFmt
+      ? yFmt(v)
+      : v >= 10000
+        ? new Intl.NumberFormat('en-US').format(Math.round(v))
+        : v.toFixed(v < 10 ? 2 : 0);
+
+  const computed = useMemo(() => {
+    if (points.length === 0) return null;
+    let yMaxComputed = 0;
+    for (const p of points) {
+      for (const line of PERCENTILE_LINES) {
+        const v = p.values[line.key];
+        if (typeof v === 'number' && Number.isFinite(v) && v > yMaxComputed) yMaxComputed = v;
+      }
+    }
+    const yTop = yMax ?? (yMaxComputed === 0 ? 1 : yMaxComputed * 1.05);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    return { yTop, innerW, innerH };
+  }, [points, W, H, PAD.left, PAD.right, PAD.top, PAD.bottom, yMax]);
+
+  if (!computed) {
+    return (
+      <div className="grid place-items-center text-xs text-muted-foreground" style={{ height: H }}>
+        No data
+      </div>
+    );
+  }
+  const { yTop, innerW, innerH } = computed;
+
+  // X positions: evenly spaced across the inner width.
+  const xOf = (i: number) =>
+    points.length === 1 ? PAD.left + innerW / 2 : PAD.left + (i / (points.length - 1)) * innerW;
+  const yOf = (v: number) => PAD.top + (1 - v / yTop) * innerH;
+
+  // 5 y-axis ticks evenly between 0 and yTop.
+  const yTicks = Array.from({ length: 5 }, (_, i) => (yTop * i) / 4);
+
+  // Resolve hover: snap to nearest sibling index and emit all percentiles
+  // that have data at that x.
+  const resolve = (fraction: number) => {
+    const idx = Math.round(fraction * (points.length - 1));
+    const p = points[Math.max(0, Math.min(points.length - 1, idx))];
+    if (!p) return null;
+    const items: HoverItem[] = [];
+    for (const line of PERCENTILE_LINES) {
+      const v = p.values[line.key];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      items.push({ color: line.color, label: line.label, value: fmt(v) });
+    }
+    return { items, title: p.label };
+  };
+
+  return (
+    <div className="w-full">
+      <div className="mb-2 flex flex-wrap items-center gap-x-3 gap-y-1 text-xs">
+        {PERCENTILE_LINES.map((line) => (
+          <div key={line.key} className="flex items-center gap-1.5">
+            <span className="inline-block w-3 h-0.5" style={{ backgroundColor: line.color }} />
+            <span className="text-muted-foreground">{line.label}</span>
+          </div>
+        ))}
+        <span className="ml-auto text-muted-foreground">
+          {points.length} configs · units: {unit}
+        </span>
+      </div>
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis ticks + gridlines */}
+        {yTicks.map((v, i) => {
+          const y = yOf(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left}
+                x2={PAD.left + innerW}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.08}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X-axis tick labels — one per sibling, rotated 30° to fit. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          return (
+            <g key={`x${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={PAD.top + innerH}
+                y2={PAD.top + innerH + 4}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={x}
+                y={PAD.top + innerH + 8}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.7}
+                textAnchor="end"
+                transform={`rotate(-30 ${x} ${PAD.top + innerH + 8})`}
+              >
+                {p.label}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X axis baseline */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.25}
+        />
+
+        {/* Percentile polylines + markers */}
+        {PERCENTILE_LINES.map((line) => {
+          const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
+          const markers: { x: number; y: number }[] = [];
+          let prev: { x: number; y: number } | null = null;
+          for (let i = 0; i < points.length; i++) {
+            const v = points[i]!.values[line.key];
+            if (typeof v !== 'number' || !Number.isFinite(v)) {
+              prev = null;
+              continue;
+            }
+            const x = xOf(i);
+            const y = yOf(v);
+            markers.push({ x, y });
+            if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
+            prev = { x, y };
+          }
+          return (
+            <g key={line.key}>
+              {segments.map((s, j) => (
+                <line
+                  key={`s${j}`}
+                  x1={s.x1}
+                  y1={s.y1}
+                  x2={s.x2}
+                  y2={s.y2}
+                  stroke={line.color}
+                  strokeWidth={1.5}
+                />
+              ))}
+              {markers.map((m, j) => (
+                <circle key={`m${j}`} cx={m.x} cy={m.y} r={3} fill={line.color} />
+              ))}
+            </g>
+          );
+        })}
+      </ChartHover>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index 776c8ba2..aa727fdc 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -48,7 +48,7 @@ function frameworkLabel(fw: string) {
 }
 
 /** Short label for a sibling chip: parallelism + concurrency. */
-function chipLabel(s: BenchmarkSibling): string {
+export function chipLabel(s: BenchmarkSibling): string {
   const parallel = s.disagg
     ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
     : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts
new file mode 100644
index 00000000..4ca25ee2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts
@@ -0,0 +1,45 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  n: number;
+}
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+async function fetchAgenticAggregates(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<AgenticAggregateMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/agentic-aggregates?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`agentic-aggregates ${res.status}`);
+  return (await res.json()) as AgenticAggregateMap;
+}
+
+/**
+ * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV
+ * cache utilization, and prefix cache hit rate. Used by the "Aggregates
+ * across configs" view on the agentic detail page.
+ */
+export function useAgenticAggregates(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['agentic-aggregates', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchAgenticAggregates(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
new file mode 100644
index 00000000..2a0305bf
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, it } from 'vitest';
+
+import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates.js';
+
+describe('percentilesOf', () => {
+  it('returns null for empty input', () => {
+    expect(percentilesOf([])).toBeNull();
+    expect(percentilesOf([Number.NaN, Number.POSITIVE_INFINITY])).toBeNull();
+  });
+
+  it('computes percentiles for a simple integer range', () => {
+    // 1..100, evenly spaced — linear quantile is straightforward.
+    const xs = Array.from({ length: 100 }, (_, i) => i + 1);
+    const p = percentilesOf(xs);
+    expect(p).not.toBeNull();
+    expect(p!.n).toBe(100);
+    expect(p!.mean).toBeCloseTo(50.5, 6);
+    expect(p!.p50).toBeCloseTo(50.5, 6);
+    // For 100 sorted values, p75 = sorted[0.75 * 99] = sorted[74.25] interp.
+    expect(p!.p75).toBeCloseTo(75.25, 6);
+    expect(p!.p90).toBeCloseTo(90.1, 6);
+    expect(p!.p99).toBeCloseTo(99.01, 6);
+  });
+
+  it('filters out non-finite values before computing', () => {
+    const p = percentilesOf([1, 2, Number.NaN, 3, Number.POSITIVE_INFINITY, 4]);
+    expect(p?.n).toBe(4);
+    expect(p?.mean).toBeCloseTo(2.5, 6);
+  });
+});
+
+describe('extractIslOsl', () => {
+  it('reads input/output sequence length from profiling records', () => {
+    const lines = [
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 200, unit: 'tokens' },
+          output_sequence_length: { value: 75, unit: 'tokens' },
+        },
+      }),
+      // warmup record — should be ignored
+      JSON.stringify({
+        metadata: { benchmark_phase: 'warmup' },
+        metrics: {
+          input_sequence_length: { value: 9999, unit: 'tokens' },
+          output_sequence_length: { value: 9999, unit: 'tokens' },
+        },
+      }),
+    ];
+    const { isl, osl } = extractIslOsl(lines.join('\n'));
+    expect(isl).toEqual([100, 200]);
+    expect(osl).toEqual([50, 75]);
+  });
+});
+
+describe('extractServerMetricSamples', () => {
+  it('extracts KV cache util gauge and computes per-interval prefix hit rate', () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:kv_cache_usage_perc': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, end_ns: 1, avg: 0.1 },
+                { start_ns: 1, end_ns: 2, avg: 0.5 },
+                { start_ns: 2, end_ns: 3, avg: 0.9 },
+              ],
+            },
+          ],
+        },
+        'vllm:gpu_prefix_cache_hits': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 80 },
+                { start_ns: 1, rate: 50 },
+                { start_ns: 2, rate: 0 }, // skipped because matching queries.rate is 0
+              ],
+            },
+          ],
+        },
+        'vllm:gpu_prefix_cache_queries': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 100 }, // hit rate = 0.8
+                { start_ns: 1, rate: 100 }, // hit rate = 0.5
+                { start_ns: 2, rate: 0 },
+              ],
+            },
+          ],
+        },
+      },
+    });
+    const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+    expect(kvCacheUtil).toEqual([0.1, 0.5, 0.9]);
+    expect(prefixCacheHitRate).toEqual([0.8, 0.5]);
+  });
+
+  it('returns empty arrays when the JSON lacks the expected metric series', () => {
+    const out = extractServerMetricSamples(JSON.stringify({ metrics: {} }));
+    expect(out.kvCacheUtil).toEqual([]);
+    expect(out.prefixCacheHitRate).toEqual([]);
+  });
+});
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
new file mode 100644
index 00000000..49ae6900
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -0,0 +1,255 @@
+/**
+ * Per-id aggregate stats for the "Aggregates across configs" view on the
+ * agentic detail page. Each id contributes one summary number per metric per
+ * percentile so the frontend can plot how each metric varies across the
+ * SKU's parallelism + concurrency configs.
+ *
+ * Sources:
+ *  - `profile_export.jsonl` → ISL / OSL per request (filtered to profiling phase)
+ *  - `server_metrics_json` → time-series of KV cache utilization +
+ *     prefix-cache hit rate per scrape interval
+ *
+ * Returns mean/p50/p75/p90/p99 per metric. Nulls when the blob is missing
+ * or has no usable samples — frontend treats those as "no data".
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  /** Sample count used to compute the percentiles. */
+  n: number;
+}
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+/**
+ * Each row pulls TWO compressed blobs (profile_export + server_metrics).
+ * `server_metrics_json_gz` can be up to ~17 MB compressed for high-conc
+ * runs, so even 3 rows can clear Neon's 64 MB cap. Stay conservative at 2.
+ * Chunks are issued in parallel below, so the wall-clock impact is small.
+ */
+const QUERY_CHUNK_SIZE = 2;
+
+/** Linear-interpolated percentile (matches numpy default). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 0) return Number.NaN;
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+
+/** Compute the percentile bundle for an array of samples; null if empty. */
+export function percentilesOf(samples: number[]): MetricPercentiles | null {
+  const clean = samples.filter((v) => Number.isFinite(v));
+  if (clean.length === 0) return null;
+  const sorted = [...clean].toSorted((a, b) => a - b);
+  return {
+    mean: meanOf(sorted),
+    p50: quantile(sorted, 0.5),
+    p75: quantile(sorted, 0.75),
+    p90: quantile(sorted, 0.9),
+    p99: quantile(sorted, 0.99),
+    n: sorted.length,
+  };
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return v;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+interface ProfileRecord {
+  metadata?: { benchmark_phase?: string };
+  metrics?: {
+    input_sequence_length?: { value?: number } | number;
+    output_sequence_length?: { value?: number } | number;
+  };
+}
+
+/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */
+export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } {
+  const isl: number[] = [];
+  const osl: number[] = [];
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const m = rec.metrics ?? {};
+    const i = readNum(m.input_sequence_length);
+    const o = readNum(m.output_sequence_length);
+    if (typeof i === 'number') isl.push(i);
+    if (typeof o === 'number') osl.push(o);
+  }
+  return { isl, osl };
+}
+
+interface TimeSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+  count?: number;
+  sum?: number;
+}
+interface Series {
+  labels?: Record<string, string>;
+  timeslices?: TimeSlice[];
+}
+interface MetricMeta {
+  series?: Series[];
+}
+interface MetricsJson {
+  metrics?: Record<string, MetricMeta>;
+}
+
+/**
+ * Parse the server_metrics_json → time-series arrays for KV cache util and
+ * prefix cache hit rate (per-interval, computed from the prometheus
+ * counters the same way trace-server-metrics does it).
+ */
+export function extractServerMetricSamples(json: string): {
+  kvCacheUtil: number[];
+  prefixCacheHitRate: number[];
+} {
+  const parsed = JSON.parse(json) as MetricsJson;
+  const metrics = parsed.metrics ?? {};
+  const firstSeries = (name: string): Series | undefined => {
+    const s = metrics[name]?.series;
+    return s && s.length > 0 ? s[0] : undefined;
+  };
+
+  // KV cache util — gauge in [0, 1].
+  const kvSeries =
+    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
+  const kvCacheUtil: number[] = [];
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number') kvCacheUtil.push(ts.avg);
+  }
+
+  // Prefix cache hit rate per interval = hits.rate / queries.rate.
+  // Matches the derivation in queries/trace-server-metrics.ts.
+  const prefixCacheHitRate: number[] = [];
+  const hitsSeries = firstSeries('vllm:gpu_prefix_cache_hits');
+  const queriesSeries = firstSeries('vllm:gpu_prefix_cache_queries');
+  if (hitsSeries && queriesSeries) {
+    const qByStart = new Map<number, TimeSlice>();
+    for (const q of queriesSeries.timeslices ?? []) {
+      if (typeof q.start_ns === 'number') qByStart.set(q.start_ns, q);
+    }
+    for (const h of hitsSeries.timeslices ?? []) {
+      if (typeof h.start_ns !== 'number' || typeof h.rate !== 'number') continue;
+      const q = qByStart.get(h.start_ns);
+      if (!q || typeof q.rate !== 'number' || q.rate === 0) continue;
+      prefixCacheHitRate.push(h.rate / q.rate);
+    }
+  }
+
+  return { kvCacheUtil, prefixCacheHitRate };
+}
+
+export async function getAgenticAggregates(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<AgenticAggregateMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  // Serial chunks so we never have more than ~`QUERY_CHUNK_SIZE` blobs in
+  // memory at once. Some `server_metrics` blobs decompress to >100 MB; running
+  // all chunks in parallel OOMs the Node process. The aggregator is fronted by
+  // a blob cache (`blobOnly: true`), so the slow path runs at most once per
+  // sibling set.
+  const result: AgenticAggregateMap = {};
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as profile_blob,
+        atr.server_metrics_json_gz as server_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as {
+      benchmark_result_id: number;
+      profile_blob: Buffer | null;
+      server_blob: Buffer | null;
+    }[];
+    for (const row of chunkRows) {
+      processRow(row, result);
+    }
+  }
+  return result;
+}
+
+function processRow(
+  row: { benchmark_result_id: number; profile_blob: Buffer | null; server_blob: Buffer | null },
+  result: AgenticAggregateMap,
+): void {
+  let islPct: MetricPercentiles | null = null;
+  let oslPct: MetricPercentiles | null = null;
+  let kvPct: MetricPercentiles | null = null;
+  let prefixPct: MetricPercentiles | null = null;
+
+  if (row.profile_blob) {
+    try {
+      const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      islPct = percentilesOf(isl);
+      oslPct = percentilesOf(osl);
+    } catch {
+      // ignore malformed blob
+    }
+  }
+  if (row.server_blob) {
+    try {
+      const json = gunzipSync(row.server_blob).toString('utf8');
+      const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+      kvPct = percentilesOf(kvCacheUtil);
+      prefixPct = percentilesOf(prefixCacheHitRate);
+    } catch {
+      // ignore malformed blob
+    }
+  }
+
+  result[Number(row.benchmark_result_id)] = {
+    id: Number(row.benchmark_result_id),
+    isl: islPct,
+    osl: oslPct,
+    kvCacheUtil: kvPct,
+    prefixCacheHitRate: prefixPct,
+  };
+}

From 41ef33b21e6a34430be20e812e6eedbd7b8f90cf Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 16:13:49 -0500
Subject: [PATCH 036/111] fix(agentic-aggregates): metric name + stream-parse
 oversized blobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two issues left the Aggregates view mostly empty for the worst-case rows:

1. Prefix cache hit rate was null for EVERY row because the parser looked
   up `vllm:gpu_prefix_cache_*` but the actual metric names are
   `vllm:prefix_cache_*` (no `gpu_` prefix). Add fallback so both spellings
   work.

2. KV cache util + prefix cache hit rate were null for high-conc TP+EP
   rows. Their server_metrics_json decompresses past Node's max string
   length (0x1fffffe8 / 512 MB) because vllm dumps cache_config_info into
   every scrape interval, repeated thousands of times. `gunzipSync().toString()`
   threw ERR_STRING_TOO_LONG and the silent catch left both metrics null.

   Added stream-json fallback: pipe Buffer → gunzip → JSON parser →
   pick('metrics') → streamObject; only the metric keys we care about land
   in memory. Avoids ever materializing the 500+ MB JSON string. The
   fast path stays — sync gunzip + JSON.parse is used unless it throws.

Also split the DB fetch into two passes (profile blobs in batches of 8,
server blobs one at a time) so the server query response stays under
Neon's 64 MB HTTP cap on rows where the compressed server blob alone is
~17 MB and Neon's bytea-over-HTTP encoding inflates it ~1.6×.

Chart redesign: AggregateChart now draws a vertical bar per sibling
spanning the percentile range, with colored ticks at each percentile and
a diamond at the mean. Horizontal connecting lines per percentile remain
as a faint backdrop so the reader can still follow trends across configs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/aggregate-chart.tsx         |  72 ++++++-
 packages/db/package.json                      |   5 +-
 .../db/src/queries/agentic-aggregates.test.ts |   4 +-
 packages/db/src/queries/agentic-aggregates.ts | 197 ++++++++++++------
 pnpm-lock.yaml                                |  36 ++++
 5 files changed, 242 insertions(+), 72 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
index 446677ad..55ac8061 100644
--- a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -188,10 +188,10 @@ export function AggregateChart({
           opacity={0.25}
         />
 
-        {/* Percentile polylines + markers */}
+        {/* Horizontal connecting lines per percentile — faint backdrop so the
+            eye can follow how each percentile changes across configs. */}
         {PERCENTILE_LINES.map((line) => {
           const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
-          const markers: { x: number; y: number }[] = [];
           let prev: { x: number; y: number } | null = null;
           for (let i = 0; i < points.length; i++) {
             const v = points[i]!.values[line.key];
@@ -201,12 +201,11 @@ export function AggregateChart({
             }
             const x = xOf(i);
             const y = yOf(v);
-            markers.push({ x, y });
             if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
             prev = { x, y };
           }
           return (
-            <g key={line.key}>
+            <g key={`hline-${line.key}`} opacity={0.35}>
               {segments.map((s, j) => (
                 <line
                   key={`s${j}`}
@@ -215,12 +214,69 @@ export function AggregateChart({
                   x2={s.x2}
                   y2={s.y2}
                   stroke={line.color}
-                  strokeWidth={1.5}
+                  strokeWidth={1}
                 />
               ))}
-              {markers.map((m, j) => (
-                <circle key={`m${j}`} cx={m.x} cy={m.y} r={3} fill={line.color} />
-              ))}
+            </g>
+          );
+        })}
+
+        {/* Per-sibling vertical bar spanning the percentile range, with a
+            colored tick at each percentile level. Mean rendered as a small
+            diamond to distinguish from the percentile ticks. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          // Collect percentile values present for this sibling.
+          const present = PERCENTILE_LINES.filter(
+            (line) =>
+              typeof p.values[line.key] === 'number' && Number.isFinite(p.values[line.key]!),
+          ).map((line) => ({ ...line, value: p.values[line.key]! }));
+          if (present.length === 0) return null;
+          // Only the *percentile* values define the bar extent; mean might be
+          // outside the percentile span on weird distributions.
+          const pctlOnly = present.filter((p2) => p2.key !== 'mean');
+          const bandValues = pctlOnly.length > 0 ? pctlOnly : present;
+          const bandYs = bandValues.map((b) => yOf(b.value));
+          const yLo = Math.min(...bandYs);
+          const yHi = Math.max(...bandYs);
+          return (
+            <g key={`bar-${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={yLo}
+                y2={yHi}
+                stroke="currentColor"
+                strokeWidth={1}
+                opacity={0.35}
+              />
+              {present.map((b) => {
+                const ty = yOf(b.value);
+                if (b.key === 'mean') {
+                  // Diamond marker for mean.
+                  const s = 4;
+                  return (
+                    <polygon
+                      key={`m-${b.key}`}
+                      points={`${x},${ty - s} ${x + s},${ty} ${x},${ty + s} ${x - s},${ty}`}
+                      fill={b.color}
+                      stroke={b.color}
+                    />
+                  );
+                }
+                // Horizontal tick at each percentile.
+                return (
+                  <line
+                    key={`tk-${b.key}`}
+                    x1={x - 6}
+                    x2={x + 6}
+                    y1={ty}
+                    y2={ty}
+                    stroke={b.color}
+                    strokeWidth={2.5}
+                  />
+                );
+              })}
             </g>
           );
         })}
diff --git a/packages/db/package.json b/packages/db/package.json
index c849ea26..d7caf34d 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -30,11 +30,14 @@
     "@neondatabase/serverless": "^1.1.0",
     "@noble/ciphers": "^2.2.0",
     "@semianalysisai/inferencex-constants": "workspace:*",
-    "postgres": "^3.4.9"
+    "postgres": "^3.4.9",
+    "stream-chain": "^3.4.0",
+    "stream-json": "^2.1.0"
   },
   "devDependencies": {
     "@types/adm-zip": "^0.5.8",
     "@types/node": "^25.7.0",
+    "@types/stream-json": "^1.7.8",
     "@vitest/coverage-v8": "^4.1.6",
     "adm-zip": "^0.5.17",
     "dotenv-cli": "^11.0.0",
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
index 2a0305bf..8c712323 100644
--- a/packages/db/src/queries/agentic-aggregates.test.ts
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -76,7 +76,7 @@ describe('extractServerMetricSamples', () => {
             },
           ],
         },
-        'vllm:gpu_prefix_cache_hits': {
+        'vllm:prefix_cache_hits': {
           series: [
             {
               timeslices: [
@@ -87,7 +87,7 @@ describe('extractServerMetricSamples', () => {
             },
           ],
         },
-        'vllm:gpu_prefix_cache_queries': {
+        'vllm:prefix_cache_queries': {
           series: [
             {
               timeslices: [
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 49ae6900..22ec7b28 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -13,7 +13,14 @@
  * or has no usable samples — frontend treats those as "no data".
  */
 
-import { gunzipSync } from 'node:zlib';
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
 
 import type { DbClient } from '../connection.js';
 
@@ -38,12 +45,15 @@ export interface AgenticAggregate {
 export type AgenticAggregateMap = Record<number, AgenticAggregate>;
 
 /**
- * Each row pulls TWO compressed blobs (profile_export + server_metrics).
- * `server_metrics_json_gz` can be up to ~17 MB compressed for high-conc
- * runs, so even 3 rows can clear Neon's 64 MB cap. Stay conservative at 2.
- * Chunks are issued in parallel below, so the wall-clock impact is small.
+ * `profile_export_jsonl_gz` is small (~1-3 MB) so we can batch many per
+ * round-trip. `server_metrics_json_gz` is much bigger (~17 MB compressed
+ * for high-conc TP+EP runs; Neon encodes bytea over HTTP at ~1.6× wire
+ * size, so two of those = ~50 MB and three already trips the 64 MB cap).
+ * We fetch the two blob types in separate queries with different chunk
+ * sizes.
  */
-const QUERY_CHUNK_SIZE = 2;
+const PROFILE_CHUNK_SIZE = 8;
+const SERVER_CHUNK_SIZE = 1;
 
 /** Linear-interpolated percentile (matches numpy default). */
 function quantile(sortedAsc: number[], q: number): number {
@@ -162,9 +172,14 @@ export function extractServerMetricSamples(json: string): {
 
   // Prefix cache hit rate per interval = hits.rate / queries.rate.
   // Matches the derivation in queries/trace-server-metrics.ts.
+  // Metric names: vllm exposes these as `vllm:prefix_cache_*` (no `gpu_`
+  // prefix); falls back to the `gpu_`-prefixed names in case a future
+  // vllm version renames them.
   const prefixCacheHitRate: number[] = [];
-  const hitsSeries = firstSeries('vllm:gpu_prefix_cache_hits');
-  const queriesSeries = firstSeries('vllm:gpu_prefix_cache_queries');
+  const hitsSeries =
+    firstSeries('vllm:prefix_cache_hits') ?? firstSeries('vllm:gpu_prefix_cache_hits');
+  const queriesSeries =
+    firstSeries('vllm:prefix_cache_queries') ?? firstSeries('vllm:gpu_prefix_cache_queries');
   if (hitsSeries && queriesSeries) {
     const qByStart = new Map<number, TimeSlice>();
     for (const q of queriesSeries.timeslices ?? []) {
@@ -181,75 +196,135 @@ export function extractServerMetricSamples(json: string): {
   return { kvCacheUtil, prefixCacheHitRate };
 }
 
+/** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
+const TARGET_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc', // older fallback name
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths)
+  'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect ONLY the metrics
+ * we need. Avoids the Node 512 MB string cap that JSON.parse hits on
+ * server_metrics blobs from high-conc TP+EP runs (which can decompress to
+ * >500 MB because vllm dumps `cache_config_info` every scrape interval).
+ *
+ * Pipeline: Buffer → gunzip → JSON parser → Pick('metrics') →
+ * StreamObject (one metric per chunk) → keep only the keys we care about.
+ *
+ * Returns the same `{ kvCacheUtil, prefixCacheHitRate }` shape as the
+ * synchronous fast path so callers can use either interchangeably.
+ */
+async function streamExtractServerMetricSamples(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  const collected: Record<string, MetricMeta> = {};
+  // stream-json's TypeScript types don't compose cleanly with node:stream's
+  // pipeline() generic, and several `.pipe()`/event APIs are typed loosely —
+  // cast to any for this local pipe chain. It works at runtime.
+  // stream-json composes transforms via stream-chain. `pick`/`streamObject`
+  // each return a Transform when called; `chain([...])` wires them.
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipeline as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: MetricMeta };
+      if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipeline as any).on('end', resolve);
+    (pipeline as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
 export async function getAgenticAggregates(
   sql: DbClient,
   benchmarkResultIds: number[],
 ): Promise<AgenticAggregateMap> {
   if (benchmarkResultIds.length === 0) return {};
 
-  // Serial chunks so we never have more than ~`QUERY_CHUNK_SIZE` blobs in
-  // memory at once. Some `server_metrics` blobs decompress to >100 MB; running
-  // all chunks in parallel OOMs the Node process. The aggregator is fronted by
-  // a blob cache (`blobOnly: true`), so the slow path runs at most once per
-  // sibling set.
   const result: AgenticAggregateMap = {};
-  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
-    const chunkRows = (await sql`
+  // ── Pass 1: profile_export blobs (cheap; large batches). ────────────────
+  for (let i = 0; i < benchmarkResultIds.length; i += PROFILE_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + PROFILE_CHUNK_SIZE);
+    const rows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as profile_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as { benchmark_result_id: number; profile_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (row.profile_blob) {
+        try {
+          const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+          const { isl, osl } = extractIslOsl(jsonl);
+          result[id].isl = percentilesOf(isl);
+          result[id].osl = percentilesOf(osl);
+        } catch {
+          // ignore malformed blob
+        }
+      }
+    }
+  }
+  // ── Pass 2: server_metrics blobs (huge; one at a time). ────────────────
+  // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
+  // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
+  // path runs at most once per sibling set.
+  for (let i = 0; i < benchmarkResultIds.length; i += SERVER_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + SERVER_CHUNK_SIZE);
+    const rows = (await sql`
       select
         br.id as benchmark_result_id,
-        atr.profile_export_jsonl_gz as profile_blob,
         atr.server_metrics_json_gz as server_blob
       from benchmark_results br
       join agentic_trace_replay atr on atr.id = br.trace_replay_id
       where br.id = any(${chunk}::bigint[])
-    `) as {
-      benchmark_result_id: number;
-      profile_blob: Buffer | null;
-      server_blob: Buffer | null;
-    }[];
-    for (const row of chunkRows) {
-      processRow(row, result);
+    `) as { benchmark_result_id: number; server_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (!row.server_blob) continue;
+      let parsed: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+      try {
+        const json = gunzipSync(row.server_blob).toString('utf8');
+        parsed = extractServerMetricSamples(json);
+      } catch (error) {
+        // ERR_STRING_TOO_LONG (>512 MB) hits on high-conc TP+EP rows whose
+        // server_metrics_json decompresses past Node's max string length.
+        // Stream-parse to extract just the metric subtrees we care about.
+        const code = error && (error as NodeJS.ErrnoException).code;
+        const msg = error instanceof Error ? error.message : String(error);
+        if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+          try {
+            parsed = await streamExtractServerMetricSamples(row.server_blob);
+          } catch {
+            // stream fallback failed too — leave nulls
+          }
+        }
+      }
+      if (parsed) {
+        result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil);
+        result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate);
+      }
     }
   }
   return result;
 }
 
-function processRow(
-  row: { benchmark_result_id: number; profile_blob: Buffer | null; server_blob: Buffer | null },
-  result: AgenticAggregateMap,
-): void {
-  let islPct: MetricPercentiles | null = null;
-  let oslPct: MetricPercentiles | null = null;
-  let kvPct: MetricPercentiles | null = null;
-  let prefixPct: MetricPercentiles | null = null;
-
-  if (row.profile_blob) {
-    try {
-      const jsonl = gunzipSync(row.profile_blob).toString('utf8');
-      const { isl, osl } = extractIslOsl(jsonl);
-      islPct = percentilesOf(isl);
-      oslPct = percentilesOf(osl);
-    } catch {
-      // ignore malformed blob
-    }
-  }
-  if (row.server_blob) {
-    try {
-      const json = gunzipSync(row.server_blob).toString('utf8');
-      const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
-      kvPct = percentilesOf(kvCacheUtil);
-      prefixPct = percentilesOf(prefixCacheHitRate);
-    } catch {
-      // ignore malformed blob
-    }
-  }
-
-  result[Number(row.benchmark_result_id)] = {
-    id: Number(row.benchmark_result_id),
-    isl: islPct,
-    osl: oslPct,
-    kvCacheUtil: kvPct,
-    prefixCacheHitRate: prefixPct,
-  };
+function blankAggregate(id: number): AgenticAggregate {
+  return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
 }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 14505e57..717ffc5c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -249,6 +249,12 @@ importers:
       postgres:
         specifier: ^3.4.9
         version: 3.4.9
+      stream-chain:
+        specifier: ^3.4.0
+        version: 3.6.3
+      stream-json:
+        specifier: ^2.1.0
+        version: 2.1.0
     devDependencies:
       '@types/adm-zip':
         specifier: ^0.5.8
@@ -256,6 +262,9 @@ importers:
       '@types/node':
         specifier: ^25.7.0
         version: 25.7.0
+      '@types/stream-json':
+        specifier: ^1.7.8
+        version: 1.7.8
       '@vitest/coverage-v8':
         specifier: ^4.1.6
         version: 4.1.6(vitest@4.1.6)
@@ -2334,6 +2343,12 @@ packages:
   '@types/stats.js@0.17.4':
     resolution: {integrity: sha512-jIBvWWShCvlBqBNIZt0KAshWpvSjhkwkEu4ZUcASoAvhmrgAUI2t1dXrjSL4xXVLB4FznPrIsX3nKXFl/Dt4vA==}
 
+  '@types/stream-chain@2.1.0':
+    resolution: {integrity: sha512-guDyAl6s/CAzXUOWpGK2bHvdiopLIwpGu8v10+lb9hnQOyo4oj/ZUQFOvqFjKGsE3wJP1fpIesCcMvbXuWsqOg==}
+
+  '@types/stream-json@1.7.8':
+    resolution: {integrity: sha512-MU1OB1eFLcYWd1LjwKXrxdoPtXSRzRmAnnxs4Js/ayB5O/NvHraWwuOaqMWIebpYwM6khFlsJOHEhI9xK/ab4Q==}
+
   '@types/three@0.184.1':
     resolution: {integrity: sha512-6q4VdiqVsrTRqmk62/BnlcAvIrnDM0zf2ZDVKI5kZiniWrSaOHaQzmbp+BNzoggc/8tgW412pL//wZIxu2PPTA==}
 
@@ -5074,9 +5089,15 @@ packages:
     resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==}
     engines: {node: '>= 0.4'}
 
+  stream-chain@3.6.3:
+    resolution: {integrity: sha512-JZuELdHUuiZL4Olcr4EllGUvj9VKEaDkGHA6QAP5SruD0bgrr8TwtNXwRfH+fCncysEII7HhWll1+aOwvHYyRw==}
+
   stream-combiner@0.2.2:
     resolution: {integrity: sha512-6yHMqgLYDzQDcAkL+tjJDC5nSNuNIx0vZtRZeiPh7Saef7VHX9H5Ijn9l2VIol2zaNYlYEX6KyuT/237A58qEQ==}
 
+  stream-json@2.1.0:
+    resolution: {integrity: sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==}
+
   string-width@4.2.3:
     resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
     engines: {node: '>=8'}
@@ -7392,6 +7413,15 @@ snapshots:
 
   '@types/stats.js@0.17.4': {}
 
+  '@types/stream-chain@2.1.0':
+    dependencies:
+      '@types/node': 25.7.0
+
+  '@types/stream-json@1.7.8':
+    dependencies:
+      '@types/node': 25.7.0
+      '@types/stream-chain': 2.1.0
+
   '@types/three@0.184.1':
     dependencies:
       '@dimforge/rapier3d-compat': 0.12.0
@@ -10752,11 +10782,17 @@ snapshots:
       es-errors: 1.3.0
       internal-slot: 1.1.0
 
+  stream-chain@3.6.3: {}
+
   stream-combiner@0.2.2:
     dependencies:
       duplexer: 0.1.2
       through: 2.3.8
 
+  stream-json@2.1.0:
+    dependencies:
+      stream-chain: 3.6.3
+
   string-width@4.2.3:
     dependencies:
       emoji-regex: 8.0.0

From 1cedd240e95b52789690919cc4b13600920d842f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:04:02 -0500
Subject: [PATCH 037/111] feat(agentic-aggregates): pre-compute stats at ingest
 time

Detail page was decompressing + parsing every trace_replay blob on each
request, sometimes hitting Node's 512 MB string cap on high-conc TP+EP
server_metrics_json. Pre-compute the percentile + derived bundles into
a versioned `aggregate_stats` JSONB column, mirroring the pattern Alec
suggested. APIs read the column first and only fall back to the slow
blob-parse path for rows the backfill hasn't drained.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../008_agentic_aggregate_stats.sql           |  18 +++
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-aggregate-stats.ts   | 150 ++++++++++++++++++
 .../src/etl/compute-aggregate-stats.test.ts   | 123 ++++++++++++++
 .../db/src/etl/compute-aggregate-stats.ts     | 147 +++++++++++++++++
 packages/db/src/etl/trace-replay-ingest.ts    |  17 +-
 packages/db/src/queries/agentic-aggregates.ts |  77 ++++++++-
 .../db/src/queries/derived-agentic-metrics.ts |  47 +++++-
 8 files changed, 569 insertions(+), 11 deletions(-)
 create mode 100644 packages/db/migrations/008_agentic_aggregate_stats.sql
 create mode 100644 packages/db/src/backfill-aggregate-stats.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.test.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.ts

diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql
new file mode 100644
index 00000000..d55533b9
--- /dev/null
+++ b/packages/db/migrations/008_agentic_aggregate_stats.sql
@@ -0,0 +1,18 @@
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+  add column aggregate_stats jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index d7caf34d..f3f92311 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,6 +19,7 @@
     "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
+    "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
new file mode 100644
index 00000000..8dd42dce
--- /dev/null
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -0,0 +1,150 @@
+/**
+ * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it
+ * or were computed by an older `STATS_VERSION`.
+ *
+ * The ingest path now computes stats inline, but existing rows (and rows
+ * whose computation logic has since changed) still need this pass. Run after
+ * applying migration 008 and any time `STATS_VERSION` bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can be hundreds of
+ *     MB decompressed for TP+EP / high-conc points — keeping one in memory
+ *     at a time avoids OOM).
+ *   - Skip rows whose stored `aggregate_stats.version` already matches.
+ *   - Recompute via the same `computeAggregateStats()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats
+ *     [--limit N]   only process the first N candidate rows (useful for
+ *                   smoke-tests on a fresh deploy)
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { computeAggregateStats, STATS_VERSION } from './etl/compute-aggregate-stats.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-aggregate-stats ===');
+  console.log(`  STATS_VERSION = ${STATS_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Find candidates: rows missing stats, or whose stored version is stale.
+  // Using >>'version'::int comparison would error on null; coalesce to -1 so
+  // null-stats rows always count as stale.
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where aggregate_stats is null
+           or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION}
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      // Fetch one row at a time — the json_gz blob is the heavy field.
+      const [row] = await sql<
+        { profile_export_jsonl_gz: Buffer | null; server_metrics_json_gz: Buffer | null }[]
+      >`
+        select profile_export_jsonl_gz, server_metrics_json_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+
+      const stats = await computeAggregateStats({
+        profileBlob: row.profile_export_jsonl_gz,
+        serverBlob: row.server_metrics_json_gz,
+      });
+
+      await sql`
+        update agentic_trace_replay
+        set aggregate_stats = ${sql.json(structuredClone(stats) as unknown as Parameters<typeof sql.json>[0])}
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-aggregate-stats failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts
new file mode 100644
index 00000000..de0009de
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.test.ts
@@ -0,0 +1,123 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { STATS_VERSION, computeAggregateStats } from './compute-aggregate-stats.js';
+
+/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */
+function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) {
+  const lines = requests.map((r, i) =>
+    JSON.stringify({
+      metadata: {
+        benchmark_phase: 'profiling',
+        conversation_id: `conv-${i}`,
+        turn_index: 0,
+      },
+      metrics: {
+        input_sequence_length: { value: r.isl, unit: 'tokens' },
+        output_sequence_length: { value: r.osl, unit: 'tokens' },
+        request_latency: { value: r.rl ?? 1000, unit: 'ms' },
+        time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */
+function makeServerBlob() {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1, avg: 0.2 },
+              { start_ns: 1, end_ns: 2, avg: 0.5 },
+              { start_ns: 2, end_ns: 3, avg: 0.8 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+describe('computeAggregateStats', () => {
+  it('returns the current STATS_VERSION in the bundle', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+
+  it('leaves every metric null when both blobs are null', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+  });
+
+  it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => {
+    const profileBlob = makeProfileBlob([
+      { isl: 100, osl: 50, rl: 1000, ttft: 100 },
+      { isl: 200, osl: 75, rl: 2000, ttft: 200 },
+      { isl: 300, osl: 100, rl: 3000, ttft: 300 },
+    ]);
+    const stats = await computeAggregateStats({ profileBlob, serverBlob: null });
+
+    expect(stats.isl?.n).toBe(3);
+    expect(stats.isl?.mean).toBeCloseTo(200, 6);
+    expect(stats.osl?.n).toBe(3);
+    expect(stats.osl?.mean).toBeCloseTo(75, 6);
+
+    // Server-side metrics still null when there's no server blob.
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+
+    // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000.
+    expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6);
+    // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+    //   loads = [150, 275, 400], mean_load = 275
+    //   scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625]
+    //   mean ≈ 1.9653
+    expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3);
+  });
+
+  it('computes KV util + prefix hit rate from the server blob alone', async () => {
+    const stats = await computeAggregateStats({
+      profileBlob: null,
+      serverBlob: makeServerBlob(),
+    });
+    expect(stats.kvCacheUtil?.n).toBe(3);
+    expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6);
+    expect(stats.prefixCacheHitRate?.n).toBe(1);
+    expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6);
+
+    // Profile-derived metrics absent.
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+  });
+
+  it('tolerates a malformed profile blob by leaving its metrics null', async () => {
+    // A random non-gzip buffer triggers a gunzip error — code path swallows it.
+    const garbage = Buffer.from('not-gzip-data');
+    const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+    // Version still set so the row is considered "computed".
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+});
diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts
new file mode 100644
index 00000000..a422cfec
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.ts
@@ -0,0 +1,147 @@
+/**
+ * Pre-compute the per-row aggregate stats for an `agentic_trace_replay`
+ * blob pair. The output lands in the `aggregate_stats` JSONB column so the
+ * detail page can serve the "Aggregates across configs" view and the
+ * derived chart x-axis modes from a single SQL row read, instead of
+ * parsing the raw blobs on demand.
+ *
+ * Shape is intentionally versioned — bump `STATS_VERSION` whenever the
+ * computation changes so the backfill script knows which rows to recompute.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics.js';
+import {
+  STATS_VERSION,
+  extractIslOsl,
+  extractServerMetricSamples,
+  percentilesOf,
+  type MetricPercentiles,
+} from '../queries/agentic-aggregates.js';
+
+export { STATS_VERSION };
+
+export interface AggregateStats {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */
+  normalizedSessionTimeS: number | null;
+  /** P90 of per-turn ISL/TTFT pooled across every session's turns. */
+  p90PrefillTpsPerUser: number | null;
+}
+
+/** Metric subtrees we extract via stream-parse on oversized server blobs. */
+const TARGET_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits',
+  'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect just the metric
+ * subtrees we care about. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows.
+ */
+async function streamExtractServer(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const collected: Record<string, unknown> = {};
+  const pipelineStream = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipelineStream as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: unknown };
+      if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipelineStream as any).on('end', resolve);
+    (pipelineStream as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+/**
+ * Compute the full versioned stats bundle from a (profile, server-metrics)
+ * blob pair. Either blob may be null (e.g. only the server file existed) —
+ * the corresponding stats just come back null.
+ */
+export async function computeAggregateStats(args: {
+  profileBlob: Buffer | null;
+  serverBlob: Buffer | null;
+}): Promise<AggregateStats> {
+  let islPct: MetricPercentiles | null = null;
+  let oslPct: MetricPercentiles | null = null;
+  let normalized: number | null = null;
+  let prefillP90: number | null = null;
+
+  if (args.profileBlob) {
+    try {
+      const jsonl = gunzipSync(args.profileBlob).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      islPct = percentilesOf(isl);
+      oslPct = percentilesOf(osl);
+      const derived = computeDerivedFromBlob(jsonl);
+      normalized = derived.normalized_session_time_s;
+      prefillP90 = derived.p90_prefill_tps_per_user;
+    } catch {
+      // ignore malformed blob — leave nulls
+    }
+  }
+
+  let kvPct: MetricPercentiles | null = null;
+  let prefixPct: MetricPercentiles | null = null;
+  if (args.serverBlob) {
+    let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+    try {
+      const json = gunzipSync(args.serverBlob).toString('utf8');
+      server = extractServerMetricSamples(json);
+    } catch (error) {
+      const code = error && (error as NodeJS.ErrnoException).code;
+      const msg = error instanceof Error ? error.message : String(error);
+      // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to
+      // pull just the metric subtrees we need without materializing the
+      // full 500+ MB JSON string.
+      if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+        try {
+          server = await streamExtractServer(args.serverBlob);
+        } catch {
+          // stream fallback failed too — leave nulls
+        }
+      }
+    }
+    if (server) {
+      kvPct = percentilesOf(server.kvCacheUtil);
+      prefixPct = percentilesOf(server.prefixCacheHitRate);
+    }
+  }
+
+  return {
+    version: STATS_VERSION,
+    isl: islPct,
+    osl: oslPct,
+    kvCacheUtil: kvPct,
+    prefixCacheHitRate: prefixPct,
+    normalizedSessionTimeS: normalized,
+    p90PrefillTpsPerUser: prefillP90,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8c6d92b6..423f70e7 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -12,6 +12,8 @@ import { gzipSync } from 'node:zlib';
 
 import type postgres from 'postgres';
 
+import { computeAggregateStats } from './compute-aggregate-stats.js';
+
 type Sql = ReturnType<typeof postgres>;
 
 /**
@@ -55,6 +57,15 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
+  // Pre-compute the aggregate stats so the detail page / aggregates view
+  // doesn't have to re-parse these blobs on every request. The compute
+  // function tolerates one-or-both blobs being null and falls back to a
+  // streaming parser for oversized server_metrics blobs.
+  const aggregateStats = await computeAggregateStats({
+    profileBlob: profileGz,
+    serverBlob: metricsJsonGz,
+  });
+
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
     insert into agentic_trace_replay (
       profile_export_jsonl_gz,
@@ -62,7 +73,8 @@ export async function insertTraceReplay(
       server_metrics_csv,
       server_metrics_csv_size,
       server_metrics_json_gz,
-      server_metrics_json_uncompressed_size
+      server_metrics_json_uncompressed_size,
+      aggregate_stats
     )
     values (
       ${profileGz},
@@ -70,7 +82,8 @@ export async function insertTraceReplay(
       ${serverMetricsCsv},
       ${csvSize},
       ${metricsJsonGz},
-      ${metricsJsonSize}
+      ${metricsJsonSize},
+      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 22ec7b28..8ac4f678 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -24,6 +24,14 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
 
 import type { DbClient } from '../connection.js';
 
+/**
+ * Bump when the aggregate-stats computation algorithm changes — the backfill
+ * script recomputes any row whose stored `aggregate_stats.version` is older.
+ * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
+ * import: the compute helper depends on the percentile utilities below.
+ */
+export const STATS_VERSION = 1;
+
 export interface MetricPercentiles {
   mean: number;
   p50: number;
@@ -254,9 +262,55 @@ export async function getAgenticAggregates(
   if (benchmarkResultIds.length === 0) return {};
 
   const result: AgenticAggregateMap = {};
-  // ── Pass 1: profile_export blobs (cheap; large batches). ────────────────
-  for (let i = 0; i < benchmarkResultIds.length; i += PROFILE_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + PROFILE_CHUNK_SIZE);
+
+  // Fast path: read the pre-computed `aggregate_stats` JSONB written by the
+  // ingest pipeline (and back-filled by `backfill-aggregate-stats.ts`). One
+  // round-trip pulls everything we need for every requested id with no blob
+  // decompression, so the slow blob-parsing fallback only runs for ids
+  // whose stats are missing or were produced by an older `STATS_VERSION`.
+  const statsRows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.aggregate_stats as stats
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+  `) as {
+    benchmark_result_id: number;
+    stats: AggregateStatsRow | null;
+  }[];
+
+  const idsNeedingProfile: number[] = [];
+  const idsNeedingServer: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    const agg = blankAggregate(id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      agg.isl = row.stats.isl ?? null;
+      agg.osl = row.stats.osl ?? null;
+      agg.kvCacheUtil = row.stats.kvCacheUtil ?? null;
+      agg.prefixCacheHitRate = row.stats.prefixCacheHitRate ?? null;
+    } else {
+      // No stats (or stale version) — schedule the blob-parse fallback below
+      // so the response still surfaces data. Backfill should drain these.
+      idsNeedingProfile.push(id);
+      idsNeedingServer.push(id);
+    }
+    result[id] = agg;
+  }
+  // Also fall back for ids that didn't return a row at all (no trace_replay
+  // link) — keep the caller contract: every id we know about lands in the map.
+  for (const id of benchmarkResultIds) {
+    if (!(id in result)) result[id] = blankAggregate(id);
+  }
+
+  if (idsNeedingProfile.length === 0 && idsNeedingServer.length === 0) {
+    return result;
+  }
+
+  // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ──────
+  for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) {
+    const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE);
     const rows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -280,12 +334,12 @@ export async function getAgenticAggregates(
       }
     }
   }
-  // ── Pass 2: server_metrics blobs (huge; one at a time). ────────────────
+  // ── Fallback Pass 2: server_metrics blobs (huge; one at a time). ───────
   // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
   // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
   // path runs at most once per sibling set.
-  for (let i = 0; i < benchmarkResultIds.length; i += SERVER_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + SERVER_CHUNK_SIZE);
+  for (let i = 0; i < idsNeedingServer.length; i += SERVER_CHUNK_SIZE) {
+    const chunk = idsNeedingServer.slice(i, i + SERVER_CHUNK_SIZE);
     const rows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -325,6 +379,17 @@ export async function getAgenticAggregates(
   return result;
 }
 
+/** Shape of the JSONB column when read back via postgres-js. */
+interface AggregateStatsRow {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  normalizedSessionTimeS: number | null;
+  p90PrefillTpsPerUser: number | null;
+}
+
 function blankAggregate(id: number): AgenticAggregate {
   return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
 }
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index ac6fd38d..a14a1727 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -21,6 +21,7 @@
 import { gunzipSync } from 'node:zlib';
 
 import type { DbClient } from '../connection.js';
+import { STATS_VERSION } from './agentic-aggregates.js';
 
 export interface DerivedAgenticMetric {
   /** benchmark_results.id this entry belongs to. */
@@ -190,9 +191,50 @@ export async function getDerivedAgenticMetrics(
 ): Promise<DerivedAgenticMetricMap> {
   if (benchmarkResultIds.length === 0) return {};
 
+  const result: DerivedAgenticMetricMap = {};
+
+  // Fast path: read the pre-computed values out of `aggregate_stats`. The
+  // ingest pipeline computes both metrics in the same pass that produces the
+  // percentile bundles, so a single SQL round-trip covers most ids without
+  // touching the gzipped profile blob.
+  const statsRows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.aggregate_stats as stats
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+  `) as {
+    benchmark_result_id: number;
+    stats: {
+      version?: number;
+      normalizedSessionTimeS?: number | null;
+      p90PrefillTpsPerUser?: number | null;
+    } | null;
+  }[];
+
+  const idsNeedingBlob: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      result[id] = {
+        id,
+        normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null,
+        p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null,
+      };
+    } else {
+      idsNeedingBlob.push(id);
+    }
+  }
+
+  if (idsNeedingBlob.length === 0) return result;
+
+  // Fallback: parse the profile blob directly. Used for rows whose
+  // `aggregate_stats` is null or computed by an older STATS_VERSION; the
+  // backfill script drains the population so this path should be rare.
   const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
-  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+  for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE);
     const chunkRows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -205,7 +247,6 @@ export async function getDerivedAgenticMetrics(
     rows.push(...chunkRows);
   }
 
-  const result: DerivedAgenticMetricMap = {};
   for (const row of rows) {
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');

From 9d9c7c13413c16a147b176691782827d5ee8d21d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:07:30 -0500
Subject: [PATCH 038/111] fix(agentic-aggregates): drop .js extension on
 app-route-traced import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turbopack doesn't do TypeScript's `.js → .ts` substitution when an
app-route bundles an intra-package value import, so the new
`STATS_VERSION` import broke the /api/v1/derived-agentic-metrics
route. The same `.js` value-import pattern works for files not pulled
into an app route (e.g. workflow-run.ts → run-overrides.ts) so the
existing intra-package imports are left alone.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/queries/derived-agentic-metrics.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index a14a1727..35a4b76c 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -21,7 +21,7 @@
 import { gunzipSync } from 'node:zlib';
 
 import type { DbClient } from '../connection.js';
-import { STATS_VERSION } from './agentic-aggregates.js';
+import { STATS_VERSION } from './agentic-aggregates';
 
 export interface DerivedAgenticMetric {
   /** benchmark_results.id this entry belongs to. */

From 6063d01e2d563951d70dea699edd30a6b06df81a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:18:31 -0500
Subject: [PATCH 039/111] feat(agentic-detail): pre-compute chart_series at
 ingest time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Detail page was parsing the entire server_metrics_json_gz blob on every
request — fine for small rows, but TP+EP high-conc rows decompress past
Node's 512 MB max-string-length cap and threw ERR_STRING_TOO_LONG,
killing the page for point 206242 et al.

Extends the Alec-pattern to the time-series path: new `chart_series`
JSONB column holds pre-extracted kvCacheUsage, prefixCacheHitRate,
queueDepth, prefillTps, decodeTps, and promptTokensBySource arrays.
The API fast-path is a single SQL row read; the slow path (compute
from blob, with stream-parse fallback for oversized rows) only runs
for rows whose chart_series is missing or stale-versioned.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../migrations/009_agentic_chart_series.sql   |  19 ++
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-chart-series.ts      | 154 ++++++++++
 .../db/src/etl/compute-chart-series.test.ts   | 129 +++++++++
 packages/db/src/etl/compute-chart-series.ts   | 268 ++++++++++++++++++
 packages/db/src/etl/trace-replay-ingest.ts    |  21 +-
 .../db/src/queries/trace-server-metrics.ts    | 261 +++++------------
 7 files changed, 654 insertions(+), 199 deletions(-)
 create mode 100644 packages/db/migrations/009_agentic_chart_series.sql
 create mode 100644 packages/db/src/backfill-chart-series.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.test.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.ts

diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql
new file mode 100644
index 00000000..b42718b9
--- /dev/null
+++ b/packages/db/migrations/009_agentic_chart_series.sql
@@ -0,0 +1,19 @@
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+  add column chart_series jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index f3f92311..f97c442a 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -20,6 +20,7 @@
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
+    "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
new file mode 100644
index 00000000..66156b45
--- /dev/null
+++ b/packages/db/src/backfill-chart-series.ts
@@ -0,0 +1,154 @@
+/**
+ * Backfill `agentic_trace_replay.chart_series` for rows that are missing it
+ * or were computed by an older `CHART_SERIES_VERSION`.
+ *
+ * The ingest path now computes the time-series inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION`
+ * bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can decompress
+ *     past 500 MB on high-conc TP+EP points — one in memory at a time
+ *     avoids OOM).
+ *   - Skip rows whose stored version already matches.
+ *   - Recompute via the same `computeChartSeries()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-chart-series ===');
+  console.log(`  CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows that actually have a server_metrics blob can produce a
+  // chart_series. Rows without the blob legitimately keep `chart_series`
+  // null and the API serves them via the slow path (which also returns
+  // null because there's no blob to parse — so the page falls into the
+  // "no stored trace_replay blob" branch).
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+          and (
+            chart_series is null
+            or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      const [row] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
+        select server_metrics_json_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+
+      const series = await computeChartSeries(row.server_metrics_json_gz);
+
+      await sql`
+        update agentic_trace_replay
+        set chart_series = ${
+          series === null
+            ? null
+            : sql.json(structuredClone(series) as unknown as Parameters<typeof sql.json>[0])
+        }
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-chart-series failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
new file mode 100644
index 00000000..dafc7200
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -0,0 +1,129 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js';
+
+/**
+ * Build a minimal server_metrics_json blob covering the metrics the chart
+ * consumes. Each timeslice is one second long starting at t=0.
+ */
+function makeBlob(opts?: {
+  prefixHits?: number;
+  prefixQueries?: number;
+  promptTokensRate?: number;
+}) {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1e9, avg: 0.1 },
+              { start_ns: 1e9, end_ns: 2e9, avg: 0.4 },
+              { start_ns: 2e9, end_ns: 3e9, avg: 0.7 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }],
+      },
+      'vllm:num_requests_running': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }],
+      },
+      'vllm:num_requests_waiting': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }],
+      },
+      'vllm:prompt_tokens': {
+        series: [
+          { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] },
+        ],
+      },
+      'vllm:generation_tokens': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }],
+      },
+      'vllm:prompt_tokens_by_source': {
+        series: [
+          {
+            labels: { source: 'local_cache_hit' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }],
+          },
+          {
+            labels: { source: 'miss' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }],
+          },
+        ],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+describe('computeChartSeries', () => {
+  it('returns null when the blob is null', async () => {
+    expect(await computeChartSeries(null)).toBeNull();
+  });
+
+  it('returns the current CHART_SERIES_VERSION in the bundle', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.version).toBe(CHART_SERIES_VERSION);
+  });
+
+  it('extracts kvCacheUsage points with t=seconds-from-start', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.kvCacheUsage).toEqual([
+      { t: 0, value: 0.1 },
+      { t: 1, value: 0.4 },
+      { t: 2, value: 0.7 },
+    ]);
+  });
+
+  it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 }));
+    expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]);
+  });
+
+  it('drops prefixCacheHitRate windows where queries.rate is 0', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 }));
+    expect(series?.prefixCacheHitRate).toEqual([]);
+  });
+
+  it('pairs running + waiting into queueDepth points', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]);
+  });
+
+  it('extracts prefillTps + decodeTps from counter rates', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]);
+    expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]);
+  });
+
+  it('splits promptTokensBySource by label and skips empty series', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([
+      'local_cache_hit',
+      'miss',
+    ]);
+    expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]);
+    expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]);
+  });
+
+  it('computes timing metadata from the widest metric window', async () => {
+    const series = await computeChartSeries(makeBlob());
+    // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9.
+    expect(series?.startNs).toBe(0);
+    expect(series?.endNs).toBe(3e9);
+    expect(series?.durationS).toBeCloseTo(3, 6);
+    expect(series?.timeslicesCount).toBe(3);
+  });
+
+  it('returns null on a malformed (non-gzip) blob', async () => {
+    const result = await computeChartSeries(Buffer.from('not-gzip-data'));
+    expect(result).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
new file mode 100644
index 00000000..3cb4181b
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -0,0 +1,268 @@
+/**
+ * Pre-compute the time-series for the agentic detail page chart, so the
+ * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every
+ * request. The output lands in `agentic_trace_replay.chart_series` and is
+ * read directly by `getTraceServerMetrics`.
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `CHART_SERIES_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const CHART_SERIES_VERSION = 1;
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+
+export interface ChartSeries {
+  version: number;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+}
+
+// ── Raw blob shapes (subset we read) ────────────────────────────────────
+
+interface RawSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+}
+
+interface RawSeries {
+  labels?: Record<string, string>;
+  timeslices?: RawSlice[];
+}
+
+interface RawMetric {
+  series?: RawSeries[];
+}
+
+type MetricsMap = Record<string, RawMetric>;
+
+/** The set of metric subtrees the chart consumes. */
+const CHART_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:num_requests_running',
+  'vllm:num_requests_waiting',
+  'vllm:prompt_tokens',
+  'vllm:generation_tokens',
+  'vllm:prompt_tokens_by_source',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect only the metric
+ * subtrees the chart needs. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows.
+ */
+async function streamCollectMetrics(buffer: Buffer): Promise<MetricsMap> {
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const collected: MetricsMap = {};
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipeline as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: RawMetric };
+      if (CHART_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipeline as any).on('end', resolve);
+    (pipeline as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return collected;
+}
+
+/**
+ * Parse the gzipped server_metrics blob into the metric map. Tries the
+ * synchronous fast path first; falls back to stream-parse on
+ * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed.
+ */
+async function parseMetrics(buffer: Buffer): Promise<MetricsMap> {
+  try {
+    const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { metrics?: MetricsMap };
+    return obj.metrics ?? {};
+  } catch (error) {
+    const code = error && (error as NodeJS.ErrnoException).code;
+    const msg = error instanceof Error ? error.message : String(error);
+    if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+      return await streamCollectMetrics(buffer);
+    }
+    throw error;
+  }
+}
+
+/**
+ * Build chart-ready time-series arrays from a gzipped server_metrics blob.
+ * The math mirrors `getTraceServerMetrics` — this helper exists so ingest,
+ * backfill, and the API path produce byte-identical results.
+ */
+export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeries | null> {
+  if (!blob) return null;
+  let metrics: MetricsMap;
+  try {
+    metrics = await parseMetrics(blob);
+  } catch {
+    // Malformed blob → no series (caller treats null as "no data").
+    return null;
+  }
+  return buildSeriesFromMetrics(metrics);
+}
+
+/** Pull the first series under a metric key, or undefined. */
+function firstSeries(metrics: MetricsMap, name: string): RawSeries | undefined {
+  const s = metrics[name]?.series;
+  return s && s.length > 0 ? s[0] : undefined;
+}
+
+function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
+  // Timing reference: smallest start_ns and largest end_ns across every
+  // timeslice we extracted. (Same logic as the original getTraceServerMetrics
+  // — looking at every metric gives the widest possible window even if some
+  // series start late.)
+  let startNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+  let timeslicesCount = 0;
+  for (const metricMeta of Object.values(metrics)) {
+    for (const s of metricMeta?.series ?? []) {
+      const ts = s.timeslices ?? [];
+      if (ts.length === 0) continue;
+      timeslicesCount = Math.max(timeslicesCount, ts.length);
+      const first = ts[0]!;
+      const last = ts.at(-1)!;
+      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+    }
+  }
+  if (!Number.isFinite(startNs)) startNs = 0;
+  const tOf = (ns: number) => (ns - startNs) / 1e9;
+
+  // KV cache usage (gauge, 0..1)
+  const kvCacheUsage: TimeSeriesPoint[] = [];
+  const kvSeries =
+    firstSeries(metrics, 'vllm:kv_cache_usage_perc') ??
+    firstSeries(metrics, 'vllm:gpu_cache_usage_perc');
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number' && typeof ts.start_ns === 'number') {
+      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
+    }
+  }
+
+  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
+  const hitsTs = firstSeries(metrics, 'vllm:prefix_cache_hits')?.timeslices ?? [];
+  const qsTs = firstSeries(metrics, 'vllm:prefix_cache_queries')?.timeslices ?? [];
+  const prefixCacheHitRate: TimeSeriesPoint[] = [];
+  const minLen = Math.min(hitsTs.length, qsTs.length);
+  for (let i = 0; i < minLen; i++) {
+    const h = hitsTs[i]!;
+    const q = qsTs[i]!;
+    if (
+      typeof q.rate === 'number' &&
+      q.rate > 0 &&
+      typeof h.rate === 'number' &&
+      typeof h.start_ns === 'number'
+    ) {
+      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
+    }
+  }
+
+  // Queue depth: pair running + waiting by index.
+  const runTs = firstSeries(metrics, 'vllm:num_requests_running')?.timeslices ?? [];
+  const waitTs = firstSeries(metrics, 'vllm:num_requests_waiting')?.timeslices ?? [];
+  const queueDepth: QueueDepthPoint[] = [];
+  const qlen = Math.min(runTs.length, waitTs.length);
+  for (let i = 0; i < qlen; i++) {
+    const r = runTs[i]!;
+    const w = waitTs[i]!;
+    if (typeof r.start_ns !== 'number') continue;
+    const running = typeof r.avg === 'number' ? r.avg : 0;
+    const waiting = typeof w.avg === 'number' ? w.avg : 0;
+    queueDepth.push({
+      t: tOf(r.start_ns),
+      running,
+      waiting,
+      total: running + waiting,
+    });
+  }
+
+  // Throughput: extract counter `rate` (already per-second from aiperf).
+  const counterRate = (name: string): TimeSeriesPoint[] => {
+    const s = firstSeries(metrics, name);
+    if (!s) return [];
+    const out: TimeSeriesPoint[] = [];
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+        out.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    return out;
+  };
+  const prefillTps = counterRate('vllm:prompt_tokens');
+  const decodeTps = counterRate('vllm:generation_tokens');
+
+  // Per-source prompt tokens — emit one TS array per source label.
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+    const labels = series.labels ?? {};
+    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+    const arr: TimeSeriesPoint[] = [];
+    for (const ts of series.timeslices ?? []) {
+      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    if (arr.length > 0) promptTokensBySource[source] = arr;
+  }
+
+  return {
+    version: CHART_SERIES_VERSION,
+    startNs,
+    endNs,
+    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+    timeslicesCount,
+    kvCacheUsage,
+    prefixCacheHitRate,
+    queueDepth,
+    promptTokensBySource,
+    prefillTps,
+    decodeTps,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 423f70e7..f70200ff 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -13,6 +13,7 @@ import { gzipSync } from 'node:zlib';
 import type postgres from 'postgres';
 
 import { computeAggregateStats } from './compute-aggregate-stats.js';
+import { computeChartSeries } from './compute-chart-series.js';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -57,14 +58,14 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
-  // Pre-compute the aggregate stats so the detail page / aggregates view
-  // doesn't have to re-parse these blobs on every request. The compute
-  // function tolerates one-or-both blobs being null and falls back to a
+  // Pre-compute the aggregate stats + chart-ready time-series so the
+  // detail page / aggregates view doesn't have to re-parse these blobs on
+  // every request. Both helpers tolerate a null blob and fall back to a
   // streaming parser for oversized server_metrics blobs.
-  const aggregateStats = await computeAggregateStats({
-    profileBlob: profileGz,
-    serverBlob: metricsJsonGz,
-  });
+  const [aggregateStats, chartSeries] = await Promise.all([
+    computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
+    computeChartSeries(metricsJsonGz),
+  ]);
 
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
     insert into agentic_trace_replay (
@@ -74,7 +75,8 @@ export async function insertTraceReplay(
       server_metrics_csv_size,
       server_metrics_json_gz,
       server_metrics_json_uncompressed_size,
-      aggregate_stats
+      aggregate_stats,
+      chart_series
     )
     values (
       ${profileGz},
@@ -83,7 +85,8 @@ export async function insertTraceReplay(
       ${csvSize},
       ${metricsJsonGz},
       ${metricsJsonSize},
-      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])}
+      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])},
+      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 822ae633..624b6ed3 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -1,73 +1,26 @@
 /**
- * Parse aiperf's `server_metrics_export.json` blob (gzipped in
- * `agentic_trace_replay.server_metrics_json_gz`) and return a slim, chart-ready
- * time-series for one benchmark point.
+ * Time-series view of one agentic benchmark point: chart-ready arrays for
+ * KV utilization, prefix-cache hit rate, queue depth, prefill + decode TPS,
+ * and per-source prompt-token counts.
  *
- * The raw JSON has shape:
- *   metrics: {
- *     "<metric_name>": {
- *       series: [
- *         {
- *           labels: { ... },
- *           stats: { ... summary ... },
- *           timeslices: [
- *             { start_ns, end_ns, avg, min, max }            // gauges
- *             { start_ns, end_ns, total, rate }              // counters
- *           ]
- *         }
- *       ]
- *     }
- *   }
- *
- * Timeslices are ~1 Hz windows. The benchmark window can be tens of minutes
- * (1800+ windows). We return them as `[{ t, ...}]` arrays with `t` measured
- * in seconds from the benchmark start so the frontend doesn't need to
- * shuffle bigint nanoseconds around.
+ * Backed by `agentic_trace_replay.chart_series` (pre-computed at ingest
+ * time, see `etl/compute-chart-series.ts`). The fast path is a single SQL
+ * row read; the slow path re-computes from `server_metrics_json_gz` and is
+ * only taken when the column is missing or the stored
+ * `CHART_SERIES_VERSION` is stale (the backfill script should drain that).
  */
 
-import { gunzipSync } from 'node:zlib';
+import {
+  CHART_SERIES_VERSION,
+  computeChartSeries,
+  type ChartSeries,
+  type QueueDepthPoint,
+  type TimeSeriesPoint,
+} from '../etl/compute-chart-series';
 
 import type { DbClient } from '../connection.js';
 
-interface GaugeSlice {
-  start_ns: number;
-  end_ns: number;
-  avg?: number;
-  min?: number;
-  max?: number;
-}
-
-interface CounterSlice {
-  start_ns: number;
-  end_ns: number;
-  total?: number;
-  rate?: number;
-}
-
-interface Series {
-  endpoint_url?: string;
-  labels?: Record<string, string>;
-  stats?: Record<string, unknown>;
-  timeslices?: (GaugeSlice & CounterSlice)[];
-}
-
-interface MetricsJson {
-  metrics?: Record<string, { type?: string; description?: string; series?: Series[] }>;
-}
-
-export interface TimeSeriesPoint {
-  /** Seconds from benchmark start. */
-  t: number;
-  value: number;
-}
-
-export interface QueueDepthPoint {
-  t: number;
-  running: number;
-  waiting: number;
-  /** Optional total — frontend can compute too. */
-  total: number;
-}
+export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series';
 
 export interface PointMeta {
   id: number;
@@ -120,30 +73,13 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
 }
 
-export async function getTraceServerMetrics(
-  sql: DbClient,
-  benchmarkResultId: number,
-): Promise<TraceServerMetrics | null> {
-  const rows = (await sql`
-    select
-      atr.server_metrics_json_gz as blob,
-      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
-      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
-      br.date::text,
-      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
-      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
-      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
-    from benchmark_results br
-    join configs c on c.id = br.config_id
-    join workflow_runs wr on wr.id = br.workflow_run_id
-    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
-    where br.id = ${benchmarkResultId}
-  `) as unknown as ({ blob: Buffer | null } & PointMeta)[];
-  const row = rows[0];
-  if (!row) return null;
-  const blob = row.blob;
-  if (!blob) return null;
-  const pointMeta: PointMeta = {
+interface RawMetaRow extends PointMeta {
+  blob: Buffer | null;
+  chart_series: ChartSeries | null;
+}
+
+function buildMeta(row: RawMetaRow): PointMeta {
+  return {
     id: Number(row.id),
     hardware: row.hardware,
     framework: row.framework,
@@ -163,113 +99,58 @@ export async function getTraceServerMetrics(
     server_cpu_cache_hit_rate:
       row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
   };
+}
 
-  const parsed = JSON.parse(gunzipSync(blob).toString('utf8')) as MetricsJson;
-  const metrics = parsed.metrics ?? {};
-
-  const firstSeries = (name: string): Series | undefined => {
-    const s = metrics[name]?.series;
-    return s && s.length > 0 ? s[0] : undefined;
+function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
+  return {
+    meta,
+    startNs: series.startNs,
+    endNs: series.endNs,
+    durationS: series.durationS,
+    timeslicesCount: series.timeslicesCount,
+    kvCacheUsage: series.kvCacheUsage,
+    prefixCacheHitRate: series.prefixCacheHitRate,
+    queueDepth: series.queueDepth,
+    promptTokensBySource: series.promptTokensBySource,
+    prefillTps: series.prefillTps,
+    decodeTps: series.decodeTps,
   };
+}
 
-  // Compute timing reference from the first gauge metric we can find.
-  let startNs = Number.POSITIVE_INFINITY;
-  let endNs = 0;
-  let timeslicesCount = 0;
-  for (const metricMeta of Object.values(metrics)) {
-    for (const s of metricMeta?.series ?? []) {
-      const ts = s.timeslices ?? [];
-      if (ts.length === 0) continue;
-      timeslicesCount = Math.max(timeslicesCount, ts.length);
-      const first = ts[0]!;
-      const last = ts.at(-1)!;
-      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
-      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
-    }
-  }
-  if (!Number.isFinite(startNs)) startNs = 0;
-  const tOf = (ns: number) => (ns - startNs) / 1e9;
-
-  // KV cache usage (gauge, 0..1)
-  const kvCacheUsage: TimeSeriesPoint[] = [];
-  const kvSeries =
-    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number') {
-      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
-    }
-  }
-
-  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
-  // `rate` is already per-window delta; we just divide.
-  const hitsTs = firstSeries('vllm:prefix_cache_hits')?.timeslices ?? [];
-  const qsTs = firstSeries('vllm:prefix_cache_queries')?.timeslices ?? [];
-  const prefixCacheHitRate: TimeSeriesPoint[] = [];
-  const minLen = Math.min(hitsTs.length, qsTs.length);
-  for (let i = 0; i < minLen; i++) {
-    const h = hitsTs[i]!;
-    const q = qsTs[i]!;
-    if (typeof q.rate === 'number' && q.rate > 0 && typeof h.rate === 'number') {
-      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
-    }
-  }
-
-  // Queue depth: pair running + waiting by index.
-  const runTs = firstSeries('vllm:num_requests_running')?.timeslices ?? [];
-  const waitTs = firstSeries('vllm:num_requests_waiting')?.timeslices ?? [];
-  const queueDepth: QueueDepthPoint[] = [];
-  const qlen = Math.min(runTs.length, waitTs.length);
-  for (let i = 0; i < qlen; i++) {
-    const r = runTs[i]!;
-    const w = waitTs[i]!;
-    const running = typeof r.avg === 'number' ? r.avg : 0;
-    const waiting = typeof w.avg === 'number' ? w.avg : 0;
-    queueDepth.push({
-      t: tOf(r.start_ns),
-      running,
-      waiting,
-      total: running + waiting,
-    });
-  }
-
-  // Throughput: extract counter `rate` (already per-second delta from aiperf).
-  const counterRateSeries = (name: string): TimeSeriesPoint[] => {
-    const s = firstSeries(name);
-    if (!s) return [];
-    const out: TimeSeriesPoint[] = [];
-    for (const ts of s.timeslices ?? []) {
-      if (typeof ts.rate === 'number') out.push({ t: tOf(ts.start_ns), value: ts.rate });
-    }
-    return out;
-  };
-  const prefillTps = counterRateSeries('vllm:prompt_tokens');
-  const decodeTps = counterRateSeries('vllm:generation_tokens');
+export async function getTraceServerMetrics(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const rows = (await sql`
+    select
+      atr.server_metrics_json_gz as blob,
+      atr.chart_series,
+      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+      br.date::text,
+      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawMetaRow[];
+  const row = rows[0];
+  if (!row) return null;
+  if (!row.blob) return null;
+  const meta = buildMeta(row);
 
-  // Per-source prompt tokens — emit one TS array per source label.
-  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
-  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
-    const labels = series.labels ?? {};
-    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
-    const arr: TimeSeriesPoint[] = [];
-    for (const ts of series.timeslices ?? []) {
-      if (typeof ts.rate === 'number') {
-        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
-      }
-    }
-    if (arr.length > 0) promptTokensBySource[source] = arr;
+  // Fast path: pre-computed chart_series at the current version.
+  if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) {
+    return merge(meta, row.chart_series);
   }
 
-  return {
-    meta: pointMeta,
-    startNs,
-    endNs,
-    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
-    timeslicesCount,
-    kvCacheUsage,
-    prefixCacheHitRate,
-    queueDepth,
-    promptTokensBySource,
-    prefillTps,
-    decodeTps,
-  };
+  // Slow path: compute from the blob. `computeChartSeries` handles
+  // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP
+  // rows succeed even before the backfill drains them.
+  const series = await computeChartSeries(row.blob);
+  if (!series) return null;
+  return merge(meta, series);
 }

From 24fe8feae5175d80a53002fd4f3b3b77bb42e8c4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 22 May 2026 14:00:37 -0500
Subject: [PATCH 040/111] feat(agentic-detail): per-request Gantt timeline view
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a "Request timeline" view on the agentic point detail page, modeled
after the agent-timeline in semianalysis-claude-code-proxy. Each row is
a conversation (with sub-agent rows nested + indented under their
parent), each bar is one HTTP request from request_start → request_end
with a thin lead-in showing credit_issued → request_start queue wait.

Hover any bar for per-request stats (TTFT, ISL/OSL, queue wait, phase,
worker, agent depth). Move anywhere over the chart for a crosshair
that shows the cursor time + how many requests are running / waiting /
completed at that instant — O(log n) sweep counts so it stays smooth
on big runs.

Same Alec pattern as 008/009: migration 010 adds a `request_timeline`
JSONB column on agentic_trace_replay, computed at ingest time and
backfilled for existing rows. ~30 KB per row vs the ~1-3 MB raw blob.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/request-timeline/route.ts  |  40 +
 .../agentic-point/agentic-point-detail.tsx    |  25 +-
 .../agentic-point/request-timeline.tsx        | 821 ++++++++++++++++++
 .../app/src/hooks/api/use-request-timeline.ts |  59 ++
 .../010_agentic_request_timeline.sql          |  15 +
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-request-timeline.ts  | 144 +++
 .../src/etl/compute-request-timeline.test.ts  | 153 ++++
 .../db/src/etl/compute-request-timeline.ts    | 182 ++++
 packages/db/src/etl/trace-replay-ingest.ts    |  18 +-
 packages/db/src/queries/request-timeline.ts   |  48 +
 11 files changed, 1498 insertions(+), 8 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/request-timeline/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.tsx
 create mode 100644 packages/app/src/hooks/api/use-request-timeline.ts
 create mode 100644 packages/db/migrations/010_agentic_request_timeline.sql
 create mode 100644 packages/db/src/backfill-request-timeline.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.test.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.ts
 create mode 100644 packages/db/src/queries/request-timeline.ts

diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts
new file mode 100644
index 00000000..6c884fb2
--- /dev/null
+++ b/packages/app/src/app/api/v1/request-timeline/route.ts
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getRequestTimeline,
+  type RequestTimeline,
+} from '@semianalysisai/inferencex-db/queries/request-timeline';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedRequestTimeline = cachedQuery(
+  (id: number): Promise<RequestTimeline | null> => getRequestTimeline(getDb(), id),
+  'request-timeline',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/request-timeline?id=N
+ *
+ * Returns the per-request Gantt timeline for one agentic benchmark point.
+ * Each request entry has ns-from-start offsets for credit/start/ack/end,
+ * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the
+ * point has no stored profile_export.jsonl blob.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedRequestTimeline(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching request timeline:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index a5bca4e0..2e43b4fb 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -6,6 +6,7 @@ import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
 import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
+import { useRequestTimeline } from '@/hooks/api/use-request-timeline';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -19,6 +20,7 @@ import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/seg
 import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
 import { ExpandableChart } from './expandable-chart';
+import { RequestTimelineView } from './request-timeline';
 import { SiblingNav, chipLabel } from './sibling-nav';
 import {
   StackedAreaChart,
@@ -82,9 +84,10 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
-type DetailView = 'point' | 'aggregates';
+type DetailView = 'point' | 'timeline' | 'aggregates';
 const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+  { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' },
   { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
 ];
 
@@ -120,6 +123,8 @@ export function AgenticPointDetail({ id }: Props) {
   // shows how the metric varies across the SKU.
   const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
   const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+  // Per-request timeline fetched only when the timeline view is active.
+  const timelineQuery = useRequestTimeline(id, view === 'timeline');
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -176,6 +181,11 @@ export function AgenticPointDetail({ id }: Props) {
             {aggregatesQuery.isLoading ? ' · loading…' : ''}
           </span>
         )}
+        {view === 'timeline' && timelineQuery.data && (
+          <span className="text-xs text-muted-foreground">
+            {timelineQuery.data.requests.length} requests
+          </span>
+        )}
       </div>
 
       {view === 'aggregates' ? (
@@ -184,6 +194,19 @@ export function AgenticPointDetail({ id }: Props) {
           aggregates={aggregatesQuery.data}
           isLoading={aggregatesQuery.isLoading}
         />
+      ) : view === 'timeline' ? (
+        timelineQuery.isLoading ? (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            Loading request timeline…
+          </div>
+        ) : timelineQuery.data ? (
+          <RequestTimelineView data={timelineQuery.data} />
+        ) : (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact
+            isn&apos;t stored for this row.
+          </div>
+        )
       ) : (
         <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
           <ExpandableChart
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
new file mode 100644
index 00000000..bcbe105a
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -0,0 +1,821 @@
+'use client';
+
+import { useCallback, useMemo, useRef, useState } from 'react';
+
+import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+
+/**
+ * Gantt-style request timeline for one agentic benchmark point.
+ *
+ * Rows are conversations (or workers — toggle in the header). Bars are
+ * individual HTTP requests, drawn from request_start to request_end with a
+ * thin lead-in segment from credit_issued (load gen queue). Scroll-wheel
+ * zooms, drag pans, hover shows per-request stats.
+ *
+ * The reference for this layout is the agent-timeline in semianalysis-claude-code-proxy.
+ */
+
+type RowMode = 'conversation' | 'worker';
+
+const ROW_MODE_OPTIONS: SegmentedToggleOption<RowMode>[] = [
+  { value: 'conversation', label: 'By conversation', testId: 'timeline-mode-conversation' },
+  { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' },
+];
+
+type PhaseFilter = 'all' | 'profiling';
+
+const PHASE_OPTIONS: SegmentedToggleOption<PhaseFilter>[] = [
+  { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' },
+  { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' },
+];
+
+/** A stable color palette indexed by row-key hash. */
+const ROW_COLORS = [
+  '#3b82f6',
+  '#ef4444',
+  '#10b981',
+  '#f59e0b',
+  '#a855f7',
+  '#06b6d4',
+  '#f97316',
+  '#84cc16',
+  '#ec4899',
+  '#14b8a6',
+  '#8b5cf6',
+  '#eab308',
+];
+
+/** Phase color overlay drawn as a thin strip at the bottom of each bar. */
+const PHASE_COLORS: Record<string, string> = {
+  profiling: '#22c55e',
+  warmup: '#94a3b8',
+  unknown: '#64748b',
+};
+
+interface Row {
+  key: string;
+  label: string;
+  color: string;
+  requests: RequestRecord[];
+  /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */
+  depth: number;
+  /** True if this row is a sub-agent ("Subagent N of parent X"). */
+  isSubagent: boolean;
+}
+
+/**
+ * Conversation ids for subagent calls look like
+ *   <parent_cid>::sa:subagent_<N>_<hash>
+ * Split into the parent cid and a sub-agent label (or the whole thing if
+ * this is a top-level conversation).
+ */
+function splitCid(cid: string): { parent: string; subagent: string | null } {
+  const sep = cid.indexOf('::sa:');
+  if (sep === -1) return { parent: cid, subagent: null };
+  return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) };
+}
+
+/** Group requests into rows; in conversation mode subagents nest under parents. */
+function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
+  const groups = new Map<string, RequestRecord[]>();
+  for (const r of requests) {
+    const key = mode === 'conversation' ? r.cid : r.wid;
+    let list = groups.get(key);
+    if (!list) {
+      list = [];
+      groups.set(key, list);
+    }
+    list.push(r);
+  }
+
+  if (mode !== 'conversation') {
+    // Worker mode: flat rows, sorted by first activity.
+    const rows: Row[] = [];
+    let i = 0;
+    for (const [key, list] of groups) {
+      list.sort((a, b) => a.start - b.start);
+      rows.push({
+        key,
+        label: shortenWid(key),
+        color: ROW_COLORS[i % ROW_COLORS.length]!,
+        requests: list,
+        depth: 0,
+        isSubagent: false,
+      });
+      i++;
+    }
+    rows.sort((a, b) => a.requests[0]!.start - b.requests[0]!.start);
+    return rows;
+  }
+
+  // Conversation mode: build a parent → [subagents] tree so each parent
+  // group renders as one parent row followed by its sub-agent rows. Color
+  // is shared inside a tree so the visual grouping reads.
+  interface Tree {
+    parentCid: string;
+    parentRow: { key: string; requests: RequestRecord[] } | null;
+    subagents: Map<string, RequestRecord[]>; // subagent label → requests
+    firstStart: number;
+  }
+  const trees = new Map<string, Tree>();
+  for (const [cid, list] of groups) {
+    list.sort((a, b) => a.start - b.start);
+    const { parent, subagent } = splitCid(cid);
+    let tree = trees.get(parent);
+    if (!tree) {
+      tree = {
+        parentCid: parent,
+        parentRow: null,
+        subagents: new Map(),
+        firstStart: Number.POSITIVE_INFINITY,
+      };
+      trees.set(parent, tree);
+    }
+    if (subagent === null) {
+      tree.parentRow = { key: cid, requests: list };
+    } else {
+      tree.subagents.set(subagent, list);
+    }
+    const earliest = list[0]!.start;
+    if (earliest < tree.firstStart) tree.firstStart = earliest;
+  }
+
+  const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
+  const rows: Row[] = [];
+  let colorIdx = 0;
+  for (const tree of sortedTrees) {
+    const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
+    colorIdx++;
+    if (tree.parentRow) {
+      rows.push({
+        key: tree.parentRow.key,
+        label: shortenCid(tree.parentCid),
+        color,
+        requests: tree.parentRow.requests,
+        depth: 0,
+        isSubagent: false,
+      });
+    } else {
+      // Pseudo-parent header so orphan subagents still render under
+      // something they belong to.
+      rows.push({
+        key: `__parent_${tree.parentCid}`,
+        label: shortenCid(tree.parentCid),
+        color,
+        requests: [],
+        depth: 0,
+        isSubagent: false,
+      });
+    }
+    const subagentEntries = [...tree.subagents.entries()].toSorted(
+      (a, b) => a[1][0]!.start - b[1][0]!.start,
+    );
+    for (const [saLabel, list] of subagentEntries) {
+      rows.push({
+        key: `${tree.parentCid}::${saLabel}`,
+        label: `↳ ${formatSubagentLabel(saLabel)}`,
+        color,
+        requests: list,
+        depth: 1,
+        isSubagent: true,
+      });
+    }
+  }
+  return rows;
+}
+
+/** `subagent_001_bf1c5c16` → `subagent 001 · bf1c` (compact, readable). */
+function formatSubagentLabel(raw: string): string {
+  const m = /^subagent_(\d+)_([0-9a-f]+)$/i.exec(raw);
+  if (!m) return raw;
+  return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
+}
+
+function shortenCid(cid: string): string {
+  if (cid.length <= 12) return cid;
+  return `${cid.slice(0, 8)}…${cid.slice(-4)}`;
+}
+
+function shortenWid(wid: string): string {
+  // worker_4ae87bea → w_4ae8
+  return wid.replace(/^worker_/, 'w_').slice(0, 12);
+}
+
+/** Format ns offset → "+12.3s" / "+1.2m". */
+function formatTickLabel(ns: number): string {
+  const ms = ns / 1e6;
+  if (ms < 1000) return `+${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `+${(ms / 1000).toFixed(ms < 10_000 ? 1 : 0)}s`;
+  return `+${(ms / 60_000).toFixed(1)}m`;
+}
+
+function formatDuration(ms: number): string {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `${(ms / 1000).toFixed(2)}s`;
+  return `${(ms / 60_000).toFixed(2)}m`;
+}
+
+/** Number of values in a sorted ascending array that are <= target. */
+function countLeq(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! <= target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+/** Number of values in a sorted ascending array that are < target. */
+function countLt(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! < target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+
+interface TooltipData {
+  x: number;
+  y: number;
+  row: Row;
+  req: RequestRecord;
+}
+
+function Tooltip({ data }: { data: TooltipData }) {
+  const { row, req } = data;
+  const totalMs = (req.end - req.start) / 1e6;
+  const queueMs = (req.start - req.credit) / 1e6;
+  return (
+    <div
+      className="fixed z-50 pointer-events-none rounded-md border border-border bg-card p-2.5 shadow-lg text-[11px]"
+      style={{ left: data.x + 12, top: data.y - 10, maxWidth: 280 }}
+    >
+      <div className="flex items-center gap-2 font-medium text-foreground">
+        <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: row.color }} />
+        <span className="truncate">{row.label}</span>
+        <span className="text-muted-foreground">· turn {req.ti}</span>
+        {req.cancelled && <span className="text-destructive">· cancelled</span>}
+      </div>
+      <div className="mt-1.5 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>Total</span>
+        <span className="text-foreground text-right tabular-nums">{formatDuration(totalMs)}</span>
+        <span>Queue wait</span>
+        <span className="text-foreground text-right tabular-nums">
+          {queueMs > 0.5 ? formatDuration(queueMs) : '—'}
+        </span>
+        {req.ttftMs !== null && (
+          <>
+            <span>TTFT</span>
+            <span className="text-foreground text-right tabular-nums">
+              {formatDuration(req.ttftMs)}
+            </span>
+          </>
+        )}
+        {req.isl !== null && (
+          <>
+            <span>ISL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.isl.toLocaleString()}
+            </span>
+          </>
+        )}
+        {req.osl !== null && (
+          <>
+            <span>OSL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.osl.toLocaleString()}
+            </span>
+          </>
+        )}
+        <span>Phase</span>
+        <span className="text-foreground text-right">{req.phase}</span>
+        {req.ad > 0 && (
+          <>
+            <span>Agent depth</span>
+            <span className="text-foreground text-right tabular-nums">{req.ad}</span>
+          </>
+        )}
+        <span>Worker</span>
+        <span className="text-foreground text-right truncate">{shortenWid(req.wid)}</span>
+      </div>
+      <div className="mt-1.5 pt-1 border-t border-border/40 text-[10px] text-muted-foreground">
+        Started at {formatTickLabel(req.start)}
+      </div>
+    </div>
+  );
+}
+
+export function RequestTimelineView({ data }: { data: RequestTimeline }) {
+  const [rowMode, setRowMode] = useState<RowMode>('conversation');
+  const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
+  const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+  const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
+
+  // Apply phase filter, then group into rows.
+  const filtered = useMemo(
+    () =>
+      phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
+    [data.requests, phaseFilter],
+  );
+  const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]);
+
+  // Pre-sort the timestamp columns so the cursor-time stats popover can
+  // count "running / waiting at time t" in O(log n). With a few hundred
+  // requests this is overkill — but it stays smooth on huge runs too.
+  const sortedTimes = useMemo(() => {
+    const credits = filtered.map((r) => r.credit).toSorted((a, b) => a - b);
+    const starts = filtered.map((r) => r.start).toSorted((a, b) => a - b);
+    const ends = filtered.map((r) => r.end).toSorted((a, b) => a - b);
+    return { credits, starts, ends };
+  }, [filtered]);
+
+  // Cursor state (vertical line + stats popover). null when the mouse
+  // isn't over the chart. xPx is svg-local; tNs is the ns offset from
+  // dataStart that the cursor is pointing at.
+  const [cursor, setCursor] = useState<{
+    xPx: number;
+    tNs: number;
+    clientX: number;
+    clientY: number;
+  } | null>(null);
+
+  // Timeline extent (clamped to actual data — if we filtered out warmup
+  // the visible window should shrink to just the profiling phase).
+  const dataStart = filtered.length === 0 ? 0 : Math.min(...filtered.map((r) => r.credit));
+  const dataEnd = filtered.length === 0 ? 1 : Math.max(...filtered.map((r) => r.end));
+  const totalNs = Math.max(dataEnd - dataStart, 1);
+
+  // Visible window state (ns offsets, relative to dataStart).
+  const [viewStart, setViewStart] = useState(0);
+  const [viewEnd, setViewEnd] = useState<number | null>(null);
+  const vStart = viewStart;
+  const vEnd = viewEnd ?? totalNs;
+  const visibleDur = Math.max(vEnd - vStart, 1);
+  const isZoomed = viewEnd !== null;
+
+  // Layout
+  const LABEL_WIDTH = 160;
+  const ROW_HEIGHT = 22;
+  const ROW_GAP = 3;
+  const HEADER_HEIGHT = 24;
+  const PADDING_RIGHT = 12;
+  const chartWidth = 920;
+  const svgHeight = HEADER_HEIGHT + rows.length * (ROW_HEIGHT + ROW_GAP) + 6;
+  const scale = (chartWidth - PADDING_RIGHT) / visibleDur;
+  // Local coords: convert ns offset from dataStart to x px.
+  const xOf = (ns: number) => (ns - dataStart - vStart) * scale;
+
+  // Time-axis ticks (~8 across visible window, snapped to nice second multiples).
+  const niceMs = [
+    100, 250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 120_000, 300_000, 600_000, 1_800_000,
+  ];
+  const targetMs = visibleDur / 1e6 / 8;
+  const tickMs = niceMs.find((n) => n >= targetMs) ?? targetMs;
+  const tickNs = tickMs * 1e6;
+  const ticks: number[] = [];
+  const tickStart = Math.floor(vStart / tickNs) * tickNs;
+  for (let t = tickStart; t <= vEnd + tickNs; t += tickNs) {
+    if (t >= vStart && t <= vEnd) ticks.push(t);
+  }
+
+  const handleWheel = useCallback(
+    (e: React.WheelEvent<SVGSVGElement>) => {
+      e.preventDefault();
+      const rect = e.currentTarget.getBoundingClientRect();
+      const mouseX = e.clientX - rect.left;
+      const mouseRatio = Math.max(0, Math.min(1, mouseX / (chartWidth - PADDING_RIGHT)));
+      const curStart = vStart;
+      const curEnd = vEnd;
+      const curDur = curEnd - curStart;
+      const factor = e.deltaY > 0 ? 1.2 : 1 / 1.2;
+      const newDur = Math.min(Math.max(curDur * factor, totalNs * 0.001), totalNs);
+      const pivot = curStart + mouseRatio * curDur;
+      let newStart = pivot - mouseRatio * newDur;
+      let newEnd = pivot + (1 - mouseRatio) * newDur;
+      if (newStart < 0) {
+        newEnd -= newStart;
+        newStart = 0;
+      }
+      if (newEnd > totalNs) {
+        newStart -= newEnd - totalNs;
+        newEnd = totalNs;
+        if (newStart < 0) newStart = 0;
+      }
+      if (newEnd - newStart >= totalNs * 0.99) {
+        setViewStart(0);
+        setViewEnd(null);
+      } else {
+        setViewStart(newStart);
+        setViewEnd(newEnd);
+      }
+    },
+    [vStart, vEnd, totalNs, chartWidth],
+  );
+
+  const handleMouseDown = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      if (e.button !== 0) return;
+      dragRef.current = { startX: e.clientX, vs: vStart, ve: vEnd };
+    },
+    [vStart, vEnd],
+  );
+
+  const handleMouseMove = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      // Dragging takes precedence over cursor tracking — panning the view.
+      if (dragRef.current) {
+        const dx = e.clientX - dragRef.current.startX;
+        const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT);
+        const delta = -dx * nsPerPx;
+        let ns = dragRef.current.vs + delta;
+        let ne = dragRef.current.ve + delta;
+        const dur = ne - ns;
+        if (ns < 0) {
+          ns = 0;
+          ne = dur;
+        }
+        if (ne > totalNs) {
+          ne = totalNs;
+          ns = totalNs - dur;
+          if (ns < 0) ns = 0;
+        }
+        setViewStart(ns);
+        setViewEnd(ne);
+        setTooltip(null);
+        setCursor(null);
+        return;
+      }
+      // Track the cursor position in svg-local px and the matching ns offset
+      // so the crosshair + stats popover can render. Clamped to the chart
+      // plot area (don't show a cursor on the axis labels gutter).
+      const rect = e.currentTarget.getBoundingClientRect();
+      const xPx = Math.max(0, Math.min(chartWidth - PADDING_RIGHT, e.clientX - rect.left));
+      const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT);
+      const tNs = vStart + xPx * nsPerPx;
+      setCursor({ xPx, tNs, clientX: e.clientX, clientY: e.clientY });
+    },
+    [visibleDur, chartWidth, totalNs, vStart],
+  );
+
+  const handleMouseUp = useCallback(() => {
+    dragRef.current = null;
+  }, []);
+
+  const handleMouseLeave = useCallback(() => {
+    dragRef.current = null;
+    setCursor(null);
+  }, []);
+
+  const resetZoom = useCallback(() => {
+    setViewStart(0);
+    setViewEnd(null);
+  }, []);
+
+  if (rows.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        No requests in the current filter.
+      </div>
+    );
+  }
+
+  const totalRequests = filtered.length;
+
+  return (
+    <div className="space-y-3">
+      {/* Controls */}
+      <div className="flex flex-wrap items-center gap-2">
+        <SegmentedToggle
+          value={rowMode}
+          options={ROW_MODE_OPTIONS}
+          onValueChange={setRowMode}
+          ariaLabel="Row mode"
+          testId="timeline-row-mode"
+          buttonClassName="px-2.5 py-1 text-xs"
+        />
+        <SegmentedToggle
+          value={phaseFilter}
+          options={PHASE_OPTIONS}
+          onValueChange={setPhaseFilter}
+          ariaLabel="Phase filter"
+          testId="timeline-phase-filter"
+          buttonClassName="px-2.5 py-1 text-xs"
+        />
+        <span className="ml-auto text-xs text-muted-foreground">
+          {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
+          {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
+          {formatDuration((dataEnd - dataStart) / 1e6)}
+          {isZoomed && (
+            <>
+              {' · '}
+              <button type="button" onClick={resetZoom} className="text-foreground hover:underline">
+                reset zoom
+              </button>
+            </>
+          )}
+        </span>
+      </div>
+
+      {/* Chart container */}
+      <div className="rounded-md border border-border/60 bg-card overflow-hidden">
+        <div className="flex">
+          {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
+          <div
+            className="flex-shrink-0 border-r border-border/60 bg-card/80"
+            style={{ width: LABEL_WIDTH }}
+          >
+            <div
+              className="border-b border-border/60 flex items-end px-2 pb-1"
+              style={{ height: HEADER_HEIGHT }}
+            >
+              <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
+                {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
+              </span>
+            </div>
+            {rows.map((row) => (
+              <div
+                key={row.key}
+                className="flex items-center gap-1.5 overflow-hidden pr-2"
+                style={{
+                  height: ROW_HEIGHT + ROW_GAP,
+                  paddingLeft: 8 + row.depth * 12,
+                }}
+              >
+                <span
+                  className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                  style={{
+                    backgroundColor: row.color,
+                    opacity: row.isSubagent ? 0.55 : 1,
+                  }}
+                />
+                <span
+                  className="text-[10px] font-mono truncate"
+                  style={{ color: row.color, opacity: row.isSubagent ? 0.85 : 1 }}
+                >
+                  {row.label}
+                </span>
+                <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                  {row.requests.length > 0 ? row.requests.length : '—'}
+                </span>
+              </div>
+            ))}
+          </div>
+
+          {/* Scrollable SVG */}
+          <div className="flex-1 overflow-x-auto">
+            <svg
+              width={chartWidth}
+              height={svgHeight}
+              className="block"
+              style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
+              onWheel={handleWheel}
+              onMouseDown={handleMouseDown}
+              onMouseMove={handleMouseMove}
+              onMouseUp={handleMouseUp}
+              onMouseLeave={handleMouseLeave}
+            >
+              {/* Header / time-axis baseline */}
+              <line
+                x1={0}
+                y1={HEADER_HEIGHT}
+                x2={chartWidth}
+                y2={HEADER_HEIGHT}
+                stroke="currentColor"
+                opacity={0.15}
+              />
+
+              {/* Time axis ticks */}
+              {ticks.map((t) => {
+                // Convert visible-window ns offset → x px (the tick array
+                // is already in dataStart-relative coords).
+                const x = (t - vStart) * scale;
+                return (
+                  <g key={t}>
+                    <line
+                      x1={x}
+                      y1={HEADER_HEIGHT}
+                      x2={x}
+                      y2={svgHeight}
+                      stroke="currentColor"
+                      opacity={0.08}
+                      strokeDasharray="2 4"
+                    />
+                    <text
+                      x={x + 2}
+                      y={HEADER_HEIGHT - 6}
+                      fill="currentColor"
+                      opacity={0.55}
+                      fontSize={9}
+                      fontFamily="ui-monospace, SFMono-Regular, monospace"
+                    >
+                      {formatTickLabel(t)}
+                    </text>
+                  </g>
+                );
+              })}
+
+              {/* Row separators */}
+              {rows.map((row, idx) => (
+                <line
+                  key={`sep-${row.key}`}
+                  x1={0}
+                  y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  x2={chartWidth}
+                  y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  stroke="currentColor"
+                  opacity={0.04}
+                />
+              ))}
+
+              {/* Request bars */}
+              {rows.map((row, rowIdx) => {
+                const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
+                const barH = ROW_HEIGHT - 4;
+                return row.requests.map((req) => {
+                  const xCredit = xOf(req.credit);
+                  const xStart = xOf(req.start);
+                  const xEnd = xOf(req.end);
+                  // Cull bars entirely outside the visible window so big
+                  // benchmarks don't render thousands of zero-width rects.
+                  if (xEnd < -2 || xCredit > chartWidth + 2) return null;
+                  const runW = Math.max(xEnd - xStart, 1);
+                  const queueW = Math.max(xStart - xCredit, 0);
+                  const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+                  return (
+                    <g
+                      key={`${req.cid}-${req.ti}-${req.start}`}
+                      onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
+                      onMouseLeave={() => setTooltip(null)}
+                    >
+                      {/* Queue lead-in (faint) — only drawn when noticeable. */}
+                      {queueW >= 1 && (
+                        <rect
+                          x={xCredit}
+                          y={yTop + barH / 2 - 1}
+                          width={queueW}
+                          height={2}
+                          fill={row.color}
+                          opacity={0.35}
+                        />
+                      )}
+                      {/* Main bar */}
+                      <rect
+                        x={xStart}
+                        y={yTop}
+                        width={runW}
+                        height={barH}
+                        rx={2}
+                        fill={row.color}
+                        opacity={req.cancelled ? 0.35 : row.isSubagent ? 0.6 : 0.85}
+                      />
+                      {/* Phase strip at bottom */}
+                      <rect
+                        x={xStart}
+                        y={yTop + barH - 2}
+                        width={runW}
+                        height={2}
+                        rx={1}
+                        fill={phaseColor}
+                        opacity={0.85}
+                      />
+                      {/* Cancelled X overlay */}
+                      {req.cancelled && runW > 6 && (
+                        <line
+                          x1={xStart + 1}
+                          y1={yTop + 1}
+                          x2={xStart + runW - 1}
+                          y2={yTop + barH - 1}
+                          stroke="currentColor"
+                          strokeWidth={0.7}
+                          opacity={0.6}
+                        />
+                      )}
+                    </g>
+                  );
+                });
+              })}
+
+              {/* Cursor crosshair — drawn on top of bars so it stays visible
+                  through dense rows. Stats popover is rendered as fixed
+                  HTML below the SVG block. */}
+              {cursor && (
+                <line
+                  x1={cursor.xPx}
+                  x2={cursor.xPx}
+                  y1={0}
+                  y2={svgHeight}
+                  stroke="currentColor"
+                  strokeWidth={1}
+                  opacity={0.45}
+                  pointerEvents="none"
+                />
+              )}
+            </svg>
+          </div>
+        </div>
+      </div>
+
+      {/* Footer / legend */}
+      <div className="flex flex-wrap items-center gap-x-4 gap-y-1 px-1 text-[11px] text-muted-foreground">
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm bg-current opacity-30" />
+          queue wait
+        </span>
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#22c55e' }} />
+          profiling
+        </span>
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#94a3b8' }} />
+          warmup
+        </span>
+        <span className="ml-auto opacity-70">scroll to zoom · drag to pan</span>
+      </div>
+
+      {/* Cursor stats popover: count of in-flight / waiting at the cursor's
+          ns offset. Hidden when the user is hovering an individual bar
+          (per-request tooltip wins). */}
+      {cursor && !tooltip && (
+        <CursorPopover
+          cursor={cursor}
+          dataStart={dataStart}
+          startTimes={sortedTimes.starts}
+          endTimes={sortedTimes.ends}
+          creditTimes={sortedTimes.credits}
+        />
+      )}
+
+      {/* Tooltip */}
+      {tooltip && <Tooltip data={tooltip} />}
+    </div>
+  );
+}
+
+function CursorPopover({
+  cursor,
+  dataStart,
+  startTimes,
+  endTimes,
+  creditTimes,
+}: {
+  cursor: { xPx: number; tNs: number; clientX: number; clientY: number };
+  dataStart: number;
+  startTimes: number[];
+  endTimes: number[];
+  creditTimes: number[];
+}) {
+  // At time t (ns from dataStart, here represented as t = tNs):
+  //   running  = #(start <= t) - #(end < t)
+  //   waiting  = #(credit <= t) - #(start <= t)
+  //   completed= #(end <= t)
+  const t = cursor.tNs;
+  const startsLeq = countLeq(startTimes, t);
+  const endsLt = countLt(endTimes, t);
+  const creditsLeq = countLeq(creditTimes, t);
+  const endsLeq = countLeq(endTimes, t);
+  const running = Math.max(0, startsLeq - endsLt);
+  const waiting = Math.max(0, creditsLeq - startsLeq);
+  const completed = endsLeq;
+  const inflight = running + waiting;
+  // Absolute wall-clock seconds since the timeline origin (dataStart).
+  const tSec = t / 1e9;
+  // Position the popover near the cursor without overflowing the viewport.
+  // 200 px wide; flip to the left of the cursor if it would clip the right.
+  const wantLeft = cursor.clientX + 14;
+  const left =
+    typeof window === 'undefined' || wantLeft + 220 < window.innerWidth
+      ? wantLeft
+      : cursor.clientX - 220;
+  return (
+    <div
+      className="fixed z-40 pointer-events-none rounded-md border border-border bg-card/95 backdrop-blur p-2 shadow-lg text-[11px] font-mono"
+      style={{ left, top: cursor.clientY - 60, minWidth: 180 }}
+    >
+      <div className="flex justify-between gap-3 text-foreground">
+        <span className="text-muted-foreground">t =</span>
+        <span className="tabular-nums">
+          {tSec < 60 ? `${tSec.toFixed(3)} s` : `${(tSec / 60).toFixed(3)} m`}
+        </span>
+      </div>
+      <div className="mt-1 pt-1 border-t border-border/40 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>In flight</span>
+        <span className="text-foreground text-right tabular-nums">{inflight}</span>
+        <span className="pl-3 text-[10px]">running</span>
+        <span className="text-foreground text-right tabular-nums">{running}</span>
+        <span className="pl-3 text-[10px]">waiting</span>
+        <span className="text-foreground text-right tabular-nums">{waiting}</span>
+        <span>Completed</span>
+        <span className="text-foreground text-right tabular-nums">{completed}</span>
+      </div>
+      {/* dataStart is informational — the displayed t is relative to it. */}
+      <div className="mt-1 pt-1 border-t border-border/40 text-[9px] text-muted-foreground">
+        relative to t₀ ({(dataStart / 1e9).toFixed(0)}s wall-clock)
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
new file mode 100644
index 00000000..d3ceaab8
--- /dev/null
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -0,0 +1,59 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  ttftMs: number | null;
+  isl: number | null;
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+async function fetchRequestTimeline(
+  id: number,
+  signal?: AbortSignal,
+): Promise<RequestTimeline | null> {
+  const res = await fetch(`/api/v1/request-timeline?id=${id}`, { signal });
+  if (res.status === 404) return null;
+  if (!res.ok) throw new Error(`request-timeline ${res.status}`);
+  return (await res.json()) as RequestTimeline;
+}
+
+/**
+ * Lazy-fetch the per-request Gantt timeline for one agentic point.
+ * Enabled only when the caller opts in (e.g. the timeline view becomes
+ * active), so the payload (~30 KB per point) isn't paid for every page load.
+ */
+export function useRequestTimeline(id: number | null, enabled = false) {
+  return useQuery({
+    queryKey: ['request-timeline', id] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      id ? fetchRequestTimeline(id, signal) : Promise.resolve(null),
+    enabled: enabled && Boolean(id),
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql
new file mode 100644
index 00000000..756b775e
--- /dev/null
+++ b/packages/db/migrations/010_agentic_request_timeline.sql
@@ -0,0 +1,15 @@
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+  add column request_timeline jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index f97c442a..710089f1 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -21,6 +21,7 @@
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+    "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts
new file mode 100644
index 00000000..327099d0
--- /dev/null
+++ b/packages/db/src/backfill-request-timeline.ts
@@ -0,0 +1,144 @@
+/**
+ * Backfill `agentic_trace_replay.request_timeline` for rows that are
+ * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`.
+ *
+ * The ingest path now computes the timeline inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 010 and any time the version bumps.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+} from './etl/compute-request-timeline.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-request-timeline ===');
+  console.log(`  REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows with a profile_export blob can produce a timeline. Rows
+  // without the blob keep `request_timeline` null and the API serves them
+  // as "no timeline data".
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+          and (
+            request_timeline is null
+            or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>`
+        select profile_export_jsonl_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+      const timeline = computeRequestTimeline(row.profile_export_jsonl_gz);
+      await sql`
+        update agentic_trace_replay
+        set request_timeline = ${
+          timeline === null
+            ? null
+            : sql.json(structuredClone(timeline) as unknown as Parameters<typeof sql.json>[0])
+        }
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-request-timeline failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
new file mode 100644
index 00000000..64512aca
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -0,0 +1,153 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js';
+
+interface SyntheticRequest {
+  cid: string;
+  ti: number;
+  wid?: string;
+  ad?: number;
+  phase?: string;
+  credit: number;
+  start: number;
+  end: number;
+  ack?: number | null;
+  ttftMs?: number | null;
+  isl?: number | null;
+  osl?: number | null;
+  cancelled?: boolean;
+}
+
+function makeBlob(requests: SyntheticRequest[]) {
+  const lines = requests.map((r) =>
+    JSON.stringify({
+      metadata: {
+        conversation_id: r.cid,
+        turn_index: r.ti,
+        worker_id: r.wid ?? 'worker_default',
+        agent_depth: r.ad ?? 0,
+        benchmark_phase: r.phase ?? 'profiling',
+        credit_issued_ns: r.credit,
+        request_start_ns: r.start,
+        ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }),
+        request_end_ns: r.end,
+        was_cancelled: r.cancelled ?? false,
+      },
+      metrics: {
+        time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+        input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
+        output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+describe('computeRequestTimeline', () => {
+  it('returns null when the blob is null', () => {
+    expect(computeRequestTimeline(null)).toBeNull();
+  });
+
+  it('returns null on a malformed (non-gzip) blob', () => {
+    expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull();
+  });
+
+  it('returns null when the blob has no parseable records', () => {
+    expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull();
+  });
+
+  it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]),
+    );
+    expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION);
+  });
+
+  it('shifts ns timestamps to be relative to the earliest credit_issued', () => {
+    // Two requests with absolute ns starting at 1_000_000_000.
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 },
+        { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 },
+      ]),
+    );
+    expect(tl?.startNs).toBe(1_000_000_000);
+    expect(tl?.endNs).toBe(1_030_000_000);
+    expect(tl?.durationS).toBeCloseTo(0.03, 6);
+    expect(tl?.requests[0]?.credit).toBe(0);
+    expect(tl?.requests[0]?.end).toBe(10_000_000);
+    expect(tl?.requests[1]?.start).toBe(21_000_000);
+  });
+
+  it('sorts requests by start time, regardless of input order', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 },
+        { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 },
+        { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 },
+      ]),
+    );
+    expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]);
+  });
+
+  it('preserves conversation/worker grouping fields', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'conv-A',
+          ti: 5,
+          wid: 'worker_abcd1234',
+          ad: 2,
+          phase: 'profiling',
+          credit: 0,
+          start: 10,
+          end: 100,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cid).toBe('conv-A');
+    expect(r.ti).toBe(5);
+    expect(r.wid).toBe('worker_abcd1234');
+    expect(r.ad).toBe(2);
+    expect(r.phase).toBe('profiling');
+  });
+
+  it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'a',
+          ti: 0,
+          credit: 0,
+          start: 10,
+          end: 100,
+          ttftMs: 25.5,
+          isl: 1024,
+          osl: 256,
+          cancelled: true,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cancelled).toBe(true);
+    expect(r.ttftMs).toBeCloseTo(25.5, 6);
+    expect(r.isl).toBe(1024);
+    expect(r.osl).toBe(256);
+  });
+
+  it('skips records missing both credit_issued_ns and request_start_ns', () => {
+    // Build a record with only request_end_ns — the helper rejects it.
+    const broken = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 },
+          metrics: {},
+        }),
+      ),
+    );
+    expect(computeRequestTimeline(broken)).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
new file mode 100644
index 00000000..a1134f7a
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -0,0 +1,182 @@
+/**
+ * Pre-compute the per-request timeline for the agentic detail page's
+ * Gantt view. Output lands in `agentic_trace_replay.request_timeline`
+ * and is read directly by the timeline API route.
+ *
+ * Shape is a thin array — ~150 bytes per request × ~200 requests per
+ * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw
+ * gzipped JSONL blob (~1-3 MB).
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const REQUEST_TIMELINE_VERSION = 1;
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  /** Time-to-first-token in ms. */
+  ttftMs: number | null;
+  /** Input sequence length (tokens). */
+  isl: number | null;
+  /** Output sequence length (tokens). */
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  /** Wall-clock ns of the earliest event (used as the relative-time origin). */
+  startNs: number;
+  /** Wall-clock ns of the latest `request_end_ns`. */
+  endNs: number;
+  /** Total span in seconds. */
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+interface RawMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  worker_id?: string;
+  agent_depth?: number;
+  benchmark_phase?: string;
+  credit_issued_ns?: number;
+  request_start_ns?: number;
+  request_ack_ns?: number;
+  request_end_ns?: number;
+  was_cancelled?: boolean;
+}
+
+interface RawMetricValue {
+  value?: number;
+}
+
+interface RawRecord {
+  metadata?: RawMetadata;
+  metrics?: {
+    time_to_first_token?: RawMetricValue | number;
+    input_sequence_length?: RawMetricValue | number;
+    output_sequence_length?: RawMetricValue | number;
+  };
+}
+
+/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return Number.isFinite(v) ? v : undefined;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+/**
+ * Parse the gzipped `profile_export.jsonl` blob into a chart-ready
+ * timeline. Returns null on a missing or malformed blob.
+ */
+export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null {
+  if (!blob) return null;
+  let text: string;
+  try {
+    text = gunzipSync(blob).toString('utf8');
+  } catch {
+    return null;
+  }
+
+  // First pass: parse + collect raw turns; find timeline origin.
+  const raw: {
+    meta: RawMetadata;
+    ttftMs: number | null;
+    isl: number | null;
+    osl: number | null;
+  }[] = [];
+  let originNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+
+  for (const line of text.split('\n')) {
+    if (!line) continue;
+    let rec: RawRecord;
+    try {
+      rec = JSON.parse(line) as RawRecord;
+    } catch {
+      continue;
+    }
+    const meta = rec.metadata ?? {};
+    // Use credit_issued_ns when available (the true start of the request's
+    // lifecycle), falling back to request_start_ns. Skip rows missing both.
+    const cStart = meta.credit_issued_ns ?? meta.request_start_ns;
+    const cEnd = meta.request_end_ns;
+    if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue;
+
+    if (cStart < originNs) originNs = cStart;
+    if (cEnd > endNs) endNs = cEnd;
+
+    raw.push({
+      meta,
+      ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+      isl: readNum(rec.metrics?.input_sequence_length) ?? null,
+      osl: readNum(rec.metrics?.output_sequence_length) ?? null,
+    });
+  }
+
+  if (raw.length === 0) return null;
+  if (!Number.isFinite(originNs)) originNs = 0;
+
+  // Second pass: shift timestamps to be relative to originNs (smaller
+  // numbers fit in JSON nicely and the frontend doesn't need bigint math).
+  const requests: RequestRecord[] = [];
+  for (const r of raw) {
+    const m = r.meta;
+    const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs;
+    const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs;
+    const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null;
+    const end = (m.request_end_ns ?? originNs) - originNs;
+    requests.push({
+      cid: m.conversation_id ?? 'unknown',
+      ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
+      wid: m.worker_id ?? 'unknown',
+      ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
+      phase: m.benchmark_phase ?? 'unknown',
+      credit,
+      start,
+      ack,
+      end,
+      ttftMs: r.ttftMs,
+      isl: r.isl,
+      osl: r.osl,
+      cancelled: m.was_cancelled === true,
+    });
+  }
+
+  // Stable order so backfill output is deterministic.
+  requests.sort((a, b) => a.start - b.start);
+
+  return {
+    version: REQUEST_TIMELINE_VERSION,
+    startNs: originNs,
+    endNs,
+    durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0,
+    requests,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index f70200ff..8cc03f2a 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -14,6 +14,7 @@ import type postgres from 'postgres';
 
 import { computeAggregateStats } from './compute-aggregate-stats.js';
 import { computeChartSeries } from './compute-chart-series.js';
+import { computeRequestTimeline } from './compute-request-timeline.js';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -58,13 +59,14 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
-  // Pre-compute the aggregate stats + chart-ready time-series so the
-  // detail page / aggregates view doesn't have to re-parse these blobs on
-  // every request. Both helpers tolerate a null blob and fall back to a
-  // streaming parser for oversized server_metrics blobs.
-  const [aggregateStats, chartSeries] = await Promise.all([
+  // Pre-compute aggregate stats + chart-ready time-series + per-request
+  // timeline so the detail page doesn't have to re-parse these blobs on
+  // every request. Each helper tolerates a null blob and falls back to
+  // a streaming parser for oversized server_metrics blobs.
+  const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
     computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
     computeChartSeries(metricsJsonGz),
+    Promise.resolve(computeRequestTimeline(profileGz)),
   ]);
 
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
@@ -76,7 +78,8 @@ export async function insertTraceReplay(
       server_metrics_json_gz,
       server_metrics_json_uncompressed_size,
       aggregate_stats,
-      chart_series
+      chart_series,
+      request_timeline
     )
     values (
       ${profileGz},
@@ -86,7 +89,8 @@ export async function insertTraceReplay(
       ${metricsJsonGz},
       ${metricsJsonSize},
       ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])},
-      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])}
+      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])},
+      ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts
new file mode 100644
index 00000000..2bd3e251
--- /dev/null
+++ b/packages/db/src/queries/request-timeline.ts
@@ -0,0 +1,48 @@
+/**
+ * Per-request timeline for the agentic detail page's Gantt view.
+ *
+ * Backed by `agentic_trace_replay.request_timeline` (pre-computed at
+ * ingest time, see `etl/compute-request-timeline.ts`). The fast path is
+ * a single SQL row read; the slow path re-computes from
+ * `profile_export_jsonl_gz` and is only taken when the column is missing
+ * or the stored `REQUEST_TIMELINE_VERSION` is stale.
+ */
+
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+  type RequestTimeline,
+} from '../etl/compute-request-timeline';
+
+import type { DbClient } from '../connection.js';
+
+export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline';
+
+interface RawRow {
+  blob: Buffer | null;
+  request_timeline: RequestTimeline | null;
+}
+
+export async function getRequestTimeline(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<RequestTimeline | null> {
+  const rows = (await sql`
+    select
+      atr.profile_export_jsonl_gz as blob,
+      atr.request_timeline
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawRow[];
+  const row = rows[0];
+  if (!row) return null;
+
+  // Fast path: pre-computed timeline at the current version.
+  if (row.request_timeline && Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION) {
+    return row.request_timeline;
+  }
+
+  // Slow path: recompute from the blob (rare — only stale/missing rows).
+  return computeRequestTimeline(row.blob);
+}

From f2618f44d6eafa38bffb3b9b9ec39c5224d62b76 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 22 May 2026 14:21:24 -0500
Subject: [PATCH 041/111] fix(agentic-detail): aggregate vllm metrics across
 all engine series
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chart_series + aggregate_stats helpers were only reading series[0]
for each metric, which under-counted by Nx on multi-engine DP/PP vllm
deployments (each engine reports its own series in
vllm:num_requests_running, kv_cache_usage_perc, prompt_tokens, etc.).

Worst-case visible effect: for point 206032 (b200, dsv4, conc=24,
8-engine cluster), the queue-depth chart maxed at ~3 while the
per-request timeline correctly showed ~22 concurrent. Other metrics
were similarly clipped — prefix-cache hit rate, throughput, KV util.

Now we sum gauges + counter rates across all engines, and average
kv_cache_usage_perc (since it's a per-engine fraction). After fix, the
same row's peak queue depth reads 24 (running 21 + waiting 3), matching
the timeline.

Bumps STATS_VERSION + CHART_SERIES_VERSION to 2 so the backfill scripts
recompute existing rows; both were re-run against 130/26 rows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../db/src/etl/compute-chart-series.test.ts   |  80 +++++++++
 packages/db/src/etl/compute-chart-series.ts   | 154 ++++++++++--------
 packages/db/src/queries/agentic-aggregates.ts |  90 ++++++----
 3 files changed, 226 insertions(+), 98 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
index dafc7200..4c6f8791 100644
--- a/packages/db/src/etl/compute-chart-series.test.ts
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -63,6 +63,48 @@ function makeBlob(opts?: {
   return gzipSync(Buffer.from(json));
 }
 
+/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */
+function buildEngineSeries(engineId: number, baseRunning: number) {
+  const labels = { engine: String(engineId) };
+  return {
+    runningSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: baseRunning },
+        { start_ns: 1e9, avg: baseRunning + 1 },
+      ],
+    },
+    waitingSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0 },
+        { start_ns: 1e9, avg: 0 },
+      ],
+    },
+    kvSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0.25 },
+        { start_ns: 1e9, avg: 0.5 },
+      ],
+    },
+    promptSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 100 },
+        { start_ns: 1e9, rate: 200 },
+      ],
+    },
+    genSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 50 },
+        { start_ns: 1e9, rate: 75 },
+      ],
+    },
+  };
+}
+
 describe('computeChartSeries', () => {
   it('returns null when the blob is null', async () => {
     expect(await computeChartSeries(null)).toBeNull();
@@ -126,4 +168,42 @@ describe('computeChartSeries', () => {
     const result = await computeChartSeries(Buffer.from('not-gzip-data'));
     expect(result).toBeNull();
   });
+
+  it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => {
+    // Simulate a 4-engine deployment: each engine reports its own series for
+    // every metric. Cluster-wide value should be SUM for running/waiting and
+    // counter rates, AVG for kv_cache_usage_perc (per-engine fraction).
+    const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) },
+        'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) },
+        'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) },
+        'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) },
+        'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) },
+      },
+    });
+    const blob = gzipSync(Buffer.from(json));
+    const cs = await computeChartSeries(blob);
+    expect(cs).not.toBeNull();
+    // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1
+    expect(cs!.queueDepth).toEqual([
+      { t: 0, running: 12, waiting: 0, total: 12 },
+      { t: 1, running: 16, waiting: 0, total: 16 },
+    ]);
+    // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value)
+    expect(cs!.kvCacheUsage).toEqual([
+      { t: 0, value: 0.25 },
+      { t: 1, value: 0.5 },
+    ]);
+    // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800
+    expect(cs!.prefillTps).toEqual([
+      { t: 0, value: 400 },
+      { t: 1, value: 800 },
+    ]);
+    expect(cs!.decodeTps).toEqual([
+      { t: 0, value: 200 },
+      { t: 1, value: 300 },
+    ]);
+  });
 });
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 3cb4181b..530600cf 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -17,8 +17,16 @@ import { parser } from 'stream-json';
 import { pick } from 'stream-json/filters/pick.js';
 import { streamObject } from 'stream-json/streamers/stream-object.js';
 
-/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
-export const CHART_SERIES_VERSION = 1;
+/**
+ * Bump when the extraction algorithm changes — backfill recomputes anything
+ * older.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP
+ * deployments — most visible as a request-queue-depth chart that maxed out
+ * at ~3 when the timeline clearly showed 20+ in-flight).
+ */
+export const CHART_SERIES_VERSION = 2;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -147,17 +155,44 @@ export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeri
   return buildSeriesFromMetrics(metrics);
 }
 
-/** Pull the first series under a metric key, or undefined. */
-function firstSeries(metrics: MetricsMap, name: string): RawSeries | undefined {
-  const s = metrics[name]?.series;
-  return s && s.length > 0 ? s[0] : undefined;
+/**
+ * Aggregate one timeslice field across all series of a metric, indexed by
+ * `start_ns`. Multi-engine vllm deployments report one series per engine —
+ * the cluster value is the sum (for running/waiting/throughput counters)
+ * or the average (for kv_cache_usage_perc, a per-engine fraction).
+ */
+function aggregateByStart(
+  series: readonly RawSeries[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of series ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
+/** Stable order: emit one point per unique start_ns, chronologically. */
+function sortedEntries(m: Map<number, number>): [number, number][] {
+  return [...m.entries()].toSorted((a, b) => a[0] - b[0]);
 }
 
 function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   // Timing reference: smallest start_ns and largest end_ns across every
-  // timeslice we extracted. (Same logic as the original getTraceServerMetrics
-  // — looking at every metric gives the widest possible window even if some
-  // series start late.)
+  // timeslice we extracted. timeslicesCount is the length of any single
+  // series (engines are scraped on the same cadence), so picking the max
+  // length across all series of all metrics is safe.
   let startNs = Number.POSITIVE_INFINITY;
   let endNs = 0;
   let timeslicesCount = 0;
@@ -175,83 +210,70 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   if (!Number.isFinite(startNs)) startNs = 0;
   const tOf = (ns: number) => (ns - startNs) / 1e9;
 
-  // KV cache usage (gauge, 0..1)
-  const kvCacheUsage: TimeSeriesPoint[] = [];
+  // KV cache usage (gauge, 0..1) — average across engines so the value
+  // stays a fraction (each engine has its own KV pool).
   const kvSeries =
-    firstSeries(metrics, 'vllm:kv_cache_usage_perc') ??
-    firstSeries(metrics, 'vllm:gpu_cache_usage_perc');
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number' && typeof ts.start_ns === 'number') {
-      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
-    }
-  }
+    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
+    aggregateByStart(kvSeries, 'avg', 'avg'),
+  ).map(([t, v]) => ({ t: tOf(t), value: v }));
 
-  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
-  const hitsTs = firstSeries(metrics, 'vllm:prefix_cache_hits')?.timeslices ?? [];
-  const qsTs = firstSeries(metrics, 'vllm:prefix_cache_queries')?.timeslices ?? [];
+  // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
+  // engines, joined on start_ns.
+  const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum');
+  const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum');
   const prefixCacheHitRate: TimeSeriesPoint[] = [];
-  const minLen = Math.min(hitsTs.length, qsTs.length);
-  for (let i = 0; i < minLen; i++) {
-    const h = hitsTs[i]!;
-    const q = qsTs[i]!;
-    if (
-      typeof q.rate === 'number' &&
-      q.rate > 0 &&
-      typeof h.rate === 'number' &&
-      typeof h.start_ns === 'number'
-    ) {
-      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
-    }
+  for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) {
+    const q = qsByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q });
   }
 
-  // Queue depth: pair running + waiting by index.
-  const runTs = firstSeries(metrics, 'vllm:num_requests_running')?.timeslices ?? [];
-  const waitTs = firstSeries(metrics, 'vllm:num_requests_waiting')?.timeslices ?? [];
+  // Queue depth: sum running + waiting across engines per timeslice.
+  const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum');
+  const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum');
   const queueDepth: QueueDepthPoint[] = [];
-  const qlen = Math.min(runTs.length, waitTs.length);
-  for (let i = 0; i < qlen; i++) {
-    const r = runTs[i]!;
-    const w = waitTs[i]!;
-    if (typeof r.start_ns !== 'number') continue;
-    const running = typeof r.avg === 'number' ? r.avg : 0;
-    const waiting = typeof w.avg === 'number' ? w.avg : 0;
-    queueDepth.push({
-      t: tOf(r.start_ns),
-      running,
-      waiting,
-      total: running + waiting,
-    });
+  // Union of timestamps so we surface activity even if one of the gauges
+  // didn't report a sample on a given tick.
+  const allTimes = new Set<number>([...runByT.keys(), ...waitByT.keys()]);
+  for (const t of [...allTimes].toSorted((a, b) => a - b)) {
+    const running = runByT.get(t) ?? 0;
+    const waiting = waitByT.get(t) ?? 0;
+    queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting });
   }
 
-  // Throughput: extract counter `rate` (already per-second from aiperf).
-  const counterRate = (name: string): TimeSeriesPoint[] => {
-    const s = firstSeries(metrics, name);
-    if (!s) return [];
-    const out: TimeSeriesPoint[] = [];
-    for (const ts of s.timeslices ?? []) {
-      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
-        out.push({ t: tOf(ts.start_ns), value: ts.rate });
-      }
-    }
-    return out;
-  };
+  // Throughput: sum the counter `rate` (already per-second) across engines.
+  const counterRate = (name: string): TimeSeriesPoint[] =>
+    sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({
+      t: tOf(t),
+      value: v,
+    }));
   const prefillTps = counterRate('vllm:prompt_tokens');
   const decodeTps = counterRate('vllm:generation_tokens');
 
-  // Per-source prompt tokens — emit one TS array per source label.
-  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  // Per-source prompt tokens — sum across engines per source label.
+  const promptBySrcByT = new Map<string, Map<number, number>>();
   for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
     const labels = series.labels ?? {};
     const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
-    const arr: TimeSeriesPoint[] = [];
+    let byT = promptBySrcByT.get(source);
+    if (!byT) {
+      byT = new Map<number, number>();
+      promptBySrcByT.set(source, byT);
+    }
     for (const ts of series.timeslices ?? []) {
       if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
-        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+        byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
       }
     }
+  }
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const [source, byT] of promptBySrcByT) {
+    const arr: TimeSeriesPoint[] = [];
+    for (const [t, v] of [...byT.entries()].toSorted((a, b) => a[0] - b[0])) {
+      if (v > 0) arr.push({ t: tOf(t), value: v });
+    }
     if (arr.length > 0) promptTokensBySource[source] = arr;
   }
-
   return {
     version: CHART_SERIES_VERSION,
     startNs,
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 8ac4f678..1ad7fd7f 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -29,8 +29,11 @@ import type { DbClient } from '../connection.js';
  * script recomputes any row whose stored `aggregate_stats.version` is older.
  * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
  * import: the compute helper depends on the percentile utilities below.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
  */
-export const STATS_VERSION = 1;
+export const STATS_VERSION = 2;
 
 export interface MetricPercentiles {
   mean: number;
@@ -154,10 +157,47 @@ interface MetricsJson {
   metrics?: Record<string, MetricMeta>;
 }
 
+/**
+ * Aggregate a per-timeslice field across all series of a metric, indexed by
+ * the timeslice's `start_ns`. vllm reports one series per engine on
+ * multi-engine DP/PP deployments, so we sum (or average) across engines to
+ * get the cluster-wide value at each timeslice.
+ *
+ * `field` selects which numeric field on a timeslice to read (`avg` for
+ * gauges, `rate` for counter deltas). `combine` controls cross-engine math:
+ * 'sum' for running/waiting/throughput counters where the cluster total is
+ * the sum; 'avg' for KV cache utilization, which is bounded [0, 1] per
+ * engine and should be averaged across engines for the cluster view.
+ */
+function aggregateSeriesByStart(
+  metricSeries: readonly Series[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of metricSeries ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
 /**
  * Parse the server_metrics_json → time-series arrays for KV cache util and
  * prefix cache hit rate (per-interval, computed from the prometheus
  * counters the same way trace-server-metrics does it).
+ *
+ * Aggregates across all engine series so multi-engine DP/PP deployments are
+ * counted correctly (previously we only read engine 0).
  */
 export function extractServerMetricSamples(json: string): {
   kvCacheUtil: number[];
@@ -165,40 +205,26 @@ export function extractServerMetricSamples(json: string): {
 } {
   const parsed = JSON.parse(json) as MetricsJson;
   const metrics = parsed.metrics ?? {};
-  const firstSeries = (name: string): Series | undefined => {
-    const s = metrics[name]?.series;
-    return s && s.length > 0 ? s[0] : undefined;
-  };
 
-  // KV cache util — gauge in [0, 1].
-  const kvSeries =
-    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
-  const kvCacheUtil: number[] = [];
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number') kvCacheUtil.push(ts.avg);
-  }
+  // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
+  // value stays a percentage; summing would give meaningless 0..N.
+  const kvSeriesAll =
+    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
 
-  // Prefix cache hit rate per interval = hits.rate / queries.rate.
-  // Matches the derivation in queries/trace-server-metrics.ts.
-  // Metric names: vllm exposes these as `vllm:prefix_cache_*` (no `gpu_`
-  // prefix); falls back to the `gpu_`-prefixed names in case a future
-  // vllm version renames them.
+  // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
+  // all engines. Sum first, then divide.
+  const hitsAll =
+    metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series;
+  const queriesAll =
+    metrics['vllm:prefix_cache_queries']?.series ??
+    metrics['vllm:gpu_prefix_cache_queries']?.series;
+  const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
+  const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
   const prefixCacheHitRate: number[] = [];
-  const hitsSeries =
-    firstSeries('vllm:prefix_cache_hits') ?? firstSeries('vllm:gpu_prefix_cache_hits');
-  const queriesSeries =
-    firstSeries('vllm:prefix_cache_queries') ?? firstSeries('vllm:gpu_prefix_cache_queries');
-  if (hitsSeries && queriesSeries) {
-    const qByStart = new Map<number, TimeSlice>();
-    for (const q of queriesSeries.timeslices ?? []) {
-      if (typeof q.start_ns === 'number') qByStart.set(q.start_ns, q);
-    }
-    for (const h of hitsSeries.timeslices ?? []) {
-      if (typeof h.start_ns !== 'number' || typeof h.rate !== 'number') continue;
-      const q = qByStart.get(h.start_ns);
-      if (!q || typeof q.rate !== 'number' || q.rate === 0) continue;
-      prefixCacheHitRate.push(h.rate / q.rate);
-    }
+  for (const [t, h] of hitsByT) {
+    const q = qByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push(h / q);
   }
 
   return { kvCacheUtil, prefixCacheHitRate };

From b3e315ccd66bfc5476fc7bf28b1b3c52628ffd8d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 26 May 2026 18:28:33 -0500
Subject: [PATCH 042/111] fix(scenario-selector): wrap "Deprecated" in
 SelectLabel + lead with agentic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two visual issues in the Scenario dropdown:
1. The "Deprecated" sub-header rendered as a bare span while sibling
   group labels ("Fixed Sequence Length") use SelectLabel — so
   "Deprecated" came out in body-text size, looking out of place.
2. Agentic Traces sat below the deprecated fixed-seq entries, visually
   implying it was part of the deprecated section.

Wraps DeprecatedSectionTitle in SelectLabel so the styling matches its
peers across all selectors (Scenario, Model, Hardware) that use it.
Moves the Agentic group to the top of the Scenario dropdown so it's
visually distinct from the fixed-seq + deprecated entries.

Agentic Traces was already the preferred default when available
(GlobalFilterContext.tsx); no behavior change there.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/ui/chart-selectors.tsx | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 19b4bfb0..8b91059a 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -33,7 +33,7 @@ import {
 
 function DeprecatedSectionTitle({ reason }: { reason: string }) {
   return (
-    <span className="flex items-center gap-1">
+    <SelectLabel className="flex items-center gap-1">
       Deprecated
       <TooltipRoot>
         <TooltipTrigger asChild>
@@ -43,7 +43,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) {
           <span>{reason}</span>
         </TooltipContent>
       </TooltipRoot>
-    </span>
+    </SelectLabel>
   );
 }
 
@@ -261,6 +261,17 @@ export function ScenarioSelector({
           <SelectValue />
         </SelectTrigger>
         <SelectContent>
+          {/* Agentic first — preferred default scenario when available. */}
+          {agentic.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Agentic</SelectLabel>
+              {agentic.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+            </SelectGroup>
+          )}
           {fixedSeq.length > 0 && (
             <SelectGroup>
               <SelectLabel>Fixed Sequence Length</SelectLabel>
@@ -281,11 +292,6 @@ export function ScenarioSelector({
               )}
             </SelectGroup>
           )}
-          {agentic.map((seq) => (
-            <SelectItem key={seq} value={seq}>
-              {getSequenceLabel(seq as Sequence)}
-            </SelectItem>
-          ))}
         </SelectContent>
       </Select>
     </div>

From 19b99586353cd39bccd4072bd6e2a2afcaf73367 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 26 May 2026 18:32:26 -0500
Subject: [PATCH 043/111] fix(scenario-selector): wrap Deprecated header in
 SelectLabel only inside Select
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous commit (b3e315c) changed DeprecatedSectionTitle to render
SelectLabel internally, which throws at runtime ("SelectLabel must be
used within SelectGroup") in callsites that render the header via
MultiSelect — MultiSelect wraps the header in its own div, not a Radix
SelectGroup.

Revert the component to a plain styled span (MultiSelect's div wrapper
supplies the small/muted styling), and wrap with SelectLabel only at
the ScenarioSelector callsite, where the header sits directly inside
a SelectGroup.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/ui/chart-selectors.tsx     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 8b91059a..49ea3f1a 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -31,9 +31,16 @@ import {
   sequenceKind,
 } from '@/lib/data-mappings';
 
+/**
+ * "Deprecated" sub-header used by selectors. Rendered as a span (not
+ * SelectLabel) because some callsites use `MultiSelect`, which wraps
+ * headers in its own div and isn't a SelectGroup. The span carries no
+ * styling of its own — the parent context supplies the muted/small
+ * treatment. ScenarioSelector renders this inside a SelectLabel directly.
+ */
 function DeprecatedSectionTitle({ reason }: { reason: string }) {
   return (
-    <SelectLabel className="flex items-center gap-1">
+    <span className="flex items-center gap-1">
       Deprecated
       <TooltipRoot>
         <TooltipTrigger asChild>
@@ -43,7 +50,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) {
           <span>{reason}</span>
         </TooltipContent>
       </TooltipRoot>
-    </SelectLabel>
+    </span>
   );
 }
 
@@ -282,7 +289,9 @@ export function ScenarioSelector({
               ))}
               {fixedGroups.deprecated.length > 0 && (
                 <>
-                  <DeprecatedSectionTitle reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  <SelectLabel>
+                    <DeprecatedSectionTitle reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  </SelectLabel>
                   {fixedGroups.deprecated.map((seq) => (
                     <SelectItem key={seq} value={seq}>
                       {getSequenceLabel(seq as Sequence)}

From 7114833409b92a206f7c22b80846db527e01da43 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 13:22:13 -0500
Subject: [PATCH 044/111] feat(agentic-detail): add cumulative input tokens
 chart
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surfaces a new chart on the agentic detail page showing the running
total of input (prompt) tokens served over the course of the run —
useful for seeing how the load actually accumulates vs the
instantaneous prefill_tps line we already plot.

Adds a `cumulativeSum` helper alongside the existing `cumulativeAverage`
and `sumSeries` time-series utilities. No backfill needed — the source
data (`chart_series.prefillTps`) is already pre-computed at ingest time
for every blob-bearing row.

(Input throughput as a Pareto axis is already wired via the existing
`y_inputTputPerGpu` y-axis option; no change there.)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 24 +++++++++++++++++++
 .../agentic-point/time-series-chart.tsx       | 17 +++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 2e43b4fb..1a61b93b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -26,6 +26,7 @@ import {
   StackedAreaChart,
   TimeSeriesChart,
   cumulativeAverage,
+  cumulativeSum,
   rollingAverage,
   sumSeries,
 } from './time-series-chart';
@@ -381,6 +382,29 @@ export function AgenticPointDetail({ id }: Props) {
               );
             }}
           />
+
+          <ExpandableChart
+            title="Total input tokens over time"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Cumulative input tokens',
+                      data: cumulativeSum(metrics.prefillTps),
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens"
+                  {...size}
+                />
+              );
+            }}
+          />
         </div>
       )}
     </div>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index cd10aff7..042c4331 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -58,6 +58,23 @@ export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Running cumulative sum of a per-interval rate series. Each output point
+ * is the integral of the rate from start to that point, assuming the rate
+ * applies over a 1-second window (aiperf's scrape interval). Use for
+ * "total tokens served so far" from a tokens-per-second series.
+ */
+export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  let sum = 0;
+  for (let i = 0; i < data.length; i++) {
+    sum += data[i]!.value;
+    out[i] = { t: data[i]!.t, value: sum };
+  }
+  return out;
+}
+
 /** Pointwise sum of two arrays sharing the same t index. */
 export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
   const n = Math.min(a.length, b.length);

From c6697de8ff3d8263924986fd71b4622f1369f9a3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 14:44:19 -0500
Subject: [PATCH 045/111] feat(agentic-detail): plot cumulative unique input
 tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the "Total input tokens over time" chart with "Total unique
input tokens over time" — cumsum of (prompt-token rate − prefix-cache-
hit rate per second), which equals the cumulative tokens vllm actually
had to prefill from scratch (= vllm:request_prefill_kv_computed_tokens).

Adds `prefixCacheHitsTps` to the chart_series JSONB (extracted by
summing vllm:prefix_cache_hits.rate across all engine series, same DP-
aware path as prefillTps). Bumps CHART_SERIES_VERSION to 3; the
existing trace-server-metrics query defaults the field to [] for any
older v2 rows so reads stay safe before backfill catches up.

Backfilled 62 rows to v3.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx       | 14 +++++++++++---
 .../src/hooks/api/use-trace-server-metrics.ts    |  2 ++
 packages/db/src/etl/compute-chart-series.ts      | 16 +++++++++++++++-
 packages/db/src/queries/trace-server-metrics.ts  |  4 ++++
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 1a61b93b..4bebd37c 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -384,16 +384,24 @@ export function AgenticPointDetail({ id }: Props) {
           />
 
           <ExpandableChart
-            title="Total input tokens over time"
+            title="Total unique input tokens over time"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
+              // Unique = total prompt tokens vllm received minus the tokens
+              // it served from the prefix cache. The cache-miss portion is
+              // what actually constitutes "new content" the GPU had to
+              // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens.
+              const unique = sumSeries(
+                metrics.prefillTps,
+                metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })),
+              );
               return (
                 <TimeSeriesChart
                   series={[
                     {
-                      name: 'Cumulative input tokens',
-                      data: cumulativeSum(metrics.prefillTps),
+                      name: 'Cumulative unique input tokens',
+                      data: cumulativeSum(unique),
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index 8418aa4f..664bc6c7 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -42,6 +42,8 @@ export interface TraceServerMetrics {
   promptTokensBySource: Record<string, TimeSeriesPoint[]>;
   prefillTps: TimeSeriesPoint[];
   decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 530600cf..91e89521 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -25,8 +25,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * only series[0], which under-counted by Nx on multi-engine DP/PP
  * deployments — most visible as a request-queue-depth chart that maxed out
  * at ~3 when the timeline clearly showed 20+ in-flight).
+ *
+ * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
+ * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
  */
-export const CHART_SERIES_VERSION = 2;
+export const CHART_SERIES_VERSION = 3;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -57,6 +60,13 @@ export interface ChartSeries {
   promptTokensBySource: Record<string, TimeSeriesPoint[]>;
   prefillTps: TimeSeriesPoint[];
   decodeTps: TimeSeriesPoint[];
+  /**
+   * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across
+   * engines. Detail page derives "cumulative unique input tokens" as
+   * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually
+   * saved vs the raw queries that came in.
+   */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -249,6 +259,9 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }));
   const prefillTps = counterRate('vllm:prompt_tokens');
   const decodeTps = counterRate('vllm:generation_tokens');
+  // Tokens served from prefix cache per scrape. Lets the frontend derive
+  // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
+  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits');
 
   // Per-source prompt tokens — sum across engines per source label.
   const promptBySrcByT = new Map<string, Map<number, number>>();
@@ -286,5 +299,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     promptTokensBySource,
     prefillTps,
     decodeTps,
+    prefixCacheHitsTps,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 624b6ed3..76775e77 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -71,6 +71,8 @@ export interface TraceServerMetrics {
   prefillTps: TimeSeriesPoint[];
   /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
   decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -114,6 +116,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     promptTokensBySource: series.promptTokensBySource,
     prefillTps: series.prefillTps,
     decodeTps: series.decodeTps,
+    // v2 chart_series rows pre-backfill don't have this field — default to []
+    prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
   };
 }
 

From b5679bb10acfd6a6765b48a5864b2a0ec73d4915 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:00:12 -0500
Subject: [PATCH 046/111] feat(request-timeline): expandable subagent -> stream
 rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The harness fans a single subagent into multiple parallel ":sN" streams
when its inner requests overlap in time (weka_trace._pack_into_streams).
Previously each :sN got its own swimlane row, which made one parent
conversation with 5 subagents (each fanned into 2-8 streams) render as
23 separate rows — visually implying 23 distinct subagent invocations
when really there are 5.

Now: each subagent shows as one row by default with a chevron + stream
count chip ("subagent 003 · f1e7 ×8"). The collapsed row draws the
union of all stream bars overlaid, so the concurrency burst is still
visible at a glance. Click the chevron to fan into per-stream rows;
click again to collapse.

For conv 0f5b266f in benchmark 206360: 23 rows → 5 rows by default.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.tsx        | 325 ++++++++++++------
 1 file changed, 226 insertions(+), 99 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index bcbe105a..8762a158 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -53,44 +53,84 @@ const PHASE_COLORS: Record<string, string> = {
   unknown: '#64748b',
 };
 
+/**
+ * Row kinds:
+ *   parent           — top-level conversation (depth 0)
+ *   worker           — worker swimlane (depth 0, worker mode)
+ *   subagent         — a subagent invocation (depth 1). Either a single
+ *                      stream (renders its own bars), or a multi-stream
+ *                      container whose bars are the union of its streams
+ *                      when collapsed.
+ *   stream           — one :sN stream of a multi-stream subagent (depth 2).
+ *                      Hidden by default; toggled in via the parent's chevron.
+ */
+type RowKind = 'parent' | 'worker' | 'subagent' | 'stream';
+
 interface Row {
   key: string;
   label: string;
   color: string;
   requests: RequestRecord[];
-  /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */
   depth: number;
-  /** True if this row is a sub-agent ("Subagent N of parent X"). */
-  isSubagent: boolean;
+  kind: RowKind;
+  /** Number of streams under this subagent (>=1). Only set for subagent rows. */
+  streamCount?: number;
+  /** For stream rows: the parent subagent's row key (drives expand/collapse). */
+  parentRowKey?: string;
 }
 
 /**
  * Conversation ids for subagent calls look like
- *   <parent_cid>::sa:subagent_<N>_<hash>
- * Split into the parent cid and a sub-agent label (or the whole thing if
- * this is a top-level conversation).
+ *   <parent_cid>::sa:<agent_id>[:s<stream_idx>]
+ * The optional `:s<N>` suffix is set when the harness fans a single
+ * subagent into multiple parallel "streams" (interval-graph
+ * decomposition in weka_trace._pack_into_streams). We split it off so
+ * we can group all streams of one subagent under a single header row.
  */
-function splitCid(cid: string): { parent: string; subagent: string | null } {
+function splitCid(cid: string): {
+  parent: string;
+  subagentBase: string | null;
+  stream: number | null;
+} {
   const sep = cid.indexOf('::sa:');
-  if (sep === -1) return { parent: cid, subagent: null };
-  return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) };
+  if (sep === -1) return { parent: cid, subagentBase: null, stream: null };
+  const parent = cid.slice(0, sep);
+  const raw = cid.slice(sep + 5);
+  const m = /^(.*):s(\d+)$/.exec(raw);
+  if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) };
+  return { parent, subagentBase: raw, stream: null };
 }
 
-/** Group requests into rows; in conversation mode subagents nest under parents. */
-function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
-  const groups = new Map<string, RequestRecord[]>();
-  for (const r of requests) {
-    const key = mode === 'conversation' ? r.cid : r.wid;
-    let list = groups.get(key);
-    if (!list) {
-      list = [];
-      groups.set(key, list);
-    }
-    list.push(r);
-  }
-
+/**
+ * Group requests into rows. In conversation mode, output order is:
+ *   parent_conv
+ *     subagent_001                  (collapsed by default, container)
+ *       :s0                         (hidden unless expanded)
+ *       :s1
+ *     subagent_002
+ *     ...
+ *
+ * `expandedSubagents` controls which subagent containers reveal their
+ * stream children. Bars on a collapsed subagent are the UNION of all its
+ * streams' requests — overlapping bars visually communicate the
+ * stream-level parallelism without expanding.
+ */
+function buildRows(
+  requests: RequestRecord[],
+  mode: RowMode,
+  expandedSubagents: ReadonlySet<string>,
+): Row[] {
   if (mode !== 'conversation') {
     // Worker mode: flat rows, sorted by first activity.
+    const groups = new Map<string, RequestRecord[]>();
+    for (const r of requests) {
+      let list = groups.get(r.wid);
+      if (!list) {
+        list = [];
+        groups.set(r.wid, list);
+      }
+      list.push(r);
+    }
     const rows: Row[] = [];
     let i = 0;
     for (const [key, list] of groups) {
@@ -101,7 +141,7 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
         color: ROW_COLORS[i % ROW_COLORS.length]!,
         requests: list,
         depth: 0,
-        isSubagent: false,
+        kind: 'worker',
       });
       i++;
     }
@@ -109,36 +149,40 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
     return rows;
   }
 
-  // Conversation mode: build a parent → [subagents] tree so each parent
-  // group renders as one parent row followed by its sub-agent rows. Color
-  // is shared inside a tree so the visual grouping reads.
+  // Conversation mode — tree: parent → subagent → stream.
   interface Tree {
     parentCid: string;
-    parentRow: { key: string; requests: RequestRecord[] } | null;
-    subagents: Map<string, RequestRecord[]>; // subagent label → requests
+    parentReqs: RequestRecord[];
+    // subagentBase → (streamIndex|null → requests)
+    subagents: Map<string, Map<number | null, RequestRecord[]>>;
     firstStart: number;
   }
   const trees = new Map<string, Tree>();
-  for (const [cid, list] of groups) {
-    list.sort((a, b) => a.start - b.start);
-    const { parent, subagent } = splitCid(cid);
+  for (const r of requests) {
+    const { parent, subagentBase, stream } = splitCid(r.cid);
     let tree = trees.get(parent);
     if (!tree) {
       tree = {
         parentCid: parent,
-        parentRow: null,
+        parentReqs: [],
         subagents: new Map(),
         firstStart: Number.POSITIVE_INFINITY,
       };
       trees.set(parent, tree);
     }
-    if (subagent === null) {
-      tree.parentRow = { key: cid, requests: list };
+    if (subagentBase === null) {
+      tree.parentReqs.push(r);
     } else {
-      tree.subagents.set(subagent, list);
+      let saMap = tree.subagents.get(subagentBase);
+      if (!saMap) {
+        saMap = new Map();
+        tree.subagents.set(subagentBase, saMap);
+      }
+      const list = saMap.get(stream);
+      if (list) list.push(r);
+      else saMap.set(stream, [r]);
     }
-    const earliest = list[0]!.start;
-    if (earliest < tree.firstStart) tree.firstStart = earliest;
+    if (r.start < tree.firstStart) tree.firstStart = r.start;
   }
 
   const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
@@ -147,39 +191,66 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
   for (const tree of sortedTrees) {
     const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
     colorIdx++;
-    if (tree.parentRow) {
+    // Parent row (use a placeholder key if the parent itself wasn't replayed).
+    tree.parentReqs.sort((a, b) => a.start - b.start);
+    rows.push({
+      key: tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`,
+      label: tree.parentCid,
+      color,
+      requests: tree.parentReqs,
+      depth: 0,
+      kind: 'parent',
+    });
+
+    // One subagent row per base (which may contain N streams).
+    const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => {
+      const aStart = Math.min(
+        ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      const bStart = Math.min(
+        ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      return aStart - bStart;
+    });
+    for (const [saBase, streams] of subagentEntries) {
+      const subagentKey = `${tree.parentCid}::sa:${saBase}`;
+      // Union of all stream requests for collapsed-view bars.
+      const allReqs: RequestRecord[] = [];
+      for (const reqs of streams.values()) allReqs.push(...reqs);
+      allReqs.sort((a, b) => a.start - b.start);
+      const streamCount = streams.size;
       rows.push({
-        key: tree.parentRow.key,
-        label: shortenCid(tree.parentCid),
+        key: subagentKey,
+        label: `↳ ${formatSubagentLabel(saBase)}`,
         color,
-        requests: tree.parentRow.requests,
-        depth: 0,
-        isSubagent: false,
-      });
-    } else {
-      // Pseudo-parent header so orphan subagents still render under
-      // something they belong to.
-      rows.push({
-        key: `__parent_${tree.parentCid}`,
-        label: shortenCid(tree.parentCid),
-        color,
-        requests: [],
-        depth: 0,
-        isSubagent: false,
-      });
-    }
-    const subagentEntries = [...tree.subagents.entries()].toSorted(
-      (a, b) => a[1][0]!.start - b[1][0]!.start,
-    );
-    for (const [saLabel, list] of subagentEntries) {
-      rows.push({
-        key: `${tree.parentCid}::${saLabel}`,
-        label: `↳ ${formatSubagentLabel(saLabel)}`,
-        color,
-        requests: list,
+        requests: allReqs,
         depth: 1,
-        isSubagent: true,
+        kind: 'subagent',
+        streamCount,
       });
+
+      // Stream children only when expanded AND there's more than one
+      // stream (a single-stream subagent has nothing extra to show).
+      if (streamCount > 1 && expandedSubagents.has(subagentKey)) {
+        const streamEntries = [...streams.entries()].toSorted((a, b) => {
+          // Sort by stream index (null first as the "default" stream)
+          const ai = a[0] ?? -1;
+          const bi = b[0] ?? -1;
+          return ai - bi;
+        });
+        for (const [streamIdx, reqs] of streamEntries) {
+          reqs.sort((a, b) => a.start - b.start);
+          rows.push({
+            key: `${subagentKey}:s${streamIdx ?? '∅'}`,
+            label: `stream ${streamIdx ?? '∅'}`,
+            color,
+            requests: reqs,
+            depth: 2,
+            kind: 'stream',
+            parentRowKey: subagentKey,
+          });
+        }
+      }
     }
   }
   return rows;
@@ -192,11 +263,6 @@ function formatSubagentLabel(raw: string): string {
   return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
 }
 
-function shortenCid(cid: string): string {
-  if (cid.length <= 12) return cid;
-  return `${cid.slice(0, 8)}…${cid.slice(-4)}`;
-}
-
 function shortenWid(wid: string): string {
   // worker_4ae87bea → w_4ae8
   return wid.replace(/^worker_/, 'w_').slice(0, 12);
@@ -314,6 +380,17 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   const [rowMode, setRowMode] = useState<RowMode>('conversation');
   const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+  // Which multi-stream subagents currently have their per-stream rows
+  // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
+  const [expandedSubagents, setExpandedSubagents] = useState<ReadonlySet<string>>(() => new Set());
+  const toggleSubagent = useCallback((key: string) => {
+    setExpandedSubagents((prev) => {
+      const next = new Set(prev);
+      if (next.has(key)) next.delete(key);
+      else next.add(key);
+      return next;
+    });
+  }, []);
   const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
 
   // Apply phase filter, then group into rows.
@@ -322,7 +399,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
       phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
     [data.requests, phaseFilter],
   );
-  const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]);
+  const rows = useMemo(
+    () => buildRows(filtered, rowMode, expandedSubagents),
+    [filtered, rowMode, expandedSubagents],
+  );
 
   // Pre-sort the timestamp columns so the cursor-time stats popover can
   // count "running / waiting at time t" in O(log n). With a few hundred
@@ -359,7 +439,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   const isZoomed = viewEnd !== null;
 
   // Layout
-  const LABEL_WIDTH = 160;
+  // Wide enough for a full 36-char conversation id at 10px font, plus the
+  // indent + color stripe + count badge. Subagent rows inherit the same
+  // width but truncate the longer "↳ subagent N · hash" tail with ellipsis.
+  const LABEL_WIDTH = 360;
   const ROW_HEIGHT = 22;
   const ROW_GAP = 3;
   const HEADER_HEIGHT = 24;
@@ -537,33 +620,58 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                 {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
               </span>
             </div>
-            {rows.map((row) => (
-              <div
-                key={row.key}
-                className="flex items-center gap-1.5 overflow-hidden pr-2"
-                style={{
-                  height: ROW_HEIGHT + ROW_GAP,
-                  paddingLeft: 8 + row.depth * 12,
-                }}
-              >
-                <span
-                  className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+            {rows.map((row) => {
+              const isSubagentRow = row.kind === 'subagent';
+              const isStreamRow = row.kind === 'stream';
+              const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
+              const isExpanded = isExpandable && expandedSubagents.has(row.key);
+              return (
+                <div
+                  key={row.key}
+                  className="flex items-center gap-1 overflow-hidden pr-2"
                   style={{
-                    backgroundColor: row.color,
-                    opacity: row.isSubagent ? 0.55 : 1,
+                    height: ROW_HEIGHT + ROW_GAP,
+                    paddingLeft: 4 + row.depth * 10,
                   }}
-                />
-                <span
-                  className="text-[10px] font-mono truncate"
-                  style={{ color: row.color, opacity: row.isSubagent ? 0.85 : 1 }}
                 >
-                  {row.label}
-                </span>
-                <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
-                  {row.requests.length > 0 ? row.requests.length : '—'}
-                </span>
-              </div>
-            ))}
+                  {isExpandable ? (
+                    <button
+                      type="button"
+                      onClick={() => toggleSubagent(row.key)}
+                      className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
+                      aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                      title={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                    >
+                      <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
+                    </button>
+                  ) : (
+                    <span className="size-3.5 shrink-0" />
+                  )}
+                  <span
+                    className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                    style={{
+                      backgroundColor: row.color,
+                      opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
+                    }}
+                  />
+                  <span
+                    className="text-[10px] font-mono truncate"
+                    style={{
+                      color: row.color,
+                      opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                    }}
+                  >
+                    {row.label}
+                    {isExpandable && (
+                      <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                    )}
+                  </span>
+                  <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                    {row.requests.length > 0 ? row.requests.length : '—'}
+                  </span>
+                </div>
+              );
+            })}
           </div>
 
           {/* Scrollable SVG */}
@@ -636,6 +744,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
               {rows.map((row, rowIdx) => {
                 const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
                 const barH = ROW_HEIGHT - 4;
+                // For multi-stream subagent containers, suppress the union
+                // bars when expanded — the child stream rows draw them
+                // individually instead, so we'd double-draw otherwise.
+                if (
+                  row.kind === 'subagent' &&
+                  (row.streamCount ?? 1) > 1 &&
+                  expandedSubagents.has(row.key)
+                ) {
+                  return null;
+                }
                 return row.requests.map((req) => {
                   const xCredit = xOf(req.credit);
                   const xStart = xOf(req.start);
@@ -663,7 +781,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                           opacity={0.35}
                         />
                       )}
-                      {/* Main bar */}
+                      {/* Main bar — opacity stepped down with depth so
+                          parent > subagent > stream reads visually. */}
                       <rect
                         x={xStart}
                         y={yTop}
@@ -671,7 +790,15 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                         height={barH}
                         rx={2}
                         fill={row.color}
-                        opacity={req.cancelled ? 0.35 : row.isSubagent ? 0.6 : 0.85}
+                        opacity={
+                          req.cancelled
+                            ? 0.35
+                            : row.kind === 'stream'
+                              ? 0.5
+                              : row.kind === 'subagent'
+                                ? 0.6
+                                : 0.85
+                        }
                       />
                       {/* Phase strip at bottom */}
                       <rect

From 2e1f1ce33da85dbc8058bf41feffffc04ba7ee26 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:07:27 -0500
Subject: [PATCH 047/111] fix(agentic-detail): make unique-input-tokens chart
 monotonic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vllm's per-scrape prompt_tokens.rate and prefix_cache_hits.rate counters
can lag each other by several seconds across scrapes (we see prefill=0
at one tick with hits=1.1M, then prefill=1.5M with hits=452K six ticks
later — lifetime totals agree but per-tick they don't). Computing
cumsum(prefill - hits) per tick made the chart dip well negative at
the start.

Replaces the per-tick subtraction with `cumulativeDifferenceMonotonic`:
union the two series by timestamp, accumulate each independently, take
the diff, then enforce a running max so the curve never decreases.
End-of-run totals are unchanged (both counters converge to the right
value); transient skew just looks like a brief plateau instead of a
negative dip.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 21 ++++++-----
 .../agentic-point/time-series-chart.tsx       | 37 +++++++++++++++++++
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 4bebd37c..1abf64e6 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -26,7 +26,7 @@ import {
   StackedAreaChart,
   TimeSeriesChart,
   cumulativeAverage,
-  cumulativeSum,
+  cumulativeDifferenceMonotonic,
   rollingAverage,
   sumSeries,
 } from './time-series-chart';
@@ -388,20 +388,21 @@ export function AgenticPointDetail({ id }: Props) {
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
-              // Unique = total prompt tokens vllm received minus the tokens
-              // it served from the prefix cache. The cache-miss portion is
-              // what actually constitutes "new content" the GPU had to
-              // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens.
-              const unique = sumSeries(
-                metrics.prefillTps,
-                metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })),
-              );
+              // Unique = total prompt tokens received minus tokens served
+              // from the prefix cache. Equivalent to cumsum of
+              // vllm:request_prefill_kv_computed_tokens. We compute it as
+              // monotonic-non-decreasing cumulative-diff so per-scrape
+              // timing skew between the prompt_tokens and prefix_cache_hits
+              // counters can't make the line dip negative.
               return (
                 <TimeSeriesChart
                   series={[
                     {
                       name: 'Cumulative unique input tokens',
-                      data: cumulativeSum(unique),
+                      data: cumulativeDifferenceMonotonic(
+                        metrics.prefillTps,
+                        metrics.prefixCacheHitsTps,
+                      ),
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 042c4331..25d5a672 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -75,6 +75,43 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Monotonic-non-decreasing cumulative difference of two rate series:
+ * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce
+ * a running max so the curve never dips below its prior value.
+ *
+ * Use this to plot things like "cumulative cache-missed tokens" where the
+ * true value can only ever grow, but the underlying per-tick rates can
+ * temporarily look negative due to counter timing skew between scrapes
+ * (vllm's `prefix_cache_hits` and `prompt_tokens` counters can lag each
+ * other by ~5-10 s in our data even though their lifetime totals agree).
+ *
+ * `a` and `b` may have different (or overlapping) timestamp sets — both
+ * are unioned and walked in time order. Output has one point per unique
+ * timestamp present in either input.
+ */
+export function cumulativeDifferenceMonotonic(
+  a: TimeSeriesPoint[],
+  b: TimeSeriesPoint[],
+): TimeSeriesPoint[] {
+  const aByT = new Map(a.map((p) => [p.t, p.value]));
+  const bByT = new Map(b.map((p) => [p.t, p.value]));
+  const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y);
+  const out: TimeSeriesPoint[] = Array.from({ length: allT.length });
+  let cumA = 0;
+  let cumB = 0;
+  let runningMax = 0;
+  for (let i = 0; i < allT.length; i++) {
+    const t = allT[i]!;
+    cumA += aByT.get(t) ?? 0;
+    cumB += bByT.get(t) ?? 0;
+    const diff = cumA - cumB;
+    if (diff > runningMax) runningMax = diff;
+    out[i] = { t, value: runningMax };
+  }
+  return out;
+}
+
 /** Pointwise sum of two arrays sharing the same t index. */
 export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
   const n = Math.min(a.length, b.length);

From 08bbe6650c73935d7ac7a9fa29a722b141911bc9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:15:05 -0500
Subject: [PATCH 048/111] feat(agentic-detail): add unique input tokens in
 flight chart

New chart on the per-point view that plots the deduped count of
input tokens currently held by in-flight requests, as a 30s time-
weighted rolling average with the raw step series rendered as faint
scatter behind it. Useful for seeing the working set the model has
to hold KV cache for at any instant.

Computation (frontend, from request_timeline):
  - At each request start/end event, maintain active ISL per cid
    (within one cid turns are sequential, so each cid contributes
    at most one in-flight ISL at a time)
  - total_in_flight(t) = sum over cids with active request of that
    cid's current ISL
  - Across cids we treat content as independent (cross-conv prefix
    sharing measured at <1 pp, so summing is a tight approximation)

Adds timeRollingAverage helper: time-weighted (vs sample-count)
moving average suitable for irregularly-sampled event series like
this one.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 43 ++++++++-
 .../agentic-point/time-series-chart.tsx       | 96 +++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 1abf64e6..2db2809b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -27,8 +27,10 @@ import {
   TimeSeriesChart,
   cumulativeAverage,
   cumulativeDifferenceMonotonic,
+  inflightUniqueTokens,
   rollingAverage,
   sumSeries,
+  timeRollingAverage,
 } from './time-series-chart';
 
 interface Props {
@@ -124,8 +126,10 @@ export function AgenticPointDetail({ id }: Props) {
   // shows how the metric varies across the SKU.
   const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
   const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
-  // Per-request timeline fetched only when the timeline view is active.
-  const timelineQuery = useRequestTimeline(id, view === 'timeline');
+  // Per-request timeline used by both the timeline view AND the per-point
+  // "Unique input tokens in flight" chart, so fetch whenever we're on
+  // either view.
+  const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point');
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -414,6 +418,41 @@ export function AgenticPointDetail({ id }: Props) {
               );
             }}
           />
+
+          <ExpandableChart
+            title="Unique input tokens in flight"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!timelineQuery.data) {
+                return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
+              }
+              // Step function: at each request start/end, sum the ISLs of
+              // currently-active requests across distinct cids. Within one
+              // cid turns are sequential so each cid contributes at most
+              // one in-flight ISL; across cids we treat content as
+              // independent (cross-conv prefix sharing adds <1pp in
+              // practice). Smooth with a 30s time-weighted rolling average
+              // so brief turn-handoff dips don't dominate the chart.
+              const raw = inflightUniqueTokens(timelineQuery.data.requests);
+              const smoothed = timeRollingAverage(raw, 30);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'In flight (avg 30s)',
+                      data: smoothed,
+                      rawData: raw,
+                      color: '#a855f7',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={timelineQuery.data.durationS}
+                  yAxisLabel="Tokens"
+                  {...size}
+                />
+              );
+            }}
+          />
         </div>
       )}
     </div>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 25d5a672..520b3ed6 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -27,6 +27,39 @@ interface TimeSeriesChartProps {
   height?: number;
 }
 
+/**
+ * Time-weighted rolling average over a `windowS`-second trailing window.
+ * Treats the input as a step function (value held constant between
+ * samples) and integrates over the trailing window, dividing by the
+ * window length. Good for smoothing irregularly-sampled event series
+ * (e.g. request start/end events) where the regular sample-count
+ * `rollingAverage` would over-weight bursts of close-together events.
+ */
+export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowS <= 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const tEnd = data[i]!.t;
+    const tStart = Math.max(0, tEnd - windowS);
+    // Find the first sample j whose t is >= tStart; the step value at
+    // tStart is data[j-1].value if j > 0, else data[0].value.
+    let j = 0;
+    while (j < data.length && data[j]!.t < tStart) j++;
+    let prevT = tStart;
+    let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value;
+    let area = 0;
+    for (; j <= i; j++) {
+      const curT = data[j]!.t;
+      area += prevV * (curT - prevT);
+      prevT = curT;
+      prevV = data[j]!.value;
+    }
+    const dur = tEnd - tStart;
+    out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value };
+  }
+  return out;
+}
+
 /** Centered rolling average over `windowSize` samples. */
 export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
   if (data.length === 0 || windowSize <= 1) return data;
@@ -75,6 +108,69 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Per-event step series: at each request start/end, sum the ISLs of
+ * currently-active requests across distinct `cid`s. Within a single
+ * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N),
+ * so each cid contributes at most one in-flight ISL at a time. Across
+ * different cids we assume content is independent (parent ↔ subagent
+ * and conv ↔ conv share negligible prefix in practice — cross-conv
+ * dedup added ~0.25 pp to theoretical hit rate, so treating them as
+ * independent is a tight approximation of the true in-flight unique
+ * token count).
+ *
+ * Output is a step function: one point per event, value held constant
+ * until the next event. Time axis is seconds relative to the earliest
+ * event in `requests`.
+ */
+export function inflightUniqueTokens(
+  requests: readonly { cid: string; start: number; end: number; isl: number | null }[],
+): TimeSeriesPoint[] {
+  if (requests.length === 0) return [];
+  // The request_timeline timestamps are ns-relative to its own origin.
+  // Convert events to seconds and emit a step series.
+  interface Event {
+    tNs: number;
+    kind: 'start' | 'end';
+    cid: string;
+    isl: number;
+  }
+  const events: Event[] = [];
+  for (const r of requests) {
+    const isl = r.isl ?? 0;
+    if (isl <= 0) continue;
+    events.push({ tNs: r.start, kind: 'start', cid: r.cid, isl });
+    events.push({ tNs: r.end, kind: 'end', cid: r.cid, isl });
+  }
+  if (events.length === 0) return [];
+  // Sort by time; on ties, process 'end' before 'start' so a same-instant
+  // turn handoff within one cid doesn't transiently double-count.
+  events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1));
+
+  // Active ISL per cid (max in case the same cid somehow has overlapping
+  // events; in practice it's always 0 or 1 request at a time per cid).
+  const activeByCid = new Map<string, number>();
+  let total = 0;
+  const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }];
+  for (const e of events) {
+    const tSec = e.tNs / 1e9;
+    if (e.kind === 'start') {
+      const prev = activeByCid.get(e.cid) ?? 0;
+      const next = Math.max(prev, e.isl);
+      activeByCid.set(e.cid, next);
+      total += next - prev;
+    } else {
+      const cur = activeByCid.get(e.cid) ?? 0;
+      if (cur > 0) {
+        total -= cur;
+        activeByCid.delete(e.cid);
+      }
+    }
+    out.push({ t: tSec, value: Math.max(0, total) });
+  }
+  return out;
+}
+
 /**
  * Monotonic-non-decreasing cumulative difference of two rate series:
  * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce

From 7561deb1cc5a210ce6cd074ab0d4771b3b9f8342 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 20:30:39 -0500
Subject: [PATCH 049/111] feat(chart-series): extract SGLang metrics alongside
 vllm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our chart_series + aggregate_stats extractors hardcoded vllm:* metric
names, so SGLang runs (e.g. qwen3.5/h100/sglang) ingested cleanly but
the per-point detail page rendered empty charts — chart_series fields
were all zero-length arrays.

Adds fallback chains in each extractor:

  KV cache util      vllm:kv_cache_usage_perc  → sglang:token_usage
  Prefix cache hits  vllm:prefix_cache_hits    → sglang:cached_tokens
  Prefix cache qrys  vllm:prefix_cache_queries → sglang:prompt_tokens
  Requests running   vllm:num_requests_running → sglang:num_running_reqs
  Requests waiting   vllm:num_requests_waiting → sglang:num_queue_reqs
  Prompt tokens rate vllm:prompt_tokens        → sglang:prompt_tokens
  Generation rate    vllm:generation_tokens    → sglang:generation_tokens

The `pickFirstNonEmpty` helper walks the chain and uses whichever
series has data, so a future framework (mori-sglang, dynamo, etc.) can
plug in by adding its names to each chain — no per-framework branching.

CHART_SERIES_VERSION → 4, STATS_VERSION → 3. Both backfills re-ran (86
chart_series rows, 190 aggregate_stats rows). SGLang chart_series for
qwen3.5 run 944 verified — was 0-length arrays before, now ~1800
samples each.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts   | 67 +++++++++++++++----
 packages/db/src/queries/agentic-aggregates.ts | 56 +++++++++++++---
 2 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 91e89521..86b79925 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -28,8 +28,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  *
  * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
  * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
+ *
+ * v4: extract sglang:* metrics too (fallback chain in each picker), so
+ * SGLang runs populate the chart_series the same way vllm runs do.
  */
-export const CHART_SERIES_VERSION = 3;
+export const CHART_SERIES_VERSION = 4;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -89,8 +92,13 @@ interface RawMetric {
 
 type MetricsMap = Record<string, RawMetric>;
 
-/** The set of metric subtrees the chart consumes. */
+/**
+ * The set of metric subtrees the chart consumes. Includes both vllm:* and
+ * sglang:* names so the stream-parse fallback collects whichever framework
+ * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric.
+ */
 const CHART_METRIC_KEYS = new Set([
+  // vLLM
   'vllm:kv_cache_usage_perc',
   'vllm:gpu_cache_usage_perc',
   'vllm:prefix_cache_hits',
@@ -100,6 +108,13 @@ const CHART_METRIC_KEYS = new Set([
   'vllm:prompt_tokens',
   'vllm:generation_tokens',
   'vllm:prompt_tokens_by_source',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
+  'sglang:generation_tokens',
+  'sglang:num_running_reqs',
+  'sglang:num_queue_reqs',
 ]);
 
 /**
@@ -220,18 +235,37 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   if (!Number.isFinite(startNs)) startNs = 0;
   const tOf = (ns: number) => (ns - startNs) / 1e9;
 
+  // Pick the first metric name whose series array has any data; fallback
+  // chain lets the same code path serve both vllm:* and sglang:* blobs.
+  const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => {
+    for (const name of names) {
+      const s = metrics[name]?.series;
+      if (s && s.length > 0) return s;
+    }
+    return undefined;
+  };
+
   // KV cache usage (gauge, 0..1) — average across engines so the value
   // stays a fraction (each engine has its own KV pool).
-  const kvSeries =
-    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvSeries = pickSeries(
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
   const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
     aggregateByStart(kvSeries, 'avg', 'avg'),
   ).map(([t, v]) => ({ t: tOf(t), value: v }));
 
   // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
-  // engines, joined on start_ns.
-  const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum');
-  const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum');
+  // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens.
+  const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens');
+  const qsSeries = pickSeries(
+    'vllm:prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
+  const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum');
+  const qsByT = aggregateByStart(qsSeries, 'rate', 'sum');
   const prefixCacheHitRate: TimeSeriesPoint[] = [];
   for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) {
     const q = qsByT.get(t);
@@ -239,8 +273,10 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   }
 
   // Queue depth: sum running + waiting across engines per timeslice.
-  const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum');
-  const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum');
+  const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs');
+  const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs');
+  const runByT = aggregateByStart(runSeries, 'avg', 'sum');
+  const waitByT = aggregateByStart(waitSeries, 'avg', 'sum');
   const queueDepth: QueueDepthPoint[] = [];
   // Union of timestamps so we surface activity even if one of the gauges
   // didn't report a sample on a given tick.
@@ -252,16 +288,19 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   }
 
   // Throughput: sum the counter `rate` (already per-second) across engines.
-  const counterRate = (name: string): TimeSeriesPoint[] =>
-    sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({
+  // Takes a fallback chain so vllm:* and sglang:* both work.
+  const counterRate = (...names: string[]): TimeSeriesPoint[] => {
+    const s = pickSeries(...names);
+    return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({
       t: tOf(t),
       value: v,
     }));
-  const prefillTps = counterRate('vllm:prompt_tokens');
-  const decodeTps = counterRate('vllm:generation_tokens');
+  };
+  const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens');
+  const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens');
   // Tokens served from prefix cache per scrape. Lets the frontend derive
   // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
-  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits');
+  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
   // Per-source prompt tokens — sum across engines per source label.
   const promptBySrcByT = new Map<string, Map<number, number>>();
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 1ad7fd7f..da5d18a0 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -32,8 +32,12 @@ import type { DbClient } from '../connection.js';
  *
  * v2: aggregate vllm gauges/counters across all engine series (was reading
  * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
+ *
+ * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate
+ * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way
+ * they do for vllm runs.
  */
-export const STATS_VERSION = 2;
+export const STATS_VERSION = 3;
 
 export interface MetricPercentiles {
   mean: number;
@@ -199,6 +203,18 @@ function aggregateSeriesByStart(
  * Aggregates across all engine series so multi-engine DP/PP deployments are
  * counted correctly (previously we only read engine 0).
  */
+/** First metric whose series array is non-empty; supports vllm/sglang fallback. */
+function pickFirstNonEmpty(
+  metrics: Record<string, MetricMeta>,
+  ...names: string[]
+): Series[] | undefined {
+  for (const name of names) {
+    const s = metrics[name]?.series;
+    if (s && s.length > 0) return s;
+  }
+  return undefined;
+}
+
 export function extractServerMetricSamples(json: string): {
   kvCacheUtil: number[];
   prefixCacheHitRate: number[];
@@ -208,17 +224,29 @@ export function extractServerMetricSamples(json: string): {
 
   // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
   // value stays a percentage; summing would give meaningless 0..N.
-  const kvSeriesAll =
-    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvSeriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
   const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
 
   // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
-  // all engines. Sum first, then divide.
-  const hitsAll =
-    metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series;
-  const queriesAll =
-    metrics['vllm:prefix_cache_queries']?.series ??
-    metrics['vllm:gpu_prefix_cache_queries']?.series;
+  // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens.
+  const hitsAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_hits',
+    'vllm:gpu_prefix_cache_hits',
+    'sglang:cached_tokens',
+  );
+  const queriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_queries',
+    'vllm:gpu_prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
   const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
   const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
   const prefixCacheHitRate: number[] = [];
@@ -232,12 +260,18 @@ export function extractServerMetricSamples(json: string): {
 
 /** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
 const TARGET_METRIC_KEYS = new Set([
+  // vLLM
   'vllm:kv_cache_usage_perc',
-  'vllm:gpu_cache_usage_perc', // older fallback name
+  'vllm:gpu_cache_usage_perc',
   'vllm:prefix_cache_hits',
   'vllm:prefix_cache_queries',
-  'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths)
+  'vllm:gpu_prefix_cache_hits',
   'vllm:gpu_prefix_cache_queries',
+  'vllm:prompt_tokens',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
 ]);
 
 /**

From 625d6e85e411cf8081977d3b76ad98d1805ad3c5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 20:48:58 -0500
Subject: [PATCH 050/111] fix(ingest): derive GPU cache hit rate for SGLang at
 ingest time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SGLang runs' harness JSON doesn't populate server_gpu_cache_hit_rate
(vLLM runs do), so the detail-page header and inference chart tooltip
showed "—" for SGLang points. Now at trace_replay ingest, if any of
the linked benchmark_results rows has a null server_gpu_cache_hit_rate
and we have non-empty prefill/hits time-series in the computed
chart_series, derive the lifetime cluster ratio as
sum(hits.rate) / sum(prompt.rate) and write it into the row's metrics
JSONB.

Already-stored SGLang rows from runs 944/945 backfilled via a one-off
UPDATE earlier in this session (8 rows, mostly ~87-89% hit rate, one
high-conc outlier at 2.4%).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8cc03f2a..8d1e01b8 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -100,4 +100,23 @@ export async function insertTraceReplay(
     set trace_replay_id = ${traceReplayId}
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
+
+  // Derive a lifetime GPU cache hit rate from chart_series for any linked
+  // row whose harness JSON didn't set one (SGLang runs don't populate
+  // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has
+  // no usable prefill data — leaves the field null in that case, matching
+  // legacy "no trace_replay" behavior.
+  if (chartSeries && chartSeries.prefillTps.length > 0) {
+    const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
+    const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+    if (sumPrompts > 0) {
+      const rate = sumHits / sumPrompts;
+      await sql`
+        update benchmark_results
+        set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric))
+        where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+          and (metrics->>'server_gpu_cache_hit_rate') is null
+      `;
+    }
+  }
 }

From aa76e9eca423d3ab2c7079ff28d74b70adefae1c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 14:38:52 -0500
Subject: [PATCH 051/111] feat(chart-series): map sglang:realtime_tokens to
 promptTokensBySource
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Cumulative prompt token source breakdown" chart was empty for
SGLang runs because the vllm-specific vllm:prompt_tokens_by_source
metric doesn't exist on SGLang. Maps sglang:realtime_tokens (which has
mode={prefill_cache, prefill_compute, decode}) into the same source
breakdown when no vllm series is present, filtered to prefill_* modes
(decode tokens are output throughput, not prompt-token volume).

CHART_SERIES_VERSION → 5. Backfilled 128 rows; SGLang rows from runs
944/946/947 now have prefill_cache + prefill_compute sources populated.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts | 31 ++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 86b79925..0807e238 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -31,8 +31,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  *
  * v4: extract sglang:* metrics too (fallback chain in each picker), so
  * SGLang runs populate the chart_series the same way vllm runs do.
+ *
+ * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
+ * into promptTokensBySource so the cumulative prompt-token-source-breakdown
+ * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
  */
-export const CHART_SERIES_VERSION = 4;
+export const CHART_SERIES_VERSION = 5;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -115,6 +119,7 @@ const CHART_METRIC_KEYS = new Set([
   'sglang:generation_tokens',
   'sglang:num_running_reqs',
   'sglang:num_queue_reqs',
+  'sglang:realtime_tokens',
 ]);
 
 /**
@@ -303,6 +308,12 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
   // Per-source prompt tokens — sum across engines per source label.
+  //   vllm: vllm:prompt_tokens_by_source has one series per source label
+  //         (local_cache_hit, external_cache_hit, miss, ...). Use the
+  //         `source`/`reason`/`kind` label as the breakdown key.
+  //   sglang: sglang:realtime_tokens uses a `mode` label with values
+  //         {prefill_cache, prefill_compute, decode}. Filter to prefill_*
+  //         since decode isn't prompt-token volume.
   const promptBySrcByT = new Map<string, Map<number, number>>();
   for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
     const labels = series.labels ?? {};
@@ -318,6 +329,24 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
       }
     }
   }
+  // SGLang fallback: only consider when the vllm metric wasn't found.
+  if (promptBySrcByT.size === 0) {
+    for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const mode = labels['mode'] ?? 'unknown';
+      if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens)
+      let byT = promptBySrcByT.get(mode);
+      if (!byT) {
+        byT = new Map<number, number>();
+        promptBySrcByT.set(mode, byT);
+      }
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+          byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+        }
+      }
+    }
+  }
   const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
   for (const [source, byT] of promptBySrcByT) {
     const arr: TimeSeriesPoint[] = [];

From 5872a3d8d3c6f5e6feee879e2f8f6f5d0ddd04ac Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 14:48:27 -0500
Subject: [PATCH 052/111] feat(chart-series): break out SGLang cache hits by
 cache_source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously SGLang detail pages showed two stacked-area layers in the
prompt-token source breakdown: prefill_cache (everything that hit the
cache) + prefill_compute (cache miss). The user wanted finer
granularity — specifically a distinction between on-GPU HBM cache and
CPU-offloaded (hicache) host cache.

SGLang's sglang:cached_tokens metric carries a cache_source label that
varies per cache tier:
  - "device" → on-GPU HBM cache hit
  - "host"   → CPU-offload (hicache) cache hit
  - "total"  → older sglang, single series with no tier breakdown

Switches the cache-hit portion of the breakdown from the coarse
`prefill_cache` mode label to per-cache_source series:
  - device → "cache hit (HBM)"
  - host   → "cache hit (CPU offload)"
  - total  → "cache hit"
  - other  → "cache hit (<src>)"

Cache misses still come from realtime_tokens[mode=prefill_compute],
relabeled "compute (miss)" for symmetry.

Current data only contains device/total (no hicache runs ingested
yet) — when hicache runs come in, the chart will automatically split
cache hits into HBM + CPU-offload layers with no further code change.

CHART_SERIES_VERSION → 6. Backfilled 128 rows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts | 47 +++++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 0807e238..1996708f 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -35,8 +35,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
  * into promptTokensBySource so the cumulative prompt-token-source-breakdown
  * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
+ *
+ * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source
+ * breakdown from sglang:cached_tokens — current runs always have one
+ * cache_source ("device" / HBM) but hicache (CPU offload) runs would
+ * split into "device" + "host" automatically once ingested.
  */
-export const CHART_SERIES_VERSION = 5;
+export const CHART_SERIES_VERSION = 6;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -330,15 +335,49 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }
   }
   // SGLang fallback: only consider when the vllm metric wasn't found.
+  //   - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]`
+  //   - Cache hits, split by tier: per-series `sglang:cached_tokens` where each
+  //     series carries a `cache_source` label ("device" = HBM, "host" = CPU
+  //     offload via hicache). Current runs have only `device`; when hicache
+  //     runs land, additional series will appear and the chart will split.
   if (promptBySrcByT.size === 0) {
     for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
       const labels = series.labels ?? {};
       const mode = labels['mode'] ?? 'unknown';
-      if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens)
-      let byT = promptBySrcByT.get(mode);
+      // Only carry the cache-miss line over — cache hits come from
+      // sglang:cached_tokens broken out by cache_source below, so we'd
+      // double-count if we kept `prefill_cache` here too.
+      if (mode !== 'prefill_compute') continue;
+      const label = 'compute (miss)';
+      let byT = promptBySrcByT.get(label);
+      if (!byT) {
+        byT = new Map<number, number>();
+        promptBySrcByT.set(label, byT);
+      }
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+          byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+        }
+      }
+    }
+    // Cache hits broken out per cache_source. Strip the noisy "total" label
+    // (older sglang versions emit a single un-broken-out series labelled
+    // total — show that as just "cache hit").
+    for (const series of metrics['sglang:cached_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const src = labels['cache_source'] ?? 'cache hit';
+      const label =
+        src === 'device'
+          ? 'cache hit (HBM)'
+          : src === 'host'
+            ? 'cache hit (CPU offload)'
+            : src === 'total'
+              ? 'cache hit'
+              : `cache hit (${src})`;
+      let byT = promptBySrcByT.get(label);
       if (!byT) {
         byT = new Map<number, number>();
-        promptBySrcByT.set(mode, byT);
+        promptBySrcByT.set(label, byT);
       }
       for (const ts of series.timeslices ?? []) {
         if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {

From 94a3e8b1986e54165c062e2a14eda60d9e9dd146 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:01:24 -0500
Subject: [PATCH 053/111] feat(chart-series): host cache util line + fix SGLang
 stacked-area colors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related fixes for SGLang hicache rendering on the agentic detail page:

1. KV cache utilization chart was GPU-HBM-only. SGLang hicache runs also
   expose sglang:hicache_host_{used,total}_tokens — the CPU offload
   pool's tokens-in-use over its capacity. Extracted as a new
   `hostKvCacheUsage` time series; frontend overlays it as a second
   orange line on the existing chart when the row has hicache data.

2. The cumulative-prompt-token-source-breakdown chart rendered ALL
   three SGLang sources in the same color, because the colors dict
   only knew vllm-style names (local_compute, local_cache_hit, etc.).
   Added explicit colors for the SGLang label names ('cache hit
   (HBM)', 'cache hit (CPU offload)', 'cache hit', 'compute (miss)')
   plus a memoized fallback palette so any future unknown source name
   gets a distinct color rather than falling through to gray.

CHART_SERIES_VERSION → 7. Backfilled 128 rows; hicache rows from
workflow_run 947 (8 rows) now have ~1830 hostKvCacheUsage samples
matching their HBM samples.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 16 ++++++++-
 .../agentic-point/time-series-chart.tsx       | 30 ++++++++++++++--
 .../src/hooks/api/use-trace-server-metrics.ts |  2 ++
 packages/db/src/etl/compute-chart-series.ts   | 36 ++++++++++++++++++-
 .../db/src/queries/trace-server-metrics.ts    |  3 ++
 5 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 2db2809b..b047ea8f 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -236,16 +236,30 @@ export function AgenticPointDetail({ id }: Props) {
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
+              // For SGLang hicache rows we have both GPU (HBM) util and
+              // host (CPU offload pool) util — overlay them as two lines.
+              const hasHost = metrics.hostKvCacheUsage.length > 0;
               return (
                 <TimeSeriesChart
                   series={[
                     {
-                      name: 'GPU KV cache (avg n=50)',
+                      name: hasHost ? 'GPU HBM (avg n=50)' : 'GPU KV cache (avg n=50)',
                       data: rollingAverage(metrics.kvCacheUsage, 50),
                       rawData: metrics.kvCacheUsage,
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
+                    ...(hasHost
+                      ? [
+                          {
+                            name: 'CPU offload pool (avg n=50)',
+                            data: rollingAverage(metrics.hostKvCacheUsage, 50),
+                            rawData: metrics.hostKvCacheUsage,
+                            color: '#f97316',
+                            strokeWidth: 2,
+                          },
+                        ]
+                      : []),
                   ]}
                   durationS={metrics.durationS}
                   yMax={1}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 520b3ed6..15a15869 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -485,10 +485,16 @@ export function StackedAreaChart({
   }, [sourceSeries]);
 
   const colors: Record<string, string> = {
+    // vLLM source names
     local_compute: '#f97316',
     local_cache_hit: '#3b82f6',
     external_kv_transfer: '#22c55e',
     miss: '#f97316',
+    // SGLang source names (set by compute-chart-series for sglang rows)
+    'cache hit (HBM)': '#3b82f6',
+    'cache hit (CPU offload)': '#22c55e',
+    'cache hit': '#3b82f6',
+    'compute (miss)': '#f97316',
   };
   const labelFor: Record<string, string> = {
     local_compute: 'Prefill',
@@ -496,6 +502,26 @@ export function StackedAreaChart({
     external_kv_transfer: 'Offload Cache Hit',
     miss: 'Miss',
   };
+  // Fallback palette for any source name not in `colors` so we never
+  // emit two layers in the same shade. Cycles by insertion order.
+  const fallbackPalette = [
+    '#3b82f6',
+    '#f97316',
+    '#22c55e',
+    '#a855f7',
+    '#ef4444',
+    '#06b6d4',
+    '#f59e0b',
+    '#ec4899',
+  ];
+  let fallbackIdx = 0;
+  const colorFor = (name: string): string => {
+    if (colors[name]) return colors[name]!;
+    const c = fallbackPalette[fallbackIdx % fallbackPalette.length]!;
+    fallbackIdx++;
+    colors[name] = c; // memoize so the SAME unknown name always gets the same color
+    return c;
+  };
 
   if (!computed) {
     return (
@@ -522,7 +548,7 @@ export function StackedAreaChart({
       .toReversed()
       .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
       .join(' ')} Z`;
-    const color = colors[name] ?? '#6b7280';
+    const color = colorFor(name);
     for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
     return { name, color, d };
   });
@@ -540,7 +566,7 @@ export function StackedAreaChart({
       }
     }
     const items: HoverItem[] = stackOrder.map((name) => ({
-      color: colors[name] ?? '#6b7280',
+      color: colorFor(name),
       label: labelFor[name] ?? name,
       value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
     }));
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index 664bc6c7..bac67a50 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -44,6 +44,8 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
   /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 1996708f..8105961e 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -40,8 +40,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * breakdown from sglang:cached_tokens — current runs always have one
  * cache_source ("device" / HBM) but hicache (CPU offload) runs would
  * split into "device" + "host" automatically once ingested.
+ *
+ * v7: extract sglang:hicache_host_{used,total}_tokens into a new
+ * hostKvCacheUsage series so the KV cache utilization chart can plot
+ * the CPU offload pool's usage alongside the on-GPU HBM line.
  */
-export const CHART_SERIES_VERSION = 6;
+export const CHART_SERIES_VERSION = 7;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -79,6 +83,12 @@ export interface ChartSeries {
    * saved vs the raw queries that came in.
    */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /**
+   * Host (CPU offload) KV cache utilization, 0..1. Only populated for
+   * SGLang hicache runs (derived as hicache_host_used / hicache_host_total).
+   * Frontend overlays this on the KV cache util chart as a second line.
+   */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -125,6 +135,8 @@ const CHART_METRIC_KEYS = new Set([
   'sglang:num_running_reqs',
   'sglang:num_queue_reqs',
   'sglang:realtime_tokens',
+  'sglang:hicache_host_used_tokens',
+  'sglang:hicache_host_total_tokens',
 ]);
 
 /**
@@ -312,6 +324,27 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
   const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
+  // SGLang hicache: host-pool KV cache utilization as used/total per
+  // timeslice. Both metrics are gauges in absolute tokens. Total stays
+  // constant (it's the pool size), used fluctuates.
+  const hostUsedByT = aggregateByStart(
+    metrics['sglang:hicache_host_used_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostTotalByT = aggregateByStart(
+    metrics['sglang:hicache_host_total_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostKvCacheUsage: TimeSeriesPoint[] = [];
+  for (const [t, used] of [...hostUsedByT.entries()].toSorted((a, b) => a[0] - b[0])) {
+    const total = hostTotalByT.get(t);
+    if (total !== undefined && total > 0) {
+      hostKvCacheUsage.push({ t: tOf(t), value: used / total });
+    }
+  }
+
   // Per-source prompt tokens — sum across engines per source label.
   //   vllm: vllm:prompt_tokens_by_source has one series per source label
   //         (local_cache_hit, external_cache_hit, miss, ...). Use the
@@ -407,5 +440,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     prefillTps,
     decodeTps,
     prefixCacheHitsTps,
+    hostKvCacheUsage,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 76775e77..eccb0a0c 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -73,6 +73,8 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
   /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -118,6 +120,7 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     decodeTps: series.decodeTps,
     // v2 chart_series rows pre-backfill don't have this field — default to []
     prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
+    hostKvCacheUsage: series.hostKvCacheUsage ?? [],
   };
 }
 

From 93e197b7e54d140acfe65b61aeb4f5c48ca27091 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:19:20 -0500
Subject: [PATCH 054/111] fix(stacked-area): align sources by timestamp before
 computing shares
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cumulative-prompt-token-source-breakdown chart was showing huge
"100% compute (miss)" plateaus around minute 20-24 of many SGLang runs.

Root cause: the chart computed cumulative shares per ARRAY INDEX (not
timestamp), but in SGLang's per-scrape metrics, cache hits and misses
fire on different ticks — one scrape reports 193K hits + 0 miss, the
next reports 0 hits + 8K miss. So each source has a different timestamp
array. Indexing them in lockstep mixed values from different moments
and made the share calculation flap to 100% one side or the other.

Fix: union timestamps across all sources, then for each unique
timestamp carry forward each source's cumulative sum (a source that
didn't report at time t holds its previous cumulative value rather
than appearing as 0).

After fix: shares change smoothly over time as each source's cumulative
sum grows; transient single-tick gaps no longer drive the visible
share to either extreme.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/time-series-chart.tsx       | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 15a15869..75d7bb1e 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -464,15 +464,36 @@ export function StackedAreaChart({
   const computed = useMemo(() => {
     const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
     if (entries.length === 0) return null;
-    const tValues = entries[0]![1].map((p) => p.t);
+
+    // Different sources can land on different scrape timestamps
+    // (SGLang's hits/misses fire on alternating ticks), so we MUST
+    // align across all sources before computing shares — otherwise the
+    // share calculation indexes into each source's own time axis and
+    // mixes values from different moments.
+    //
+    // Approach: union all timestamps across sources, then for each
+    // unique timestamp carry forward the cumulative sum for every
+    // source (a source that didn't report at time t holds its previous
+    // cumulative value rather than dropping to 0).
+    const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted(
+      (a, b) => a - b,
+    );
+
+    // For each source, walk its (sorted) array and produce a parallel
+    // cumulative-sum array indexed against `tValues` via carry-forward.
     const cum: Record<string, number[]> = {};
     for (const [name, arr] of entries) {
+      const valByT = new Map(arr.map((p) => [p.t, p.value]));
+      const out: number[] = Array.from({ length: tValues.length });
       let acc = 0;
-      cum[name] = arr.map((p) => {
-        acc += p.value;
-        return acc;
-      });
+      for (let i = 0; i < tValues.length; i++) {
+        const v = valByT.get(tValues[i]!);
+        if (v !== undefined) acc += v;
+        out[i] = acc;
+      }
+      cum[name] = out;
     }
+
     const shares: Record<string, number[]> = {};
     for (const name of Object.keys(cum)) shares[name] = [];
     for (let i = 0; i < tValues.length; i++) {

From c14e19e277930495e4a43c3a6d6f42a611fec336 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:44:07 -0500
Subject: [PATCH 055/111] fix(ingest): split GPU vs CPU cache hit rate for
 SGLang hicache rows

Previous inline derivation (commit 625d6e8) summed ALL cache hit
sources into server_gpu_cache_hit_rate, which conflated GPU HBM hits
with CPU offload hits on SGLang hicache rows. The harness JSON also
never sets server_cpu_cache_hit_rate.

Now derives both metrics from chart_series.promptTokensBySource:
  server_gpu_cache_hit_rate = sum(HBM + 'cache hit') / sum(prompts)
  server_cpu_cache_hit_rate = sum(CPU offload) / sum(prompts) or null
                              (null when no CPU offload source exists)

Falls back to prefixCacheHitsTps for vLLM rows where promptTokensBySource
isn't broken out by cache source. Overwrites any pre-existing value so
the derivation stays consistent with what the detail-page charts plot.

Backfilled all existing rows via two-phase SQL update earlier in the
session:
  - 8 hicache rows in workflow_run 947 now show GPU ~1-2% / CPU ~87-91%
  - Other SGLang rows show GPU ~87% / CPU null
  - vLLM rows restored to their original GPU hit rates

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 40 +++++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8d1e01b8..43655d9a 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -101,21 +101,43 @@ export async function insertTraceReplay(
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
 
-  // Derive a lifetime GPU cache hit rate from chart_series for any linked
-  // row whose harness JSON didn't set one (SGLang runs don't populate
-  // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has
-  // no usable prefill data — leaves the field null in that case, matching
-  // legacy "no trace_replay" behavior.
+  // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang
+  // runs don't populate these in the harness JSON; vLLM runs do but only
+  // for GPU. We always recompute to keep the derivation consistent with
+  // what the detail-page charts plot — overwriting any pre-existing value.
+  //
+  // For hicache (CPU offload) rows the chart_series.promptTokensBySource
+  // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)"
+  // sources, letting us split GPU vs CPU hit rate. Other rows just have
+  // a single cache-hit source (either "cache hit (HBM)" / "cache hit"
+  // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps
+  // sum which equals the single cache source's total).
   if (chartSeries && chartSeries.prefillTps.length > 0) {
     const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
-    const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
     if (sumPrompts > 0) {
-      const rate = sumHits / sumPrompts;
+      const sumOf = (name: string): number =>
+        (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0);
+      const cpuHits = sumOf('cache hit (CPU offload)');
+      const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit');
+      // If the source breakdown has a HBM entry, use it (covers SGLang).
+      // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path).
+      const gpuHits =
+        hbmFromBreakdown > 0
+          ? hbmFromBreakdown
+          : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+      const gpuRate = gpuHits / sumPrompts;
+      const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null;
       await sql`
         update benchmark_results
-        set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric))
+        set metrics = jsonb_set(
+          case when ${cpuRate}::numeric is not null
+            then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric))
+            else metrics
+          end,
+          '{server_gpu_cache_hit_rate}',
+          to_jsonb(${gpuRate}::numeric)
+        )
         where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
-          and (metrics->>'server_gpu_cache_hit_rate') is null
       `;
     }
   }

From 268617ccd85ccc8aea6ed12dd4bd61273c8a37c1 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 3 Jun 2026 10:40:04 -0500
Subject: [PATCH 056/111] fix(ingest): recognize vLLM LMCache
 external_kv_transfer as CPU hit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inline cache-hit-rate derivation only handled SGLang's hicache label
('cache hit (CPU offload)'). vLLM with LMCache uses 'external_kv_transfer'
in its prompt_tokens_by_source breakdown for the same concept (CPU
offload pool serving tokens to GPU). Those vLLM rows had cpu rate
null even when external_kv_transfer was the dominant source.

Adds external_kv_transfer + local_cache_hit to the source name aliases:
  GPU hits  = local_cache_hit + cache hit (HBM) + cache hit
  CPU hits  = external_kv_transfer + cache hit (CPU offload)
  fallback  = prefixCacheHitsTps total (for single-source rows)

Backfilled 132 affected rows via SQL — vLLM LMCache rows now show CPU
rate where present (e.g. dsv4 b300 conc=128 offload=on shows GPU ~1%
+ CPU ~87%, matching the actual cache topology).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 23 ++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 43655d9a..cb022ca9 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -106,21 +106,24 @@ export async function insertTraceReplay(
   // for GPU. We always recompute to keep the derivation consistent with
   // what the detail-page charts plot — overwriting any pre-existing value.
   //
-  // For hicache (CPU offload) rows the chart_series.promptTokensBySource
-  // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)"
-  // sources, letting us split GPU vs CPU hit rate. Other rows just have
-  // a single cache-hit source (either "cache hit (HBM)" / "cache hit"
-  // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps
-  // sum which equals the single cache source's total).
+  // Source label naming differs by framework / cache topology:
+  //   SGLang hicache: 'cache hit (HBM)' + 'cache hit (CPU offload)'
+  //   SGLang older:   'cache hit'      (no tier breakdown)
+  //   vLLM LMCache:   'local_cache_hit' + 'external_kv_transfer'  (+ 'local_compute' for miss)
+  //   vLLM single:    falls back to prefixCacheHitsTps total (= local cache only)
   if (chartSeries && chartSeries.prefillTps.length > 0) {
     const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
     if (sumPrompts > 0) {
       const sumOf = (name: string): number =>
         (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0);
-      const cpuHits = sumOf('cache hit (CPU offload)');
-      const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit');
-      // If the source breakdown has a HBM entry, use it (covers SGLang).
-      // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path).
+      // CPU-offload hits: SGLang hicache + vLLM LMCache external transfer.
+      const cpuHits = sumOf('cache hit (CPU offload)') + sumOf('external_kv_transfer');
+      // GPU/HBM hits from source breakdown, summed across known aliases.
+      const hbmFromBreakdown =
+        sumOf('cache hit (HBM)') + sumOf('cache hit') + sumOf('local_cache_hit');
+      // If the source breakdown has any GPU entry, use it. Otherwise fall back
+      // to total prefixCacheHitsTps sum (single-source vLLM path with no
+      // by_source metric — equals the lone cache counter's lifetime).
       const gpuHits =
         hbmFromBreakdown > 0
           ? hbmFromBreakdown

From 7fc6b4f7b5a49aa370d912d6df36b40d80b813a6 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:02:34 -0500
Subject: [PATCH 057/111] fix(scatter): use lightweight presence endpoint for
 View charts button
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chart pre-fetched full trace_replay JSONL blobs for every visible
agentic point just to decide whether to render the "View charts" button
in pinned tooltips. With the latest run's 8x8 conc=512 rows pushing up
to 13 MB compressed per blob, 12-id chunks blew past Neon's 64 MB
per-HTTP-response cap and 500'd — hiding the button for every point.

New /api/v1/trace-availability returns {id: true} for ids that have a
stored blob; ScatterGraph uses that boolean instead. trace-histograms
is still used by the detail page (single id, no chunking issue).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/api/v1/trace-availability/route.ts    | 59 +++++++++++++++++++
 .../components/inference/ui/ScatterGraph.tsx  | 23 ++++----
 .../inference/utils/tooltipUtils.ts           | 15 ++---
 .../src/hooks/api/use-trace-availability.ts   | 29 +++++++++
 packages/db/src/queries/trace-availability.ts | 34 +++++++++++
 5 files changed, 143 insertions(+), 17 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/trace-availability/route.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-availability.ts
 create mode 100644 packages/db/src/queries/trace-availability.ts

diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts
new file mode 100644
index 00000000..2484ceaf
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-availability/route.ts
@@ -0,0 +1,59 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceAvailability,
+  type TraceAvailabilityMap,
+} from '@semianalysisai/inferencex-db/queries/trace-availability';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceAvailability = cachedQuery(
+  (ids: number[]): Promise<TraceAvailabilityMap> => getTraceAvailability(getDb(), ids),
+  'trace-availability',
+);
+
+const MAX_IDS_PER_REQUEST = 500;
+
+/**
+ * GET /api/v1/trace-availability?ids=1,2,3
+ *
+ * Returns `{[id]: true}` for ids that have a stored trace_replay blob.
+ * Lightweight presence check used by the scatter tooltip to decide whether
+ * to render the "View charts" button — see queries/trace-availability.ts.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const availability = await getCachedTraceAvailability(sorted);
+    return cachedJson(availability);
+  } catch (error) {
+    console.error('Error fetching trace availability:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index fdcf8952..b93799db 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -6,7 +6,7 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react';
 
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
-import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
 import { useRouter } from 'next/navigation';
 import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
@@ -497,8 +497,11 @@ const ScatterGraph = React.memo(
     // All official points for rendering (unfiltered — visibility via opacity)
     const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]);
 
-    // Trace-replay histograms (ISL / OSL distributions) for agentic points.
-    // Pre-fetch the whole visible set so tooltip render stays synchronous.
+    // Bulk presence lookup for agentic points: which ids have a stored
+    // trace_replay blob → controls the "View charts" button in the pinned
+    // tooltip. We deliberately don't fetch the histograms themselves here;
+    // a 95-point dsv4-b300 dashboard would pull GB of profile blobs through
+    // Neon's HTTP API and trip its 64 MB per-response cap.
     const agenticIds = useMemo(() => {
       const ids: number[] = [];
       for (const p of pointsData) {
@@ -506,7 +509,7 @@ const ScatterGraph = React.memo(
       }
       return ids;
     }, [pointsData]);
-    const { data: traceHistograms } = useTraceHistograms(agenticIds);
+    const { data: traceAvailability } = useTraceAvailability(agenticIds);
     const router = useRouter();
 
     // Gradient label data
@@ -774,8 +777,7 @@ const ScatterGraph = React.memo(
             hardwareConfig,
             isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)),
             runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
-            traceHistogram:
-              typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined,
+            hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false,
           }),
         getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x),
         getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y),
@@ -842,10 +844,11 @@ const ScatterGraph = React.memo(
         removeTrackedConfig,
         chartDefinition.chartType,
         selectedPrecisions,
-        // Tooltip content closure reads traceHistograms to decide whether to
-        // show the "View charts" button — rebuild config when the histogram
-        // fetch resolves so the button appears for points that have data.
-        traceHistograms,
+        // Tooltip content closure reads traceAvailability to decide whether
+        // to render the "View charts" button — rebuild config when the
+        // presence fetch resolves so the button appears for points that
+        // have a trace_replay blob.
+        traceAvailability,
         router,
       ],
     );
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index ccc371f9..ed68c41b 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -20,12 +20,13 @@ export interface TooltipConfig {
   /** URL to the GitHub Actions workflow run */
   runUrl?: string;
   /**
-   * Per-request ISL/OSL arrays for agentic points, sourced from the stored
-   * aiperf `profile_export.jsonl`. Used to detect whether the point has any
-   * trace data (so the "View charts" button can appear); the actual
-   * distributions are rendered on the detail page, not inline.
+   * Whether this agentic point has a stored trace_replay blob. Controls
+   * visibility of the "View charts" button — the actual distributions are
+   * rendered on the detail page, not inline, so all the tooltip needs is a
+   * presence boolean (sourced from the bulk `/api/v1/trace-availability`
+   * call so we don't ship megabytes of profile JSONL just for this check).
    */
-  traceHistogram?: { isl: number[]; osl: number[] } | undefined;
+  hasTrace?: boolean;
 }
 
 export interface OverlayTooltipConfig extends TooltipConfig {
@@ -221,7 +222,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
     selectedYAxisMetric,
     hardwareConfig,
     runUrl,
-    traceHistogram,
+    hasTrace,
   } = config;
 
   return `
@@ -271,7 +272,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       </div>
       ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
-      ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))}
+      ${viewChartsButtonHTML(isPinned, Boolean(hasTrace))}
       ${
         isPinned
           ? `<button data-action="track-over-time" style="
diff --git a/packages/app/src/hooks/api/use-trace-availability.ts b/packages/app/src/hooks/api/use-trace-availability.ts
new file mode 100644
index 00000000..02176d59
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-availability.ts
@@ -0,0 +1,29 @@
+import { useQuery } from '@tanstack/react-query';
+
+export type TraceAvailabilityMap = Record<number, true>;
+
+async function fetchTraceAvailability(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<TraceAvailabilityMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/trace-availability?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`trace-availability ${res.status}`);
+  return (await res.json()) as TraceAvailabilityMap;
+}
+
+/**
+ * Bulk presence lookup: which of the given `benchmark_results.id`s have a
+ * stored trace_replay blob. Used by the scatter chart to decide whether to
+ * surface the "View charts" button — cheap boolean per id instead of
+ * shipping multi-MB profile blobs just for the check.
+ */
+export function useTraceAvailability(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['trace-availability', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchTraceAvailability(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/trace-availability.ts b/packages/db/src/queries/trace-availability.ts
new file mode 100644
index 00000000..155b3d4c
--- /dev/null
+++ b/packages/db/src/queries/trace-availability.ts
@@ -0,0 +1,34 @@
+/**
+ * Bulk "does this point have a trace_replay blob?" lookup. Used by the
+ * inference scatter chart to decide whether to render a "View charts"
+ * button in the pinned tooltip — a pure presence check that doesn't need
+ * the multi-megabyte blob payload `getTraceHistograms` ships.
+ *
+ * Going through `trace-histograms` for this trips Neon's 64 MB
+ * per-HTTP-response cap as soon as one chunk's combined gzip payload
+ * exceeds the cap (high-conc 8×8 rows can be 13 MB compressed each).
+ */
+
+import type { DbClient } from '../connection.js';
+
+/** Map of `benchmark_results.id` → true for each id that has a trace_replay blob. */
+export type TraceAvailabilityMap = Record<number, true>;
+
+export async function getTraceAvailability(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<TraceAvailabilityMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows = (await sql`
+    select br.id
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+      and atr.profile_export_jsonl_gz is not null
+  `) as { id: number }[];
+
+  const result: TraceAvailabilityMap = {};
+  for (const row of rows) result[Number(row.id)] = true;
+  return result;
+}

From 80468ebbb3f733db613de9241b82b6c159685b4d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:02:50 -0500
Subject: [PATCH 058/111] feat(chart-series): per-DP-rank KV cache utilization
 overlay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cluster-average KV util line hides load skew on DEP configs — 8
ranks averaging 20% can hide one rank at 12% and another at 23%.

Bump CHART_SERIES_VERSION 7 -> 8 to keep one entry per engine in
kvCacheUsageByEngine. The detail page draws each rank in the
request-timeline palette (so DP indices read as the same color in
both views) and overlays the bold red "Avg" line on top.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 87 ++++++++++++++-----
 .../agentic-point/time-series-chart.tsx       | 15 +++-
 .../src/hooks/api/use-trace-server-metrics.ts |  5 ++
 packages/db/src/etl/compute-chart-series.ts   | 41 ++++++++-
 .../db/src/queries/trace-server-metrics.ts    |  7 ++
 5 files changed, 131 insertions(+), 24 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index b047ea8f..1ce321ee 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -87,6 +87,25 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
+// Per-DP-rank color palette for DEP runs (one distinct color per rank in
+// the KV cache utilization overlay). Mirrors the request-timeline row
+// palette so the same DP index reads as the same color across both views.
+// Wraps mod-N if more than 12 ranks ever land.
+const DP_RANK_PALETTE = [
+  '#3b82f6',
+  '#ef4444',
+  '#10b981',
+  '#f59e0b',
+  '#a855f7',
+  '#06b6d4',
+  '#f97316',
+  '#84cc16',
+  '#ec4899',
+  '#14b8a6',
+  '#8b5cf6',
+  '#eab308',
+];
+
 type DetailView = 'point' | 'timeline' | 'aggregates';
 const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
@@ -239,28 +258,56 @@ export function AgenticPointDetail({ id }: Props) {
               // For SGLang hicache rows we have both GPU (HBM) util and
               // host (CPU offload pool) util — overlay them as two lines.
               const hasHost = metrics.hostKvCacheUsage.length > 0;
+              // DEP runs report one series per engine. When there's more
+              // than one, draw one line per rank in distinct colors so
+              // load skew is visible at a glance; cluster-average sits on
+              // top in white so it stands out.
+              const perEngine = metrics.kvCacheUsageByEngine ?? [];
+              const hasPerEngine = perEngine.length > 1;
+              // Render order matters: per-engine first → average drawn on top.
+              const series = [
+                ...(hasPerEngine
+                  ? perEngine.map((e, i) => ({
+                      name: `DP ${e.engineLabel}`,
+                      data: rollingAverage(e.points, 50),
+                      color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!,
+                      // Thin + translucent so the Avg line on top reads as
+                      // the headline number, not just one more series.
+                      strokeWidth: 1,
+                      strokeOpacity: 0.5,
+                    }))
+                  : []),
+                {
+                  name: hasHost
+                    ? 'GPU HBM (avg n=50)'
+                    : hasPerEngine
+                      ? 'Avg'
+                      : 'GPU KV cache (avg n=50)',
+                  data: rollingAverage(metrics.kvCacheUsage, 50),
+                  // Skip raw scatter when per-engine overlay is on — the
+                  // DP-rank lines already convey the spread, dots would be noise.
+                  rawData: hasPerEngine ? undefined : metrics.kvCacheUsage,
+                  // Bold red Avg sits on top of the translucent per-DP lines.
+                  // DP 1 in the palette is #ef4444 (lighter red); the darker
+                  // #dc2626 here plus the heavier stroke keeps it distinct.
+                  color: hasPerEngine ? '#dc2626' : '#3b82f6',
+                  strokeWidth: hasPerEngine ? 3.5 : 2,
+                },
+                ...(hasHost
+                  ? [
+                      {
+                        name: 'CPU offload pool (avg n=50)',
+                        data: rollingAverage(metrics.hostKvCacheUsage, 50),
+                        rawData: metrics.hostKvCacheUsage,
+                        color: '#f97316',
+                        strokeWidth: 2,
+                      },
+                    ]
+                  : []),
+              ];
               return (
                 <TimeSeriesChart
-                  series={[
-                    {
-                      name: hasHost ? 'GPU HBM (avg n=50)' : 'GPU KV cache (avg n=50)',
-                      data: rollingAverage(metrics.kvCacheUsage, 50),
-                      rawData: metrics.kvCacheUsage,
-                      color: '#3b82f6',
-                      strokeWidth: 2,
-                    },
-                    ...(hasHost
-                      ? [
-                          {
-                            name: 'CPU offload pool (avg n=50)',
-                            data: rollingAverage(metrics.hostKvCacheUsage, 50),
-                            rawData: metrics.hostKvCacheUsage,
-                            color: '#f97316',
-                            strokeWidth: 2,
-                          },
-                        ]
-                      : []),
-                  ]}
+                  series={series}
                   durationS={metrics.durationS}
                   yMax={1}
                   yFmt={(v) => `${(v * 100).toFixed(0)}%`}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 75d7bb1e..399f965d 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -15,6 +15,11 @@ interface Series {
   color: string;
   /** Override default stroke width (1.8). Use higher values for emphasis lines. */
   strokeWidth?: number;
+  /** Stroke opacity (0..1). Use < 1 for background/underlay lines. */
+  strokeOpacity?: number;
+  /** Hide from the hover legend (e.g. per-engine underlay lines that
+   *  would clutter the tooltip). The path still renders. */
+  hideFromHover?: boolean;
 }
 
 interface TimeSeriesChartProps {
@@ -287,6 +292,7 @@ export function TimeSeriesChart({
     const t = fraction * xMax;
     const items: HoverItem[] = [];
     for (const s of series) {
+      if (s.hideFromHover) continue;
       const v = interpAt(s.data, t);
       if (v === null || !Number.isFinite(v)) continue;
       items.push({ color: s.color, label: s.name, value: yFmt(v) });
@@ -363,6 +369,7 @@ export function TimeSeriesChart({
             fill="none"
             stroke={s.color}
             strokeWidth={s.strokeWidth ?? 1.8}
+            strokeOpacity={s.strokeOpacity ?? 1}
           />
         );
       })}
@@ -418,11 +425,13 @@ export function TimeSeriesChart({
         </text>
       )}
 
-      {/* Legend */}
+      {/* Legend — skip series flagged hideFromHover so per-engine
+          underlays don't clutter the chip row. */}
       {(() => {
+        const visible = series.filter((s) => !s.hideFromHover);
         const chipY = H - 8;
-        const chipW = innerW / Math.max(1, series.length);
-        return series.map((s, i) => {
+        const chipW = innerW / Math.max(1, visible.length);
+        return visible.map((s, i) => {
           const x = PAD.left + i * chipW;
           return (
             <g key={`leg${i}`}>
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index bac67a50..11905aaa 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -46,6 +46,11 @@ export interface TraceServerMetrics {
   prefixCacheHitsTps: TimeSeriesPoint[];
   /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
   hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization. Empty for single-engine deployments —
+   * the cluster-average `kvCacheUsage` line covers that case alone.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 8105961e..46600f7d 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -44,8 +44,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * v7: extract sglang:hicache_host_{used,total}_tokens into a new
  * hostKvCacheUsage series so the KV cache utilization chart can plot
  * the CPU offload pool's usage alongside the on-GPU HBM line.
+ *
+ * v8: keep the per-engine dimension on kv_cache_usage_perc as
+ * `kvCacheUsageByEngine` (one entry per DP rank). The cluster-average
+ * line hides load skew on DEP configs; the detail page overlays the
+ * per-rank lines so a hot rank is visible at a glance.
  */
-export const CHART_SERIES_VERSION = 7;
+export const CHART_SERIES_VERSION = 8;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -89,6 +94,15 @@ export interface ChartSeries {
    * Frontend overlays this on the KV cache util chart as a second line.
    */
   hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization (0..1 each). One entry per engine
+   * series found in the raw metric, ordered by the `engine` label when
+   * present and by series-array index otherwise. Empty for single-engine
+   * deployments — the average `kvCacheUsage` line covers that case alone.
+   * The detail page overlays these on the same chart so DEP load skew is
+   * visible without changing the headline number.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -277,6 +291,30 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
     aggregateByStart(kvSeries, 'avg', 'avg'),
   ).map(([t, v]) => ({ t: tOf(t), value: v }));
+  // Per-engine breakdown of the same metric. We only emit it when there's
+  // more than one series — single-engine deployments would just duplicate
+  // the cluster-average line.
+  const kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[] = [];
+  if (kvSeries && kvSeries.length > 1) {
+    // Sort by numeric engine label when present so rank 0..N renders in
+    // order; fall back to series-array index otherwise.
+    const decorated = kvSeries.map((s, idx) => {
+      const raw =
+        s.labels?.['engine'] ?? s.labels?.['engine_idx'] ?? s.labels?.['dp_rank'] ?? String(idx);
+      const numeric = Number(raw);
+      return { series: s, idx, label: raw, sortKey: Number.isFinite(numeric) ? numeric : idx };
+    });
+    decorated.sort((a, b) => a.sortKey - b.sortKey);
+    for (const { series, label } of decorated) {
+      const pts: TimeSeriesPoint[] = [];
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.start_ns !== 'number' || typeof ts.avg !== 'number') continue;
+        if (!Number.isFinite(ts.avg)) continue;
+        pts.push({ t: tOf(ts.start_ns), value: ts.avg });
+      }
+      if (pts.length > 0) kvCacheUsageByEngine.push({ engineLabel: label, points: pts });
+    }
+  }
 
   // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
   // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens.
@@ -441,5 +479,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     decodeTps,
     prefixCacheHitsTps,
     hostKvCacheUsage,
+    kvCacheUsageByEngine,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index eccb0a0c..5594d514 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -75,6 +75,11 @@ export interface TraceServerMetrics {
   prefixCacheHitsTps: TimeSeriesPoint[];
   /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
   hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization. Empty for single-engine deployments —
+   * the cluster-average `kvCacheUsage` line covers that case alone.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -121,6 +126,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     // v2 chart_series rows pre-backfill don't have this field — default to []
     prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
     hostKvCacheUsage: series.hostKvCacheUsage ?? [],
+    // v8+ field; older chart_series rows lack it → omit per-engine overlay.
+    kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [],
   };
 }
 

From 3a5ef158f615ba2177e7b911639d3ccd832159f2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:24:56 -0500
Subject: [PATCH 059/111] feat(scatter): restrict non-e2e xmodes to e2e-pareto
 points
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TTFT, interactivity, session-time, and prefill-tps charts used to
compute their own Pareto frontiers on the swapped x metric. That let a
vendor benchmark-hack: tune a config to top TTFT while quietly tanking
decode (or vice versa), and post a chart-topping point that didn't
reflect real e2e performance.

When xmode != 'e2e', filter the displayed point set to those that sit
on the (e2e_latency, y) Pareto frontier — same set of points across
every non-e2e chart, just rendered at the chosen x metric. The e2e
chart itself is unchanged and remains the source of truth.

Per Oren's review:
  "all and only the points that show up on e2e latency pareto should
   show up on ttft & interactivity & prefill tok/s/user pareto."

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx |   1 +
 .../inference/hooks/useChartData.ts           | 104 ++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 2e5a245f..c446dc71 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -285,6 +285,7 @@ export function InferenceProvider({
     selectedPercentile,
     compareGpuPair ?? null,
     benchmarkRunId,
+    selectedXAxisMode,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 328750f0..397572df 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -22,6 +22,84 @@ import {
 import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
 import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
+import {
+  paretoFrontLowerLeft,
+  paretoFrontLowerRight,
+  paretoFrontUpperLeft,
+  paretoFrontUpperRight,
+} from '@/lib/chart-utils';
+
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+
+/**
+ * Resolve the percentile-prefixed e2e-latency field name for the given
+ * sequence + percentile combo (e.g. 'median_e2el', 'p90_e2el').
+ */
+function e2elFieldFor(percentile: string): string {
+  return withPercentile('median_e2el', percentile);
+}
+
+/**
+ * Compute the set of benchmark_results.id values that sit on the
+ * (e2e_latency, y) Pareto frontier within each (hwKey, precision, date)
+ * group. Used to restrict the non-e2e xmode charts (ttft, interactivity,
+ * session-time, prefill-tps) so they show *only* the points that win on
+ * end-to-end latency — preventing benchmark-hacking where a config tops
+ * one axis while tanking the other.
+ *
+ * Returns null when the y-metric has no roofline direction declared on
+ * the e2e chart (caller falls back to no filtering in that case).
+ */
+function e2eParetoIds(
+  points: InferenceData[],
+  selectedYAxisMetric: string,
+  percentile: string,
+): Set<number> | null {
+  const e2eChartDef = (chartDefinitions as ChartDefinition[]).find((c) => c.chartType === 'e2e');
+  if (!e2eChartDef) return null;
+  const dir = e2eChartDef[`${selectedYAxisMetric}_roofline` as keyof ChartDefinition] as
+    | 'upper_right'
+    | 'upper_left'
+    | 'lower_left'
+    | 'lower_right'
+    | undefined;
+  if (!dir) return null;
+  const frontierFn =
+    dir === 'upper_right'
+      ? paretoFrontUpperRight
+      : dir === 'upper_left'
+        ? paretoFrontUpperLeft
+        : dir === 'lower_left'
+          ? paretoFrontLowerLeft
+          : paretoFrontLowerRight;
+  const e2elField = e2elFieldFor(percentile);
+  const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+
+  // Re-frame each candidate point in (e2el, y) space, then compute the
+  // pareto per (hwKey, precision, date) bucket — frontiers don't span dates
+  // (a May 17 point can't dominate a May 15 plot).
+  const byGroup = new Map<string, InferenceData[]>();
+  for (const p of points) {
+    const yValue = (p[metricKey] as { y?: number } | undefined)?.y;
+    const xValue = (p as unknown as Record<string, unknown>)[e2elField];
+    if (typeof xValue !== 'number' || !Number.isFinite(xValue)) continue;
+    if (typeof yValue !== 'number' || !Number.isFinite(yValue)) continue;
+    const key = `${p.hwKey}|${p.precision}|${p.date}`;
+    let bucket = byGroup.get(key);
+    if (!bucket) {
+      bucket = [];
+      byGroup.set(key, bucket);
+    }
+    bucket.push({ ...p, x: xValue, y: yValue });
+  }
+  const ids = new Set<number>();
+  for (const bucket of byGroup.values()) {
+    for (const f of frontierFn(bucket)) {
+      if (typeof f.id === 'number') ids.add(f.id);
+    }
+  }
+  return ids;
+}
 
 /** Build deduplicated comparison dates, excluding the main run date. */
 export function buildComparisonDates(
@@ -92,6 +170,15 @@ export function useChartData(
    * config — disambiguates when two runs land on the same date.
    */
   selectedRunId?: string,
+  /**
+   * Current x-axis mode. When set to anything other than 'e2e', the displayed
+   * data is filtered to the (e2e-latency, y) Pareto frontier so the ttft /
+   * interactivity / session-time / prefill-tps charts show only points that
+   * also win on end-to-end latency — preventing benchmark-hacking where a
+   * config tops one metric while tanking the other. The 'e2e' mode is the
+   * source of truth and keeps the full point set.
+   */
+  selectedXAxisMode: XAxisMode = 'e2e',
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
@@ -357,6 +444,21 @@ export function useChartData(
 
         filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
 
+        // When the user is NOT viewing the e2e latency chart, restrict the
+        // displayed points to those that sit on the (e2e_latency, y) Pareto
+        // frontier — i.e. "this is the e2e chart, we're just plotting the
+        // ttft value." Prevents benchmark-hacking where a config tops one
+        // axis (TTFT, interactivity, prefill-tps) while quietly tanking
+        // end-to-end latency.
+        if (selectedXAxisMode !== 'e2e') {
+          const paretoIds = e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
+          if (paretoIds) {
+            filteredData = filteredData.filter(
+              (d) => typeof d.id === 'number' && paretoIds.has(d.id),
+            );
+          }
+        }
+
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
         const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
@@ -407,6 +509,8 @@ export function useChartData(
     userPowers,
     stableChartDefinitions,
     compareGpuPair,
+    selectedXAxisMode,
+    selectedPercentile,
   ]);
 
   return { graphs, loading, error, hardwareConfig };

From 5035e17a8fdc2b6c9b86511075d22687a2b1f731 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:36:22 -0500
Subject: [PATCH 060/111] fix(scatter): keep non-pareto points visible on
 non-e2e xmodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous change filtered the displayed data down to e2e-Pareto winners,
which hid every dominated config from the TTFT / interactivity /
session-time / prefill-tps views. Users couldn't see where the
non-optimal configs actually sit on the alternative axes — losing
diagnostic visibility just to enforce the anti-benchmark-hack rule.

Switch from hard filter to a per-point `isOnE2eFrontier` flag: every
point still renders as scatter, only the e2e-Pareto winners feed the
frontier line. ScatterGraph honors the flag in its roofline compute
so the line stays restricted to non-hackable configs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../inference/hooks/useChartData.ts           | 30 ++++++++++---------
 .../app/src/components/inference/types.ts     | 11 +++++++
 .../components/inference/ui/ScatterGraph.tsx  | 14 ++++++++-
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 397572df..50e6d87d 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -444,20 +444,17 @@ export function useChartData(
 
         filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
 
-        // When the user is NOT viewing the e2e latency chart, restrict the
-        // displayed points to those that sit on the (e2e_latency, y) Pareto
-        // frontier — i.e. "this is the e2e chart, we're just plotting the
-        // ttft value." Prevents benchmark-hacking where a config tops one
-        // axis (TTFT, interactivity, prefill-tps) while quietly tanking
-        // end-to-end latency.
-        if (selectedXAxisMode !== 'e2e') {
-          const paretoIds = e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
-          if (paretoIds) {
-            filteredData = filteredData.filter(
-              (d) => typeof d.id === 'number' && paretoIds.has(d.id),
-            );
-          }
-        }
+        // When the user is NOT viewing the e2e latency chart, mark each
+        // point with whether it sits on the (e2e_latency, y) Pareto
+        // frontier for its (hwKey, precision, date) group. The chart
+        // still renders every point as scatter — only e2e-Pareto winners
+        // feed the roofline (ScatterGraph honors the flag). Prevents
+        // benchmark-hacking the TTFT / interactivity line by tanking
+        // decode (or vice versa) without hiding non-optimal configs.
+        const e2eParetoSet =
+          selectedXAxisMode === 'e2e'
+            ? null
+            : e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
@@ -470,11 +467,16 @@ export function useChartData(
                 const yValue = (d[metricKey] as { y: number })?.y ?? d.y;
                 const roof = (d[metricKey] as { roof: boolean })?.roof ?? false;
                 const xValue = (d as any)[xAxisField] ?? d.x;
+                const isOnE2eFrontier =
+                  e2eParetoSet === null
+                    ? undefined
+                    : typeof d.id === 'number' && e2eParetoSet.has(d.id);
                 return {
                   ...d,
                   x: xValue,
                   y: yValue,
                   roof,
+                  isOnE2eFrontier,
                 };
               })
               // When TTFT is on the x-axis, apply the latency limit to filter
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index bedded40..219e6bd7 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -158,6 +158,17 @@ export interface InferenceData extends Partial<Omit<AggDataEntry, AggDataConflic
   x: number;
   y: number;
   hidden?: boolean;
+  /**
+   * Whether this point sits on the (e2e_latency, y-metric) Pareto frontier.
+   * Set by useChartData when `selectedXAxisMode !== 'e2e'`. The TTFT /
+   * interactivity / session-time / prefill-tps charts use this flag to
+   * restrict their roofline computation to e2e-Pareto winners — vendors
+   * can't benchmark-hack TTFT by tanking decode (or vice versa) and still
+   * appear on the frontier line — while keeping every point visible as
+   * scatter so the user can see where dominated configs actually sit.
+   * Undefined when the chart is in e2e mode (no remapping needed).
+   */
+  isOnE2eFrontier?: boolean;
 
   // Overridden fields with narrower types
   hwKey: string;
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index b93799db..a5cbc9cf 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -383,7 +383,19 @@ const ScatterGraph = React.memo(
         }
         const combined: InferenceData[] = [];
         for (const datePoints of byDate.values()) {
-          combined.push(...frontierFn(datePoints));
+          // In non-e2e xmodes, useChartData stamps every point with an
+          // `isOnE2eFrontier` flag so the line is restricted to the
+          // e2e-Pareto winners — same set of points across every chart,
+          // just re-plotted at the chosen x metric. When the flag is
+          // present on ANY point in the bucket, narrow to the winners
+          // before paretoing (otherwise we'd recompute a fresh frontier
+          // on the swapped x axis and reintroduce the benchmark hack).
+          const flagged = datePoints.some((p) => p.isOnE2eFrontier !== undefined);
+          const seedPoints = flagged
+            ? datePoints.filter((p) => p.isOnE2eFrontier === true)
+            : datePoints;
+          if (seedPoints.length === 0) continue;
+          combined.push(...frontierFn(seedPoints));
         }
         combined.sort((a, b) => a.x - b.x);
         result[hwKey] = combined;

From 2bfea38c9b200c4fbd09592cddf8b0e788e4a580 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:43:09 -0500
Subject: [PATCH 061/111] fix(scatter): scope e2e-pareto restriction to agentic
 only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixed-seq workloads don't have the multi-turn / session-time framing
that motivated the anti-hack rule — their e2e IS the request latency,
so a TTFT hack there reads honestly on e2e too. Reverting fixed-seq
to the prior per-axis Pareto avoids changing established leaderboard
semantics for non-agentic runs.

Agentic continues to mark `isOnE2eFrontier` on each point so the TTFT,
interactivity, session-time and prefill-tps lines stay restricted to
e2e-winning configs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../inference/hooks/useChartData.ts           | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 50e6d87d..3c67ff90 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -444,22 +444,30 @@ export function useChartData(
 
         filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
 
-        // When the user is NOT viewing the e2e latency chart, mark each
-        // point with whether it sits on the (e2e_latency, y) Pareto
-        // frontier for its (hwKey, precision, date) group. The chart
-        // still renders every point as scatter — only e2e-Pareto winners
-        // feed the roofline (ScatterGraph honors the flag). Prevents
-        // benchmark-hacking the TTFT / interactivity line by tanking
-        // decode (or vice versa) without hiding non-optimal configs.
+        // For AGENTIC workloads only: when the user is NOT viewing the
+        // e2e latency chart, mark each point with whether it sits on the
+        // (e2e_latency, y) Pareto frontier for its (hwKey, precision,
+        // date) group. The chart still renders every point as scatter —
+        // only e2e-Pareto winners feed the roofline (ScatterGraph honors
+        // the flag). Prevents benchmark-hacking the TTFT / interactivity
+        // line by tanking decode (or vice versa) without hiding the
+        // non-optimal configs from view.
+        //
+        // Fixed-seq workloads keep the existing per-axis Pareto since
+        // there's no separate "session-time" notion of total latency —
+        // their e2e IS the request latency, so a TTFT hack there reads
+        // honestly on e2e too. The anti-hack constraint is specifically
+        // about multi-turn agentic where TTFT measures a tiny fraction
+        // of the user-visible session time.
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
         const e2eParetoSet =
-          selectedXAxisMode === 'e2e'
-            ? null
-            : e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
+          isAgentic && selectedXAxisMode !== 'e2e'
+            ? e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile)
+            : null;
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
         const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
-        const isAgentic = selectedSequence === Sequence.AgenticTraces;
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)

From cbeeb695a15391fde615792bf9ad9e3e4233b220 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:56:40 -0500
Subject: [PATCH 062/111] feat(legend): info tooltip on Optimal Only for
 agentic non-e2e modes

Add an optional infoTooltip field to LegendSwitchConfig that renders a
small info icon next to the switch label. On agentic + non-e2e xmodes,
hovering it explains that "optimal" means on the end-to-end Pareto
frontier (not a per-axis Pareto), so users understand why off-frontier
points may appear above the line.

Hit target widened (-m-1.5 p-1.5) and delay dropped to 100ms so the
tiny icon isn't flaky to hover.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 15 ++++++++++-
 .../app/src/components/ui/chart-legend.tsx    | 26 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index a5cbc9cf..2552a334 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -12,7 +12,7 @@ import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import { computeToggle } from '@/hooks/useTogglableSet';
 import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
-import { getChartWatermark } from '@/lib/data-mappings';
+import { getChartWatermark, Sequence } from '@/lib/data-mappings';
 import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils';
 import { D3Chart } from '@/lib/d3-chart/D3Chart';
 import type {
@@ -242,6 +242,8 @@ const ScatterGraph = React.memo(
       trackedConfigs,
       addTrackedConfig,
       removeTrackedConfig,
+      selectedXAxisMode,
+      selectedSequence,
     } = useInference();
 
     const {
@@ -2281,6 +2283,17 @@ const ScatterGraph = React.memo(
                   setHideNonOptimal(checked);
                   track('latency_hide_non_optimal_toggled', { enabled: checked });
                 },
+                // On agentic + non-e2e chart, "optimal" means "on the
+                // e2e-latency Pareto frontier" (not a per-axis Pareto on the
+                // current x metric). Explain that so users don't wonder why
+                // a point sitting above the line is still considered
+                // dominated.
+                ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e'
+                  ? {
+                      infoTooltip:
+                        "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.",
+                    }
+                  : {}),
               },
               {
                 id: 'scatter-hide-point-labels',
diff --git a/packages/app/src/components/ui/chart-legend.tsx b/packages/app/src/components/ui/chart-legend.tsx
index 81d5f261..a20c9959 100644
--- a/packages/app/src/components/ui/chart-legend.tsx
+++ b/packages/app/src/components/ui/chart-legend.tsx
@@ -6,6 +6,7 @@ import {
   ArrowRightToLine,
   Circle,
   Diamond,
+  Info,
   Square,
   Triangle,
   X,
@@ -36,6 +37,8 @@ export interface LegendSwitchConfig {
   label: string;
   checked: boolean;
   onCheckedChange: (checked: boolean) => void;
+  /** Optional explainer rendered as an info-icon tooltip next to the label. */
+  infoTooltip?: React.ReactNode;
 }
 
 export interface LegendActionConfig {
@@ -273,6 +276,29 @@ export default function ChartLegend({
             >
               {sw.label}
             </Label>
+            {sw.infoTooltip && (
+              <TooltipProvider delayDuration={100}>
+                <TooltipRoot>
+                  <TooltipTrigger asChild>
+                    <button
+                      type="button"
+                      data-testid={`${sw.id}-info`}
+                      aria-label={`More info about ${sw.label}`}
+                      className="text-muted-foreground hover:text-foreground cursor-help -m-1.5 p-1.5 inline-flex items-center"
+                    >
+                      <Info size={14} />
+                    </button>
+                  </TooltipTrigger>
+                  <TooltipContent
+                    side="top"
+                    sideOffset={6}
+                    className="max-w-[260px] text-xs leading-snug"
+                  >
+                    {sw.infoTooltip}
+                  </TooltipContent>
+                </TooltipRoot>
+              </TooltipProvider>
+            )}
           </div>
         ))}
       </div>

From de5e51a1330d7c24f51850e729a19a2d8802d990 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 14:50:42 -0500
Subject: [PATCH 063/111] fix(inference): don't scope chart to one run when
 runs cover different hardware
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two workflow runs landing on the same date for the same model+precision
but DIFFERENT hardware (e.g. a B300 dsv4 run and a B200 dsv4 run) each
get their own changelog entry. The single-run scoping guard matched runs
by model+precision only, so both counted as "runs with a changelog for
this model", length>1 tripped, and selecting either run scoped the
benchmarks query to that one workflow run — hiding the other GPU's curve
entirely (carry-forward across hardware silently broke).

Scope to a single run only when two runs contest the SAME full config_key
(model-precision-hardware-framework) — a genuine same-day re-run of one
hardware, where a DISTINCT ON merge could mix them. Complementary
different-hardware runs now both render via the normal date carry-forward.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index c446dc71..244c713c 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -242,25 +242,42 @@ export function InferenceProvider({
   const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING)
     .filter(([, model]) => model === selectedModel)
     .map(([prefix]) => prefix);
-  const runIdsWithModelChangelog: string[] = [];
+  // Map each FULL config_key (model-precision-hardware-framework) a run's
+  // changelog claims to the set of runs claiming it. Single-run scoping should
+  // only kick in when two runs contest the SAME full key — e.g. a same-day
+  // re-run of one hardware — because then a DISTINCT ON merge could mix them
+  // and the user needs to pick which run wins. Runs covering DIFFERENT hardware
+  // of the same model (e.g. a B300 run and a B200 run on the same date) are
+  // complementary: both must render via carry-forward. Matching on model+
+  // precision alone (the old behavior) wrongly treated those as alternatives
+  // and scoped the chart to one run, hiding the other GPU's curve.
+  const runsByConfigKey = new Map<string, Set<string>>();
   if (availableRuns) {
     for (const [runId, runInfo] of Object.entries(availableRuns)) {
       if (!runInfo.changelog) continue;
-      const matches = runInfo.changelog.entries.some((entry) =>
-        entry.config_keys.some((key) => {
+      for (const entry of runInfo.changelog.entries) {
+        for (const key of entry.config_keys) {
           const parts = key.split('-');
-          return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!);
-        }),
-      );
-      if (matches) runIdsWithModelChangelog.push(runId);
+          if (modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!)) {
+            let runs = runsByConfigKey.get(key);
+            if (!runs) {
+              runs = new Set<string>();
+              runsByConfigKey.set(key, runs);
+            }
+            runs.add(runId);
+          }
+        }
+      }
     }
   }
+  // A run is "contested" only if some full config_key it claims is also claimed
+  // by another run. Only then does picking a run disambiguate anything.
+  const contestedRunIds = new Set<string>();
+  for (const runs of runsByConfigKey.values()) {
+    if (runs.size > 1) for (const r of runs) contestedRunIds.add(r);
+  }
   const benchmarkRunId =
-    selectedRunId &&
-    runIdsWithModelChangelog.length > 1 &&
-    runIdsWithModelChangelog.includes(selectedRunId)
-      ? String(selectedRunId)
-      : undefined;
+    selectedRunId && contestedRunIds.has(String(selectedRunId)) ? String(selectedRunId) : undefined;
 
   const {
     graphs,

From af8766ddbe9a3077b9a226cd3487f4f4e040e58b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 11 Jun 2026 11:24:29 -0500
Subject: [PATCH 064/111] fix(inference): carry forward un-contested configs
 when a run is selected
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Selecting a workflow run in the picker scoped the ENTIRE benchmarks query
to that run, so any same-day config living in a different workflow run
vanished — e.g. with two vLLM runs and one SGLang run on the same date,
picking either vLLM run (contested, so scoping kicks in) hid the SGLang
curve entirely, while picking the SGLang run (uncontested, no scoping)
showed everything.

Fetch both the normal latest-per-config rows and the run-scoped rows, and
merge: the selected run wins for every (model, precision, hardware,
framework, benchmark_type) group it actually produced — preserving the
disambiguation that scoping exists for, including dropping base rows for
concs the run didn't cover so DISTINCT-ON mixing can't sneak back — and
every other config carries forward from the base rows. benchmark_type is
part of the replacement key so an agentic-only run can't hide the same
config's fixed-seq carry-forward.

The base query is the default view query so it's effectively always
cached; run selection adds no extra latency in practice.

Verified live: Jun 10, DSv4 B300, run 3/3 (vLLM affinity run) now renders
both b300_vllm (run-scoped) and b300_sglang (carried forward).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx |  4 ++
 .../inference/hooks/useChartData.ts           | 41 ++++++++++---
 .../app/src/lib/benchmark-transform.test.ts   | 60 ++++++++++++++++++-
 packages/app/src/lib/benchmark-transform.ts   | 29 +++++++++
 4 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 3b994367..5d165e60 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -286,6 +286,10 @@ export function InferenceProvider({
   }
   // A run is "contested" only if some full config_key it claims is also claimed
   // by another run. Only then does picking a run disambiguate anything.
+  // Downstream (useChartData / mergeRunScopedRows) this no longer scopes the
+  // WHOLE chart to the run: only the configs the run actually produced are
+  // pinned to it, and every other config (e.g. another framework's same-day
+  // run) still carries forward from the normal latest-per-config rows.
   const contestedRunIds = new Set<string>();
   for (const runs of runsByConfigKey.values()) {
     if (runs.size > 1) for (const r of runs) contestedRunIds.add(r);
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 019d0691..e76c3123 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -19,7 +19,11 @@ import {
   getModelSortIndex,
   hardwareKeyMatchesAnyBase,
 } from '@/lib/constants';
-import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
+import {
+  mergeRunScopedRows,
+  transformBenchmarkRows,
+  withPercentile,
+} from '@/lib/benchmark-transform';
 import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 import {
@@ -183,19 +187,40 @@ export function useChartData(
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
   // materialized view instead of firing a redundant second fetch with identical data.
-  // When a specific run is selected, we always go through the runId branch and the
-  // date is effectively ignored — keep queryDate set so React Query still has a
-  // distinct cache key per date if the user navigates back to "latest".
   const queryDate =
     selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate
       ? ''
       : selectedRunDate;
 
+  // Two queries: the normal latest-per-config view (always), plus the
+  // run-scoped rows when a specific workflow run is selected. The merged
+  // result pins ONLY the configs the selected run produced to that run, and
+  // carries every other config forward from the base rows — selecting one of
+  // two same-day vLLM runs must not hide the day's SGLang curve just because
+  // it lives in a different workflow run. The base query is the default view
+  // query, so it's almost always already in the React Query cache.
+  const {
+    data: baseRows,
+    isLoading: baseLoading,
+    error: baseError,
+  } = useBenchmarks(selectedModel, queryDate, enabled);
   const {
-    data: allRows,
-    isLoading: queryLoading,
-    error: queryError,
-  } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId);
+    data: runRows,
+    isLoading: runLoading,
+    error: runError,
+  } = useBenchmarks(selectedModel, queryDate, enabled && Boolean(selectedRunId), selectedRunId);
+
+  const allRows = useMemo(() => {
+    if (!selectedRunId) return baseRows;
+    // Wait for the run rows before rendering a scoped view — rendering base
+    // rows first would flash the un-scoped chart, then swap contested points.
+    if (!runRows) return undefined;
+    if (!baseRows) return runRows;
+    return mergeRunScopedRows(runRows, baseRows);
+  }, [selectedRunId, runRows, baseRows]);
+
+  const queryLoading = baseLoading || (Boolean(selectedRunId) && runLoading);
+  const queryError = baseError ?? (selectedRunId ? runError : null);
 
   // GPU comparison: fetch data for each additional comparison date
   const comparisonDates = useMemo(
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 62cc1809..077e8c3e 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -2,7 +2,11 @@ import { describe, it, expect, vi } from 'vitest';
 
 import type { BenchmarkRow } from '@/lib/api';
 
-import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform';
+import {
+  mergeRunScopedRows,
+  rowToAggDataEntry,
+  transformBenchmarkRows,
+} from './benchmark-transform';
 
 function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
@@ -776,3 +780,57 @@ describe('transformBenchmarkRows — dp_attention narrowing', () => {
     expect(point.decode_dp_attention).toBe(true);
   });
 });
+
+describe('mergeRunScopedRows', () => {
+  const vllmRun = (over: Partial<BenchmarkRow> = {}) =>
+    makeRow({ model: 'dsv4', hardware: 'b300', framework: 'vllm', precision: 'fp4', ...over });
+  const sglangBase = (over: Partial<BenchmarkRow> = {}) =>
+    makeRow({ model: 'dsv4', hardware: 'b300', framework: 'sglang', precision: 'fp4', ...over });
+
+  it('pins configs the run covers to the run rows, replacing base rows', () => {
+    const runRows = [vllmRun({ id: 10, conc: 32 }), vllmRun({ id: 11, conc: 64 })];
+    const baseRows = [vllmRun({ id: 90, conc: 32 }), vllmRun({ id: 91, conc: 128 })];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    // All vllm base rows dropped (incl. conc=128 the run didn't cover) — a
+    // partial-sweep run must fully own its config or the DISTINCT-ON mixing
+    // the scoping exists to prevent comes right back.
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]);
+  });
+
+  it('carries forward configs the run does not cover (the same-day other-framework curve)', () => {
+    const runRows = [vllmRun({ id: 10 })];
+    const baseRows = [
+      vllmRun({ id: 90 }),
+      sglangBase({ id: 91 }),
+      sglangBase({ id: 92, conc: 128 }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91, 92]);
+  });
+
+  it('keeps base rows of other hardware / precision / model untouched', () => {
+    const runRows = [vllmRun({ id: 10 })];
+    const baseRows = [
+      vllmRun({ id: 90, hardware: 'b200' }),
+      vllmRun({ id: 91, precision: 'fp8' }),
+      vllmRun({ id: 92, model: 'kimik2.5' }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 90, 91, 92]);
+  });
+
+  it('scopes per benchmark_type — an agentic run does not hide fixed-seq carry-forward', () => {
+    const runRows = [vllmRun({ id: 10, benchmark_type: 'agentic_traces' })];
+    const baseRows = [
+      vllmRun({ id: 90, benchmark_type: 'agentic_traces' }),
+      vllmRun({ id: 91, benchmark_type: 'single_turn' }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]);
+  });
+
+  it('returns base rows unchanged when the run produced nothing', () => {
+    const baseRows = [vllmRun({ id: 90 }), sglangBase({ id: 91 })];
+    expect(mergeRunScopedRows([], baseRows)).toBe(baseRows);
+  });
+});
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 9f6b43d1..8329c84b 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -172,6 +172,35 @@ export function withPercentile(key: string, percentile: string): string {
   return key.replace(/^(?:mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
 }
 
+// Replacement granularity for single-run scoping: the changelog config_key
+// tuple (model-precision-hardware-framework) plus benchmark_type, so an
+// agentic-only run never hides the same config's fixed-seq carry-forward.
+const runScopeKey = (r: BenchmarkRow): string =>
+  `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}`;
+
+/**
+ * Merge run-scoped benchmark rows with the normal latest-per-config rows.
+ *
+ * When the user picks a specific workflow run (to disambiguate two same-day
+ * sweeps of the same config), only the configs that run actually produced
+ * should be pinned to it — every other config must keep its normal
+ * carry-forward rows. Scoping the whole chart to the run (the old behavior)
+ * silently hid complementary configs that happened to land on the same date,
+ * e.g. selecting one of two same-day vLLM runs made the day's SGLang curve
+ * vanish because it lived in a different workflow run.
+ *
+ * Run rows win for every (model, precision, hardware, framework,
+ * benchmark_type) group they cover; base rows fill in the rest.
+ */
+export function mergeRunScopedRows(
+  runRows: BenchmarkRow[],
+  baseRows: BenchmarkRow[],
+): BenchmarkRow[] {
+  if (runRows.length === 0) return baseRows;
+  const claimed = new Set(runRows.map(runScopeKey));
+  return [...runRows, ...baseRows.filter((r) => !claimed.has(runScopeKey(r)))];
+}
+
 /**
  * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
  * Returns one InferenceData[] per chart definition (e2e, interactivity).

From d6d31436abf38eb32e6383ab692ff0b8519ca32c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:25:49 -0500
Subject: [PATCH 065/111] fix: reconcile agentic data after master merge

---
 .../component/inference-chart-controls.cy.tsx |   4 +-
 .../inference/hooks/useChartData.ts           |   8 +-
 .../components/inference/ui/ChartDisplay.tsx  | 481 +++++++++---------
 .../components/inference/ui/ScatterGraph.tsx  |   5 +-
 .../components/unofficial-run-provider.tsx    |  10 +-
 packages/app/src/lib/api.ts                   |  15 +-
 packages/db/src/queries/benchmarks.ts         |  21 +-
 7 files changed, 282 insertions(+), 262 deletions(-)

diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx
index 03e6a50c..5a6311f4 100644
--- a/packages/app/cypress/component/inference-chart-controls.cy.tsx
+++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx
@@ -14,8 +14,8 @@ describe('Inference ChartControls', () => {
 
   it('renders the sequence selector with the current sequence', () => {
     // Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K"
-    cy.get('#sequence-select').should('be.visible');
-    cy.get('#sequence-select').should('contain.text', '8K / 1K');
+    cy.get('#scenario-select').should('be.visible');
+    cy.get('#scenario-select').should('contain.text', '8K / 1K');
   });
 
   it('renders the precision multi-select with the current precision', () => {
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 0d1eac64..ee5acb88 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -220,13 +220,7 @@ export function useChartData(
     data: runRows,
     isLoading: runLoading,
     error: runError,
-  } = useBenchmarks(
-    selectedModel,
-    '',
-    enabled && Boolean(selectedRunId),
-    selectedRunId,
-    true,
-  );
+  } = useBenchmarks(selectedModel, '', enabled && Boolean(selectedRunId), selectedRunId, true);
 
   const allRows = useMemo(() => {
     if (!selectedRunId) return baseRows;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 3a431440..caf713cc 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -429,217 +429,206 @@ export default function ChartDisplay() {
     });
   }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
 
-  const displayGraphs = isFirstLoad || isDerivedLoading
-    ? [
-        <Card key="skeleton-0">
-          <Skeleton className="h-7 w-2/4 mb-1" />
-          <Skeleton className="h-5 w-3/4 mb-2" />
-          <Skeleton className="h-[600px] w-full" />
-        </Card>,
-      ]
-    : renderableGraphs.length === 0
-      ? []
-      : renderableGraphs.map((graph, graphIndex) => {
-          const isTimelineMode = Boolean(
-            selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
-          );
-          const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
-          return (
-            <section key={graphIndex} className="pt-8 md:pt-0">
-              <figure data-testid="chart-figure" className="relative rounded-lg">
-                <ChartButtons
-                  chartId={`chart-${graphIndex}`}
-                  analyticsPrefix={
-                    isTimelineMode
-                      ? 'gpu_timeseries'
-                      : graph.chartDefinition.chartType === 'e2e'
-                        ? 'latency'
-                        : 'interactivity'
-                  }
-                  leadingControls={
-                    <SegmentedToggle
-                      value={getViewMode(graphIndex)}
-                      options={VIEW_MODE_OPTIONS}
-                      onValueChange={(v) => handleViewModeChange(graphIndex, v)}
-                      ariaLabel="View mode"
-                      testId={`inference-view-toggle-${graphIndex}`}
-                    />
-                  }
-                  hideImageExport={getViewMode(graphIndex) === 'table'}
-                  setIsLegendExpanded={setIsLegendExpanded}
-                  exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
-                  onExportMp4={
-                    replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined
-                  }
-                  onExportCsv={() => {
-                    const visibleData = graph.data.filter((d) =>
+  const displayGraphs =
+    isFirstLoad || isDerivedLoading
+      ? [
+          <Card key="skeleton-0">
+            <Skeleton className="h-7 w-2/4 mb-1" />
+            <Skeleton className="h-5 w-3/4 mb-2" />
+            <Skeleton className="h-[600px] w-full" />
+          </Card>,
+        ]
+      : renderableGraphs.length === 0
+        ? []
+        : renderableGraphs.map((graph, graphIndex) => {
+            const isTimelineMode = Boolean(
+              selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
+            );
+            const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
+            return (
+              <section key={graphIndex} className="pt-8 md:pt-0">
+                <figure data-testid="chart-figure" className="relative rounded-lg">
+                  <ChartButtons
+                    chartId={`chart-${graphIndex}`}
+                    analyticsPrefix={
                       isTimelineMode
-                        ? activeDates.has(`${d.date}_${d.hwKey}`)
-                        : activeHwTypes.has(d.hwKey as string) &&
-                          selectedPrecisions.includes(d.precision),
-                    );
-                    const { headers, rows } = inferenceChartToCsv(
-                      visibleData,
-                      graph.model,
-                      graph.sequence,
-                    );
-                    // Match warnings against the same series the chart annotates,
-                    // including visible unofficial-run overlay series.
-                    const overlay =
-                      graph.chartDefinition.chartType === 'e2e'
-                        ? overlayDataByChartType.e2e
-                        : overlayDataByChartType.interactivity;
-                    const visibleOverlayRows = isTimelineMode
-                      ? []
-                      : (overlay?.data ?? []).filter(
-                          (p) =>
-                            activeOverlayHwTypes.has(p.hwKey as string) &&
-                            selectedPrecisions.includes(p.precision),
-                        );
-                    const issueNotes = matchKnownConfigIssues(graph.model, [
-                      ...visibleData,
-                      ...visibleOverlayRows,
-                    ]).map((issue) =>
-                      knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))),
-                    );
-                    exportToCsv(
-                      `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
-                      headers,
-                      rows,
-                      issueNotes,
-                    );
-                  }}
-                />
-                <Card>
-                  {(() => {
-                    const chartCaption = (
-                      <>
-                        <h2 className="text-lg font-semibold">
-                          {
-                            graph.chartDefinition[
-                              `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                            ]
-                          }{' '}
-                          {(() => {
-                            // For Input metrics with dynamic x-axis, use dynamic heading
-                            const metricTitle =
-                              (graph.chartDefinition[
-                                `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                              ] as string) || '';
-                            const isInputMetric = metricTitle.toLowerCase().includes('input');
-                            if (
-                              graph.chartDefinition.chartType === 'interactivity' &&
-                              isInputMetric &&
-                              selectedXAxisMetric
-                            ) {
-                              if (selectedXAxisMetric === 'p99_ttft') {
-                                return 'vs. P99 Time To First Token';
-                              } else if (selectedXAxisMetric === 'median_ttft') {
-                                return 'vs. Median Time To First Token';
-                              }
-                            }
-
-                            // The e2e chart heading follows the branch-level x-axis mode
-                            // selector, including agentic-only derived metrics.
-                            if (graph.chartDefinition.chartType === 'e2e') {
-                              if (selectedXAxisMode === 'session-time') {
-                                return 'vs. Mean Normalized Session Time';
-                              }
-                              if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. P90 Prefill TPS / user';
-                              }
-                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
-                                const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
-                                const word =
-                                  percentile === 'median' ? 'Median' : percentile.toUpperCase();
-                                return `vs. ${word} Time To First Token`;
-                              }
-                              return isAgentic
-                                ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency`
-                                : 'vs. End-to-end Latency';
-                            }
-
-                            // Fall back to configured heading
-                            return (
-                              graph.chartDefinition[
-                                `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                              ] || graph.chartDefinition.heading
-                            );
-                          })()}
-                        </h2>
-                        <p className="text-sm text-muted-foreground mb-2">
-                          {getModelLabel(graph.model as Model)} •{' '}
-                          {selectedPrecisions
-                            .map((prec) => getPrecisionLabel(prec as Precision))
-                            .join(', ')}{' '}
-                          • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
-                          {isUnofficialRun
-                            ? 'Source: UNOFFICIAL'
-                            : 'Source: SemiAnalysis InferenceX™'}
-                          {selectedRunDate && (
-                            <>
-                              {' '}
-                              • Updated:{' '}
-                              {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
-                                'en-US',
-                                {
-                                  year: 'numeric',
-                                  month: '2-digit',
-                                  day: '2-digit',
-                                  timeZone: 'UTC',
-                                },
-                              )}
-                            </>
-                          )}
-                        </p>
-                        <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
-                        <UnofficialDomainNotice />
-                      </>
-                    );
-
-                    if (getViewMode(graphIndex) === 'table') {
+                        ? 'gpu_timeseries'
+                        : graph.chartDefinition.chartType === 'e2e'
+                          ? 'latency'
+                          : 'interactivity'
+                    }
+                    leadingControls={
+                      <SegmentedToggle
+                        value={getViewMode(graphIndex)}
+                        options={VIEW_MODE_OPTIONS}
+                        onValueChange={(v) => handleViewModeChange(graphIndex, v)}
+                        ariaLabel="View mode"
+                        testId={`inference-view-toggle-${graphIndex}`}
+                      />
+                    }
+                    hideImageExport={getViewMode(graphIndex) === 'table'}
+                    setIsLegendExpanded={setIsLegendExpanded}
+                    exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
+                    onExportMp4={
+                      replayAvailable
+                        ? () => replayHandlesRef.current[graphIndex]?.open()
+                        : undefined
+                    }
+                    onExportCsv={() => {
+                      const visibleData = graph.data.filter((d) =>
+                        isTimelineMode
+                          ? activeDates.has(`${d.date}_${d.hwKey}`)
+                          : activeHwTypes.has(d.hwKey as string) &&
+                            selectedPrecisions.includes(d.precision),
+                      );
+                      const { headers, rows } = inferenceChartToCsv(
+                        visibleData,
+                        graph.model,
+                        graph.sequence,
+                      );
+                      // Match warnings against the same series the chart annotates,
+                      // including visible unofficial-run overlay series.
                       const overlay =
                         graph.chartDefinition.chartType === 'e2e'
                           ? overlayDataByChartType.e2e
                           : overlayDataByChartType.interactivity;
-                      const overlayRows = (overlay?.data ?? []).filter((p) =>
-                        selectedPrecisions.includes(p.precision),
+                      const visibleOverlayRows = isTimelineMode
+                        ? []
+                        : (overlay?.data ?? []).filter(
+                            (p) =>
+                              activeOverlayHwTypes.has(p.hwKey as string) &&
+                              selectedPrecisions.includes(p.precision),
+                          );
+                      const issueNotes = matchKnownConfigIssues(graph.model, [
+                        ...visibleData,
+                        ...visibleOverlayRows,
+                      ]).map((issue) =>
+                        knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))),
                       );
-                      return (
+                      exportToCsv(
+                        `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
+                        headers,
+                        rows,
+                        issueNotes,
+                      );
+                    }}
+                  />
+                  <Card>
+                    {(() => {
+                      const chartCaption = (
                         <>
-                          {chartCaption}
-                          <InferenceTable
-                            data={
-                              overlayRows.length > 0 ? [...graph.data, ...overlayRows] : graph.data
-                            }
-                            chartDefinition={graph.chartDefinition}
-                            selectedYAxisMetric={selectedYAxisMetric}
-                          />
+                          <h2 className="text-lg font-semibold">
+                            {
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                              ]
+                            }{' '}
+                            {(() => {
+                              // For Input metrics with dynamic x-axis, use dynamic heading
+                              const metricTitle =
+                                (graph.chartDefinition[
+                                  `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                                ] as string) || '';
+                              const isInputMetric = metricTitle.toLowerCase().includes('input');
+                              if (
+                                graph.chartDefinition.chartType === 'interactivity' &&
+                                isInputMetric &&
+                                selectedXAxisMetric
+                              ) {
+                                if (selectedXAxisMetric === 'p99_ttft') {
+                                  return 'vs. P99 Time To First Token';
+                                } else if (selectedXAxisMetric === 'median_ttft') {
+                                  return 'vs. Median Time To First Token';
+                                }
+                              }
+
+                              // The e2e chart heading follows the branch-level x-axis mode
+                              // selector, including agentic-only derived metrics.
+                              if (graph.chartDefinition.chartType === 'e2e') {
+                                if (selectedXAxisMode === 'session-time') {
+                                  return 'vs. Mean Normalized Session Time';
+                                }
+                                if (selectedXAxisMode === 'prefill-tps') {
+                                  return 'vs. P90 Prefill TPS / user';
+                                }
+                                const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                                if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                  const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                  const word =
+                                    percentile === 'median' ? 'Median' : percentile.toUpperCase();
+                                  return `vs. ${word} Time To First Token`;
+                                }
+                                return isAgentic
+                                  ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency`
+                                  : 'vs. End-to-end Latency';
+                              }
+
+                              // Fall back to configured heading
+                              return (
+                                graph.chartDefinition[
+                                  `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
+                                ] || graph.chartDefinition.heading
+                              );
+                            })()}
+                          </h2>
+                          <p className="text-sm text-muted-foreground mb-2">
+                            {getModelLabel(graph.model as Model)} •{' '}
+                            {selectedPrecisions
+                              .map((prec) => getPrecisionLabel(prec as Precision))
+                              .join(', ')}{' '}
+                            • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
+                            {isUnofficialRun
+                              ? 'Source: UNOFFICIAL'
+                              : 'Source: SemiAnalysis InferenceX™'}
+                            {selectedRunDate && (
+                              <>
+                                {' '}
+                                • Updated:{' '}
+                                {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
+                                  'en-US',
+                                  {
+                                    year: 'numeric',
+                                    month: '2-digit',
+                                    day: '2-digit',
+                                    timeZone: 'UTC',
+                                  },
+                                )}
+                              </>
+                            )}
+                          </p>
+                          <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
+                          <UnofficialDomainNotice />
                         </>
                       );
-                    }
 
-                    return selectedGPUs.length > 0 &&
-                      ((selectedDateRange.startDate && selectedDateRange.endDate) ||
-                        selectedDates.length > 0) ? (
-                      <GPUGraph
-                        chartId={`chart-${graphIndex}`}
-                        modelLabel={graph.model}
-                        data={graph.data}
-                        xLabel={graph.chartDefinition.x_label}
-                        yLabel={`${
-                          graph.chartDefinition[
-                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                          ]
-                        }`}
-                        chartDefinition={graph.chartDefinition}
-                        caption={chartCaption}
-                        runNumbering={runNumbering}
-                      />
-                    ) : (
-                      <div className="relative">
-                        <ScatterGraph
+                      if (getViewMode(graphIndex) === 'table') {
+                        const overlay =
+                          graph.chartDefinition.chartType === 'e2e'
+                            ? overlayDataByChartType.e2e
+                            : overlayDataByChartType.interactivity;
+                        const overlayRows = (overlay?.data ?? []).filter((p) =>
+                          selectedPrecisions.includes(p.precision),
+                        );
+                        return (
+                          <>
+                            {chartCaption}
+                            <InferenceTable
+                              data={
+                                overlayRows.length > 0
+                                  ? [...graph.data, ...overlayRows]
+                                  : graph.data
+                              }
+                              chartDefinition={graph.chartDefinition}
+                              selectedYAxisMetric={selectedYAxisMetric}
+                            />
+                          </>
+                        );
+                      }
+
+                      return selectedGPUs.length > 0 &&
+                        ((selectedDateRange.startDate && selectedDateRange.endDate) ||
+                          selectedDates.length > 0) ? (
+                        <GPUGraph
                           chartId={`chart-${graphIndex}`}
                           modelLabel={graph.model}
                           data={graph.data}
@@ -651,44 +640,60 @@ export default function ChartDisplay() {
                           }`}
                           chartDefinition={graph.chartDefinition}
                           caption={chartCaption}
-                          overlayData={
-                            graph.chartDefinition.chartType === 'e2e'
-                              ? (overlayDataByChartType.e2e ?? undefined)
-                              : (overlayDataByChartType.interactivity ?? undefined)
-                          }
+                          runNumbering={runNumbering}
                         />
-                        {selectedGPUs.length > 0 &&
-                          (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
-                          selectedDates.length === 0 && (
-                            <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
-                              <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
-                                Select a date range or add a run to view GPU comparison
-                              </p>
-                            </div>
-                          )}
-                      </div>
-                    );
-                  })()}
-                  {replayAvailable && (
-                    <ReplayLauncher
-                      ref={(handle) => {
-                        replayHandlesRef.current[graphIndex] = handle;
-                      }}
-                      parentChartId={`chart-${graphIndex}`}
-                      chartDefinition={graph.chartDefinition}
-                      yLabel={`${
-                        graph.chartDefinition[
-                          `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                        ]
-                      }`}
-                      xLabel={graph.chartDefinition.x_label}
-                    />
-                  )}
-                </Card>
-              </figure>
-            </section>
-          );
-        });
+                      ) : (
+                        <div className="relative">
+                          <ScatterGraph
+                            chartId={`chart-${graphIndex}`}
+                            modelLabel={graph.model}
+                            data={graph.data}
+                            xLabel={graph.chartDefinition.x_label}
+                            yLabel={`${
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                              ]
+                            }`}
+                            chartDefinition={graph.chartDefinition}
+                            caption={chartCaption}
+                            overlayData={
+                              graph.chartDefinition.chartType === 'e2e'
+                                ? (overlayDataByChartType.e2e ?? undefined)
+                                : (overlayDataByChartType.interactivity ?? undefined)
+                            }
+                          />
+                          {selectedGPUs.length > 0 &&
+                            (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
+                            selectedDates.length === 0 && (
+                              <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
+                                <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
+                                  Select a date range or add a run to view GPU comparison
+                                </p>
+                              </div>
+                            )}
+                        </div>
+                      );
+                    })()}
+                    {replayAvailable && (
+                      <ReplayLauncher
+                        ref={(handle) => {
+                          replayHandlesRef.current[graphIndex] = handle;
+                        }}
+                        parentChartId={`chart-${graphIndex}`}
+                        chartDefinition={graph.chartDefinition}
+                        yLabel={`${
+                          graph.chartDefinition[
+                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                          ]
+                        }`}
+                        xLabel={graph.chartDefinition.x_label}
+                      />
+                    )}
+                  </Card>
+                </figure>
+              </section>
+            );
+          });
 
   return (
     <div data-testid="inference-chart-display" className="flex flex-col gap-4">
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 982c24d2..e1cad1a4 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -7,7 +7,6 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react';
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
 import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
-import { useRouter } from 'next/navigation';
 import { pointNearestX } from '@/components/inference/ui/line-label-anchor';
 import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
@@ -582,7 +581,6 @@ const ScatterGraph = React.memo(
       return ids;
     }, [pointsData]);
     const { data: traceAvailability } = useTraceAvailability(agenticIds);
-    const router = useRouter();
 
     // Gradient label data
     const allPointLabelsByKey = useMemo(() => {
@@ -902,7 +900,7 @@ const ScatterGraph = React.memo(
               });
               chartRef.current?.dismissTooltip();
               chartRef.current?.hideTooltip();
-              router.push(`/inference/agentic/${pointId}`);
+              window.location.assign(`/inference/agentic/${pointId}`);
             });
           }
         },
@@ -923,7 +921,6 @@ const ScatterGraph = React.memo(
         // presence fetch resolves so the button appears for points that
         // have a trace_replay blob.
         traceAvailability,
-        router,
       ],
     );
 
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index b8e76f38..54b470ff 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -279,11 +279,11 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) {
       // Filter chart data by stamped `run_url`. A row belongs to the dismissed
       // run if its URL matches exactly OR the numeric id parses to the same.
       const belongsToDismissed = (rowUrl?: string | null) => {
-          if (!rowUrl) return false;
-          if (rowUrl === target.url) return true;
-          const m = rowUrl.match(/\/runs\/(?<runId>\d+)/u);
-          return m?.groups?.runId === runId;
-        };
+        if (!rowUrl) return false;
+        if (rowUrl === target.url) return true;
+        const m = rowUrl.match(/\/runs\/(?<runId>\d+)/u);
+        return m?.groups?.runId === runId;
+      };
 
       // Compute the filtered chart data BEFORE any setState so we can pass the
       // same value to setUnofficialChartData and parseAvailableModelsAndSequences.
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 0dac5883..a9d66715 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -8,6 +8,8 @@ import type { WorkerPower } from '@/components/inference/types';
 import type { SubmissionsResponse } from './submissions-types';
 
 export interface BenchmarkRow {
+  /** Stable per-point id from benchmark_results; used for agentic detail lookups. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -25,9 +27,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode. Defaults to 'off' for fixed-sequence rows. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   /**
@@ -176,13 +182,14 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 49c60604..6833756a 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -11,6 +11,8 @@ import type { WorkerPower } from '../etl/benchmark-mapper.js';
 export type BenchmarkWorkerRow = WorkerPower;
 
 export interface BenchmarkRow {
+  /** Stable benchmark_results id used for agentic detail lookups. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -28,9 +30,11 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   /**
@@ -95,6 +99,7 @@ export async function getLatestBenchmarks(
         : sql``;
     const rows = await sql`
       SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+        br.id,
         c.hardware,
         c.framework,
         c.model,
@@ -112,6 +117,8 @@ export async function getLatestBenchmarks(
         c.decode_num_workers,
         c.num_prefill_gpu,
         c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
         br.isl,
         br.osl,
         br.conc,
@@ -136,6 +143,7 @@ export async function getLatestBenchmarks(
   // No date filter: use materialized view for instant lookups
   const rows = await sql`
     SELECT
+      lb.id,
       c.hardware,
       c.framework,
       c.model,
@@ -153,6 +161,8 @@ export async function getLatestBenchmarks(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      lb.benchmark_type,
+      lb.offload_mode,
       lb.isl,
       lb.osl,
       lb.conc,
@@ -185,6 +195,7 @@ export async function getBenchmarksForRun(
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   const rows = await sql`
     SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+      br.id,
       c.hardware,
       c.framework,
       c.model,
@@ -202,6 +213,8 @@ export async function getBenchmarksForRun(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
+      br.offload_mode,
       br.isl,
       br.osl,
       br.conc,
@@ -235,6 +248,7 @@ export async function getAllBenchmarksForHistory(
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   const rows = await sql`
     SELECT
+      br.id,
       c.hardware,
       c.framework,
       c.model,
@@ -252,9 +266,12 @@ export async function getAllBenchmarksForHistory(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
+      br.offload_mode,
       br.isl,
       br.osl,
       br.conc,
+      br.image,
       br.metrics - '{std_ttft,std_tpot,std_e2el,std_intvty,std_itl,mean_ttft,mean_tpot,mean_e2el,mean_intvty,mean_itl}'::text[] as metrics,
       br.workers,
       br.date::text,

From f60ef9c7f18a1782edd5542510328b242048a2de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:34:00 -0500
Subject: [PATCH 066/111] fix(gpu-compare): show concurrency (C=) over points

GPU compare mode (GPUGraph) labeled points with only the parallelism/tp
string, dropping the C=<conc> suffix that the single-run scatter chart
(ScatterGraph) shows. Append it so compare-mode points are annotated the
same way.

Verified live in compare mode: points now read e.g. 'DEP8 / C=2048',
'TP4 / C=64'.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/GPUGraph.tsx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index e7737a2e..24b1266f 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -759,7 +759,11 @@ const GPUGraph = React.memo(
             config: {
               getColor,
               hideLabels: hidePointLabels,
-              getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+              // Match ScatterGraph: append the concurrency (C=) to the
+              // parallelism/tp label so compare-mode points are annotated the
+              // same way as the single-run scatter chart.
+              getLabelText: (d) =>
+                useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`,
               foreground: 'var(--foreground)',
               dataAttrs: {
                 series: (d) => `${d.date}_${d.hwKey}`,

From 22028ccfe3141aa632b4c23aaca26b9c4bd51b58 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:43:42 -0500
Subject: [PATCH 067/111] fix(agentic-timeline): hide no-op phase toggle;
 fixed-height scroll window
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes to the conversation/request-timeline view:

1. The Profiling vs 'All (incl. warmup)' toggle never did anything —
   aiperf's profile_export only contains profiling-phase requests, so
   every stored record has phase='profiling' (verified: 297k/297k rows).
   Hide the toggle unless a non-profiling request actually exists, so it
   reappears and works only if warmup is ever exported.

2. The timeline grew to fit every conversation/worker, making the card
   arbitrarily tall. Cap the body at a fixed height (480px) and scroll
   the rows vertically inside it. Few-row runs still size to content
   (no empty space); the label column and bars scroll together since
   they share the one scroll container.

Verified live on a 3475-request point: phase toggle absent, row-mode
toggle still present, window clientHeight 480 with ~3745px scrolling
inside.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.tsx        | 474 +++++++++---------
 1 file changed, 249 insertions(+), 225 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 3c032fdd..2313775e 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -30,6 +30,11 @@ const PHASE_OPTIONS: SegmentedToggleOption<PhaseFilter>[] = [
   { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' },
 ];
 
+// The timeline body is capped at this height and scrolls internally, so a run
+// with many conversations/workers doesn't make the card grow unbounded and push
+// the rest of the detail page down. Sized to show ~16 rows + the header.
+const TIMELINE_BODY_MAX_HEIGHT = 480;
+
 /** A stable color palette indexed by row-key hash. */
 const ROW_COLORS = [
   '#3b82f6',
@@ -393,11 +398,24 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   }, []);
   const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
 
-  // Apply phase filter, then group into rows.
+  // The phase toggle only means something when warmup requests are actually
+  // present. aiperf's profile_export only contains profiling-phase requests, so
+  // in practice every record is `profiling` and the toggle is a no-op — hide it
+  // unless a non-profiling request exists (keeps it working if warmup is ever
+  // exported).
+  const hasWarmup = useMemo(
+    () => data.requests.some((r) => r.phase !== 'profiling'),
+    [data.requests],
+  );
+
+  // Apply phase filter, then group into rows. With no warmup data the filter
+  // collapses to "profiling" regardless of the (hidden) toggle state.
   const filtered = useMemo(
     () =>
-      phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
-    [data.requests, phaseFilter],
+      phaseFilter === 'all' && hasWarmup
+        ? data.requests
+        : data.requests.filter((r) => r.phase === 'profiling'),
+    [data.requests, phaseFilter, hasWarmup],
   );
   const rows = useMemo(
     () => buildRows(filtered, rowMode, expandedSubagents),
@@ -581,14 +599,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
           testId="timeline-row-mode"
           buttonClassName="px-2.5 py-1 text-xs"
         />
-        <SegmentedToggle
-          value={phaseFilter}
-          options={PHASE_OPTIONS}
-          onValueChange={setPhaseFilter}
-          ariaLabel="Phase filter"
-          testId="timeline-phase-filter"
-          buttonClassName="px-2.5 py-1 text-xs"
-        />
+        {hasWarmup && (
+          <SegmentedToggle
+            value={phaseFilter}
+            options={PHASE_OPTIONS}
+            onValueChange={setPhaseFilter}
+            ariaLabel="Phase filter"
+            testId="timeline-phase-filter"
+            buttonClassName="px-2.5 py-1 text-xs"
+          />
+        )}
         <span className="ml-auto text-xs text-muted-foreground">
           {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
           {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
@@ -606,243 +626,247 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
 
       {/* Chart container */}
       <div className="rounded-md border border-border/60 bg-card overflow-hidden">
-        <div className="flex">
-          {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
-          <div
-            className="flex-shrink-0 border-r border-border/60 bg-card/80"
-            style={{ width: LABEL_WIDTH }}
-          >
+        {/* Fixed-height window: the rows scroll vertically inside it instead of
+            the card growing to fit every conversation/worker. */}
+        <div className="overflow-y-auto" style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}>
+          <div className="flex">
+            {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
             <div
-              className="border-b border-border/60 flex items-end px-2 pb-1"
-              style={{ height: HEADER_HEIGHT }}
+              className="flex-shrink-0 border-r border-border/60 bg-card/80"
+              style={{ width: LABEL_WIDTH }}
             >
-              <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
-                {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
-              </span>
-            </div>
-            {rows.map((row) => {
-              const isSubagentRow = row.kind === 'subagent';
-              const isStreamRow = row.kind === 'stream';
-              const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
-              const isExpanded = isExpandable && expandedSubagents.has(row.key);
-              return (
-                <div
-                  key={row.key}
-                  className="flex items-center gap-1 overflow-hidden pr-2"
-                  style={{
-                    height: ROW_HEIGHT + ROW_GAP,
-                    paddingLeft: 4 + row.depth * 10,
-                  }}
-                >
-                  {isExpandable ? (
-                    <button
-                      type="button"
-                      onClick={() => toggleSubagent(row.key)}
-                      className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
-                      aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
-                      title={isExpanded ? 'Collapse streams' : 'Expand streams'}
-                    >
-                      <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
-                    </button>
-                  ) : (
-                    <span className="size-3.5 shrink-0" />
-                  )}
-                  <span
-                    className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
-                    style={{
-                      backgroundColor: row.color,
-                      opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
-                    }}
-                  />
-                  <span
-                    className="text-[10px] font-mono truncate"
+              <div
+                className="border-b border-border/60 flex items-end px-2 pb-1"
+                style={{ height: HEADER_HEIGHT }}
+              >
+                <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
+                  {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
+                </span>
+              </div>
+              {rows.map((row) => {
+                const isSubagentRow = row.kind === 'subagent';
+                const isStreamRow = row.kind === 'stream';
+                const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
+                const isExpanded = isExpandable && expandedSubagents.has(row.key);
+                return (
+                  <div
+                    key={row.key}
+                    className="flex items-center gap-1 overflow-hidden pr-2"
                     style={{
-                      color: row.color,
-                      opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                      height: ROW_HEIGHT + ROW_GAP,
+                      paddingLeft: 4 + row.depth * 10,
                     }}
                   >
-                    {row.label}
-                    {isExpandable && (
-                      <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                    {isExpandable ? (
+                      <button
+                        type="button"
+                        onClick={() => toggleSubagent(row.key)}
+                        className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
+                        aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                        title={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                      >
+                        <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
+                      </button>
+                    ) : (
+                      <span className="size-3.5 shrink-0" />
                     )}
-                  </span>
-                  <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
-                    {row.requests.length > 0 ? row.requests.length : '—'}
-                  </span>
-                </div>
-              );
-            })}
-          </div>
-
-          {/* Scrollable SVG */}
-          <div className="flex-1 overflow-x-auto">
-            <svg
-              width={chartWidth}
-              height={svgHeight}
-              className="block"
-              style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
-              onWheel={handleWheel}
-              onMouseDown={handleMouseDown}
-              onMouseMove={handleMouseMove}
-              onMouseUp={handleMouseUp}
-              onMouseLeave={handleMouseLeave}
-            >
-              {/* Header / time-axis baseline */}
-              <line
-                x1={0}
-                y1={HEADER_HEIGHT}
-                x2={chartWidth}
-                y2={HEADER_HEIGHT}
-                stroke="currentColor"
-                opacity={0.15}
-              />
-
-              {/* Time axis ticks */}
-              {ticks.map((t) => {
-                // Convert visible-window ns offset → x px (the tick array
-                // is already in dataStart-relative coords).
-                const x = (t - vStart) * scale;
-                return (
-                  <g key={t}>
-                    <line
-                      x1={x}
-                      y1={HEADER_HEIGHT}
-                      x2={x}
-                      y2={svgHeight}
-                      stroke="currentColor"
-                      opacity={0.08}
-                      strokeDasharray="2 4"
+                    <span
+                      className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                      style={{
+                        backgroundColor: row.color,
+                        opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
+                      }}
                     />
-                    <text
-                      x={x + 2}
-                      y={HEADER_HEIGHT - 6}
-                      fill="currentColor"
-                      opacity={0.55}
-                      fontSize={9}
-                      fontFamily="ui-monospace, SFMono-Regular, monospace"
+                    <span
+                      className="text-[10px] font-mono truncate"
+                      style={{
+                        color: row.color,
+                        opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                      }}
                     >
-                      {formatTickLabel(t)}
-                    </text>
-                  </g>
+                      {row.label}
+                      {isExpandable && (
+                        <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                      )}
+                    </span>
+                    <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                      {row.requests.length > 0 ? row.requests.length : '—'}
+                    </span>
+                  </div>
                 );
               })}
+            </div>
 
-              {/* Row separators */}
-              {rows.map((row, idx) => (
+            {/* Scrollable SVG */}
+            <div className="flex-1 overflow-x-auto">
+              <svg
+                width={chartWidth}
+                height={svgHeight}
+                className="block"
+                style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
+                onWheel={handleWheel}
+                onMouseDown={handleMouseDown}
+                onMouseMove={handleMouseMove}
+                onMouseUp={handleMouseUp}
+                onMouseLeave={handleMouseLeave}
+              >
+                {/* Header / time-axis baseline */}
                 <line
-                  key={`sep-${row.key}`}
                   x1={0}
-                  y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  y1={HEADER_HEIGHT}
                   x2={chartWidth}
-                  y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  y2={HEADER_HEIGHT}
                   stroke="currentColor"
-                  opacity={0.04}
+                  opacity={0.15}
                 />
-              ))}
-
-              {/* Request bars */}
-              {rows.map((row, rowIdx) => {
-                const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
-                const barH = ROW_HEIGHT - 4;
-                // For multi-stream subagent containers, suppress the union
-                // bars when expanded — the child stream rows draw them
-                // individually instead, so we'd double-draw otherwise.
-                if (
-                  row.kind === 'subagent' &&
-                  (row.streamCount ?? 1) > 1 &&
-                  expandedSubagents.has(row.key)
-                ) {
-                  return null;
-                }
-                return row.requests.map((req) => {
-                  const xCredit = xOf(req.credit);
-                  const xStart = xOf(req.start);
-                  const xEnd = xOf(req.end);
-                  // Cull bars entirely outside the visible window so big
-                  // benchmarks don't render thousands of zero-width rects.
-                  if (xEnd < -2 || xCredit > chartWidth + 2) return null;
-                  const runW = Math.max(xEnd - xStart, 1);
-                  const queueW = Math.max(xStart - xCredit, 0);
-                  const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+
+                {/* Time axis ticks */}
+                {ticks.map((t) => {
+                  // Convert visible-window ns offset → x px (the tick array
+                  // is already in dataStart-relative coords).
+                  const x = (t - vStart) * scale;
                   return (
-                    <g
-                      key={`${req.cid}-${req.ti}-${req.start}`}
-                      onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
-                      onMouseLeave={() => setTooltip(null)}
-                    >
-                      {/* Queue lead-in (faint) — only drawn when noticeable. */}
-                      {queueW >= 1 && (
+                    <g key={t}>
+                      <line
+                        x1={x}
+                        y1={HEADER_HEIGHT}
+                        x2={x}
+                        y2={svgHeight}
+                        stroke="currentColor"
+                        opacity={0.08}
+                        strokeDasharray="2 4"
+                      />
+                      <text
+                        x={x + 2}
+                        y={HEADER_HEIGHT - 6}
+                        fill="currentColor"
+                        opacity={0.55}
+                        fontSize={9}
+                        fontFamily="ui-monospace, SFMono-Regular, monospace"
+                      >
+                        {formatTickLabel(t)}
+                      </text>
+                    </g>
+                  );
+                })}
+
+                {/* Row separators */}
+                {rows.map((row, idx) => (
+                  <line
+                    key={`sep-${row.key}`}
+                    x1={0}
+                    y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                    x2={chartWidth}
+                    y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                    stroke="currentColor"
+                    opacity={0.04}
+                  />
+                ))}
+
+                {/* Request bars */}
+                {rows.map((row, rowIdx) => {
+                  const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
+                  const barH = ROW_HEIGHT - 4;
+                  // For multi-stream subagent containers, suppress the union
+                  // bars when expanded — the child stream rows draw them
+                  // individually instead, so we'd double-draw otherwise.
+                  if (
+                    row.kind === 'subagent' &&
+                    (row.streamCount ?? 1) > 1 &&
+                    expandedSubagents.has(row.key)
+                  ) {
+                    return null;
+                  }
+                  return row.requests.map((req) => {
+                    const xCredit = xOf(req.credit);
+                    const xStart = xOf(req.start);
+                    const xEnd = xOf(req.end);
+                    // Cull bars entirely outside the visible window so big
+                    // benchmarks don't render thousands of zero-width rects.
+                    if (xEnd < -2 || xCredit > chartWidth + 2) return null;
+                    const runW = Math.max(xEnd - xStart, 1);
+                    const queueW = Math.max(xStart - xCredit, 0);
+                    const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+                    return (
+                      <g
+                        key={`${req.cid}-${req.ti}-${req.start}`}
+                        onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
+                        onMouseLeave={() => setTooltip(null)}
+                      >
+                        {/* Queue lead-in (faint) — only drawn when noticeable. */}
+                        {queueW >= 1 && (
+                          <rect
+                            x={xCredit}
+                            y={yTop + barH / 2 - 1}
+                            width={queueW}
+                            height={2}
+                            fill={row.color}
+                            opacity={0.35}
+                          />
+                        )}
+                        {/* Main bar — opacity stepped down with depth so
+                          parent > subagent > stream reads visually. */}
                         <rect
-                          x={xCredit}
-                          y={yTop + barH / 2 - 1}
-                          width={queueW}
-                          height={2}
+                          x={xStart}
+                          y={yTop}
+                          width={runW}
+                          height={barH}
+                          rx={2}
                           fill={row.color}
-                          opacity={0.35}
+                          opacity={
+                            req.cancelled
+                              ? 0.35
+                              : row.kind === 'stream'
+                                ? 0.5
+                                : row.kind === 'subagent'
+                                  ? 0.6
+                                  : 0.85
+                          }
                         />
-                      )}
-                      {/* Main bar — opacity stepped down with depth so
-                          parent > subagent > stream reads visually. */}
-                      <rect
-                        x={xStart}
-                        y={yTop}
-                        width={runW}
-                        height={barH}
-                        rx={2}
-                        fill={row.color}
-                        opacity={
-                          req.cancelled
-                            ? 0.35
-                            : row.kind === 'stream'
-                              ? 0.5
-                              : row.kind === 'subagent'
-                                ? 0.6
-                                : 0.85
-                        }
-                      />
-                      {/* Phase strip at bottom */}
-                      <rect
-                        x={xStart}
-                        y={yTop + barH - 2}
-                        width={runW}
-                        height={2}
-                        rx={1}
-                        fill={phaseColor}
-                        opacity={0.85}
-                      />
-                      {/* Cancelled X overlay */}
-                      {req.cancelled && runW > 6 && (
-                        <line
-                          x1={xStart + 1}
-                          y1={yTop + 1}
-                          x2={xStart + runW - 1}
-                          y2={yTop + barH - 1}
-                          stroke="currentColor"
-                          strokeWidth={0.7}
-                          opacity={0.6}
+                        {/* Phase strip at bottom */}
+                        <rect
+                          x={xStart}
+                          y={yTop + barH - 2}
+                          width={runW}
+                          height={2}
+                          rx={1}
+                          fill={phaseColor}
+                          opacity={0.85}
                         />
-                      )}
-                    </g>
-                  );
-                });
-              })}
-
-              {/* Cursor crosshair — drawn on top of bars so it stays visible
+                        {/* Cancelled X overlay */}
+                        {req.cancelled && runW > 6 && (
+                          <line
+                            x1={xStart + 1}
+                            y1={yTop + 1}
+                            x2={xStart + runW - 1}
+                            y2={yTop + barH - 1}
+                            stroke="currentColor"
+                            strokeWidth={0.7}
+                            opacity={0.6}
+                          />
+                        )}
+                      </g>
+                    );
+                  });
+                })}
+
+                {/* Cursor crosshair — drawn on top of bars so it stays visible
                   through dense rows. Stats popover is rendered as fixed
                   HTML below the SVG block. */}
-              {cursor && (
-                <line
-                  x1={cursor.xPx}
-                  x2={cursor.xPx}
-                  y1={0}
-                  y2={svgHeight}
-                  stroke="currentColor"
-                  strokeWidth={1}
-                  opacity={0.45}
-                  pointerEvents="none"
-                />
-              )}
-            </svg>
+                {cursor && (
+                  <line
+                    x1={cursor.xPx}
+                    x2={cursor.xPx}
+                    y1={0}
+                    y2={svgHeight}
+                    stroke="currentColor"
+                    strokeWidth={1}
+                    opacity={0.45}
+                    pointerEvents="none"
+                  />
+                )}
+              </svg>
+            </div>
           </div>
         </div>
       </div>

From 28d25a53b7e3543a3d91e9c19f05b2409c20c032 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:50:26 -0500
Subject: [PATCH 068/111] feat(agentic-timeline): sticky bottom h-scroll +
 double-click to reset zoom

The fixed-height window put the chart's horizontal scrollbar at the
bottom of the tall (full-height) content, below the fold and unreachable.
Make the window itself the single scroll container (overflow-auto, both
axes) and pin the label column with position:sticky left-0, so the
horizontal scrollbar stays at the window's bottom edge while the label
column stays put during horizontal scroll and scrolls with the rows
vertically.

Also add double-click anywhere on the timeline to reset zoom/pan (same
resetZoom the existing button calls) and note it in the hint text.

Verified live: window scrollW 1280 > clientW 879 (h-scroll present and
working), label column sticky, rows scroll vertically.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.tsx        | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 2313775e..7c5fdab0 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -626,13 +626,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
 
       {/* Chart container */}
       <div className="rounded-md border border-border/60 bg-card overflow-hidden">
-        {/* Fixed-height window: the rows scroll vertically inside it instead of
-            the card growing to fit every conversation/worker. */}
-        <div className="overflow-y-auto" style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}>
-          <div className="flex">
-            {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
+        {/* Fixed-height window: rows scroll vertically and the chart scrolls
+            horizontally inside it, so the card doesn't grow to fit every
+            conversation/worker AND the horizontal scrollbar stays pinned to the
+            window's bottom edge (rather than the bottom of the tall content). */}
+        <div className="overflow-auto" style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}>
+          <div className="flex w-max">
+            {/* Label column — pinned left (sticky) so it stays put during
+                horizontal scroll, while scrolling vertically with the rows. */}
             <div
-              className="flex-shrink-0 border-r border-border/60 bg-card/80"
+              className="sticky left-0 z-10 flex-shrink-0 border-r border-border/60 bg-card"
               style={{ width: LABEL_WIDTH }}
             >
               <div
@@ -697,8 +700,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
               })}
             </div>
 
-            {/* Scrollable SVG */}
-            <div className="flex-1 overflow-x-auto">
+            {/* Chart column — horizontal scrolling is handled by the window
+                container above so its scrollbar stays pinned to the window's
+                bottom edge; double-click anywhere resets the zoom. */}
+            <div className="flex-shrink-0">
               <svg
                 width={chartWidth}
                 height={svgHeight}
@@ -709,6 +714,7 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                 onMouseMove={handleMouseMove}
                 onMouseUp={handleMouseUp}
                 onMouseLeave={handleMouseLeave}
+                onDoubleClick={resetZoom}
               >
                 {/* Header / time-axis baseline */}
                 <line
@@ -885,7 +891,9 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
           <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#94a3b8' }} />
           warmup
         </span>
-        <span className="ml-auto opacity-70">scroll to zoom · drag to pan</span>
+        <span className="ml-auto opacity-70">
+          scroll to zoom · drag to pan · double-click to reset
+        </span>
       </div>
 
       {/* Cursor stats popover: count of in-flight / waiting at the cursor's

From 6e56bbfb2a29c6ffad2e4d4484bfcb6673fdacfd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 09:29:18 -0500
Subject: [PATCH 069/111] fix(gpu-compare): show CPU-offload halo on points
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dashed offload-mode ring (drawn in ScatterGraph's onRender for every
point with offload_mode='on') was missing from GPU compare mode
(GPUGraph), so the CPU-offloading indicator never appeared there. Mirror
it in GPUGraph's onRender — same dashed var(--foreground) ring at
POINT_SIZE+4, appended inside each .dot-group so it travels with the
point on zoom/pan.

Verified live in compare mode (DSv4 B200/B300 agentic): offload points
now render the dashed halo (5 rings, r=7.5, dash 3 2).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../src/components/inference/ui/GPUGraph.tsx  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index 24b1266f..19ba574f 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -26,6 +26,7 @@ import {
   formatLargeNumber,
   getShapeKeyForPrecision,
   logTickFormat,
+  POINT_SIZE,
 } from '@/lib/chart-rendering';
 import {
   paretoFrontLowerLeft,
@@ -827,6 +828,28 @@ const GPUGraph = React.memo(
           }
           // Set foreground color on scatter point labels
           ctx.layout.zoomGroup.selectAll('.point-label').style('fill', 'var(--foreground)');
+
+          // Offload halo: dashed ring on every point that used KV offload
+          // (mirrors ScatterGraph so compare mode shows the same CPU-offload
+          // indicator). The ring is a child of the dot-group, so it travels
+          // with the point on zoom/pan without a separate onZoom pass.
+          ctx.layout.zoomGroup
+            .selectAll<SVGGElement, InferenceData>('.dot-group')
+            .each(function (d) {
+              const showHalo = d.offload_mode === 'on';
+              d3.select(this)
+                .selectAll<SVGCircleElement, boolean>('.offload-halo')
+                .data(showHalo ? [true] : [])
+                .join('circle')
+                .attr('class', 'offload-halo')
+                .attr('r', POINT_SIZE + 4)
+                .attr('fill', 'none')
+                .attr('stroke', 'var(--foreground)')
+                .attr('stroke-width', 1.5)
+                .attr('stroke-dasharray', '3 2')
+                .attr('opacity', 0.9)
+                .attr('pointer-events', 'none');
+            });
         }}
         legendElement={
           <ChartLegend

From 2c060090278d660f1ad59e01646f5cdf0950e7d4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 12:56:08 -0500
Subject: [PATCH 070/111] fix(high-contrast): use full hue wheel for
 single-vendor comparisons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

generateHighContrastColors clamps each vendor's series into its brand hue
zone (NVIDIA=green, AMD=red) at <=PREFERRED_MAX items. The point of that
clamp is to keep DIFFERENT vendors apart at a glance — but when only one
vendor is present (the common all-NVIDIA agentic comparison: B200/B300 x
vLLM/SGLang), there's no rival to separate from, so every series collapses
into the same narrow green band and high-contrast mode looks like it does
nothing.

When a single vendor is present, skip the brand zone and rival-ban and use
the full hue wheel for maximum separation. Verified on an all-NVIDIA
agentic view: HC now spreads pink/blue/gold/green (hues 45/99/227/330,
min adjacent gap 54deg) instead of four near-identical greens. Multi-vendor
behavior is unchanged — vendors keep their brand zones so they stay
distinguishable. The non-HC palette still carries vendor identity.

Updated the single-vendor color tests to assert separability across the
full wheel rather than brand-zone confinement.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/app/src/lib/chart-utils.test.ts | 39 ++++++++++--------------
 packages/app/src/lib/chart-utils.ts      | 19 ++++++++++--
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/packages/app/src/lib/chart-utils.test.ts b/packages/app/src/lib/chart-utils.test.ts
index 061037ed..f6828ce2 100644
--- a/packages/app/src/lib/chart-utils.test.ts
+++ b/packages/app/src/lib/chart-utils.test.ts
@@ -353,30 +353,29 @@ describe('generateHighContrastColors', () => {
     expect(Object.values(dark).join(',')).not.toEqual(Object.values(light).join(','));
   });
 
-  // ---------- Tier 1: few items → brand zone ----------
-
-  it('3 NVIDIA GPUs are not red', () => {
+  // ---------- Single vendor: full wheel for maximum contrast ----------
+  // Brand-zone / rival-ban only apply when MULTIPLE vendors are present (so the
+  // vendors stay visually separable). With a single vendor there's no rival to
+  // distinguish from, so HC opens the full hue wheel — brand hue is sacrificed
+  // for the contrast HC exists to provide (fixes the all-NVIDIA agentic case
+  // where every series otherwise collapsed into the green brand band).
+
+  it('3 NVIDIA GPUs (single vendor) are distinguishable across the full wheel', () => {
     const result = generateHighContrastColors(['h100_vllm', 'h200_vllm', 'b200_vllm'], 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotReddish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(3);
     assertMinDist(result, 30);
   });
 
-  it('2 AMD GPUs are not green', () => {
+  it('2 AMD GPUs (single vendor) are distinguishable across the full wheel', () => {
     const result = generateHighContrastColors(['mi300x_sglang', 'mi325x_sglang'], 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotGreenish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(2);
     assertMinDist(result, 30);
   });
 
-  it('4 NVIDIA GPUs stay in brand zone and are distinguishable', () => {
+  it('4 NVIDIA GPUs (single vendor) use the full wheel and stay well-separated', () => {
     const keys = ['h100_vllm', 'h200_vllm', 'b200_vllm', 'b300_vllm'];
     const result = generateHighContrastColors(keys, 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotReddish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(4);
     assertMinDist(result, 25);
   });
 
@@ -401,19 +400,13 @@ describe('generateHighContrastColors', () => {
     assertMinDist(result, 25);
   });
 
-  // ---------- Tier 2: moderate items → full wheel minus rival color ----------
+  // ---------- Single vendor, many items → full wheel, best spacing ----------
 
-  it('10 NVIDIA GPUs: no red hues, still distinguishable', () => {
+  it('10 NVIDIA GPUs (single vendor) are well-separated across the full wheel', () => {
     const gpus = ['h100', 'h200', 'b200', 'b300', 'gb200'];
     const keys = gpus.flatMap((g) => [`${g}_vllm`, `${g}_sglang`]);
     const result = generateHighContrastColors(keys, 'dark');
-    // Should not be reddish (banned)
-    for (const color of Object.values(result)) {
-      const rgb = parseRgb(color);
-      // Not red-dominant with low green — i.e. not in the red/pink zone
-      const isRedPink = rgb[0] > 150 && rgb[1] < 80 && rgb[2] < 150;
-      expect(isRedPink).toBe(false);
-    }
+    expect(Object.keys(result)).toHaveLength(10);
     assertMinDist(result, 20);
   });
 
diff --git a/packages/app/src/lib/chart-utils.ts b/packages/app/src/lib/chart-utils.ts
index 33a5b4e3..3eeda15b 100644
--- a/packages/app/src/lib/chart-utils.ts
+++ b/packages/app/src/lib/chart-utils.ts
@@ -61,10 +61,17 @@ const PALETTE_CACHE = new Map<string, string[]>();
 /**
  * Generates high-contrast colors using iwanthue (k-means in CIELab space).
  *
- * Tiered strategy per vendor:
+ * Tiered strategy per vendor (only when >1 vendor is present):
  *   ≤ PREFERRED_MAX → constrain to brand zone (NVIDIA=green, AMD=red)
  *   ≤ BAN_MAX       → full wheel minus rival's brand color
  *   > BAN_MAX       → full wheel, no restrictions, best spacing wins
+ *
+ * Single-vendor case (e.g. an all-NVIDIA agentic comparison of B200/B300 ×
+ * vLLM/SGLang): the brand zone and rival-ban exist to keep vendors apart at a
+ * glance, but with one vendor there's no rival — clamping every series into the
+ * same narrow hue band just collapses the contrast HC is supposed to maximize.
+ * So skip both restrictions and use the full wheel, giving the series the widest
+ * possible separation.
  */
 export const generateHighContrastColors = (
   keys: string[],
@@ -91,6 +98,12 @@ export const generateHighContrastColors = (
     list.push(key);
   }
 
+  // Brand-zone / rival-ban only serve to keep DIFFERENT vendors apart. With a
+  // single vendor present there's nothing to separate from, so those
+  // restrictions only shrink the usable hue range and kill contrast — open the
+  // full wheel instead (the common all-NVIDIA agentic comparison case).
+  const multiVendor = groups.size > 1;
+
   for (const [vendor, vendorKeys] of groups) {
     const count = vendorKeys.length;
     const isBanned = BANNED_HUE_TEST[vendor] ?? null;
@@ -99,8 +112,8 @@ export const generateHighContrastColors = (
     // Tier 1: few items → brand zone only
     // Tier 2: moderate  → full wheel minus rival color
     // Tier 3: many      → full wheel, no restrictions
-    const usePreferred = preferred && count <= PREFERRED_MAX;
-    const useBan = !usePreferred && isBanned && count <= BAN_MAX;
+    const usePreferred = multiVendor && preferred && count <= PREFERRED_MAX;
+    const useBan = multiVendor && !usePreferred && isBanned && count <= BAN_MAX;
 
     // Everything iwanthue's output depends on (the ban filter and preferred
     // zone are functions of vendor; the seed is vendor+theme).

From 6275aa70bf0162cd83762ff79a2e0a5c053270e2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 10:17:42 -0500
Subject: [PATCH 071/111] feat(inference): default line labels off, parallelism
 labels + high contrast on

Change the inference chart's default toggle states:
- Line Labels: on -> off  (i_linelabel=1 overrides on)
- Parallelism Labels: off -> on, which also defaults point labels on since
  parallelism labels ARE point labels (i_advlabel=0 overrides off)
- High Contrast: off -> on, via a new opt-in defaultHighContrast on
  useChartUIState so reliability/evaluation (r_/e_ prefixes) stay off;
  i_hc=0 overrides off. Historical trends shares the inference context so
  it inherits the high-contrast default too.

URL serialization flipped to omit each param at its new default and only
write the override value, so share links stay clean. Updated line-labels,
gradient-labels, and url-params E2E specs to the new defaults.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../app/cypress/e2e/gradient-labels.cy.ts     | 16 +++++-----
 packages/app/cypress/e2e/line-labels.cy.ts    | 31 ++++++++++++-------
 packages/app/cypress/e2e/url-params.cy.ts     | 14 +++++++--
 .../components/inference/InferenceContext.tsx | 25 ++++++++-------
 packages/app/src/hooks/useChartContext.ts     | 12 +++++--
 5 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts
index 333baa6d..a0753e90 100644
--- a/packages/app/cypress/e2e/gradient-labels.cy.ts
+++ b/packages/app/cypress/e2e/gradient-labels.cy.ts
@@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => {
     cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels');
   });
 
-  it('Parallelism Labels toggle is off by default', () => {
-    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked');
+  it('Parallelism Labels toggle is on by default', () => {
+    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
   });
 
   it('per-point labels are visible by default (gradient labels off)', () => {
@@ -60,21 +60,19 @@ describe('Gradient Labels Toggle', () => {
   });
 
   it('both toggles can be enabled simultaneously', () => {
-    // Turn on Gradient Labels (off by default)
+    // Parallelism Labels is on by default; ensure it's on, then turn on Gradient.
+    cy.get('#scatter-parallelism-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     cy.get('#scatter-gradient-labels').click();
     cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
 
-    // Turn on Parallelism Labels
-    cy.get('#scatter-parallelism-labels').click();
-    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
-
     // Both should be checked
     cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
     cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
 
-    // Reset for next tests
+    // Reset gradient for next tests (parallelism stays at its default-on).
     cy.get('#scatter-gradient-labels').click();
-    cy.get('#scatter-parallelism-labels').click();
   });
 
   it('URL param i_gradlabel=1 enables gradient labels on load', () => {
diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts
index 84e655f8..23b372df 100644
--- a/packages/app/cypress/e2e/line-labels.cy.ts
+++ b/packages/app/cypress/e2e/line-labels.cy.ts
@@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => {
     cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels');
   });
 
-  it('Line Labels toggle is on by default', () => {
-    cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
-
-    // Line labels render without any interaction
-    cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
-  });
-
-  it('toggling Line Labels off then back on removes and restores label elements', () => {
-    // On by default — turn it off first.
-    cy.get('#scatter-line-labels').click();
+  it('Line Labels toggle is off by default', () => {
     cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+
+    // No line labels render without interaction
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
+  });
 
-    // Turn it back on — labels return.
+  it('toggling Line Labels on then back off adds and removes label elements', () => {
+    // Off by default — turn it on first.
     cy.get('#scatter-line-labels').click();
     cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
+
+    // Turn it back off — labels disappear.
+    cy.get('#scatter-line-labels').click();
+    cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+    cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
   });
 
   it('line labels have colored background rects and text', () => {
+    // Off by default — ensure on (idempotent; prior test left them off).
+    cy.get('#scatter-line-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     // Each line label group should contain a background rect and text
     cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should(
       'have.length.greaterThan',
@@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => {
   });
 
   it('line labels render in the foreground, after the scatter points', () => {
-    // Labels were toggled on in the test above and remain on here.
+    // Off by default — ensure on (idempotent; previous test leaves them on).
+    cy.get('#scatter-line-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
 
     cy.get('[data-testid="scatter-graph"] svg').then(($svg) => {
diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts
index 33282b9c..3c480686 100644
--- a/packages/app/cypress/e2e/url-params.cy.ts
+++ b/packages/app/cypress/e2e/url-params.cy.ts
@@ -236,9 +236,15 @@ describe('URL Parameter Persistence', () => {
   });
 
   describe('High contrast mode', () => {
-    it('page loads without high contrast by default', () => {
+    it('inference loads with high contrast on by default', () => {
       visitWithDismissedModal('/inference');
       cy.get('[data-testid="scatter-graph"]').should('exist');
+      cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked');
+    });
+
+    it('i_hc=0 disables high contrast on load', () => {
+      visitWithDismissedModal('/inference?i_hc=0');
+      cy.get('[data-testid="scatter-graph"]').should('exist');
       cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
     });
 
@@ -267,10 +273,12 @@ describe('URL Parameter Persistence', () => {
       cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
 
-    it('historical trends tab has high contrast switch off by default', () => {
+    it('historical trends tab shares the inference high-contrast default (on)', () => {
+      // Historical reads highContrast from the same InferenceContext as the
+      // scatter chart, so it inherits the default-on behavior.
       visitWithDismissedModal('/historical');
       cy.get('[data-testid="historical-trends-display"]').should('exist');
-      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
+      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
 
     it('i_hc=1 enables historical trends high contrast', () => {
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index d66febd0..c2c599ff 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -195,6 +195,8 @@ export function InferenceProvider({
   );
   const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({
     urlPrefix: 'i_',
+    // Inference chart defaults to high contrast (?i_hc=0 overrides off).
+    defaultHighContrast: true,
   });
 
   const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0');
@@ -202,21 +204,22 @@ export function InferenceProvider({
     // Legacy `?i_nolabel=1` from before the rename: keep hiding point labels
     // explicitly so the share link's intent survives future default changes.
     if (getUrlParam('i_nolabel') === '1') return false;
+    if (getUrlParam('i_label') === '0') return false;
     if (getUrlParam('i_label') === '1') return true;
-    // Old share links set `?i_advlabel=1` while keeping the labels default
-    // (shown). Mirror the toggle's auto-enable side-effect on load so those
-    // links still render advanced labels under the new default-off behavior.
-    if (getUrlParam('i_advlabel') === '1') return true;
-    return false;
+    // Default on: parallelism labels (also default on) are point labels and
+    // are pointless without them shown.
+    return true;
   });
   const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1');
+  // Parallelism labels default on (?i_advlabel=0 overrides off).
   const [useAdvancedLabels, setUseAdvancedLabels] = useState(
-    () => getUrlParam('i_advlabel') === '1',
+    () => getUrlParam('i_advlabel') !== '0',
   );
   const [showGradientLabels, setShowGradientLabels] = useState(
     () => getUrlParam('i_gradlabel') === '1',
   );
-  const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') !== '0');
+  // Line labels default off (?i_linelabel=1 overrides on).
+  const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') === '1');
   const [showSpeedOverlay, setShowSpeedOverlay] = useState(() => getUrlParam('i_speed') === '1');
   const [showMinecraftOverlay, setShowMinecraftOverlay] = useState(
     () => getUrlParam('i_mc') === '1',
@@ -983,17 +986,17 @@ export function InferenceProvider({
       i_dstart: selectedDateRange.startDate,
       i_dend: selectedDateRange.endDate,
       i_optimal: hideNonOptimal ? '' : '0',
-      i_label: showPointLabels ? '1' : '',
-      i_hc: highContrast ? '1' : '',
+      i_label: showPointLabels ? '' : '0',
+      i_hc: highContrast ? '' : '0',
       i_log: logScale ? '1' : '',
       i_xmetric: selectedXAxisMetric || '',
       i_e2e_xmetric: selectedE2eXAxisMetric || '',
       i_xmode: selectedXAxisMode,
       i_scale: scaleType,
       i_legend: isLegendExpanded ? '' : '0',
-      i_advlabel: useAdvancedLabels ? '1' : '',
+      i_advlabel: useAdvancedLabels ? '' : '0',
       i_gradlabel: showGradientLabels ? '1' : '',
-      i_linelabel: showLineLabels ? '' : '0',
+      i_linelabel: showLineLabels ? '1' : '',
       i_speed: showSpeedOverlay ? '1' : '',
       i_mc: showMinecraftOverlay ? '1' : '',
       i_active: iActiveStr,
diff --git a/packages/app/src/hooks/useChartContext.ts b/packages/app/src/hooks/useChartContext.ts
index 49812c3e..be095430 100644
--- a/packages/app/src/hooks/useChartContext.ts
+++ b/packages/app/src/hooks/useChartContext.ts
@@ -37,6 +37,12 @@ export function reconcileActiveSet<T>(
 interface UseChartStateConfig {
   /** URL parameter prefix (e.g., 'i_' for inference, 'r_' for reliability, 'e_' for evaluation) */
   urlPrefix: string;
+  /**
+   * Initial high-contrast value when the URL has no `<prefix>hc` param.
+   * Defaults to false; the inference chart opts in to true. A `<prefix>hc=0`
+   * URL param overrides it back off.
+   */
+  defaultHighContrast?: boolean;
 }
 
 /**
@@ -44,7 +50,7 @@ interface UseChartStateConfig {
  * Includes mobile-specific legend collapse behavior.
  */
 export function useChartUIState(config: UseChartStateConfig) {
-  const { urlPrefix } = config;
+  const { urlPrefix, defaultHighContrast = false } = config;
   const { getUrlParam } = useUrlState();
 
   const hcParam = `${urlPrefix}hc` as any;
@@ -52,7 +58,7 @@ export function useChartUIState(config: UseChartStateConfig) {
 
   // Initialize with safe defaults that match SSR output to avoid hydration mismatches.
   // URL-param values are applied in a mount effect so the state is only set client-side.
-  const [highContrast, setHighContrast] = useState(false);
+  const [highContrast, setHighContrast] = useState(defaultHighContrast);
   const [isLegendExpanded, setIsLegendExpanded] = useState(true);
   const didInit = useRef(false);
 
@@ -60,7 +66,9 @@ export function useChartUIState(config: UseChartStateConfig) {
     if (didInit.current) return;
     didInit.current = true;
     const hcVal = getUrlParam(hcParam);
+    // Respect both overrides so the toggle round-trips regardless of the default.
     if (hcVal === '1') setHighContrast(true);
+    else if (hcVal === '0') setHighContrast(false);
     const legendVal = getUrlParam(legendParam);
     if (legendVal === '0') setIsLegendExpanded(false);
   }, [getUrlParam, hcParam, legendParam]);

From 5c290a49f50d7a0834a544d3e837bc1d1ccad5de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 14:30:44 -0500
Subject: [PATCH 072/111] feat(agentic): use the chart's TP/EP/DEP/TEP
 parallelism labels on sibling chips
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The agentic detail page's sibling navigator labeled configs with an ad-hoc
`TP{n}EP{n}` / `{p}P+{d}D` scheme that ignored dp-attention and the
TEP/DEP collapse, so a DEP4 config read as plain TP4EP4 (and, mid-deploy
before the API carried dp_attention, as TEP4).

Extract the scatter chart's labeler into a shared parallelism-label module
(configSegmentLabel + parallelismLabel) and route both getPointLabel and the
sibling chipLabel through it, so the two surfaces describe a config
identically (TP/EP/TEP/DEP/DPA…, multinode-disagg worker segments).

Carry the fields the labeler needs through the siblings query/API/hook:
decode/prefill dp_attention + num_workers + is_multinode.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../inference/agentic-point/sibling-nav.tsx   | 20 ++++-
 .../inference/utils/parallelism-label.test.ts | 58 ++++++++++++++
 .../inference/utils/parallelism-label.ts      | 79 +++++++++++++++++++
 .../inference/utils/tooltipUtils.ts           | 69 ++++++----------
 .../src/hooks/api/use-benchmark-siblings.ts   |  5 ++
 packages/db/src/queries/benchmark-siblings.ts | 20 ++++-
 6 files changed, 202 insertions(+), 49 deletions(-)
 create mode 100644 packages/app/src/components/inference/utils/parallelism-label.test.ts
 create mode 100644 packages/app/src/components/inference/utils/parallelism-label.ts

diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index aa727fdc..f92d6b63 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -4,6 +4,7 @@ import { useRouter } from 'next/navigation';
 import { ChevronLeft, ChevronRight } from 'lucide-react';
 
 import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
 
 const HW_LABELS: Record<string, string> = {
   b200: 'B200',
@@ -49,9 +50,22 @@ function frameworkLabel(fw: string) {
 
 /** Short label for a sibling chip: parallelism + concurrency. */
 export function chipLabel(s: BenchmarkSibling): string {
-  const parallel = s.disagg
-    ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
-    : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
+  // Same parallelism labeler the chart points use (TP/EP/TEP/DEP/DPA…).
+  const parallel = parallelismLabel({
+    tp: s.decode_tp,
+    ep: s.decode_ep,
+    dpAttention: s.decode_dp_attention,
+    disagg: s.disagg,
+    isMultinode: s.is_multinode,
+    prefillTp: s.prefill_tp,
+    prefillEp: s.prefill_ep,
+    prefillDpAttention: s.prefill_dp_attention,
+    prefillNumWorkers: s.prefill_num_workers,
+    decodeTp: s.decode_tp,
+    decodeEp: s.decode_ep,
+    decodeDpAttention: s.decode_dp_attention,
+    decodeNumWorkers: s.decode_num_workers,
+  });
   const offload = s.offload_mode === 'on' ? ' • off=ON' : '';
   return `${parallel} • c=${s.conc}${offload}`;
 }
diff --git a/packages/app/src/components/inference/utils/parallelism-label.test.ts b/packages/app/src/components/inference/utils/parallelism-label.test.ts
new file mode 100644
index 00000000..aaf715d3
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+
+import { configSegmentLabel, parallelismLabel } from './parallelism-label';
+
+describe('configSegmentLabel', () => {
+  it('collapses symmetric tp===ep to TEP / DEP by dp-attention', () => {
+    expect(configSegmentLabel(8, 8, false)).toBe('TEP8');
+    expect(configSegmentLabel(8, 8, true)).toBe('DEP8');
+  });
+
+  it('uses EP / DPAEP when ep>1 and tp!==ep', () => {
+    expect(configSegmentLabel(4, 16, false)).toBe('EP16');
+    expect(configSegmentLabel(4, 16, true)).toBe('DPAEP16');
+  });
+
+  it('uses TP / DPATP when ep<=1 or absent', () => {
+    expect(configSegmentLabel(8, 1, false)).toBe('TP8');
+    expect(configSegmentLabel(8, undefined, false)).toBe('TP8');
+    expect(configSegmentLabel(8, 1, true)).toBe('DPATP8');
+  });
+});
+
+describe('parallelismLabel', () => {
+  it('falls back to bare tp when no ep data', () => {
+    expect(parallelismLabel({ tp: 8 })).toBe('8');
+  });
+
+  it('labels a single-segment config', () => {
+    expect(parallelismLabel({ tp: 8, ep: 8, dpAttention: true })).toBe('DEP8');
+    expect(parallelismLabel({ tp: 4, ep: 8, dpAttention: false })).toBe('EP8');
+  });
+
+  it('builds multinode-disagg per-role worker segments', () => {
+    expect(
+      parallelismLabel({
+        tp: 8,
+        ep: 4,
+        disagg: true,
+        isMultinode: true,
+        prefillTp: 4,
+        prefillEp: 4,
+        prefillDpAttention: false,
+        prefillNumWorkers: 2,
+        decodeTp: 8,
+        decodeEp: 8,
+        decodeDpAttention: true,
+        decodeNumWorkers: 1,
+      }),
+    ).toBe('2xTEP4+1xDEP8');
+  });
+
+  it('single-node disagg uses the single (decode) segment, not worker syntax', () => {
+    // is_multinode false → no "NxPrefill+MxDecode" expansion.
+    expect(
+      parallelismLabel({ tp: 8, ep: 8, dpAttention: false, disagg: true, isMultinode: false }),
+    ).toBe('TEP8');
+  });
+});
diff --git a/packages/app/src/components/inference/utils/parallelism-label.ts b/packages/app/src/components/inference/utils/parallelism-label.ts
new file mode 100644
index 00000000..98207110
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.ts
@@ -0,0 +1,79 @@
+/**
+ * Shared parallelism-config labeling — the single source of truth for the
+ * short "TP8 / EP8 / TEP8 / DEP8 / DPAEP8 / 2xEP4+1xDPAEP32" labels.
+ *
+ * Used by the scatter/GPU chart point labels (via getPointLabel) and the
+ * agentic detail page's sibling navigator chips, so both surfaces describe a
+ * config identically.
+ */
+
+/**
+ * Generates a short config segment label from parallelism params.
+ * - tp == ep and dp-attn false: "TEP{N}"
+ * - tp == ep and dp-attn true: "DEP{N}"
+ * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
+ * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
+ */
+export const configSegmentLabel = (
+  tp: number,
+  ep: number | undefined,
+  dpAttention: boolean | undefined,
+): string => {
+  if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
+    return dpAttention ? `DEP${tp}` : `TEP${tp}`;
+  }
+  const dpaPrefix = dpAttention ? 'DPA' : '';
+  if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
+  return `${dpaPrefix}EP${ep}`;
+};
+
+/** Parallelism params for one benchmark config, framework-agnostic. */
+export interface ParallelismFields {
+  tp: number;
+  ep?: number;
+  dpAttention?: boolean;
+  disagg?: boolean;
+  isMultinode?: boolean;
+  prefillTp?: number;
+  prefillEp?: number;
+  prefillDpAttention?: boolean;
+  prefillNumWorkers?: number;
+  decodeTp?: number;
+  decodeEp?: number;
+  decodeDpAttention?: boolean;
+  decodeNumWorkers?: number;
+}
+
+/**
+ * Returns the short parallelism label for a config.
+ * - No EP data (old rows): falls back to the bare tp value (e.g. "8").
+ * - Multinode disagg: per-role segments with worker counts,
+ *   e.g. "2xEP4+1xDPAEP32".
+ * - Otherwise: a single segment from (tp, ep, dpAttention).
+ */
+export const parallelismLabel = (f: ParallelismFields): string => {
+  if (
+    (f.ep === null || f.ep === undefined) &&
+    (f.prefillEp === null || f.prefillEp === undefined)
+  ) {
+    return String(f.tp);
+  }
+
+  if (f.isMultinode && f.disagg) {
+    const prefillLabel = configSegmentLabel(
+      f.prefillTp ?? f.tp,
+      f.prefillEp ?? f.ep,
+      f.prefillDpAttention ?? f.dpAttention,
+    );
+    const decodeLabel = configSegmentLabel(
+      f.decodeTp ?? f.tp,
+      f.decodeEp ?? f.ep,
+      f.decodeDpAttention ?? f.dpAttention,
+    );
+    const pw = f.prefillNumWorkers ?? 1;
+    const dw = f.decodeNumWorkers ?? 1;
+    return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
+  }
+
+  return configSegmentLabel(f.tp, f.ep, f.dpAttention);
+};
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 14d3b553..ea039336 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -1,6 +1,7 @@
 import { formatNumber, getDisplayLabel } from '@/lib/utils';
 
 import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
 
 export interface TooltipConfig {
   /** The data point to display */
@@ -34,57 +35,37 @@ export interface OverlayTooltipConfig extends TooltipConfig {
   overlayData: OverlayData;
 }
 
-/**
- * Generates a short config segment label from parallelism params.
- * - tp == ep and dp-attn false: "TEP{N}"
- * - tp == ep and dp-attn true: "DEP{N}"
- * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
- * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
- */
-const configSegmentLabel = (
-  tp: number,
-  ep: number | undefined,
-  dpAttention: boolean | undefined,
-): string => {
-  if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
-    return dpAttention ? `DEP${tp}` : `TEP${tp}`;
-  }
-  const dpaPrefix = dpAttention ? 'DPA' : '';
-  if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
-  return `${dpaPrefix}EP${ep}`;
-};
+// `dp_attention` is `boolean | string` on InferenceData (DB sends raw, the
+// transform narrows "true"/"false" → boolean). Coerce to a plain boolean for
+// the shared labeler, treating the legacy string form correctly.
+const asBool = (v: boolean | string | undefined): boolean | undefined =>
+  typeof v === 'string' ? v === 'true' : v;
 
 /**
  * Returns the short label for a data point on the chart.
  * - Non-multinode: e.g. "TP8", "EP8", "TEP8", "DEP8", "DPAEP8"
  * - Multinode disagg: e.g. "2xEP4+1xDPAEP32"
  * - Old data (no ep field): falls back to tp value
+ *
+ * Delegates to the shared {@link parallelismLabel} so the chart points and the
+ * agentic sibling navigator describe a config identically.
  */
-export const getPointLabel = (d: InferenceData): string => {
-  if (
-    (d.ep === null || d.ep === undefined) &&
-    (d.prefill_ep === null || d.prefill_ep === undefined)
-  )
-    return String(d.tp);
-
-  if (d.is_multinode && d.disagg) {
-    const prefillLabel = configSegmentLabel(
-      d.prefill_tp ?? d.tp,
-      d.prefill_ep ?? d.ep,
-      d.prefill_dp_attention ?? d.dp_attention,
-    );
-    const decodeLabel = configSegmentLabel(
-      d.decode_tp ?? d.tp,
-      d.decode_ep ?? d.ep,
-      d.decode_dp_attention ?? d.dp_attention,
-    );
-    const pw = d.prefill_num_workers ?? 1;
-    const dw = d.decode_num_workers ?? 1;
-    return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
-  }
-
-  return configSegmentLabel(d.tp, d.ep, d.dp_attention);
-};
+export const getPointLabel = (d: InferenceData): string =>
+  parallelismLabel({
+    tp: d.tp,
+    ep: d.ep,
+    dpAttention: asBool(d.dp_attention),
+    disagg: d.disagg,
+    isMultinode: d.is_multinode,
+    prefillTp: d.prefill_tp,
+    prefillEp: d.prefill_ep,
+    prefillDpAttention: asBool(d.prefill_dp_attention),
+    prefillNumWorkers: d.prefill_num_workers,
+    decodeTp: d.decode_tp,
+    decodeEp: d.decode_ep,
+    decodeDpAttention: asBool(d.decode_dp_attention),
+    decodeNumWorkers: d.decode_num_workers,
+  });
 
 const runLinkHTML = (runUrl?: string) =>
   runUrl
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
index 1ea90c0d..e6bc4906 100644
--- a/packages/app/src/hooks/api/use-benchmark-siblings.ts
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -6,11 +6,16 @@ export interface BenchmarkSibling {
   offload_mode: string | null;
   decode_tp: number;
   decode_ep: number;
+  decode_dp_attention: boolean;
+  decode_num_workers: number;
   prefill_tp: number;
   prefill_ep: number;
+  prefill_dp_attention: boolean;
+  prefill_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
   disagg: boolean;
+  is_multinode: boolean;
   is_current: boolean;
   has_trace: boolean;
 }
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
index 245a1170..241a48ba 100644
--- a/packages/db/src/queries/benchmark-siblings.ts
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -14,11 +14,16 @@ export interface BenchmarkSibling {
   offload_mode: string | null;
   decode_tp: number;
   decode_ep: number;
+  decode_dp_attention: boolean;
+  decode_num_workers: number;
   prefill_tp: number;
   prefill_ep: number;
+  prefill_dp_attention: boolean;
+  prefill_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
   disagg: boolean;
+  is_multinode: boolean;
   /** True if this row IS the point passed in. */
   is_current: boolean;
   /** Whether the row has a stored trace_replay blob (for navigation hint). */
@@ -74,8 +79,9 @@ export async function getBenchmarkSiblings(
   const rows = (await sql`
     select
       br.id, br.conc, br.offload_mode,
-      c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep,
-      c.num_prefill_gpu, c.num_decode_gpu, c.disagg,
+      c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers,
+      c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers,
+      c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode,
       (br.trace_replay_id is not null) as has_trace
     from benchmark_results br
     join configs c on c.id = br.config_id
@@ -93,11 +99,16 @@ export async function getBenchmarkSiblings(
     offload_mode: string | null;
     decode_tp: number;
     decode_ep: number;
+    decode_dp_attention: boolean;
+    decode_num_workers: number;
     prefill_tp: number;
     prefill_ep: number;
+    prefill_dp_attention: boolean;
+    prefill_num_workers: number;
     num_prefill_gpu: number;
     num_decode_gpu: number;
     disagg: boolean;
+    is_multinode: boolean;
     has_trace: boolean;
   }[];
 
@@ -107,11 +118,16 @@ export async function getBenchmarkSiblings(
     offload_mode: r.offload_mode,
     decode_tp: r.decode_tp,
     decode_ep: r.decode_ep,
+    decode_dp_attention: r.decode_dp_attention,
+    decode_num_workers: r.decode_num_workers,
     prefill_tp: r.prefill_tp,
     prefill_ep: r.prefill_ep,
+    prefill_dp_attention: r.prefill_dp_attention,
+    prefill_num_workers: r.prefill_num_workers,
     num_prefill_gpu: r.num_prefill_gpu,
     num_decode_gpu: r.num_decode_gpu,
     disagg: r.disagg,
+    is_multinode: r.is_multinode,
     is_current: Number(r.id) === benchmarkResultId,
     has_trace: r.has_trace,
   }));

From 32adf6bec66f41ffe2cfa4f08251afcb333c007d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 14:53:17 -0500
Subject: [PATCH 073/111] feat(agentic): sort dropdown for the sibling point
 navigator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a 'Sort by' dropdown to the agentic detail page's point navigator:
- Default (DB order)
- Concurrency ↑
- Parallelism (groups all TP, then TEP/DEP/EP… by ep→tp→dpa, conc within)
- Throughput/GPU ↓
- Total requests ↓

Carry tput_per_gpu and total_requests (total_requests_completed, falling
back to legacy num_requests_total) through the siblings query/API/hook.

prev/next follow the sorted order, and the chosen sort is persisted in the
URL (?sort=) — read on mount and threaded through every point link plus a
router.replace — so navigating to another point no longer resets it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../inference/agentic-point/sibling-nav.tsx   | 131 ++++++++++++++++--
 .../src/hooks/api/use-benchmark-siblings.ts   |   2 +
 packages/db/src/queries/benchmark-siblings.ts |  16 +++
 3 files changed, 141 insertions(+), 8 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index f92d6b63..a1a5d1ab 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -1,10 +1,19 @@
 'use client';
 
+import { useMemo, useState } from 'react';
 import { useRouter } from 'next/navigation';
 import { ChevronLeft, ChevronRight } from 'lucide-react';
 
 import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
 import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { track } from '@/lib/analytics';
 
 const HW_LABELS: Record<string, string> = {
   b200: 'B200',
@@ -70,12 +79,83 @@ export function chipLabel(s: BenchmarkSibling): string {
   return `${parallel} • c=${s.conc}${offload}`;
 }
 
+type SortMode = 'default' | 'conc' | 'parallelism' | 'tput' | 'requests';
+
+const SORT_OPTIONS: { value: SortMode; label: string }[] = [
+  { value: 'default', label: 'Default' },
+  { value: 'conc', label: 'Concurrency ↑' },
+  { value: 'parallelism', label: 'Parallelism' },
+  { value: 'tput', label: 'Throughput/GPU ↓' },
+  { value: 'requests', label: 'Total requests ↓' },
+];
+
+// Group key for the "parallelism" sort: ep first (so TP/EP1 sorts ahead of
+// EP/TEP/DEP groups), then tp, then dp-attention, then disagg — every config
+// of one parallelism lands together, ordered by concurrency within.
+const parallelRank = (s: BenchmarkSibling): [number, number, number, number] => [
+  s.decode_ep ?? 0,
+  s.decode_tp ?? 0,
+  s.decode_dp_attention ? 1 : 0,
+  s.disagg ? 1 : 0,
+];
+
+function sortSiblings(siblings: BenchmarkSibling[], mode: SortMode): BenchmarkSibling[] {
+  if (mode === 'default') return siblings;
+  const out = [...siblings];
+  if (mode === 'conc') {
+    out.sort((a, b) => a.conc - b.conc);
+  } else if (mode === 'tput') {
+    // Highest throughput/GPU first; rows missing the metric sink to the end.
+    out.sort((a, b) => (b.tput_per_gpu ?? -Infinity) - (a.tput_per_gpu ?? -Infinity));
+  } else if (mode === 'requests') {
+    // Most total requests first; rows missing the metric sink to the end.
+    out.sort((a, b) => (b.total_requests ?? -Infinity) - (a.total_requests ?? -Infinity));
+  } else {
+    out.sort((a, b) => {
+      const ra = parallelRank(a);
+      const rb = parallelRank(b);
+      for (let i = 0; i < ra.length; i++) {
+        if (ra[i] !== rb[i]) return ra[i] - rb[i];
+      }
+      // Within a parallelism group: offload off before on, then concurrency.
+      const oa = a.offload_mode === 'on' ? 1 : 0;
+      const ob = b.offload_mode === 'on' ? 1 : 0;
+      return oa - ob || a.conc - b.conc;
+    });
+  }
+  return out;
+}
+
+const isSortMode = (v: string | null): v is SortMode =>
+  v !== null && SORT_OPTIONS.some((o) => o.value === v);
+
 export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) {
   const router = useRouter();
-  const currentIdx = siblings.findIndex((s) => s.is_current);
-  const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null;
-  const next =
-    currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null;
+  // Persist the sort in the URL so clicking a point (which remounts this
+  // component on the new route) keeps the chosen order instead of resetting.
+  // Read it once from the URL on mount — this component only renders after the
+  // client-side siblings query resolves, so `window` is always available here
+  // (no SSR/hydration mismatch). Matches the app's window-based url-state read.
+  const [sortMode, setSortMode] = useState<SortMode>(() => {
+    if (typeof window === 'undefined') return 'default';
+    const v = new URLSearchParams(window.location.search).get('sort');
+    return isSortMode(v) ? v : 'default';
+  });
+
+  const sorted = useMemo(() => sortSiblings(siblings, sortMode), [siblings, sortMode]);
+
+  // prev/next follow the displayed (sorted) order so navigation matches the row.
+  const currentIdx = sorted.findIndex((s) => s.is_current);
+  const prev = currentIdx > 0 ? sorted[currentIdx - 1] : null;
+  const next = currentIdx !== -1 && currentIdx < sorted.length - 1 ? sorted[currentIdx + 1] : null;
+
+  // Carry the active sort through every point-to-point link.
+  const hrefFor = (id: number) =>
+    sortMode === 'default'
+      ? `/inference/agentic/${id}`
+      : `/inference/agentic/${id}?sort=${sortMode}`;
+
+  const currentId = siblings.find((s) => s.is_current)?.id;
 
   const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`;
 
@@ -88,23 +168,58 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
         </span>
       </div>
       <div className="flex items-center gap-2 flex-wrap">
+        <div className="flex items-center gap-1.5">
+          <span className="text-xs text-muted-foreground">Sort by</span>
+          <Select
+            value={sortMode}
+            onValueChange={(v) => {
+              const mode = v as SortMode;
+              setSortMode(mode);
+              track('agentic_siblings_sorted', { mode });
+              // Mirror into the URL (replace, no history spam) so a refresh —
+              // and the next point's mount — keep the chosen order.
+              if (currentId !== undefined) {
+                const href =
+                  mode === 'default'
+                    ? `/inference/agentic/${currentId}`
+                    : `/inference/agentic/${currentId}?sort=${mode}`;
+                router.replace(href, { scroll: false });
+              }
+            }}
+          >
+            <SelectTrigger
+              className="h-7 w-[10rem] text-xs"
+              aria-label="Sort points"
+              data-testid="sibling-sort-select"
+            >
+              <SelectValue />
+            </SelectTrigger>
+            <SelectContent>
+              {SORT_OPTIONS.map((o) => (
+                <SelectItem key={o.value} value={o.value} className="text-xs">
+                  {o.label}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
         <button
           type="button"
           disabled={!prev}
-          onClick={() => prev && router.push(`/inference/agentic/${prev.id}`)}
+          onClick={() => prev && router.push(hrefFor(prev.id))}
           className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
           aria-label="Previous point"
         >
           <ChevronLeft className="size-3.5" /> prev
         </button>
         <div className="flex items-center gap-1 flex-wrap">
-          {siblings.map((s) => {
+          {sorted.map((s) => {
             const active = s.is_current;
             return (
               <button
                 key={s.id}
                 type="button"
-                onClick={() => !active && router.push(`/inference/agentic/${s.id}`)}
+                onClick={() => !active && router.push(hrefFor(s.id))}
                 className={`px-2 py-1 rounded-md text-xs border transition-colors ${
                   active
                     ? 'border-primary bg-primary text-primary-foreground font-medium'
@@ -120,7 +235,7 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
         <button
           type="button"
           disabled={!next}
-          onClick={() => next && router.push(`/inference/agentic/${next.id}`)}
+          onClick={() => next && router.push(hrefFor(next.id))}
           className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
           aria-label="Next point"
         >
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
index e6bc4906..55720bdf 100644
--- a/packages/app/src/hooks/api/use-benchmark-siblings.ts
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -16,6 +16,8 @@ export interface BenchmarkSibling {
   num_decode_gpu: number;
   disagg: boolean;
   is_multinode: boolean;
+  tput_per_gpu: number | null;
+  total_requests: number | null;
   is_current: boolean;
   has_trace: boolean;
 }
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
index 241a48ba..c7e4a317 100644
--- a/packages/db/src/queries/benchmark-siblings.ts
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -24,6 +24,13 @@ export interface BenchmarkSibling {
   num_decode_gpu: number;
   disagg: boolean;
   is_multinode: boolean;
+  /** Throughput per GPU (tok/s/gpu) for this point; null if the metric is absent. */
+  tput_per_gpu: number | null;
+  /**
+   * Total requests for this point — `total_requests_completed` (aiperf runner)
+   * falling back to the legacy `num_requests_total`; null if neither is present.
+   */
+  total_requests: number | null;
   /** True if this row IS the point passed in. */
   is_current: boolean;
   /** Whether the row has a stored trace_replay blob (for navigation hint). */
@@ -82,6 +89,11 @@ export async function getBenchmarkSiblings(
       c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers,
       c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers,
       c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode,
+      (br.metrics->>'tput_per_gpu')::float8 as tput_per_gpu,
+      coalesce(
+        (br.metrics->>'total_requests_completed')::float8,
+        (br.metrics->>'num_requests_total')::float8
+      ) as total_requests,
       (br.trace_replay_id is not null) as has_trace
     from benchmark_results br
     join configs c on c.id = br.config_id
@@ -109,6 +121,8 @@ export async function getBenchmarkSiblings(
     num_decode_gpu: number;
     disagg: boolean;
     is_multinode: boolean;
+    tput_per_gpu: number | null;
+    total_requests: number | null;
     has_trace: boolean;
   }[];
 
@@ -128,6 +142,8 @@ export async function getBenchmarkSiblings(
     num_decode_gpu: r.num_decode_gpu,
     disagg: r.disagg,
     is_multinode: r.is_multinode,
+    tput_per_gpu: r.tput_per_gpu === null ? null : Number(r.tput_per_gpu),
+    total_requests: r.total_requests === null ? null : Number(r.total_requests),
     is_current: Number(r.id) === benchmarkResultId,
     has_trace: r.has_trace,
   }));

From 60c5c2db0d73e9858e2cab84bb5e507be18ebf1e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 15:49:57 -0500
Subject: [PATCH 074/111] feat(datasets): add 011 schema for datasets +
 dataset_conversations

Additive migration backing the new /datasets area: a registry of ingested
HF cc-traces-weka dataset versions (summary + precomputed chart_data) and one
row per conversation holding a flamegraph-ready structure JSONB. Drop snippet
in the migration header for revert.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/migrations/011_datasets.sql | 55 +++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 packages/db/migrations/011_datasets.sql

diff --git a/packages/db/migrations/011_datasets.sql b/packages/db/migrations/011_datasets.sql
new file mode 100644
index 00000000..7a70d83f
--- /dev/null
+++ b/packages/db/migrations/011_datasets.sql
@@ -0,0 +1,55 @@
+-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
+-- the agentic benchmarks replay) + their per-conversation trace structure.
+--
+-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
+-- not the source traces. These two tables back the new /datasets area: a
+-- registry of ingested dataset versions with precomputed summary + chart data,
+-- and one row per conversation holding a flamegraph-ready `structure` (turns +
+-- subagent groups with input split into cached-prefix vs uncached-suffix). The
+-- raw hash_ids are NOT stored — they're only needed at ingest to derive the
+-- cached/uncached split, so the runtime read is a single small JSONB.
+--
+-- Additive only. To revert this migration:
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   delete from schema_migrations where filename = '011_datasets.sql';
+
+create table datasets (
+  -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
+  id          text primary key,
+  -- URL key, e.g. 'cc-traces-weka-062126'.
+  slug        text not null unique,
+  label       text not null,
+  -- 'full' | '256k' | 'no-subagents' (the published variants).
+  variant     text not null default 'full',
+  description text,
+  hf_url      text,
+  license     text,
+  conversation_count integer not null default 0,
+  -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
+  summary     jsonb not null default '{}'::jsonb,
+  -- Precomputed distributions for the dataset-detail cards (input/output length,
+  -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
+  chart_data  jsonb not null default '{}'::jsonb,
+  dataset_version integer not null default 1,
+  ingested_at timestamptz not null default now()
+);
+
+create table dataset_conversations (
+  id          bigserial primary key,
+  dataset_id  text not null references datasets(id) on delete cascade,
+  -- The conversation id from the dataset record (trace id).
+  conv_id     text not null,
+  models      text[] not null default '{}',
+  num_turns           integer not null default 0,
+  num_subagent_groups integer not null default 0,
+  total_in    bigint not null default 0,
+  total_out   bigint not null default 0,
+  total_cached bigint not null default 0,
+  -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
+  -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
+  structure   jsonb not null,
+  unique (dataset_id, conv_id)
+);
+
+create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);

From 71e388f83c8d20f76738daa2b877962c9e3533bd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 15:49:57 -0500
Subject: [PATCH 075/111] feat(datasets): weka trace structure + cached-prefix
 builder

Pure transforms (no DB) turning a raw cc-traces-weka conversation into a
flamegraph-ready structure: ordered turn/subagent nodes with input split into
cached-prefix vs uncached-suffix. Ports _count_seen_prefix_blocks from the
aiperf weka loader; subagents run against a spawn-time snapshot of the parent
prefix cache. Includes linear/log histogram helpers for the detail cards and
13 unit tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/weka-structure.test.ts | 158 ++++++++++++
 packages/db/src/etl/weka-structure.ts      | 275 +++++++++++++++++++++
 2 files changed, 433 insertions(+)
 create mode 100644 packages/db/src/etl/weka-structure.test.ts
 create mode 100644 packages/db/src/etl/weka-structure.ts

diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
new file mode 100644
index 00000000..95bfef38
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -0,0 +1,158 @@
+import { describe, it, expect } from 'vitest';
+import {
+  countSeenPrefixBlocks,
+  buildConversationStructure,
+  linearHistogram,
+  logHistogram,
+  type RawWekaConversation,
+  type SubagentNode,
+  type TurnNode,
+} from './weka-structure.js';
+
+describe('countSeenPrefixBlocks', () => {
+  it('counts only the contiguous leading run already seen', () => {
+    const seen = new Set([1, 2, 3, 9]);
+    // 1,2,3 seen contiguously; 4 breaks the run even though 9 is seen later.
+    expect(countSeenPrefixBlocks([1, 2, 3, 4, 9], seen)).toBe(3);
+  });
+
+  it('returns 0 when the first block is unseen', () => {
+    expect(countSeenPrefixBlocks([7, 1, 2], new Set([1, 2]))).toBe(0);
+  });
+
+  it('returns the full length when every block is seen', () => {
+    expect(countSeenPrefixBlocks([1, 2], new Set([1, 2, 3]))).toBe(2);
+  });
+
+  it('handles empty hash list', () => {
+    expect(countSeenPrefixBlocks([], new Set([1]))).toBe(0);
+  });
+});
+
+describe('buildConversationStructure', () => {
+  it('splits input into cached-prefix vs uncached as the prefix cache warms', () => {
+    const conv: RawWekaConversation = {
+      id: 'c1',
+      block_size: 64,
+      requests: [
+        // Turn 0: nothing seen yet → all uncached.
+        { type: 'n', model: 'm', in: 128, out: 10, hash_ids: [1, 2] },
+        // Turn 1: blocks 1,2 already seen, 3 is new → 2 blocks cached.
+        { type: 'n', model: 'm', in: 192, out: 20, hash_ids: [1, 2, 3] },
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    const t0 = s.nodes[0] as TurnNode;
+    const t1 = s.nodes[1] as TurnNode;
+    expect(t0).toMatchObject({ kind: 'turn', in: 128, cached: 0, uncached: 128, out: 10 });
+    expect(t1.cached).toBe(128); // 2 blocks × 64
+    expect(t1.uncached).toBe(64); // 192 - 128
+    expect(s.totals).toMatchObject({
+      in: 320,
+      out: 30,
+      cached: 128,
+      uncached: 192,
+      numTurns: 2,
+      numSubagentGroups: 0,
+    });
+  });
+
+  it('clamps cached to the effective input on a partial last block', () => {
+    const conv: RawWekaConversation = {
+      id: 'c2',
+      block_size: 64,
+      requests: [
+        { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // 2 blocks but in=100 (partial)
+        { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // both seen → cached clamped to 100
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    const t1 = s.nodes[1] as TurnNode;
+    expect(t1.cached).toBe(100);
+    expect(t1.uncached).toBe(0);
+  });
+
+  it('treats turns with no hash_ids as fully uncached', () => {
+    const conv: RawWekaConversation = {
+      id: 'c3',
+      requests: [{ type: 'n', in: 50, out: 5 }],
+    };
+    const t0 = buildConversationStructure(conv).nodes[0] as TurnNode;
+    expect(t0).toMatchObject({ cached: 0, uncached: 50 });
+  });
+
+  it('nests subagent groups with aggregated children and runs them against a spawn-time snapshot', () => {
+    const conv: RawWekaConversation = {
+      id: 'c4',
+      block_size: 64,
+      requests: [
+        { type: 'n', model: 'main', in: 64, out: 10, hash_ids: [1] },
+        {
+          type: 'subagent',
+          agent_id: 'a1',
+          subagent_type: 'Explore',
+          duration_ms: 1234,
+          requests: [
+            // sees parent block 1 (snapshot at spawn) → 1 block cached
+            { type: 'n', model: 'sub', in: 128, out: 7, hash_ids: [1, 5] },
+            // now block 5 is also seen within the subagent → 2 cached
+            { type: 'n', model: 'sub', in: 128, out: 3, hash_ids: [1, 5] },
+          ],
+        },
+        // Parent turn after subagent: block 5 must NOT be cached (subagent
+        // context not folded back); only block 1 is in the parent seen set.
+        { type: 'n', model: 'main', in: 128, out: 1, hash_ids: [1, 5] },
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    expect(s.totals.numTurns).toBe(2); // two top-level normal turns
+    expect(s.totals.numSubagentGroups).toBe(1);
+
+    const sub = s.nodes[1] as SubagentNode;
+    expect(sub.kind).toBe('subagent');
+    expect(sub.label).toBe('Explore');
+    expect(sub.agentId).toBe('a1');
+    expect(sub.durationMs).toBe(1234);
+    expect(sub.children).toHaveLength(2);
+    expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
+    expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
+    expect(sub.in).toBe(256);
+    expect(sub.out).toBe(10);
+
+    const afterSub = s.nodes[2] as TurnNode;
+    expect(afterSub.cached).toBe(64); // only block 1; block 5 not folded back
+  });
+
+  it('falls back to the default block size and a generic subagent label', () => {
+    const conv: RawWekaConversation = {
+      id: 'c5',
+      requests: [{ type: 'subagent', requests: [{ type: 'n', in: 10, out: 1, hash_ids: [1] }] }],
+    };
+    const s = buildConversationStructure(conv);
+    expect(s.blockSize).toBe(64);
+    expect((s.nodes[0] as SubagentNode).label).toBe('Subagent');
+  });
+});
+
+describe('histograms', () => {
+  it('linearHistogram buckets across [0, max] and totals the count', () => {
+    const bins = linearHistogram([0, 1, 2, 3, 4], 4);
+    expect(bins).toHaveLength(4);
+    expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(5);
+    expect(bins[0].x0).toBe(0);
+  });
+
+  it('linearHistogram handles all-zero input', () => {
+    expect(linearHistogram([0, 0])).toEqual([{ x0: 0, x1: 1, count: 2 }]);
+  });
+
+  it('logHistogram drops non-positive values and preserves the positive total', () => {
+    const bins = logHistogram([1, 10, 100, 1000, 0, -5], 3);
+    expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(4);
+  });
+
+  it('both return [] for empty input', () => {
+    expect(linearHistogram([])).toEqual([]);
+    expect(logHistogram([])).toEqual([]);
+  });
+});
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
new file mode 100644
index 00000000..e4113c68
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.ts
@@ -0,0 +1,275 @@
+/**
+ * Pure transforms for the HuggingFace cc-traces-weka datasets.
+ *
+ * Turns a raw conversation record (`{ id, block_size, requests[] }`, where each
+ * request is a normal turn or a subagent group) into a compact, flamegraph-ready
+ * `structure`: ordered nodes with input split into cached-prefix vs
+ * uncached-suffix. The cached split ports `_count_seen_prefix_blocks` from the
+ * aiperf weka loader (contiguous leading hash_ids already seen under an infinite
+ * KV cache). No DB access — safe to import anywhere and unit-test directly.
+ */
+
+export const DEFAULT_BLOCK_SIZE = 64;
+
+// ── Raw record shapes (subset we read) ──────────────────────────────────────
+
+export interface RawWekaRequest {
+  t?: number;
+  type?: string; // 'n' | 's'
+  model?: string;
+  in?: number;
+  out?: number;
+  hash_ids?: number[];
+  api_time?: number;
+}
+
+export interface RawWekaSubagent {
+  t?: number;
+  type: 'subagent';
+  agent_id?: string;
+  subagent_type?: string;
+  duration_ms?: number;
+  requests?: RawWekaRequest[];
+  models?: string[];
+}
+
+export type RawWekaEntry = RawWekaRequest | RawWekaSubagent;
+
+export interface RawWekaConversation {
+  id: string;
+  models?: string[];
+  block_size?: number;
+  hash_id_scope?: string;
+  requests?: RawWekaEntry[];
+}
+
+// ── Output structure (stored in dataset_conversations.structure) ─────────────
+
+export interface TurnNode {
+  kind: 'turn';
+  turnIndex: number;
+  model?: string;
+  in: number;
+  out: number;
+  /** Input tokens served from the prefix cache (≤ in). */
+  cached: number;
+  /** Input tokens that must be (re)computed (in - cached). */
+  uncached: number;
+}
+
+export interface SubagentNode {
+  kind: 'subagent';
+  label: string;
+  agentId?: string;
+  durationMs?: number;
+  in: number;
+  out: number;
+  cached: number;
+  uncached: number;
+  children: TurnNode[];
+}
+
+export type StructureNode = TurnNode | SubagentNode;
+
+export interface ConversationStructure {
+  blockSize: number;
+  nodes: StructureNode[];
+  totals: {
+    in: number;
+    out: number;
+    cached: number;
+    uncached: number;
+    numTurns: number;
+    numSubagentGroups: number;
+  };
+}
+
+const isSubagent = (e: RawWekaEntry): e is RawWekaSubagent =>
+  (e as RawWekaSubagent).type === 'subagent';
+
+/**
+ * Count contiguous leading hash_ids already present in `seen`
+ * (port of aiperf `_count_seen_prefix_blocks`).
+ */
+export function countSeenPrefixBlocks(
+  hashIds: readonly number[],
+  seen: ReadonlySet<number>,
+): number {
+  let hits = 0;
+  for (const h of hashIds) {
+    if (!seen.has(h)) break;
+    hits += 1;
+  }
+  return hits;
+}
+
+/**
+ * Compute the {cached, uncached} input-token split for one request and fold its
+ * blocks into `seen`. `cached` is derived from blocks but clamped to the
+ * request's effective `in` so cached+uncached === in even when the last block is
+ * partial (in = hash_token_count, not always a multiple of blockSize).
+ */
+function splitInput(
+  req: RawWekaRequest,
+  seen: Set<number>,
+  blockSize: number,
+): { in: number; cached: number; uncached: number } {
+  const input = Math.max(0, Math.round(req.in ?? 0));
+  const hashIds = req.hash_ids ?? [];
+  if (hashIds.length === 0) {
+    return { in: input, cached: 0, uncached: input };
+  }
+  const cachedBlocks = countSeenPrefixBlocks(hashIds, seen);
+  for (const h of hashIds) seen.add(h);
+  const cached = Math.min(input, cachedBlocks * blockSize);
+  return { in: input, cached, uncached: input - cached };
+}
+
+function subagentLabel(s: RawWekaSubagent): string {
+  const base = s.subagent_type?.trim();
+  return base && base.length > 0 ? base : 'Subagent';
+}
+
+/**
+ * Build the flamegraph structure for one conversation. Main turns share a single
+ * accumulating prefix-cache `seen` set; each subagent group runs against a
+ * *copy* of the parent `seen` at spawn (its context is separate and is not
+ * folded back into the parent), mirroring the weka loader's parent/child split.
+ */
+export function buildConversationStructure(
+  conv: RawWekaConversation,
+  blockSizeOverride?: number,
+): ConversationStructure {
+  const blockSize = blockSizeOverride ?? conv.block_size ?? DEFAULT_BLOCK_SIZE;
+  const seen = new Set<number>();
+  const nodes: StructureNode[] = [];
+  let totalIn = 0;
+  let totalOut = 0;
+  let totalCached = 0;
+  let totalUncached = 0;
+  let numTurns = 0;
+  let numSubagentGroups = 0;
+  let turnIndex = 0;
+
+  for (const entry of conv.requests ?? []) {
+    if (isSubagent(entry)) {
+      const childSeen = new Set(seen); // snapshot at spawn; not merged back
+      const children: TurnNode[] = [];
+      let gin = 0;
+      let gout = 0;
+      let gcached = 0;
+      let guncached = 0;
+      for (const inner of entry.requests ?? []) {
+        const split = splitInput(inner, childSeen, blockSize);
+        const out = Math.max(0, Math.round(inner.out ?? 0));
+        children.push({
+          kind: 'turn',
+          turnIndex: turnIndex++,
+          model: inner.model,
+          in: split.in,
+          out,
+          cached: split.cached,
+          uncached: split.uncached,
+        });
+        gin += split.in;
+        gout += out;
+        gcached += split.cached;
+        guncached += split.uncached;
+      }
+      nodes.push({
+        kind: 'subagent',
+        label: subagentLabel(entry),
+        agentId: entry.agent_id,
+        durationMs: entry.duration_ms,
+        in: gin,
+        out: gout,
+        cached: gcached,
+        uncached: guncached,
+        children,
+      });
+      numSubagentGroups += 1;
+      totalIn += gin;
+      totalOut += gout;
+      totalCached += gcached;
+      totalUncached += guncached;
+    } else {
+      const split = splitInput(entry, seen, blockSize);
+      const out = Math.max(0, Math.round(entry.out ?? 0));
+      nodes.push({
+        kind: 'turn',
+        turnIndex: turnIndex++,
+        model: entry.model,
+        in: split.in,
+        out,
+        cached: split.cached,
+        uncached: split.uncached,
+      });
+      numTurns += 1;
+      totalIn += split.in;
+      totalOut += out;
+      totalCached += split.cached;
+      totalUncached += split.uncached;
+    }
+  }
+
+  return {
+    blockSize,
+    nodes,
+    totals: {
+      in: totalIn,
+      out: totalOut,
+      cached: totalCached,
+      uncached: totalUncached,
+      numTurns,
+      numSubagentGroups,
+    },
+  };
+}
+
+// ── Distribution binning (for the dataset-detail cards) ──────────────────────
+
+export interface HistogramBin {
+  x0: number;
+  x1: number;
+  count: number;
+}
+
+/** Linear-width histogram over [0, max]. Empty input → []. */
+export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+  if (values.length === 0) return [];
+  const max = Math.max(...values);
+  if (max <= 0) return [{ x0: 0, x1: 1, count: values.length }];
+  const width = max / bins;
+  const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+    x0: i * width,
+    x1: (i + 1) * width,
+    count: 0,
+  }));
+  for (const v of values) {
+    const idx = Math.min(bins - 1, Math.max(0, Math.floor(v / width)));
+    out[idx].count += 1;
+  }
+  return out;
+}
+
+/** Log-width histogram over positive values (values ≤ 0 are dropped). */
+export function logHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+  const pos = values.filter((v) => v > 0);
+  if (pos.length === 0) return [];
+  const min = Math.min(...pos);
+  const max = Math.max(...pos);
+  const lo = Math.log10(min);
+  const hi = Math.log10(max);
+  if (hi <= lo) return [{ x0: min, x1: max <= min ? min * 10 : max, count: pos.length }];
+  const width = (hi - lo) / bins;
+  const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+    x0: 10 ** (lo + i * width),
+    x1: 10 ** (lo + (i + 1) * width),
+    count: 0,
+  }));
+  for (const v of pos) {
+    const idx = Math.min(bins - 1, Math.max(0, Math.floor((Math.log10(v) - lo) / width)));
+    out[idx].count += 1;
+  }
+  return out;
+}

From 9fbc7160057f60945adaa4bf3bc98b645f0c25f2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 15:54:11 -0500
Subject: [PATCH 076/111] feat(datasets): HF cc-traces-weka ingest script

Pages the HF datasets-server rows API (adaptive page length for the ~3.5MB
rows), builds the flamegraph structure + cached-prefix split per conversation,
accumulates dataset-level distributions (input/output length, turns/conv,
subagent fan-out, cached fraction) into datasets.chart_data, and upserts
datasets + dataset_conversations. DATABASE_WRITE_URL must be provided. Verified
the cached split against a hand computation on raw hash_ids.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/src/ingest-weka-dataset.ts | 386 +++++++++++++++++++++++++
 1 file changed, 386 insertions(+)
 create mode 100644 packages/db/src/ingest-weka-dataset.ts

diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
new file mode 100644
index 00000000..4ef5328e
--- /dev/null
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -0,0 +1,386 @@
+/**
+ * Ingest a HuggingFace cc-traces-weka dataset into the `datasets` +
+ * `dataset_conversations` tables that back the /datasets area.
+ *
+ * Public dataset, no token needed — fetched via the HF datasets-server rows API
+ * (rows are large, ~3.5 MB each, so we page in small chunks with adaptive
+ * backoff). Per conversation we build a flamegraph-ready `structure` (turns +
+ * subagent groups, input split into cached-prefix vs uncached) and accumulate
+ * dataset-level distributions for the detail cards. Raw hash_ids are discarded
+ * after the cached/uncached split is computed.
+ *
+ * Usage (DATABASE_WRITE_URL must be provided — never hardcoded):
+ *   DATABASE_WRITE_URL='postgres://…' pnpm exec tsx src/ingest-weka-dataset.ts \
+ *     semianalysisai/cc-traces-weka-062126 [--label "…"] [--variant full|256k] \
+ *     [--description "…"] [--limit N]
+ *
+ * Upsert: re-running replaces the dataset's rows (delete + re-insert).
+ * Remember to purge the API cache afterwards (POST /api/v1/invalidate).
+ */
+
+import { createAdminSql } from './etl/db-utils';
+import { hasNoSslFlag } from './cli-utils';
+import {
+  buildConversationStructure,
+  linearHistogram,
+  logHistogram,
+  type ConversationStructure,
+  type RawWekaConversation,
+  type TurnNode,
+} from './etl/weka-structure';
+
+const ROWS_API = 'https://datasets-server.huggingface.co/rows';
+const INFO_API = 'https://datasets-server.huggingface.co/info';
+
+interface CliArgs {
+  dataset: string;
+  label?: string;
+  variant?: string;
+  description?: string;
+  limit?: number;
+}
+
+function parseArgs(): CliArgs {
+  const argv = process.argv.slice(2);
+  const positional = argv.filter((a) => !a.startsWith('--'));
+  const dataset = positional[0];
+  if (!dataset) {
+    console.error(
+      'Usage: tsx src/ingest-weka-dataset.ts <hf-dataset-id> [--label …] [--variant full|256k] [--description …] [--limit N]',
+    );
+    process.exit(1);
+  }
+  const getFlag = (name: string): string | undefined => {
+    const i = argv.indexOf(`--${name}`);
+    return i !== -1 && i + 1 < argv.length ? argv[i + 1] : undefined;
+  };
+  const limitRaw = getFlag('limit');
+  return {
+    dataset,
+    label: getFlag('label'),
+    variant: getFlag('variant'),
+    description: getFlag('description'),
+    limit: limitRaw ? Number(limitRaw) : undefined,
+  };
+}
+
+async function fetchJson(url: string): Promise<unknown> {
+  const res = await fetch(url);
+  if (!res.ok) {
+    throw new Error(`${res.status} ${res.statusText} for ${url}`);
+  }
+  return res.json();
+}
+
+async function getRowCount(dataset: string): Promise<number> {
+  const info = (await fetchJson(`${INFO_API}?dataset=${encodeURIComponent(dataset)}`)) as {
+    dataset_info?: Record<string, { splits?: Record<string, { num_examples?: number }> }>;
+  };
+  const cfg = info.dataset_info?.['default'];
+  const num = cfg?.splits?.['train']?.num_examples;
+  return typeof num === 'number' ? num : 0;
+}
+
+/** Page through rows with adaptive length (halve on "too big"/error). */
+async function* iterRows(
+  dataset: string,
+  total: number,
+  limit?: number,
+): AsyncGenerator<RawWekaConversation> {
+  const cap = limit ? Math.min(limit, total) : total;
+  let offset = 0;
+  let length = 5; // ~18 MB/page at ~3.5 MB/row; backs off on failure
+  while (offset < cap) {
+    const want = Math.min(length, cap - offset);
+    const url = `${ROWS_API}?dataset=${encodeURIComponent(dataset)}&config=default&split=train&offset=${offset}&length=${want}`;
+    let payload: { rows?: { row: RawWekaConversation }[] };
+    try {
+      payload = (await fetchJson(url)) as { rows?: { row: RawWekaConversation }[] };
+    } catch (error) {
+      if (want > 1) {
+        length = Math.max(1, Math.floor(want / 2));
+        console.warn(
+          `  page @${offset} (len ${want}) failed (${String(error)}); retrying with len ${length}`,
+        );
+        continue;
+      }
+      throw error;
+    }
+    const rows = payload.rows ?? [];
+    if (rows.length === 0) break;
+    for (const r of rows) yield r.row;
+    offset += rows.length;
+    process.stdout.write(`\r  fetched ${Math.min(offset, cap)}/${cap} conversations`);
+  }
+  process.stdout.write('\n');
+}
+
+interface Accumulator {
+  inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children)
+  outputPerTurn: number[];
+  cachedFractionPerTurn: number[]; // cached/in, for turns with in>0
+  turnsPerConv: number[]; // main (top-level) turns
+  subagentGroupsPerConv: number[];
+  subagentTurnsPerGroup: number[];
+  totalIn: number;
+  totalOut: number;
+  totalCached: number;
+  mainTurns: number;
+  subagentGroups: number;
+  subagentTurns: number;
+  modelCounts: Record<string, number>;
+}
+
+function newAccumulator(): Accumulator {
+  return {
+    inputPerTurn: [],
+    outputPerTurn: [],
+    cachedFractionPerTurn: [],
+    turnsPerConv: [],
+    subagentGroupsPerConv: [],
+    subagentTurnsPerGroup: [],
+    totalIn: 0,
+    totalOut: 0,
+    totalCached: 0,
+    mainTurns: 0,
+    subagentGroups: 0,
+    subagentTurns: 0,
+    modelCounts: {},
+  };
+}
+
+function recordTurn(acc: Accumulator, t: TurnNode): void {
+  acc.inputPerTurn.push(t.in);
+  acc.outputPerTurn.push(t.out);
+  if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in);
+  if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1;
+}
+
+function accumulate(acc: Accumulator, s: ConversationStructure): void {
+  acc.totalIn += s.totals.in;
+  acc.totalOut += s.totals.out;
+  acc.totalCached += s.totals.cached;
+  acc.mainTurns += s.totals.numTurns;
+  acc.subagentGroups += s.totals.numSubagentGroups;
+  acc.turnsPerConv.push(s.totals.numTurns);
+  acc.subagentGroupsPerConv.push(s.totals.numSubagentGroups);
+  for (const node of s.nodes) {
+    if (node.kind === 'turn') {
+      recordTurn(acc, node);
+    } else {
+      acc.subagentTurnsPerGroup.push(node.children.length);
+      acc.subagentTurns += node.children.length;
+      for (const child of node.children) recordTurn(acc, child);
+    }
+  }
+}
+
+interface NumberSummary {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  p90: number;
+}
+
+function summarize(values: number[]): NumberSummary {
+  if (values.length === 0) {
+    return { count: 0, min: 0, max: 0, mean: 0, median: 0, p90: 0 };
+  }
+  const sorted = [...values].toSorted((a, b) => a - b);
+  const n = sorted.length;
+  // Quantile by position; q(0)=min, q(1)=max — avoids array-tail indexing that
+  // the linter rewrites to `.at(-1)` (which widens the type to `| undefined`).
+  const q = (p: number) => sorted[Math.min(n - 1, Math.max(0, Math.floor(p * (n - 1))))];
+  const sum = sorted.reduce((a, b) => a + b, 0);
+  return {
+    count: n,
+    min: q(0),
+    max: q(1),
+    mean: sum / n,
+    median: q(0.5),
+    p90: q(0.9),
+  };
+}
+
+function buildChartData(acc: Accumulator) {
+  return {
+    version: 1,
+    inputTokensPerTurn: {
+      bins: logHistogram(acc.inputPerTurn),
+      stats: summarize(acc.inputPerTurn),
+    },
+    outputTokensPerTurn: {
+      bins: logHistogram(acc.outputPerTurn),
+      stats: summarize(acc.outputPerTurn),
+    },
+    turnsPerConversation: {
+      bins: linearHistogram(acc.turnsPerConv),
+      stats: summarize(acc.turnsPerConv),
+    },
+    subagentGroupsPerConversation: {
+      bins: linearHistogram(acc.subagentGroupsPerConv),
+      stats: summarize(acc.subagentGroupsPerConv),
+    },
+    cachedFractionPerTurn: {
+      bins: linearHistogram(acc.cachedFractionPerTurn, 20),
+      stats: summarize(acc.cachedFractionPerTurn),
+    },
+  };
+}
+
+function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string | null) {
+  const cachedPct = acc.totalIn > 0 ? acc.totalCached / acc.totalIn : 0;
+  return {
+    version: 1,
+    blockSize,
+    hashIdScope,
+    totalIn: acc.totalIn,
+    totalOut: acc.totalOut,
+    totalCached: acc.totalCached,
+    cachedPct,
+    mainTurns: acc.mainTurns,
+    subagentGroups: acc.subagentGroups,
+    subagentTurns: acc.subagentTurns,
+    modelMix: acc.modelCounts,
+  };
+}
+
+function slugFromDataset(dataset: string): string {
+  return dataset.includes('/') ? dataset.slice(dataset.indexOf('/') + 1) : dataset;
+}
+
+function inferVariant(slug: string): string {
+  if (slug.endsWith('-256k')) return '256k';
+  if (slug.includes('no-subagent')) return 'no-subagents';
+  return 'full';
+}
+
+function defaultLabel(slug: string): string {
+  // cc-traces-weka-062126 → "CC Traces Weka 062126"
+  return slug
+    .split('-')
+    .map((p) => (/^\d+$/u.test(p) ? p : p.toUpperCase()))
+    .join(' ')
+    .replace(/^CC TRACES WEKA/u, 'CC Traces Weka');
+}
+
+async function main(): Promise<void> {
+  const args = parseArgs();
+  const slug = slugFromDataset(args.dataset);
+  const variant = args.variant ?? inferVariant(slug);
+  const label = args.label ?? defaultLabel(slug);
+  const hfUrl = `https://huggingface.co/datasets/${args.dataset}`;
+
+  console.log(`=== ingest-weka-dataset: ${args.dataset} ===`);
+  console.log(`  slug=${slug} variant=${variant} label="${label}"`);
+
+  const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1 });
+
+  const total = await getRowCount(args.dataset);
+  console.log(`  ${total} conversations on HF`);
+
+  const acc = newAccumulator();
+  let blockSize = 64;
+  let hashIdScope: string | null = null;
+
+  // Buffer the per-conversation rows; flush in batches to keep memory bounded.
+  interface ConvRow {
+    dataset_id: string;
+    conv_id: string;
+    models: string[];
+    num_turns: number;
+    num_subagent_groups: number;
+    total_in: number;
+    total_out: number;
+    total_cached: number;
+    structure: ConversationStructure;
+  }
+  const pending: ConvRow[] = [];
+
+  try {
+    // Upsert the dataset shell first (FK target). Counts/summary filled at the end.
+    await sql`
+      insert into datasets (id, slug, label, variant, description, hf_url, license)
+      values (${args.dataset}, ${slug}, ${label}, ${variant}, ${args.description ?? null}, ${hfUrl}, 'apache-2.0')
+      on conflict (id) do update set
+        slug = excluded.slug, label = excluded.label, variant = excluded.variant,
+        description = coalesce(excluded.description, datasets.description),
+        hf_url = excluded.hf_url, license = excluded.license, ingested_at = now()
+    `;
+    // Clear prior conversations for a clean re-ingest.
+    await sql`delete from dataset_conversations where dataset_id = ${args.dataset}`;
+
+    const flush = async () => {
+      if (pending.length === 0) return;
+      // postgres.js row-helper insert: serializes `structure` to jsonb and
+      // `models` to text[] per row (unnest can't carry a text[] column — a 2D
+      // array would flatten into scalar rows).
+      const rows = pending.map((p) => ({
+        dataset_id: args.dataset,
+        conv_id: p.conv_id,
+        models: p.models,
+        num_turns: p.num_turns,
+        num_subagent_groups: p.num_subagent_groups,
+        total_in: p.total_in,
+        total_out: p.total_out,
+        total_cached: p.total_cached,
+        structure: sql.json(p.structure as unknown as Parameters<typeof sql.json>[0]),
+      }));
+      await sql`insert into dataset_conversations ${sql(rows)}`;
+      pending.length = 0;
+    };
+
+    let count = 0;
+    for await (const conv of iterRows(args.dataset, total, args.limit)) {
+      blockSize = conv.block_size ?? blockSize;
+      hashIdScope = conv.hash_id_scope ?? hashIdScope;
+      const structure = buildConversationStructure(conv);
+      accumulate(acc, structure);
+      pending.push({
+        dataset_id: args.dataset,
+        conv_id: conv.id,
+        models: Array.isArray(conv.models) ? conv.models : [],
+        num_turns: structure.totals.numTurns,
+        num_subagent_groups: structure.totals.numSubagentGroups,
+        total_in: structure.totals.in,
+        total_out: structure.totals.out,
+        total_cached: structure.totals.cached,
+        structure,
+      });
+      count += 1;
+      if (pending.length >= 25) await flush();
+    }
+    await flush();
+
+    const summary = buildSummary(acc, blockSize, hashIdScope);
+    const chartData = buildChartData(acc);
+    await sql`
+      update datasets set
+        conversation_count = ${count},
+        summary = ${sql.json(summary as unknown as Parameters<typeof sql.json>[0])},
+        chart_data = ${sql.json(chartData as unknown as Parameters<typeof sql.json>[0])},
+        ingested_at = now()
+      where id = ${args.dataset}
+    `;
+
+    console.log(`\n  ingested ${count} conversations`);
+    console.log(
+      `  main turns=${acc.mainTurns} subagent groups=${acc.subagentGroups} subagent turns=${acc.subagentTurns}`,
+    );
+    console.log(
+      `  totals: in=${acc.totalIn.toLocaleString()} out=${acc.totalOut.toLocaleString()} ` +
+        `cached=${acc.totalCached.toLocaleString()} (${(summary.cachedPct * 100).toFixed(1)}% of input)`,
+    );
+    console.log('\n=== done ===');
+    console.log('  Purge the API cache: POST /api/v1/invalidate');
+  } finally {
+    await sql.end({ timeout: 5 });
+  }
+}
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});

From b6be5a8d06f6f0ff118d8eee2d8c4a509d8be3ee Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:06:25 -0500
Subject: [PATCH 077/111] fix(datasets): handle HF 429 rate-limiting in ingest

Retry 429/5xx with exponential backoff (honoring Retry-After) instead of
shrinking page size, plus a 400ms inter-page delay. Lets the full 393-row
ingest complete without tripping the datasets-server rate limit.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/src/ingest-weka-dataset.ts | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
index 4ef5328e..22069419 100644
--- a/packages/db/src/ingest-weka-dataset.ts
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -64,8 +64,30 @@ function parseArgs(): CliArgs {
   };
 }
 
-async function fetchJson(url: string): Promise<unknown> {
+const sleep = (ms: number) =>
+  new Promise<void>((resolve) => {
+    setTimeout(resolve, ms);
+  });
+
+/**
+ * Fetch JSON, transparently retrying on HF rate-limiting (429) and transient
+ * 5xx with exponential backoff. Honors a Retry-After header when present.
+ */
+async function fetchJson(url: string, attempt = 0): Promise<unknown> {
   const res = await fetch(url);
+  if (res.status === 429 || res.status >= 500) {
+    if (attempt >= 6) {
+      throw new Error(`${res.status} ${res.statusText} after ${attempt} retries for ${url}`);
+    }
+    const retryAfter = Number(res.headers.get('retry-after'));
+    const waitMs =
+      Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1000 : 2000 * 2 ** attempt;
+    console.warn(
+      `  ${res.status} ${res.statusText}; waiting ${Math.round(waitMs / 1000)}s (attempt ${attempt + 1})`,
+    );
+    await sleep(waitMs);
+    return fetchJson(url, attempt + 1);
+  }
   if (!res.ok) {
     throw new Error(`${res.status} ${res.statusText} for ${url}`);
   }
@@ -111,6 +133,7 @@ async function* iterRows(
     for (const r of rows) yield r.row;
     offset += rows.length;
     process.stdout.write(`\r  fetched ${Math.min(offset, cap)}/${cap} conversations`);
+    if (offset < cap) await sleep(400); // be polite to the HF datasets-server
   }
   process.stdout.write('\n');
 }

From a376b5ba826463d447dcade4c5cc990ce7f22143 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:10:08 -0500
Subject: [PATCH 078/111] feat(datasets): DB queries, API routes, and React
 Query hooks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

queries/datasets.ts: listDatasets, getDataset (incl chart_data),
listConversations (paginated, searchable, 4 sort modes — separate per-sort
queries since the neon HTTP driver can't compose order-by fragments),
getConversation (flamegraph structure). Routes under /api/v1/datasets/* with
cachedQuery + gzip cachedJson. Hooks use-datasets.ts mirror the existing
benchmark-siblings hook style. Verified all four routes against the live branch.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../[slug]/conversations/[convId]/route.ts    |  33 +++
 .../v1/datasets/[slug]/conversations/route.ts |  53 +++++
 .../src/app/api/v1/datasets/[slug]/route.ts   |  29 +++
 packages/app/src/app/api/v1/datasets/route.ts |  24 ++
 packages/app/src/hooks/api/use-datasets.ts    | 183 +++++++++++++++
 packages/db/src/queries/datasets.ts           | 209 ++++++++++++++++++
 6 files changed, 531 insertions(+)
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/route.ts
 create mode 100644 packages/app/src/hooks/api/use-datasets.ts
 create mode 100644 packages/db/src/queries/datasets.ts

diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
new file mode 100644
index 00000000..84cc15e3
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
@@ -0,0 +1,33 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getConversation,
+  type ConversationDetail,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedConversation = cachedQuery(
+  (slug: string, convId: string): Promise<ConversationDetail | null> =>
+    getConversation(getDb(), slug, convId),
+  'dataset-conversation',
+);
+
+/** GET /api/v1/datasets/[slug]/conversations/[convId] — flamegraph structure. */
+export async function GET(
+  _request: NextRequest,
+  { params }: { params: Promise<{ slug: string; convId: string }> },
+) {
+  const { slug, convId } = await params;
+  try {
+    const data = await getCachedConversation(slug, decodeURIComponent(convId));
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset conversation:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
new file mode 100644
index 00000000..62b9e5b7
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
@@ -0,0 +1,53 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  listConversations,
+  type ConversationList,
+  type ListConversationsOpts,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const SORTS = new Set(['tokens', 'turns', 'subagents', 'id']);
+
+const getCachedConversations = cachedQuery(
+  (
+    slug: string,
+    search: string,
+    limit: number,
+    offset: number,
+    sort: string,
+  ): Promise<ConversationList | null> =>
+    listConversations(getDb(), slug, {
+      search: search || undefined,
+      limit,
+      offset,
+      sort: sort as ListConversationsOpts['sort'],
+    }),
+  'dataset-conversations',
+);
+
+/**
+ * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort=
+ * Paginated conversation list (counts only, no flamegraph structure).
+ */
+export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) {
+  const { slug } = await params;
+  const sp = request.nextUrl.searchParams;
+  const search = sp.get('search') ?? '';
+  const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50));
+  const offset = Math.max(0, Number(sp.get('offset')) || 0);
+  const sortParam = sp.get('sort') ?? 'tokens';
+  const sort = SORTS.has(sortParam) ? sortParam : 'tokens';
+  try {
+    const data = await getCachedConversations(slug, search, limit, offset, sort);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset conversations:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
new file mode 100644
index 00000000..9e4af580
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
@@ -0,0 +1,29 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDataset = cachedQuery(
+  (slug: string): Promise<DatasetDetail | null> => getDataset(getDb(), slug),
+  'dataset',
+);
+
+/** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */
+export async function GET(
+  _request: NextRequest,
+  { params }: { params: Promise<{ slug: string }> },
+) {
+  const { slug } = await params;
+  try {
+    const data = await getCachedDataset(slug);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts
new file mode 100644
index 00000000..f0acca3c
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/route.ts
@@ -0,0 +1,24 @@
+import { NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDatasets = cachedQuery(
+  (): Promise<DatasetRecord[]> => listDatasets(getDb()),
+  'datasets',
+);
+
+/** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */
+export async function GET() {
+  try {
+    const data = await getCachedDatasets();
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching datasets:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts
new file mode 100644
index 00000000..3ce61a85
--- /dev/null
+++ b/packages/app/src/hooks/api/use-datasets.ts
@@ -0,0 +1,183 @@
+import { useQuery, keepPreviousData } from '@tanstack/react-query';
+
+import type {
+  ConversationStructure,
+  StructureNode,
+} from '@semianalysisai/inferencex-db/etl/weka-structure';
+
+export type { ConversationStructure, StructureNode };
+
+export interface DatasetSummary {
+  blockSize?: number;
+  hashIdScope?: string | null;
+  totalIn?: number;
+  totalOut?: number;
+  totalCached?: number;
+  cachedPct?: number;
+  mainTurns?: number;
+  subagentGroups?: number;
+  subagentTurns?: number;
+  modelMix?: Record<string, number>;
+  [k: string]: unknown;
+}
+
+export interface DatasetRecord {
+  id: string;
+  slug: string;
+  label: string;
+  variant: string;
+  description: string | null;
+  hf_url: string | null;
+  license: string | null;
+  conversation_count: number;
+  summary: DatasetSummary;
+  ingested_at: string;
+}
+
+export interface HistogramBin {
+  x0: number;
+  x1: number;
+  count: number;
+}
+
+export interface DistributionStats {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  p90: number;
+}
+
+export interface Distribution {
+  bins: HistogramBin[];
+  stats: DistributionStats;
+}
+
+export interface DatasetChartData {
+  version?: number;
+  inputTokensPerTurn?: Distribution;
+  outputTokensPerTurn?: Distribution;
+  turnsPerConversation?: Distribution;
+  subagentGroupsPerConversation?: Distribution;
+  cachedFractionPerTurn?: Distribution;
+  [k: string]: unknown;
+}
+
+export interface DatasetDetail extends DatasetRecord {
+  chart_data: DatasetChartData;
+}
+
+export interface ConversationListItem {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+}
+
+export interface ConversationList {
+  total: number;
+  items: ConversationListItem[];
+}
+
+export interface ConversationDetail {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+  structure: ConversationStructure;
+}
+
+export type ConversationSort = 'tokens' | 'turns' | 'subagents' | 'id';
+
+const DAY = 24 * 60 * 60 * 1000;
+
+/** All ingested datasets (registry cards). */
+export function useDatasets() {
+  return useQuery({
+    queryKey: ['datasets'] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch('/api/v1/datasets', { signal });
+      if (!res.ok) throw new Error(`datasets ${res.status}`);
+      return (await res.json()) as DatasetRecord[];
+    },
+    staleTime: DAY,
+  });
+}
+
+/** One dataset incl. chart_data. */
+export function useDataset(slug: string | null) {
+  return useQuery({
+    queryKey: ['dataset', slug] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch(`/api/v1/datasets/${slug}`, { signal });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`dataset ${res.status}`);
+      return (await res.json()) as DatasetDetail;
+    },
+    enabled: Boolean(slug),
+    staleTime: DAY,
+  });
+}
+
+export interface UseConversationsArgs {
+  slug: string | null;
+  search?: string;
+  limit?: number;
+  offset?: number;
+  sort?: ConversationSort;
+}
+
+/** Paginated conversation list for a dataset (counts only). */
+export function useDatasetConversations({
+  slug,
+  search = '',
+  limit = 50,
+  offset = 0,
+  sort = 'tokens',
+}: UseConversationsArgs) {
+  return useQuery({
+    queryKey: ['dataset-conversations', slug, search, limit, offset, sort] as const,
+    queryFn: async ({ signal }) => {
+      const qs = new URLSearchParams({
+        limit: String(limit),
+        offset: String(offset),
+        sort,
+      });
+      if (search) qs.set('search', search);
+      const res = await fetch(`/api/v1/datasets/${slug}/conversations?${qs.toString()}`, {
+        signal,
+      });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`dataset-conversations ${res.status}`);
+      return (await res.json()) as ConversationList;
+    },
+    enabled: Boolean(slug),
+    placeholderData: keepPreviousData,
+    staleTime: DAY,
+  });
+}
+
+/** One conversation's flamegraph structure. */
+export function useDatasetConversation(slug: string | null, convId: string | null) {
+  return useQuery({
+    queryKey: ['dataset-conversation', slug, convId] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch(
+        `/api/v1/datasets/${slug}/conversations/${encodeURIComponent(convId ?? '')}`,
+        { signal },
+      );
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`dataset-conversation ${res.status}`);
+      return (await res.json()) as ConversationDetail;
+    },
+    enabled: Boolean(slug) && Boolean(convId),
+    staleTime: DAY,
+  });
+}
diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts
new file mode 100644
index 00000000..89c6ca5e
--- /dev/null
+++ b/packages/db/src/queries/datasets.ts
@@ -0,0 +1,209 @@
+/**
+ * Read queries for the agentic-benchmark source datasets (the HF cc-traces-weka
+ * corpora ingested by ingest-weka-dataset.ts). Back the /datasets area:
+ *   - listDatasets      → registry cards (no per-conversation rows)
+ *   - getDataset        → one dataset incl. precomputed chart_data
+ *   - listConversations → paginated conversation list (counts only, no structure)
+ *   - getConversation   → one conversation's flamegraph structure
+ */
+
+import type { DbClient } from '../connection.js';
+import type { ConversationStructure } from '../etl/weka-structure.js';
+
+export interface DatasetSummary {
+  blockSize?: number;
+  hashIdScope?: string | null;
+  totalIn?: number;
+  totalOut?: number;
+  totalCached?: number;
+  cachedPct?: number;
+  mainTurns?: number;
+  subagentGroups?: number;
+  subagentTurns?: number;
+  modelMix?: Record<string, number>;
+  [k: string]: unknown;
+}
+
+export interface DatasetRecord {
+  id: string;
+  slug: string;
+  label: string;
+  variant: string;
+  description: string | null;
+  hf_url: string | null;
+  license: string | null;
+  conversation_count: number;
+  summary: DatasetSummary;
+  ingested_at: string;
+}
+
+export interface DatasetDetail extends DatasetRecord {
+  /** Precomputed distribution bins + stats keyed by metric (see ingest buildChartData). */
+  chart_data: Record<string, unknown>;
+}
+
+export interface ConversationListItem {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+}
+
+export interface ConversationList {
+  total: number;
+  items: ConversationListItem[];
+}
+
+export interface ConversationDetail {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+  structure: ConversationStructure;
+}
+
+/** All ingested datasets, newest first. Excludes the (large) chart_data blob. */
+export async function listDatasets(sql: DbClient): Promise<DatasetRecord[]> {
+  const rows = (await sql`
+    select id, slug, label, variant, description, hf_url, license,
+           conversation_count, summary, ingested_at::text
+    from datasets
+    order by ingested_at desc, slug asc
+  `) as unknown as DatasetRecord[];
+  return rows.map((r) => ({ ...r, conversation_count: Number(r.conversation_count) }));
+}
+
+/** One dataset by slug, including chart_data. Null if not found. */
+export async function getDataset(sql: DbClient, slug: string): Promise<DatasetDetail | null> {
+  const rows = (await sql`
+    select id, slug, label, variant, description, hf_url, license,
+           conversation_count, summary, chart_data, ingested_at::text
+    from datasets
+    where slug = ${slug}
+  `) as unknown as DatasetDetail[];
+  const row = rows[0];
+  if (!row) return null;
+  return { ...row, conversation_count: Number(row.conversation_count) };
+}
+
+export interface ListConversationsOpts {
+  search?: string;
+  limit?: number;
+  offset?: number;
+  /** 'tokens' (total_in desc), 'turns' (num_turns desc), or 'id' (conv_id asc). */
+  sort?: 'tokens' | 'turns' | 'subagents' | 'id';
+}
+
+const MAX_LIMIT = 200;
+
+/**
+ * Paginated conversation list for a dataset (by slug). Returns counts only —
+ * the per-conversation `structure` blob is fetched separately by
+ * getConversation so the list stays light.
+ */
+export async function listConversations(
+  sql: DbClient,
+  slug: string,
+  opts: ListConversationsOpts = {},
+): Promise<ConversationList | null> {
+  const ds = (await sql`select id from datasets where slug = ${slug}`) as unknown as {
+    id: string;
+  }[];
+  const datasetId = ds[0]?.id;
+  if (!datasetId) return null;
+
+  const limit = Math.min(MAX_LIMIT, Math.max(1, opts.limit ?? 50));
+  const offset = Math.max(0, opts.offset ?? 0);
+  const search = opts.search?.trim();
+  const like = search ? `%${search}%` : null;
+
+  const totalRows = (await sql`
+    select count(*)::int as n
+    from dataset_conversations
+    where dataset_id = ${datasetId}
+      and (${like}::text is null or conv_id ilike ${like})
+  `) as unknown as { n: number }[];
+  const total = totalRows[0]?.n ?? 0;
+
+  // Separate queries per sort (literal ORDER BY) — the neon HTTP driver doesn't
+  // compose nested sql fragments the way postgres.js does, so we can't splice an
+  // order-by fragment. The sort key is an enum, never raw user input.
+  const sort = opts.sort ?? 'tokens';
+  let items: ConversationListItem[];
+  if (sort === 'turns') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by num_turns desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else if (sort === 'subagents') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by num_subagent_groups desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else if (sort === 'id') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by total_in desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  }
+
+  return {
+    total,
+    items: items.map((r) => ({
+      ...r,
+      num_turns: Number(r.num_turns),
+      num_subagent_groups: Number(r.num_subagent_groups),
+      total_in: Number(r.total_in),
+      total_out: Number(r.total_out),
+      total_cached: Number(r.total_cached),
+    })),
+  };
+}
+
+/** One conversation's full flamegraph structure. Null if dataset/conv missing. */
+export async function getConversation(
+  sql: DbClient,
+  slug: string,
+  convId: string,
+): Promise<ConversationDetail | null> {
+  const rows = (await sql`
+    select dc.conv_id, dc.models, dc.num_turns, dc.num_subagent_groups,
+           dc.total_in, dc.total_out, dc.total_cached, dc.structure
+    from dataset_conversations dc
+    join datasets d on d.id = dc.dataset_id
+    where d.slug = ${slug} and dc.conv_id = ${convId}
+  `) as unknown as ConversationDetail[];
+  const row = rows[0];
+  if (!row) return null;
+  return {
+    ...row,
+    num_turns: Number(row.num_turns),
+    num_subagent_groups: Number(row.num_subagent_groups),
+    total_in: Number(row.total_in),
+    total_out: Number(row.total_out),
+    total_cached: Number(row.total_cached),
+  };
+}

From 574dfcc8a832fe081167cfd55af586463a29e546 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:16:54 -0500
Subject: [PATCH 079/111] feat(datasets): /datasets pages, distribution cards,
 flamegraph, nav
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- /datasets: methodology prose + dataset registry cards (DatasetList)
- /datasets/[slug]: summary stats, model mix, 5 precomputed-histogram
  distribution cards (DistributionCard, log/linear), and a
  searchable/sortable/paginated conversation table
- /datasets/[slug]/conversations/[convId]: per-conversation TraceFlamegraph —
  one bar per turn (cached prefix + uncached input + output), subagent groups
  collapsible (collapsed by default) with expand/collapse-all
- header nav 'Datasets' link
- query-layer test (mock DbClient): not-found paths + numeric coercion

Verified end-to-end against the live branch DB: both datasets list with real
stats, distributions render, flamegraph shows the prefix-reuse signature
(turn 2 fully uncached, later turns mostly cached), expand-all surfaces
subagent subturns. Zero console errors.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../[slug]/conversations/[convId]/page.tsx    |  32 ++
 packages/app/src/app/datasets/[slug]/page.tsx |  32 ++
 packages/app/src/app/datasets/page.tsx        |  99 ++++++
 .../components/datasets/conversation-view.tsx | 101 ++++++
 .../components/datasets/dataset-detail.tsx    | 305 ++++++++++++++++++
 .../src/components/datasets/dataset-list.tsx  |  85 +++++
 .../components/datasets/distribution-card.tsx | 220 +++++++++++++
 .../components/datasets/trace-flamegraph.tsx  | 273 ++++++++++++++++
 packages/app/src/components/header/header.tsx |   6 +
 packages/db/src/queries/datasets.test.ts      | 102 ++++++
 10 files changed, 1255 insertions(+)
 create mode 100644 packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
 create mode 100644 packages/app/src/app/datasets/[slug]/page.tsx
 create mode 100644 packages/app/src/app/datasets/page.tsx
 create mode 100644 packages/app/src/components/datasets/conversation-view.tsx
 create mode 100644 packages/app/src/components/datasets/dataset-detail.tsx
 create mode 100644 packages/app/src/components/datasets/dataset-list.tsx
 create mode 100644 packages/app/src/components/datasets/distribution-card.tsx
 create mode 100644 packages/app/src/components/datasets/trace-flamegraph.tsx
 create mode 100644 packages/db/src/queries/datasets.test.ts

diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
new file mode 100644
index 00000000..75702c1b
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -0,0 +1,32 @@
+import type { Metadata } from 'next';
+
+import { ConversationView } from '@/components/datasets/conversation-view';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+  params: Promise<{ slug: string; convId: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise<Metadata> {
+  const { slug, convId } = await params;
+  const short = convId.slice(0, 12);
+  const title = `Conversation ${short} | ${slug}`;
+  const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`;
+  return {
+    title,
+    description,
+    alternates: { canonical: `${SITE_URL}/datasets/${slug}/conversations/${convId}` },
+    robots: { index: false }, // per-conversation pages are too numerous to index
+  };
+}
+
+export default async function ConversationPage({ params }: Props) {
+  const { slug, convId } = await params;
+  return (
+    <main className="relative">
+      <div className="container mx-auto px-4 pb-8 lg:px-8">
+        <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/app/datasets/[slug]/page.tsx b/packages/app/src/app/datasets/[slug]/page.tsx
new file mode 100644
index 00000000..f32e3fa6
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/page.tsx
@@ -0,0 +1,32 @@
+import type { Metadata } from 'next';
+
+import { DatasetDetail } from '@/components/datasets/dataset-detail';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+  params: Promise<{ slug: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise<Metadata> {
+  const { slug } = await params;
+  const title = `${slug} | Agentic Datasets`;
+  const description = `Distributions, token statistics, and per-conversation flamegraphs for the ${slug} agentic trace dataset.`;
+  return {
+    title,
+    description,
+    alternates: { canonical: `${SITE_URL}/datasets/${slug}` },
+    openGraph: { title: `${title} | InferenceX`, description, url: `${SITE_URL}/datasets/${slug}` },
+    twitter: { title: `${title} | InferenceX`, description },
+  };
+}
+
+export default async function DatasetDetailPage({ params }: Props) {
+  const { slug } = await params;
+  return (
+    <main className="relative">
+      <div className="container mx-auto px-4 pb-8 lg:px-8">
+        <DatasetDetail slug={slug} />
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/app/datasets/page.tsx b/packages/app/src/app/datasets/page.tsx
new file mode 100644
index 00000000..7fe46b93
--- /dev/null
+++ b/packages/app/src/app/datasets/page.tsx
@@ -0,0 +1,99 @@
+import type { Metadata } from 'next';
+
+import { Card } from '@/components/ui/card';
+import { JsonLd } from '@/components/json-ld';
+import { DatasetList } from '@/components/datasets/dataset-list';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+const DESCRIPTION =
+  'The real Claude Code agentic conversation traces that the InferenceX agentic benchmark replays — methodology, distributions, and per-conversation flamegraphs.';
+
+export const metadata: Metadata = {
+  title: 'Agentic Datasets',
+  description: DESCRIPTION,
+  alternates: { canonical: `${SITE_URL}/datasets` },
+  openGraph: {
+    title: 'Agentic Datasets | InferenceX',
+    description: DESCRIPTION,
+    url: `${SITE_URL}/datasets`,
+  },
+  twitter: { title: 'Agentic Datasets | InferenceX', description: DESCRIPTION },
+};
+
+const jsonLd = {
+  '@context': 'https://schema.org',
+  '@type': 'CollectionPage',
+  name: 'InferenceX Agentic Datasets',
+  description: DESCRIPTION,
+  url: `${SITE_URL}/datasets`,
+};
+
+export default function DatasetsPage() {
+  return (
+    <main className="relative">
+      <JsonLd data={jsonLd} />
+      <div className="container mx-auto flex flex-col gap-6 px-4 pb-8 lg:px-8">
+        <section>
+          <Card>
+            <h1 className="mb-2 text-xl font-semibold text-foreground">
+              Agentic Benchmark Datasets
+            </h1>
+            <p className="mb-3 text-sm text-muted-foreground">
+              InferenceX&apos;s agentic benchmark doesn&apos;t replay synthetic prompts — it replays
+              real Claude Code coding sessions captured as <strong>conversation traces</strong>.
+              Each trace is a full multi-turn session: the main agent&apos;s turns plus any
+              subagents it spawned, with per-turn input/output token counts and the 64-token
+              KV-cache block hashes needed to reconstruct prefix-cache reuse. The traces are
+              published openly on HuggingFace under <code>semianalysisai/cc-traces-weka-*</code>{' '}
+              (apache-2.0).
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">
+              How traces are captured
+            </h2>
+            <p className="mb-3 text-sm text-muted-foreground">
+              Production Claude Code sessions are recorded through a logging proxy that captures
+              every API request: its input and output token counts, the model used, timing (TTFT,
+              inter-token latency), and a list of <code>hash_ids</code> — one per 64-token KV block
+              of the request&apos;s input. Subagent invocations are grouped under their parent turn.
+              No prompt or completion text is stored; only token counts and block hashes, so the
+              corpus is shareable while remaining a faithful workload for replay.
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">
+              Cached prefix vs uncached suffix
+            </h2>
+            <p className="mb-3 text-sm text-muted-foreground">
+              Agentic workloads are dominated by prefix reuse: each turn resends the growing
+              conversation, so most of its input is already in the KV cache from prior turns. We
+              reconstruct this exactly. Walking a conversation in order under an idealized infinite
+              cache, a turn&apos;s <strong>cached prefix</strong> is its longest run of leading{' '}
+              <code>hash_ids</code> already seen; the rest is the <strong>uncached suffix</strong>{' '}
+              that must be (re)computed. Blocks are 64 tokens; the split is clamped so cached +
+              uncached equals the turn&apos;s effective input even on a partial final block.
+              Subagents run against a snapshot of the parent cache at spawn (their context is
+              separate and is not folded back into the parent).
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">Dataset variants</h2>
+            <ul className="mb-1 list-disc space-y-1 pl-5 text-sm text-muted-foreground">
+              <li>
+                <strong>full</strong> — every captured request, unmodified.
+              </li>
+              <li>
+                <strong>256k</strong> — requests whose input + output exceeds 256,000 tokens are
+                dropped so every turn fits a 256k context window (used when benchmarking engines
+                configured for a 256k max context).
+              </li>
+            </ul>
+          </Card>
+        </section>
+
+        <section className="flex flex-col gap-3">
+          <h2 className="text-lg font-semibold text-foreground">Datasets</h2>
+          <DatasetList />
+        </section>
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
new file mode 100644
index 00000000..43992c41
--- /dev/null
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -0,0 +1,101 @@
+'use client';
+
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import { useDatasetConversation } from '@/hooks/api/use-datasets';
+
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
+  const { data, isLoading, isError } = useDatasetConversation(slug, convId);
+
+  if (isLoading) {
+    return (
+      <div className="py-12 text-center text-sm text-muted-foreground">Loading conversation…</div>
+    );
+  }
+  if (isError || !data) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">
+        Conversation not found.{' '}
+        <Link href={`/datasets/${slug}`} className="text-primary underline">
+          Back to dataset
+        </Link>
+      </div>
+    );
+  }
+
+  const cachedPct =
+    data.total_in > 0 ? `${((data.total_cached / data.total_in) * 100).toFixed(0)}%` : '—';
+
+  return (
+    <div className="flex flex-col gap-6">
+      <div>
+        <div className="mb-1 flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
+          <Link href="/datasets" className="hover:text-foreground">
+            Datasets
+          </Link>
+          <span>/</span>
+          <Link href={`/datasets/${slug}`} className="hover:text-foreground">
+            {slug}
+          </Link>
+          <span>/</span>
+          <span className="text-foreground">conversation</span>
+        </div>
+        <h1 className="break-all font-mono text-lg font-semibold text-foreground">
+          {data.conv_id}
+        </h1>
+        {data.models.length > 0 && (
+          <div className="mt-2 flex flex-wrap gap-2">
+            {data.models.map((m) => (
+              <span
+                key={m}
+                className="rounded-md border border-border/40 px-2 py-0.5 text-xs text-foreground"
+              >
+                {m}
+              </span>
+            ))}
+          </div>
+        )}
+      </div>
+
+      <Card className="p-4">
+        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-6">
+          <Stat label="Main turns" value={String(data.num_turns)} />
+          <Stat label="Subagent groups" value={String(data.num_subagent_groups)} />
+          <Stat label="Input" value={`${compact(data.total_in)} tok`} />
+          <Stat label="Output" value={`${compact(data.total_out)} tok`} />
+          <Stat label="Cached" value={`${compact(data.total_cached)} tok`} />
+          <Stat label="Cached %" value={cachedPct} />
+        </dl>
+      </Card>
+
+      <Card className="p-4">
+        <h2 className="mb-3 text-lg font-semibold text-foreground">Flamegraph</h2>
+        <p className="mb-4 text-xs text-muted-foreground">
+          One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default —
+          click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
+          plus generated output.
+        </p>
+        <TraceFlamegraph structure={data.structure} />
+      </Card>
+    </div>
+  );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div>
+      <dt className="text-xs text-muted-foreground">{label}</dt>
+      <dd className="text-lg font-semibold tabular-nums text-foreground">{value}</dd>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
new file mode 100644
index 00000000..57c50649
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -0,0 +1,305 @@
+'use client';
+
+import { useState } from 'react';
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import {
+  useDataset,
+  useDatasetConversations,
+  type ConversationSort,
+} from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+const PAGE = 50;
+
+const SORTS: { value: ConversationSort; label: string }[] = [
+  { value: 'tokens', label: 'Total input ↓' },
+  { value: 'turns', label: 'Turns ↓' },
+  { value: 'subagents', label: 'Subagent groups ↓' },
+  { value: 'id', label: 'Conversation ID' },
+];
+
+export function DatasetDetail({ slug }: { slug: string }) {
+  const { data: dataset, isLoading, isError } = useDataset(slug);
+  const [search, setSearch] = useState('');
+  const [sort, setSort] = useState<ConversationSort>('tokens');
+  const [page, setPage] = useState(0);
+
+  const { data: convs, isFetching } = useDatasetConversations({
+    slug,
+    search,
+    sort,
+    limit: PAGE,
+    offset: page * PAGE,
+  });
+
+  if (isLoading) {
+    return <div className="py-12 text-center text-sm text-muted-foreground">Loading dataset…</div>;
+  }
+  if (isError || !dataset) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">
+        Dataset not found.{' '}
+        <Link href="/datasets" className="text-primary underline">
+          Back to datasets
+        </Link>
+      </div>
+    );
+  }
+
+  const s = dataset.summary ?? {};
+  const cd = dataset.chart_data ?? {};
+  const total = convs?.total ?? 0;
+  const pageCount = Math.ceil(total / PAGE);
+
+  return (
+    <div className="flex flex-col gap-6">
+      {/* header */}
+      <div>
+        <div className="mb-1 flex items-center gap-2">
+          <Link href="/datasets" className="text-xs text-muted-foreground hover:text-foreground">
+            ← Datasets
+          </Link>
+        </div>
+        <div className="flex flex-wrap items-baseline justify-between gap-2">
+          <h1 className="text-2xl font-semibold text-foreground">{dataset.label}</h1>
+          <div className="flex items-center gap-2 text-xs">
+            <span className="rounded-full border border-border/50 px-2 py-0.5 uppercase tracking-wide text-muted-foreground">
+              {dataset.variant}
+            </span>
+            {dataset.hf_url && (
+              <a
+                href={dataset.hf_url}
+                target="_blank"
+                rel="noopener noreferrer"
+                onClick={() => track('datasets_hf_link_clicked', { slug })}
+                className="text-primary hover:underline"
+              >
+                View on HuggingFace ↗
+              </a>
+            )}
+          </div>
+        </div>
+        {dataset.description && (
+          <p className="mt-2 max-w-3xl text-sm text-muted-foreground">{dataset.description}</p>
+        )}
+      </div>
+
+      {/* summary stats */}
+      <Card className="p-4">
+        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-6">
+          <Stat label="Conversations" value={dataset.conversation_count.toLocaleString()} />
+          <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
+          <Stat label="Subagent groups" value={compact(s.subagentGroups ?? 0)} />
+          <Stat label="Subagent turns" value={compact(s.subagentTurns ?? 0)} />
+          <Stat
+            label="Cached input"
+            value={typeof s.cachedPct === 'number' ? `${(s.cachedPct * 100).toFixed(0)}%` : '—'}
+          />
+          <Stat label="Total tokens" value={compact((s.totalIn ?? 0) + (s.totalOut ?? 0))} />
+        </dl>
+        {s.modelMix && Object.keys(s.modelMix).length > 0 && (
+          <div className="mt-4 border-t border-border/40 pt-3">
+            <div className="mb-1.5 text-xs font-medium text-muted-foreground">
+              Model mix (turns)
+            </div>
+            <div className="flex flex-wrap gap-2">
+              {Object.entries(s.modelMix)
+                .toSorted((a, b) => b[1] - a[1])
+                .map(([model, count]) => (
+                  <span
+                    key={model}
+                    className="rounded-md border border-border/40 px-2 py-0.5 text-xs text-foreground"
+                  >
+                    {model} <span className="text-muted-foreground">{compact(count)}</span>
+                  </span>
+                ))}
+            </div>
+          </div>
+        )}
+      </Card>
+
+      {/* distribution cards */}
+      <section className="flex flex-col gap-3">
+        <h2 className="text-lg font-semibold text-foreground">Distributions</h2>
+        <div className="grid gap-4 lg:grid-cols-2">
+          <DistributionCard
+            title="Input tokens per turn"
+            unit="tokens"
+            scale="log"
+            distribution={cd.inputTokensPerTurn}
+          />
+          <DistributionCard
+            title="Output tokens per turn"
+            unit="tokens"
+            scale="log"
+            distribution={cd.outputTokensPerTurn}
+          />
+          <DistributionCard
+            title="Turns per conversation"
+            unit="turns"
+            distribution={cd.turnsPerConversation}
+          />
+          <DistributionCard
+            title="Subagent groups per conversation"
+            unit="groups"
+            distribution={cd.subagentGroupsPerConversation}
+          />
+          <DistributionCard
+            title="Cached fraction per turn"
+            unit=""
+            distribution={cd.cachedFractionPerTurn}
+            formatValue={(v) => `${(v * 100).toFixed(0)}%`}
+          />
+        </div>
+      </section>
+
+      {/* conversation list */}
+      <section className="flex flex-col gap-3">
+        <div className="flex flex-wrap items-center justify-between gap-3">
+          <h2 className="text-lg font-semibold text-foreground">
+            Conversations{' '}
+            <span className="text-sm font-normal text-muted-foreground">({total})</span>
+          </h2>
+          <div className="flex items-center gap-2">
+            <input
+              type="text"
+              value={search}
+              onChange={(e) => {
+                setSearch(e.target.value);
+                setPage(0);
+              }}
+              placeholder="Search by ID…"
+              className="h-8 w-40 rounded-md border border-border/40 bg-background px-2 text-xs outline-none focus:border-primary"
+            />
+            <Select
+              value={sort}
+              onValueChange={(v) => {
+                setSort(v as ConversationSort);
+                setPage(0);
+                track('datasets_conversations_sorted', { mode: v });
+              }}
+            >
+              <SelectTrigger className="h-8 w-[12rem] text-xs" aria-label="Sort conversations">
+                <SelectValue />
+              </SelectTrigger>
+              <SelectContent>
+                {SORTS.map((o) => (
+                  <SelectItem key={o.value} value={o.value} className="text-xs">
+                    {o.label}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+          </div>
+        </div>
+
+        <Card className="overflow-hidden p-0">
+          <table className="w-full text-sm">
+            <thead className="border-b border-border/40 bg-muted/30 text-xs text-muted-foreground">
+              <tr>
+                <th className="px-3 py-2 text-left font-medium">Conversation</th>
+                <th className="px-3 py-2 text-right font-medium">Turns</th>
+                <th className="px-3 py-2 text-right font-medium">Subagents</th>
+                <th className="px-3 py-2 text-right font-medium">Input</th>
+                <th className="px-3 py-2 text-right font-medium">Output</th>
+                <th className="px-3 py-2 text-right font-medium">Cached</th>
+              </tr>
+            </thead>
+            <tbody>
+              {(convs?.items ?? []).map((c) => {
+                const cachedPct =
+                  c.total_in > 0 ? `${((c.total_cached / c.total_in) * 100).toFixed(0)}%` : '—';
+                return (
+                  <tr
+                    key={c.conv_id}
+                    className="border-b border-border/20 last:border-0 hover:bg-accent/40"
+                  >
+                    <td className="px-3 py-2">
+                      <Link
+                        href={`/datasets/${slug}/conversations/${c.conv_id}`}
+                        onClick={() => track('datasets_conversation_clicked', { slug })}
+                        className="font-mono text-xs text-primary hover:underline"
+                      >
+                        {c.conv_id.slice(0, 20)}…
+                      </Link>
+                      {c.models.length > 0 && (
+                        <span className="ml-2 text-[11px] text-muted-foreground">
+                          {c.models.length} model{c.models.length === 1 ? '' : 's'}
+                        </span>
+                      )}
+                    </td>
+                    <td className="px-3 py-2 text-right tabular-nums">{c.num_turns}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{c.num_subagent_groups}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{compact(c.total_in)}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{compact(c.total_out)}</td>
+                    <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">
+                      {cachedPct}
+                    </td>
+                  </tr>
+                );
+              })}
+              {!isFetching && (convs?.items.length ?? 0) === 0 && (
+                <tr>
+                  <td colSpan={6} className="px-3 py-8 text-center text-xs text-muted-foreground">
+                    No conversations match.
+                  </td>
+                </tr>
+              )}
+            </tbody>
+          </table>
+        </Card>
+
+        {pageCount > 1 && (
+          <div className="flex items-center justify-center gap-3 text-xs">
+            <button
+              type="button"
+              disabled={page === 0}
+              onClick={() => setPage((p) => Math.max(0, p - 1))}
+              className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+            >
+              ← Prev
+            </button>
+            <span className="text-muted-foreground">
+              Page {page + 1} of {pageCount}
+            </span>
+            <button
+              type="button"
+              disabled={page >= pageCount - 1}
+              onClick={() => setPage((p) => Math.min(pageCount - 1, p + 1))}
+              className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+            >
+              Next →
+            </button>
+          </div>
+        )}
+      </section>
+    </div>
+  );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div>
+      <dt className="text-xs text-muted-foreground">{label}</dt>
+      <dd className="text-lg font-semibold tabular-nums text-foreground">{value}</dd>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx
new file mode 100644
index 00000000..5fcc0dfe
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-list.tsx
@@ -0,0 +1,85 @@
+'use client';
+
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+function DatasetCard({ d }: { d: DatasetRecord }) {
+  const s = d.summary ?? {};
+  const cachedPct = typeof s.cachedPct === 'number' ? `${(s.cachedPct * 100).toFixed(0)}%` : '—';
+  return (
+    <Link
+      href={`/datasets/${d.slug}`}
+      onClick={() => track('datasets_card_clicked', { slug: d.slug })}
+      className="block transition-colors hover:[&_*]:border-primary/40"
+    >
+      <Card className="h-full p-4 transition-colors hover:border-primary/40">
+        <div className="mb-1 flex items-baseline justify-between gap-2">
+          <h3 className="text-base font-semibold text-foreground">{d.label}</h3>
+          <span className="rounded-full border border-border/50 px-2 py-0.5 text-[10px] uppercase tracking-wide text-muted-foreground">
+            {d.variant}
+          </span>
+        </div>
+        {d.description && (
+          <p className="mb-3 line-clamp-2 text-xs text-muted-foreground">{d.description}</p>
+        )}
+        <dl className="grid grid-cols-2 gap-x-4 gap-y-1.5 text-xs">
+          <Stat label="Conversations" value={d.conversation_count.toLocaleString()} />
+          <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
+          <Stat label="Subagent groups" value={compact(s.subagentGroups ?? 0)} />
+          <Stat label="Cached input" value={cachedPct} />
+          <Stat label="Total input" value={`${compact(s.totalIn ?? 0)} tok`} />
+          <Stat label="Total output" value={`${compact(s.totalOut ?? 0)} tok`} />
+        </dl>
+        <div className="mt-3 text-xs font-medium text-primary">View dataset →</div>
+      </Card>
+    </Link>
+  );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div className="flex items-baseline justify-between gap-2">
+      <dt className="text-muted-foreground">{label}</dt>
+      <dd className="tabular-nums font-medium text-foreground">{value}</dd>
+    </div>
+  );
+}
+
+export function DatasetList() {
+  const { data, isLoading, isError } = useDatasets();
+
+  if (isLoading) {
+    return <div className="py-12 text-center text-sm text-muted-foreground">Loading datasets…</div>;
+  }
+  if (isError || !data) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">Failed to load datasets.</div>
+    );
+  }
+  if (data.length === 0) {
+    return (
+      <div className="py-12 text-center text-sm text-muted-foreground">
+        No datasets ingested yet.
+      </div>
+    );
+  }
+
+  return (
+    <div className="grid gap-4 sm:grid-cols-2">
+      {data.map((d) => (
+        <DatasetCard key={d.id} d={d} />
+      ))}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
new file mode 100644
index 00000000..7abc367f
--- /dev/null
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -0,0 +1,220 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { Card } from '@/components/ui/card';
+import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover';
+import type { Distribution } from '@/hooks/api/use-datasets';
+
+/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  if (abs > 0 && abs < 1) return n.toFixed(2);
+  return String(Math.round(n));
+}
+
+interface DistributionCardProps {
+  title: string;
+  subtitle?: string;
+  unit: string;
+  distribution?: Distribution;
+  scale?: 'log' | 'linear';
+  /** Format the x value (defaults to compact). e.g. percent for cached fraction. */
+  formatValue?: (v: number) => string;
+}
+
+const W = 720;
+const H = 240;
+const PAD = { top: 12, right: 16, bottom: 48, left: 52 };
+
+/**
+ * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a
+ * themeable bar chart with median/p90 guide lines and a hover tooltip. Bars are
+ * drawn at equal visual width; for log-scaled bins the edge labels are already
+ * log-spaced so the shape reads as a log histogram.
+ */
+export function DistributionCard({
+  title,
+  subtitle,
+  unit,
+  distribution,
+  scale = 'linear',
+  formatValue = compact,
+}: DistributionCardProps) {
+  const computed = useMemo(() => {
+    const bins = distribution?.bins ?? [];
+    if (bins.length === 0) return null;
+    const maxCount = Math.max(1, ...bins.map((b) => b.count));
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const n = bins.length;
+    const barW = innerW / n;
+    // Map a data value to an x pixel by locating its bin (positional — works for
+    // both linear and log bins since the edges are precomputed at ingest).
+    const valueToX = (v: number): number | null => {
+      for (let i = 0; i < n; i++) {
+        if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) {
+          return PAD.left + (i + 0.5) * barW;
+        }
+      }
+      if (v <= bins[0].x0) return PAD.left + 0.5 * barW;
+      return PAD.left + (n - 0.5) * barW;
+    };
+    return { bins, maxCount, innerW, innerH, n, barW, valueToX };
+  }, [distribution]);
+
+  if (!computed) {
+    return (
+      <Card className="p-4">
+        <div className="mb-1 text-sm font-medium text-foreground">{title}</div>
+        <div className="grid h-[240px] place-items-center text-xs text-muted-foreground">
+          No data
+        </div>
+      </Card>
+    );
+  }
+
+  const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed;
+  const stats = distribution?.stats;
+
+  const guides = stats
+    ? ([
+        { label: 'median', value: stats.median, color: '#3b82f6' },
+        { label: 'p90', value: stats.p90, color: '#f59e0b' },
+      ] as const)
+    : [];
+
+  // X tick labels from a few bin edges.
+  const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1];
+
+  const resolve = (fraction: number) => {
+    const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n)));
+    const b = bins[i];
+    const items: HoverItem[] = [
+      {
+        color: 'currentColor',
+        label: 'Range',
+        value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`,
+      },
+      { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() },
+    ];
+    return { items };
+  };
+
+  return (
+    <Card className="p-4">
+      <div className="mb-0.5 flex items-baseline justify-between gap-2">
+        <span className="text-sm font-medium text-foreground">{title}</span>
+        {scale === 'log' && (
+          <span className="text-[10px] uppercase tracking-wide text-muted-foreground">
+            log scale
+          </span>
+        )}
+      </div>
+      {subtitle && <div className="mb-1 text-xs text-muted-foreground">{subtitle}</div>}
+      {stats && (
+        <div className="mb-2 text-xs text-muted-foreground">
+          n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '}
+          {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit}
+        </div>
+      )}
+      <div className="w-full text-muted-foreground">
+        <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+          {/* bars */}
+          {bins.map((b, i) => {
+            const h = (b.count / maxCount) * innerH;
+            const x = PAD.left + i * barW;
+            const y = PAD.top + (innerH - h);
+            return (
+              <rect
+                key={i}
+                x={x}
+                y={y}
+                width={Math.max(0, barW - 1)}
+                height={h}
+                className="fill-primary/55"
+              />
+            );
+          })}
+
+          {/* guide lines */}
+          {guides.map((g) => {
+            const x = valueToX(g.value);
+            if (x === null) return null;
+            return (
+              <line
+                key={g.label}
+                x1={x}
+                x2={x}
+                y1={PAD.top}
+                y2={PAD.top + innerH}
+                stroke={g.color}
+                strokeWidth={2}
+                strokeDasharray="5 3"
+                opacity={0.95}
+              />
+            );
+          })}
+
+          {/* x axis */}
+          <line
+            x1={PAD.left}
+            x2={PAD.left + innerW}
+            y1={PAD.top + innerH}
+            y2={PAD.top + innerH}
+            stroke="currentColor"
+            opacity={0.2}
+          />
+          {tickIdxs.map((i, k) => {
+            const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle';
+            const x = PAD.left + (i + 0.5) * barW;
+            return (
+              <text
+                key={i}
+                x={x}
+                y={PAD.top + innerH + 14}
+                fontSize={11}
+                fill="currentColor"
+                opacity={0.7}
+                textAnchor={anchor}
+              >
+                {formatValue(bins[i].x0)}
+              </text>
+            );
+          })}
+          <text
+            x={W / 2}
+            y={H - 16}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.55}
+            textAnchor="middle"
+          >
+            {unit}
+          </text>
+
+          {/* guide legend */}
+          {guides.map((g, i) => (
+            <g key={g.label} transform={`translate(${PAD.left + i * 110}, ${PAD.top})`}>
+              <line
+                x1={0}
+                x2={12}
+                y1={4}
+                y2={4}
+                stroke={g.color}
+                strokeWidth={2}
+                strokeDasharray="5 3"
+              />
+              <text x={16} y={7} fontSize={10} fill="currentColor" opacity={0.85}>
+                {g.label} {formatValue(g.value)}
+              </text>
+            </g>
+          ))}
+        </ChartHover>
+      </div>
+    </Card>
+  );
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
new file mode 100644
index 00000000..12588582
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -0,0 +1,273 @@
+'use client';
+
+import { useCallback, useMemo, useState } from 'react';
+
+import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
+
+/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+// Stacked-bar segment colors. Cached prefix vs uncached input vs output —
+// fixed hues (theme-independent) so the meaning is stable in light/dark.
+const SEG = {
+  cached: '#10b981', // emerald-500 — input served from prefix cache
+  uncached: '#f59e0b', // amber-500 — input that must be (re)computed
+  output: '#8b5cf6', // violet-500 — generated tokens
+} as const;
+
+const LEGEND = [
+  { key: 'cached', label: 'Cached prefix', color: SEG.cached },
+  { key: 'uncached', label: 'Uncached input', color: SEG.uncached },
+  { key: 'output', label: 'Output', color: SEG.output },
+] as const;
+
+interface VisibleRow {
+  key: string;
+  label: string;
+  sublabel?: string;
+  cached: number;
+  uncached: number;
+  output: number;
+  total: number;
+  indent: number;
+  isGroup: boolean;
+  isExpanded: boolean;
+  groupIndex?: number;
+}
+
+interface TooltipState {
+  x: number;
+  y: number;
+  row: VisibleRow;
+}
+
+/**
+ * Per-conversation flamegraph driven by the precomputed `structure` JSONB.
+ * One row per turn; subagent groups render a collapsible header with indented
+ * children (collapsed by default). Each bar stacks cached-prefix + uncached
+ * input + output, scaled to the widest visible turn.
+ */
+export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) {
+  const nodes = structure.nodes;
+
+  // Subagent groups collapsed by default.
+  const [expanded, setExpanded] = useState<Set<number>>(() => new Set());
+  const [tooltip, setTooltip] = useState<TooltipState | null>(null);
+
+  const groupIndexes = useMemo(() => {
+    const out: number[] = [];
+    nodes.forEach((node, i) => {
+      if (node.kind === 'subagent') out.push(i);
+    });
+    return out;
+  }, [nodes]);
+
+  const toggle = useCallback((i: number) => {
+    setExpanded((prev) => {
+      const next = new Set(prev);
+      if (next.has(i)) next.delete(i);
+      else next.add(i);
+      return next;
+    });
+  }, []);
+
+  const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]);
+  const collapseAll = useCallback(() => setExpanded(new Set()), []);
+
+  const rows = useMemo<VisibleRow[]>(() => {
+    const out: VisibleRow[] = [];
+    let turnNo = 0;
+    nodes.forEach((node: StructureNode, i) => {
+      if (node.kind === 'turn') {
+        turnNo += 1;
+        out.push({
+          key: `t-${i}`,
+          label: `Turn ${turnNo}`,
+          sublabel: node.model ?? undefined,
+          cached: node.cached,
+          uncached: node.uncached,
+          output: node.out,
+          total: node.in + node.out,
+          indent: 0,
+          isGroup: false,
+          isExpanded: false,
+        });
+      } else {
+        const isExpanded = expanded.has(i);
+        out.push({
+          key: `g-${i}`,
+          label: `${node.label}`,
+          sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${
+            node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : ''
+          }`,
+          cached: node.cached,
+          uncached: node.uncached,
+          output: node.out,
+          total: node.in + node.out,
+          indent: 0,
+          isGroup: true,
+          isExpanded,
+          groupIndex: i,
+        });
+        if (isExpanded) {
+          node.children.forEach((child, ci) => {
+            out.push({
+              key: `g-${i}-c-${ci}`,
+              label: `↳ subturn ${ci + 1}`,
+              sublabel: child.model ?? undefined,
+              cached: child.cached,
+              uncached: child.uncached,
+              output: child.out,
+              total: child.in + child.out,
+              indent: 1,
+              isGroup: false,
+              isExpanded: false,
+            });
+          });
+        }
+      }
+    });
+    return out;
+  }, [nodes, expanded]);
+
+  const maxTotal = useMemo(
+    () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)),
+    [rows],
+  );
+
+  const onMove = (e: React.MouseEvent, row: VisibleRow) => {
+    setTooltip({ x: e.clientX, y: e.clientY, row });
+  };
+
+  return (
+    <div className="relative">
+      <div className="mb-3 flex flex-wrap items-center justify-between gap-3">
+        <div className="flex items-center gap-3 text-xs">
+          {LEGEND.map((l) => (
+            <span key={l.key} className="inline-flex items-center gap-1.5">
+              <span
+                className="inline-block size-3 rounded-sm"
+                style={{ backgroundColor: l.color }}
+              />
+              <span className="text-muted-foreground">{l.label}</span>
+            </span>
+          ))}
+        </div>
+        {groupIndexes.length > 0 && (
+          <div className="flex items-center gap-1.5">
+            <button
+              type="button"
+              onClick={expandAll}
+              className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+            >
+              Expand all
+            </button>
+            <button
+              type="button"
+              onClick={collapseAll}
+              className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+            >
+              Collapse all
+            </button>
+          </div>
+        )}
+      </div>
+
+      <div className="flex flex-col gap-0.5">
+        {rows.map((row) => {
+          const barFrac = row.total / maxTotal;
+          const cw = (row.cached / row.total) * 100;
+          const uw = (row.uncached / row.total) * 100;
+          const ow = (row.output / row.total) * 100;
+          return (
+            <div
+              key={row.key}
+              className="flex items-center gap-2"
+              style={{ paddingLeft: row.indent * 20 }}
+            >
+              {/* label / group toggle */}
+              <div className="flex w-44 shrink-0 items-center gap-1 truncate">
+                {row.isGroup ? (
+                  <button
+                    type="button"
+                    onClick={() => row.groupIndex !== undefined && toggle(row.groupIndex)}
+                    className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
+                  >
+                    <span className="inline-block w-3 text-muted-foreground">
+                      {row.isExpanded ? '▾' : '▸'}
+                    </span>
+                    <span className="truncate">{row.label}</span>
+                  </button>
+                ) : (
+                  <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
+                )}
+              </div>
+
+              {/* stacked bar */}
+              <div
+                className="relative h-5 flex-1 cursor-default"
+                onMouseMove={(e) => onMove(e, row)}
+                onMouseLeave={() => setTooltip(null)}
+              >
+                <div
+                  className={`flex h-full overflow-hidden rounded-sm ${
+                    row.isGroup ? 'opacity-70 ring-1 ring-border/50' : ''
+                  }`}
+                  style={{ width: `${Math.max(0.5, barFrac * 100)}%` }}
+                >
+                  <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
+                  <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
+                  <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                </div>
+              </div>
+
+              {/* total */}
+              <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
+                {compact(row.total)}
+              </div>
+            </div>
+          );
+        })}
+      </div>
+
+      {tooltip && (
+        <div
+          className="pointer-events-none fixed z-50 rounded-md border border-border bg-popover px-2.5 py-1.5 text-xs shadow-md"
+          style={{ left: tooltip.x + 12, top: tooltip.y + 12 }}
+        >
+          <div className="mb-1 font-medium text-foreground">
+            {tooltip.row.label}
+            {tooltip.row.sublabel ? (
+              <span className="ml-1 font-normal text-muted-foreground">{tooltip.row.sublabel}</span>
+            ) : null}
+          </div>
+          <div className="grid grid-cols-[auto_auto] gap-x-3 gap-y-0.5 text-muted-foreground">
+            <span style={{ color: SEG.cached }}>Cached prefix</span>
+            <span className="text-right tabular-nums text-foreground">
+              {compact(tooltip.row.cached)}
+            </span>
+            <span style={{ color: SEG.uncached }}>Uncached input</span>
+            <span className="text-right tabular-nums text-foreground">
+              {compact(tooltip.row.uncached)}
+            </span>
+            <span style={{ color: SEG.output }}>Output</span>
+            <span className="text-right tabular-nums text-foreground">
+              {compact(tooltip.row.output)}
+            </span>
+            <span>Cached %</span>
+            <span className="text-right tabular-nums text-foreground">
+              {tooltip.row.cached + tooltip.row.uncached > 0
+                ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
+                : '—'}
+            </span>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx
index 57965518..5725d99f 100644
--- a/packages/app/src/components/header/header.tsx
+++ b/packages/app/src/components/header/header.tsx
@@ -46,6 +46,12 @@ const NAV_LINKS = [
     testId: 'nav-link-supporters',
     event: 'header_supporters_clicked',
   },
+  {
+    href: '/datasets',
+    label: 'Datasets',
+    testId: 'nav-link-datasets',
+    event: 'header_datasets_clicked',
+  },
   { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' },
   { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' },
 ] as const;
diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts
new file mode 100644
index 00000000..c1676445
--- /dev/null
+++ b/packages/db/src/queries/datasets.test.ts
@@ -0,0 +1,102 @@
+import { describe, expect, it } from 'vitest';
+
+import type { DbClient } from '../connection.js';
+import { getConversation, listConversations, listDatasets } from './datasets.js';
+
+/**
+ * Mock DbClient: returns canned result sets in call order. Each call to the
+ * tagged-template `sql` shifts the next queued rows array. The query text is
+ * ignored — these tests assert the JS-side shaping/coercion, not SQL.
+ */
+function mockSql(queue: unknown[][]): DbClient {
+  const responses = [...queue];
+  return (() => Promise.resolve(responses.shift() ?? [])) as unknown as DbClient;
+}
+
+describe('listDatasets', () => {
+  it('coerces conversation_count to a number', async () => {
+    const sql = mockSql([
+      [
+        {
+          id: 'a/b',
+          slug: 'b',
+          label: 'B',
+          variant: 'full',
+          conversation_count: '393',
+          summary: {},
+        },
+      ],
+    ]);
+    const out = await listDatasets(sql);
+    expect(out).toHaveLength(1);
+    expect(out[0].conversation_count).toBe(393);
+    expect(typeof out[0].conversation_count).toBe('number');
+  });
+});
+
+describe('listConversations', () => {
+  it('returns null when the dataset slug is unknown', async () => {
+    const sql = mockSql([[]]); // datasets lookup → no rows
+    expect(await listConversations(sql, 'missing')).toBeNull();
+  });
+
+  it('returns total + numerically-coerced items', async () => {
+    const sql = mockSql([
+      [{ id: 'ds-id' }], // datasets lookup
+      [{ n: 2 }], // count
+      [
+        {
+          conv_id: 'c1',
+          models: ['m'],
+          num_turns: '5',
+          num_subagent_groups: '1',
+          total_in: '1000',
+          total_out: '200',
+          total_cached: '900',
+        },
+      ], // items
+    ]);
+    const out = await listConversations(sql, 'b', { sort: 'tokens' });
+    expect(out).not.toBeNull();
+    expect(out!.total).toBe(2);
+    expect(out!.items[0]).toMatchObject({
+      conv_id: 'c1',
+      num_turns: 5,
+      num_subagent_groups: 1,
+      total_in: 1000,
+      total_out: 200,
+      total_cached: 900,
+    });
+    expect(typeof out!.items[0].total_in).toBe('number');
+  });
+});
+
+describe('getConversation', () => {
+  it('returns null when the conversation is missing', async () => {
+    const sql = mockSql([[]]);
+    expect(await getConversation(sql, 'b', 'nope')).toBeNull();
+  });
+
+  it('coerces counts and passes through the structure', async () => {
+    const structure = { blockSize: 64, nodes: [], totals: {} };
+    const sql = mockSql([
+      [
+        {
+          conv_id: 'c1',
+          models: ['m'],
+          num_turns: '3',
+          num_subagent_groups: '0',
+          total_in: '500',
+          total_out: '100',
+          total_cached: '450',
+          structure,
+        },
+      ],
+    ]);
+    const out = await getConversation(sql, 'b', 'c1');
+    expect(out).not.toBeNull();
+    expect(out!.num_turns).toBe(3);
+    expect(out!.total_cached).toBe(450);
+    expect(out!.structure).toBe(structure);
+  });
+});

From 0c50139594a99adcc43f558d0b80ae08870af20e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:18:38 -0500
Subject: [PATCH 080/111] docs(ingest): note the separate agentic-dataset
 ingest script

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/agents/ingest.md | 188 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 .claude/agents/ingest.md

diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
new file mode 100644
index 00000000..aa0099ac
--- /dev/null
+++ b/.claude/agents/ingest.md
@@ -0,0 +1,188 @@
+---
+name: ingest
+description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL.
+tools: Bash, Read, Edit, Write
+---
+
+You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`.
+
+## Environment
+
+- **Repo root**: `/Users/quilicic/InferenceX-app`
+- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements:
+  - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname.
+  - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`.
+- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000)
+- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app`
+- **INVALIDATE_SECRET** lives in repo root `.env` under that key.
+- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var.
+
+## Standard ingest
+
+```bash
+cd /Users/quilicic/InferenceX-app/packages/db
+DATABASE_WRITE_URL='<provided direct non-pooled write URL>' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts --download <RUN_ID> SemiAnalysisAI/InferenceX
+```
+
+Then refresh the materialized view (the script's auto-refresh sometimes races):
+`REFRESH MATERIALIZED VIEW latest_benchmarks;`
+
+## Cache purge (always do after any DB mutation)
+
+```bash
+SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"')
+# Localhost (port 3002, NOT 3000)
+curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate
+# Preview
+mkdir -p /tmp/vp && cd /tmp/vp \
+  && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \
+  && vercel curl /api/v1/invalidate \
+       --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \
+       --yes -- -sS -X POST -H "Authorization: Bearer $SECRET"
+rm -rf /tmp/vp
+```
+
+## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision))
+
+```sql
+BEGIN;
+DELETE FROM benchmark_results br USING configs c
+WHERE c.id = br.config_id
+  AND c.model = '<model>' AND c.hardware = '<hw>' AND c.framework = '<framework>'
+  AND c.precision = '<prec>' AND br.benchmark_type = '<bt>';
+DELETE FROM availability
+WHERE model = '<model>' AND hardware = '<hw>' AND framework = '<framework>'
+  AND precision = '<prec>' AND benchmark_type = '<bt>';
+COMMIT;
+```
+
+If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked.
+
+## AIPerf tagging — DO NOT use by default
+
+AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision).
+
+Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`.
+
+<details>
+<summary>Explicit-request-only: how to tag a run as `spec_decoding='aiperf'`</summary>
+
+```bash
+RID=<run_id>
+TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX)
+cd $TMPDIR
+
+# 1. Logical-name dedup + download
+gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \
+  --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \
+  | python3 -c "
+import sys, re, collections
+seen = collections.OrderedDict()
+for line in sys.stdin:
+    name, url, created = line.rstrip('\n').split('\t')
+    key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name)
+    if key not in seen or seen[key][2] < created:
+        seen[key] = (name, url, created)
+for _, (name, url, _) in seen.items():
+    print(f'{name}\t{url}')
+" > artifacts.tsv
+while IFS=$'\t' read -r name url; do
+  mkdir -p "$name"
+  gh api "$url" > "$name/a.zip" 2>/dev/null
+  unzip -oq "$name/a.zip" -d "$name" 2>/dev/null
+  rm "$name/a.zip"
+done < artifacts.tsv
+
+# 2. Patch every benchmark JSON to set spec_decoding=aiperf
+find $TMPDIR -name "*.json" | python3 -c "
+import sys, json
+for fn in (l.strip() for l in sys.stdin):
+    try:
+        with open(fn) as f: d = json.load(f)
+    except Exception: continue
+    rows = d if isinstance(d, list) else [d]
+    if not rows or not isinstance(rows[0], dict): continue
+    changed = False
+    for row in rows:
+        if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row):
+            row['spec_decoding'] = 'aiperf'
+            changed = True
+    if changed:
+        with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f)
+"
+
+# 3. Ingest in CI mode (reads INGEST_* env vars)
+cd /Users/quilicic/InferenceX-app/packages/db
+INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \
+DATABASE_WRITE_URL='<provided direct non-pooled write URL>' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts
+rm -rf $TMPDIR
+```
+
+The `spec_method` column has a lowercase check constraint — always lowercase.
+
+</details>
+
+## Don't auto-mention "AIPerf" in changelog entries
+
+Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`.
+
+## Adding a perf changelog entry
+
+Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `<model>-<precision>-<hw>-<framework>` (matches what the user actually sees in the filter chain).
+
+```sql
+INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link)
+SELECT id, date, '', '', ARRAY['<model>-<precision>-<hw>-<framework>'], '<description>', NULL
+FROM latest_workflow_runs WHERE github_run_id = <RUN_ID>
+RETURNING id, workflow_run_id, date::text, description;
+```
+
+Description convention from prior entries: `<HW upper> <Model> Ingest #<N> (<note>)` — e.g.
+
+- `B200 Kimi Ingest #1`
+- `MI355X Kimi Ingest #2`
+- `H200 Kimi Ingest #1 (mmap cache)`
+
+If user doesn't specify a description, ask for one OR derive from the run name.
+
+## Common gotchas
+
+- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = <wr_id>` then `REFRESH MATERIALIZED VIEW latest_benchmarks`.
+- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT.
+- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites.
+- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `_<runner>_<attempt>` suffix.
+- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection.
+- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = '<full-sweep-date>'` so the frontend's max-date-per-group dedup doesn't drop the older sweep.
+
+## Process
+
+1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/<RID> --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips).
+2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding.
+3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line.
+4. **Refresh materialized view**.
+5. **Add changelog entry** if the user asked or if the run is a "marker" worth surfacing.
+6. **Purge both caches** (localhost 3002 + preview).
+7. **Report** the row count, date, hardware, run id, and changelog id (if added).
+
+## Related: ingesting agentic _datasets_ (not benchmark runs)
+
+This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow:
+
+```bash
+cd packages/db && DATABASE_WRITE_URL='<direct write url>' \
+  pnpm exec tsx src/ingest-weka-dataset.ts <hf-dataset-id> \
+  [--label "…"] [--variant full|256k] [--description "…"] [--limit N]
+```
+
+It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
+
+## Don't
+
+- Don't push to git unless the user asked.
+- Don't ingest without permission if it's a delete+reingest of existing data.
+- Don't hit port 3000 for cache purge — it's a different project.
+- Don't capitalize `spec_method` values (DB has a lowercase check constraint).

From 2ae6ebaab06b27bd65f0601aa6ae7905cbd01d79 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:24:01 -0500
Subject: [PATCH 081/111] fix(datasets): flamegraph scroll box + dual-scale
 group bars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wrap rows in a fixed-height (max-h-[520px]) vertically scrollable bordered box.
Subagent group headers carry aggregate token totals that dwarf any single turn,
which made their bars overflow the row (width >> 100%). Now turns/subturns use a
per-turn scale while group headers use a separate group-aggregate scale (slim
muted strips), both clamped to the track — groups stay comparable to each other
and nothing overflows.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/datasets/trace-flamegraph.tsx  | 111 ++++++++++--------
 1 file changed, 63 insertions(+), 48 deletions(-)

diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 12588582..12cc14ec 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -135,10 +135,19 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
     return out;
   }, [nodes, expanded]);
 
+  // Two scales: leaf turns/subturns share a per-turn axis (the primary signal —
+  // how cached/uncached evolves), while subagent group headers carry aggregates
+  // orders of magnitude larger, so they get their own axis to stay comparable to
+  // each other. Group bars render slim + muted, so the mixed scale reads as a
+  // distinct "group summary" track rather than a contradiction.
   const maxTotal = useMemo(
     () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)),
     [rows],
   );
+  const maxGroupTotal = useMemo(
+    () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)),
+    [rows],
+  );
 
   const onMove = (e: React.MouseEvent, row: VisibleRow) => {
     setTooltip({ x: e.clientX, y: e.clientY, row });
@@ -178,61 +187,67 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
         )}
       </div>
 
-      <div className="flex flex-col gap-0.5">
-        {rows.map((row) => {
-          const barFrac = row.total / maxTotal;
-          const cw = (row.cached / row.total) * 100;
-          const uw = (row.uncached / row.total) * 100;
-          const ow = (row.output / row.total) * 100;
-          return (
-            <div
-              key={row.key}
-              className="flex items-center gap-2"
-              style={{ paddingLeft: row.indent * 20 }}
-            >
-              {/* label / group toggle */}
-              <div className="flex w-44 shrink-0 items-center gap-1 truncate">
-                {row.isGroup ? (
-                  <button
-                    type="button"
-                    onClick={() => row.groupIndex !== undefined && toggle(row.groupIndex)}
-                    className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
-                  >
-                    <span className="inline-block w-3 text-muted-foreground">
-                      {row.isExpanded ? '▾' : '▸'}
-                    </span>
-                    <span className="truncate">{row.label}</span>
-                  </button>
-                ) : (
-                  <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
-                )}
-              </div>
-
-              {/* stacked bar */}
+      <div className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2">
+        <div className="flex flex-col gap-0.5">
+          {rows.map((row) => {
+            // Group headers use the group axis; turns/subturns use the per-turn
+            // axis. Clamp to the track width either way.
+            const denom = row.isGroup ? maxGroupTotal : maxTotal;
+            const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100));
+            const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0;
+            const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0;
+            const ow = row.total > 0 ? (row.output / row.total) * 100 : 0;
+            return (
               <div
-                className="relative h-5 flex-1 cursor-default"
-                onMouseMove={(e) => onMove(e, row)}
-                onMouseLeave={() => setTooltip(null)}
+                key={row.key}
+                className="flex items-center gap-2"
+                style={{ paddingLeft: row.indent * 20 }}
               >
+                {/* label / group toggle */}
+                <div className="flex w-44 shrink-0 items-center gap-1 truncate">
+                  {row.isGroup ? (
+                    <button
+                      type="button"
+                      onClick={() => row.groupIndex !== undefined && toggle(row.groupIndex)}
+                      className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
+                    >
+                      <span className="inline-block w-3 text-muted-foreground">
+                        {row.isExpanded ? '▾' : '▸'}
+                      </span>
+                      <span className="truncate">{row.label}</span>
+                    </button>
+                  ) : (
+                    <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
+                  )}
+                </div>
+
+                {/* stacked bar — group headers render as a slim muted summary
+                    strip so they read as aggregates, not individual turns. */}
                 <div
-                  className={`flex h-full overflow-hidden rounded-sm ${
-                    row.isGroup ? 'opacity-70 ring-1 ring-border/50' : ''
-                  }`}
-                  style={{ width: `${Math.max(0.5, barFrac * 100)}%` }}
+                  className="relative flex h-5 flex-1 items-center"
+                  onMouseMove={(e) => onMove(e, row)}
+                  onMouseLeave={() => setTooltip(null)}
                 >
-                  <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
-                  <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
-                  <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                  <div
+                    className={`flex overflow-hidden rounded-sm ${
+                      row.isGroup ? 'h-2.5 opacity-80' : 'h-5'
+                    }`}
+                    style={{ width: `${widthPct}%` }}
+                  >
+                    <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
+                    <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
+                    <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                  </div>
                 </div>
-              </div>
 
-              {/* total */}
-              <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
-                {compact(row.total)}
+                {/* total */}
+                <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
+                  {compact(row.total)}
+                </div>
               </div>
-            </div>
-          );
-        })}
+            );
+          })}
+        </div>
       </div>
 
       {tooltip && (

From c749f8f271bcfa46293b1ce2ec29adac1907231d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:31:40 -0500
Subject: [PATCH 082/111] feat(datasets): link request timeline to
 source-dataset conversation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add run_datasets (workflow_run → dataset slug) mapping (migration 012) and
surface it through the benchmark-siblings sku. The agentic detail page's request
timeline now deep-links each request bar to its exact conversation in the
/datasets viewer — the request cid, stripped of any ::sa:/::fa: suffix, is the
dataset conv_id. Tooltip shows a 'click to view in dataset' hint; bars get a
pointer cursor only when a mapping exists. Backfilled workflow_run 27915787191
(the dsv4/b300/vllm run incl. point 422083) → cc-traces-weka-062126.

Verified: clicking a timeline bar on /inference/agentic/422083 navigates to the
matching /datasets/cc-traces-weka-062126/conversations/<conv_id>.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    |  5 ++-
 .../agentic-point/dataset-conv-id.test.ts     | 27 ++++++++++++
 .../agentic-point/request-timeline.tsx        | 43 +++++++++++++++++--
 .../src/hooks/api/use-benchmark-siblings.ts   |  1 +
 packages/db/migrations/012_run_datasets.sql   | 19 ++++++++
 packages/db/src/queries/benchmark-siblings.ts |  7 ++-
 6 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
 create mode 100644 packages/db/migrations/012_run_datasets.sql

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 278ad8f7..4a076955 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -225,7 +225,10 @@ export function AgenticPointDetail({ id }: Props) {
             Loading request timeline…
           </div>
         ) : timelineQuery.data ? (
-          <RequestTimelineView data={timelineQuery.data} />
+          <RequestTimelineView
+            data={timelineQuery.data}
+            datasetSlug={siblingsQuery.data?.sku.dataset_slug}
+          />
         ) : (
           <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
             No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
new file mode 100644
index 00000000..a7ebbd8c
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
@@ -0,0 +1,27 @@
+import { describe, expect, it } from 'vitest';
+
+import { datasetConvId } from './request-timeline';
+
+describe('datasetConvId', () => {
+  it('returns a plain conversation id unchanged', () => {
+    expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602')).toBe(
+      '002001296e8a8c38ad9d7cc436d691afc602',
+    );
+  });
+
+  it('strips a ::sa: subagent suffix to the parent conv id', () => {
+    expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+      '002001296e8a8c38ad9d7cc436d691afc602',
+    );
+  });
+
+  it('strips a ::fa: forked-agent suffix', () => {
+    expect(datasetConvId('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBe(
+      '02bc0afb13f7a2d9efa86c28511261d85c0e',
+    );
+  });
+
+  it('strips at the first :: even with a trailing stream index', () => {
+    expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc');
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 7c5fdab0..655556fb 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -1,9 +1,21 @@
 'use client';
 
 import { useCallback, useMemo, useRef, useState } from 'react';
+import { useRouter } from 'next/navigation';
 
 import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
 import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+/**
+ * The dataset conversation id for a request: the cid with any subagent/forked
+ * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in
+ * dataset_conversations, so it deep-links into /datasets/<slug>/conversations/.
+ */
+export function datasetConvId(cid: string): string {
+  const i = cid.indexOf('::');
+  return i === -1 ? cid : cid.slice(0, i);
+}
 
 /**
  * Gantt-style request timeline for one agentic benchmark point.
@@ -317,7 +329,7 @@ interface TooltipData {
   req: RequestRecord;
 }
 
-function Tooltip({ data }: { data: TooltipData }) {
+function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) {
   const { row, req } = data;
   const totalMs = (req.end - req.start) / 1e6;
   const queueMs = (req.start - req.credit) / 1e6;
@@ -377,14 +389,37 @@ function Tooltip({ data }: { data: TooltipData }) {
       <div className="mt-1.5 pt-1 border-t border-border/40 text-[10px] text-muted-foreground">
         Started at {formatTickLabel(req.start)}
       </div>
+      {linkable && (
+        <div className="mt-1 text-[10px] font-medium text-primary">
+          Click to view this conversation in the dataset →
+        </div>
+      )}
     </div>
   );
 }
 
-export function RequestTimelineView({ data }: { data: RequestTimeline }) {
+export function RequestTimelineView({
+  data,
+  datasetSlug,
+}: {
+  data: RequestTimeline;
+  /** Source dataset slug for this run; enables click-to-conversation deep links. */
+  datasetSlug?: string | null;
+}) {
+  const router = useRouter();
   const [rowMode, setRowMode] = useState<RowMode>('conversation');
   const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+
+  const openConversation = useCallback(
+    (cid: string) => {
+      if (!datasetSlug) return;
+      const convId = datasetConvId(cid);
+      track('agentic_timeline_to_dataset', { slug: datasetSlug });
+      router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`);
+    },
+    [datasetSlug, router],
+  );
   // Which multi-stream subagents currently have their per-stream rows
   // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
   const [expandedSubagents, setExpandedSubagents] = useState<ReadonlySet<string>>(() => new Set());
@@ -798,6 +833,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                         key={`${req.cid}-${req.ti}-${req.start}`}
                         onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
                         onMouseLeave={() => setTooltip(null)}
+                        onClick={datasetSlug ? () => openConversation(req.cid) : undefined}
+                        style={datasetSlug ? { cursor: 'pointer' } : undefined}
                       >
                         {/* Queue lead-in (faint) — only drawn when noticeable. */}
                         {queueW >= 1 && (
@@ -910,7 +947,7 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
       )}
 
       {/* Tooltip */}
-      {tooltip && <Tooltip data={tooltip} />}
+      {tooltip && <Tooltip data={tooltip} linkable={Boolean(datasetSlug)} />}
     </div>
   );
 }
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
index 55720bdf..f8bef99e 100644
--- a/packages/app/src/hooks/api/use-benchmark-siblings.ts
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -31,6 +31,7 @@ export interface BenchmarkSku {
   benchmark_type: string;
   github_run_id: number;
   date: string;
+  dataset_slug: string | null;
 }
 
 export interface BenchmarkSiblings {
diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql
new file mode 100644
index 00000000..58dd9f88
--- /dev/null
+++ b/packages/db/migrations/012_run_datasets.sql
@@ -0,0 +1,19 @@
+-- Maps a benchmark workflow_run to the source dataset it replayed, so the
+-- agentic detail page can deep-link each request in the timeline to the exact
+-- conversation in the /datasets viewer (the request's conversation_id, with any
+-- ::sa:/::fa: suffix stripped, is the dataset conv_id).
+--
+-- One row per workflow_run (every benchmark in a run replays the same dataset).
+-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/<slug>
+-- URL) rather than an FK, so the mapping can be recorded before/independent of
+-- the dataset being ingested; the UI degrades gracefully if the slug is absent.
+--
+-- Additive only. To revert:
+--   drop table if exists run_datasets;
+--   delete from schema_migrations where filename = '012_run_datasets.sql';
+
+create table run_datasets (
+  workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
+  dataset_slug    text not null,
+  created_at      timestamptz not null default now()
+);
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
index c7e4a317..2d36eb22 100644
--- a/packages/db/src/queries/benchmark-siblings.ts
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -47,6 +47,8 @@ export interface BenchmarkSku {
   /** Human-readable workflow_run summary so the page header can hint at provenance. */
   github_run_id: number;
   date: string;
+  /** Slug of the source dataset this run replayed (run_datasets), or null. */
+  dataset_slug: string | null;
 }
 
 export interface BenchmarkSiblings {
@@ -63,10 +65,11 @@ export async function getBenchmarkSiblings(
     select
       c.hardware, c.framework, c.model, c.precision, c.spec_method,
       br.benchmark_type, br.workflow_run_id, br.date::text,
-      wr.github_run_id
+      wr.github_run_id, rd.dataset_slug
     from benchmark_results br
     join configs c on c.id = br.config_id
     join workflow_runs wr on wr.id = br.workflow_run_id
+    left join run_datasets rd on rd.workflow_run_id = br.workflow_run_id
     where br.id = ${benchmarkResultId}
   `) as unknown as {
     hardware: string;
@@ -78,6 +81,7 @@ export async function getBenchmarkSiblings(
     workflow_run_id: number;
     date: string;
     github_run_id: number;
+    dataset_slug: string | null;
   }[];
   const root = seed[0];
   if (!root) return null;
@@ -158,6 +162,7 @@ export async function getBenchmarkSiblings(
       benchmark_type: root.benchmark_type,
       github_run_id: Number(root.github_run_id),
       date: root.date,
+      dataset_slug: root.dataset_slug ?? null,
     },
     siblings,
   };

From 6b700a3ccbc53fbc7e109360a2e5baa582e588c9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 17:31:14 -0500
Subject: [PATCH 083/111] feat(datasets): deep-link request-timeline bar to the
 exact turn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The timeline link now carries ?turn=<ti> (and &sa=<agentId> for subagent
requests). The flamegraph resolves the target node — main turns by ordinal,
subagent turns by matching the group's agentId then the ti-th child — expands
the subagent group if needed, scrolls the row into view, and flashes a ring.

subagentIdOf strips the harness stream suffix (:s<n> and :aux:<n>) so the cid's
agent id matches the dataset SubagentNode.agentId. Verified end-to-end: clicking
a subagent bar on /inference/agentic/422083 opens the conversation, expands the
right group, and highlights the exact subturn.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/datasets/conversation-view.tsx | 18 +++++-
 .../components/datasets/trace-flamegraph.tsx  | 60 +++++++++++++++++--
 .../agentic-point/dataset-conv-id.test.ts     | 28 ++++++++-
 .../agentic-point/request-timeline.tsx        | 30 ++++++++--
 4 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index 43992c41..ba1d0532 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -1,5 +1,6 @@
 'use client';
 
+import { useState } from 'react';
 import Link from 'next/link';
 
 import { Card } from '@/components/ui/card';
@@ -17,6 +18,17 @@ function compact(n: number): string {
 export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
   const { data, isLoading, isError } = useDatasetConversation(slug, convId);
 
+  // Deep-link target from a request-timeline click: ?turn=<ti>[&sa=<agentId>].
+  // Read once from the URL on mount (matches the app's window-based url-state
+  // reads; avoids a Suspense boundary for useSearchParams).
+  const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => {
+    if (typeof window === 'undefined') return { turn: null, agent: null };
+    const p = new URLSearchParams(window.location.search);
+    const turnRaw = p.get('turn');
+    const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null;
+    return { turn, agent: p.get('sa') };
+  });
+
   if (isLoading) {
     return (
       <div className="py-12 text-center text-sm text-muted-foreground">Loading conversation…</div>
@@ -85,7 +97,11 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
           click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
           plus generated output.
         </p>
-        <TraceFlamegraph structure={data.structure} />
+        <TraceFlamegraph
+          structure={data.structure}
+          highlightTurn={highlight.turn}
+          highlightAgentId={highlight.agent}
+        />
       </Card>
     </div>
   );
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 12cc14ec..3995a9c5 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -1,6 +1,6 @@
 'use client';
 
-import { useCallback, useMemo, useState } from 'react';
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
 
 import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
 
@@ -52,12 +52,58 @@ interface TooltipState {
  * children (collapsed by default). Each bar stacks cached-prefix + uncached
  * input + output, scaled to the widest visible turn.
  */
-export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) {
+export function TraceFlamegraph({
+  structure,
+  highlightTurn,
+  highlightAgentId,
+}: {
+  structure: ConversationStructure;
+  /** Turn index to scroll to / highlight (from a request-timeline deep link). */
+  highlightTurn?: number | null;
+  /** Subagent id when the highlighted turn is inside a subagent group. */
+  highlightAgentId?: string | null;
+}) {
   const nodes = structure.nodes;
 
-  // Subagent groups collapsed by default.
-  const [expanded, setExpanded] = useState<Set<number>>(() => new Set());
+  // Resolve the deep-link target to a row key (+ the group that must be open to
+  // show it). Main turns match by their main-turn ordinal; subagent turns match
+  // the group by agentId, then the ti-th child.
+  const target = useMemo(() => {
+    if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null;
+    if (highlightAgentId) {
+      const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId);
+      if (gi === -1) return null;
+      const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
+      if (highlightTurn >= group.children.length) return null;
+      return { rowKey: `g-${gi}-c-${highlightTurn}`, expandGroup: gi };
+    }
+    let ordinal = 0;
+    for (let i = 0; i < nodes.length; i++) {
+      if (nodes[i].kind === 'turn') {
+        if (ordinal === highlightTurn) return { rowKey: `t-${i}`, expandGroup: null };
+        ordinal += 1;
+      }
+    }
+    return null;
+  }, [nodes, highlightTurn, highlightAgentId]);
+
+  // Subagent groups collapsed by default — except the deep-link target's group.
+  const [expanded, setExpanded] = useState<Set<number>>(() =>
+    typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(),
+  );
   const [tooltip, setTooltip] = useState<TooltipState | null>(null);
+  const scrollRef = useRef<HTMLDivElement>(null);
+
+  // Scroll the target row into view and flash a highlight once it's rendered.
+  useEffect(() => {
+    if (!target) return;
+    const el = scrollRef.current?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`);
+    if (!el) return;
+    el.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    el.classList.add('ring-2', 'ring-primary', 'rounded-sm');
+    const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600);
+    return () => clearTimeout(t);
+  }, [target]);
 
   const groupIndexes = useMemo(() => {
     const out: number[] = [];
@@ -187,7 +233,10 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
         )}
       </div>
 
-      <div className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2">
+      <div
+        ref={scrollRef}
+        className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2"
+      >
         <div className="flex flex-col gap-0.5">
           {rows.map((row) => {
             // Group headers use the group axis; turns/subturns use the per-turn
@@ -200,6 +249,7 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
             return (
               <div
                 key={row.key}
+                data-rowkey={row.key}
                 className="flex items-center gap-2"
                 style={{ paddingLeft: row.indent * 20 }}
               >
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
index a7ebbd8c..f55d6131 100644
--- a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
+++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 
-import { datasetConvId } from './request-timeline';
+import { datasetConvId, subagentIdOf } from './request-timeline';
 
 describe('datasetConvId', () => {
   it('returns a plain conversation id unchanged', () => {
@@ -25,3 +25,29 @@ describe('datasetConvId', () => {
     expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc');
   });
 });
+
+describe('subagentIdOf', () => {
+  it('returns null for a main-conversation cid', () => {
+    expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602')).toBeNull();
+  });
+
+  it('extracts the subagent id from a ::sa: cid', () => {
+    expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+      'subagent_004_27c95af7',
+    );
+  });
+
+  it('drops a trailing :s<stream> index from the subagent id', () => {
+    expect(subagentIdOf('abc::sa:subagent_001_f552fe6f:s3')).toBe('subagent_001_f552fe6f');
+  });
+
+  it('drops an :aux:<n> stream suffix from the subagent id', () => {
+    expect(subagentIdOf('04dba6fe::sa:subagent_001_b00fdc12:aux:011')).toBe(
+      'subagent_001_b00fdc12',
+    );
+  });
+
+  it('returns null for a ::fa: forked-agent cid (no matching subagent group)', () => {
+    expect(subagentIdOf('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBeNull();
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 655556fb..baf3dc1f 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -17,6 +17,21 @@ export function datasetConvId(cid: string): string {
   return i === -1 ? cid : cid.slice(0, i);
 }
 
+/**
+ * The subagent id encoded in a cid (`…::sa:<agent_id>[:s<n>|:aux:<n>]`), or null
+ * for a main-conversation request. The harness fans a single subagent into
+ * parallel streams with a `:s<n>` or `:aux:<n>` suffix; the dataset
+ * SubagentNode.agentId is the bare base (e.g. `subagent_001_b00fdc12`). Agent
+ * ids never contain a colon, so the base is everything up to the first one.
+ */
+export function subagentIdOf(cid: string): string | null {
+  const i = cid.indexOf('::sa:');
+  if (i === -1) return null;
+  const raw = cid.slice(i + '::sa:'.length);
+  const colon = raw.indexOf(':');
+  return colon === -1 ? raw : raw.slice(0, colon);
+}
+
 /**
  * Gantt-style request timeline for one agentic benchmark point.
  *
@@ -412,11 +427,18 @@ export function RequestTimelineView({
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
 
   const openConversation = useCallback(
-    (cid: string) => {
+    (req: RequestRecord) => {
       if (!datasetSlug) return;
-      const convId = datasetConvId(cid);
+      const convId = datasetConvId(req.cid);
+      // Carry the turn (and, for subagent requests, the subagent id) so the
+      // flamegraph can scroll to / highlight the exact node this bar maps to.
+      const params = new URLSearchParams({ turn: String(req.ti) });
+      const sa = subagentIdOf(req.cid);
+      if (sa) params.set('sa', sa);
       track('agentic_timeline_to_dataset', { slug: datasetSlug });
-      router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`);
+      router.push(
+        `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`,
+      );
     },
     [datasetSlug, router],
   );
@@ -833,7 +855,7 @@ export function RequestTimelineView({
                         key={`${req.cid}-${req.ti}-${req.start}`}
                         onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
                         onMouseLeave={() => setTooltip(null)}
-                        onClick={datasetSlug ? () => openConversation(req.cid) : undefined}
+                        onClick={datasetSlug ? () => openConversation(req) : undefined}
                         style={datasetSlug ? { cursor: 'pointer' } : undefined}
                       >
                         {/* Queue lead-in (faint) — only drawn when noticeable. */}

From 83fcd04e16649ca7a8fb3b1b78231c8588f274e8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 17:44:05 -0500
Subject: [PATCH 084/111] fix(datasets): visible turn highlight +
 pointer-tracking flamegraph tooltip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Deep-link highlight is now state-driven (bg-primary/20 + ring, fades over
  700ms) instead of fragile classList mutation, so it's clearly visible and
  survives re-renders. Subagent groups still auto-expand and scroll into view.
- Portal the hover tooltip to document.body so its position:fixed is
  viewport-relative — an ancestor transform was offsetting it away from the
  cursor. Now it sits at pointer+12px.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/datasets/trace-flamegraph.tsx  | 96 +++++++++++--------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 3995a9c5..53f13b6a 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -1,6 +1,7 @@
 'use client';
 
 import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import { createPortal } from 'react-dom';
 
 import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
 
@@ -94,14 +95,23 @@ export function TraceFlamegraph({
   const [tooltip, setTooltip] = useState<TooltipState | null>(null);
   const scrollRef = useRef<HTMLDivElement>(null);
 
-  // Scroll the target row into view and flash a highlight once it's rendered.
+  // Portal target only exists after mount (the tooltip is portaled to body so
+  // its position:fixed is viewport-relative, immune to ancestor transforms).
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
+  // The deep-link target row gets a state-driven highlight (ring + bg flash)
+  // that fades out — state-driven so a re-render can't clobber it, and so the
+  // fade is a real CSS transition rather than an abrupt classList removal.
+  const [highlightKey, setHighlightKey] = useState<string | null>(target?.rowKey ?? null);
+
+  // Scroll the target row into view once it's rendered, then fade the highlight.
   useEffect(() => {
     if (!target) return;
+    setHighlightKey(target.rowKey);
     const el = scrollRef.current?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`);
-    if (!el) return;
-    el.scrollIntoView({ block: 'center', behavior: 'smooth' });
-    el.classList.add('ring-2', 'ring-primary', 'rounded-sm');
-    const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600);
+    el?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    const t = setTimeout(() => setHighlightKey(null), 2200);
     return () => clearTimeout(t);
   }, [target]);
 
@@ -246,11 +256,14 @@ export function TraceFlamegraph({
             const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0;
             const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0;
             const ow = row.total > 0 ? (row.output / row.total) * 100 : 0;
+            const isHighlighted = row.key === highlightKey;
             return (
               <div
                 key={row.key}
                 data-rowkey={row.key}
-                className="flex items-center gap-2"
+                className={`flex items-center gap-2 rounded-sm transition-colors duration-700 ${
+                  isHighlighted ? 'bg-primary/20 ring-2 ring-primary' : 'ring-0'
+                }`}
                 style={{ paddingLeft: row.indent * 20 }}
               >
                 {/* label / group toggle */}
@@ -300,39 +313,44 @@ export function TraceFlamegraph({
         </div>
       </div>
 
-      {tooltip && (
-        <div
-          className="pointer-events-none fixed z-50 rounded-md border border-border bg-popover px-2.5 py-1.5 text-xs shadow-md"
-          style={{ left: tooltip.x + 12, top: tooltip.y + 12 }}
-        >
-          <div className="mb-1 font-medium text-foreground">
-            {tooltip.row.label}
-            {tooltip.row.sublabel ? (
-              <span className="ml-1 font-normal text-muted-foreground">{tooltip.row.sublabel}</span>
-            ) : null}
-          </div>
-          <div className="grid grid-cols-[auto_auto] gap-x-3 gap-y-0.5 text-muted-foreground">
-            <span style={{ color: SEG.cached }}>Cached prefix</span>
-            <span className="text-right tabular-nums text-foreground">
-              {compact(tooltip.row.cached)}
-            </span>
-            <span style={{ color: SEG.uncached }}>Uncached input</span>
-            <span className="text-right tabular-nums text-foreground">
-              {compact(tooltip.row.uncached)}
-            </span>
-            <span style={{ color: SEG.output }}>Output</span>
-            <span className="text-right tabular-nums text-foreground">
-              {compact(tooltip.row.output)}
-            </span>
-            <span>Cached %</span>
-            <span className="text-right tabular-nums text-foreground">
-              {tooltip.row.cached + tooltip.row.uncached > 0
-                ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
-                : '—'}
-            </span>
-          </div>
-        </div>
-      )}
+      {tooltip &&
+        mounted &&
+        createPortal(
+          <div
+            className="pointer-events-none fixed z-50 rounded-md border border-border bg-popover px-2.5 py-1.5 text-xs shadow-md"
+            style={{ left: tooltip.x + 12, top: tooltip.y + 12 }}
+          >
+            <div className="mb-1 font-medium text-foreground">
+              {tooltip.row.label}
+              {tooltip.row.sublabel ? (
+                <span className="ml-1 font-normal text-muted-foreground">
+                  {tooltip.row.sublabel}
+                </span>
+              ) : null}
+            </div>
+            <div className="grid grid-cols-[auto_auto] gap-x-3 gap-y-0.5 text-muted-foreground">
+              <span style={{ color: SEG.cached }}>Cached prefix</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.cached)}
+              </span>
+              <span style={{ color: SEG.uncached }}>Uncached input</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.uncached)}
+              </span>
+              <span style={{ color: SEG.output }}>Output</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.output)}
+              </span>
+              <span>Cached %</span>
+              <span className="text-right tabular-nums text-foreground">
+                {tooltip.row.cached + tooltip.row.uncached > 0
+                  ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
+                  : '—'}
+              </span>
+            </div>
+          </div>,
+          document.body,
+        )}
     </div>
   );
 }

From 3c40d31172cce46f5e150223bcfa092ff573288f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 17:58:47 -0500
Subject: [PATCH 085/111] fix(datasets): deep-link highlight fires on first
 navigation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The conversation page read ?turn/&sa from window.location.search in a useState
initializer, which captures stale/empty params during a client-side navigation —
so scroll+highlight+expand only worked after a manual reload. Switch to the
reactive useSearchParams (page wrapped in Suspense) so the params are present on
the first nav. Also make the flamegraph expand the target subagent group via an
effect (reacting to target changes), and defer the scroll one frame so the
just-expanded child row exists. Verified via a real timeline click — no reload.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../[slug]/conversations/[convId]/page.tsx    |  5 ++++-
 .../components/datasets/conversation-view.tsx | 19 ++++++++--------
 .../components/datasets/trace-flamegraph.tsx  | 22 +++++++++++++++----
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
index 75702c1b..83eb56a0 100644
--- a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -1,3 +1,4 @@
+import { Suspense } from 'react';
 import type { Metadata } from 'next';
 
 import { ConversationView } from '@/components/datasets/conversation-view';
@@ -25,7 +26,9 @@ export default async function ConversationPage({ params }: Props) {
   return (
     <main className="relative">
       <div className="container mx-auto px-4 pb-8 lg:px-8">
-        <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+        <Suspense>
+          <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+        </Suspense>
       </div>
     </main>
   );
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index ba1d0532..739d3bb2 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -1,7 +1,7 @@
 'use client';
 
-import { useState } from 'react';
 import Link from 'next/link';
+import { useSearchParams } from 'next/navigation';
 
 import { Card } from '@/components/ui/card';
 import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
@@ -19,15 +19,14 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
   const { data, isLoading, isError } = useDatasetConversation(slug, convId);
 
   // Deep-link target from a request-timeline click: ?turn=<ti>[&sa=<agentId>].
-  // Read once from the URL on mount (matches the app's window-based url-state
-  // reads; avoids a Suspense boundary for useSearchParams).
-  const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => {
-    if (typeof window === 'undefined') return { turn: null, agent: null };
-    const p = new URLSearchParams(window.location.search);
-    const turnRaw = p.get('turn');
-    const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null;
-    return { turn, agent: p.get('sa') };
-  });
+  // useSearchParams (not a one-shot window.location read) so the params are
+  // present on the very first client-side navigation, not just after a reload.
+  const params = useSearchParams();
+  const turnRaw = params.get('turn');
+  const highlight = {
+    turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null,
+    agent: params.get('sa'),
+  };
 
   if (isLoading) {
     return (
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 53f13b6a..a577193b 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -105,14 +105,28 @@ export function TraceFlamegraph({
   // fade is a real CSS transition rather than an abrupt classList removal.
   const [highlightKey, setHighlightKey] = useState<string | null>(target?.rowKey ?? null);
 
-  // Scroll the target row into view once it's rendered, then fade the highlight.
+  // When the deep-link target resolves/changes: expand its subagent group, then
+  // (after the row renders) scroll it into view and flash the highlight. Runs on
+  // first load and on any later target change (e.g. clicking another bar into
+  // the same conversation). The row query/scroll is deferred to the next frame
+  // so the just-expanded child row exists in the DOM.
   useEffect(() => {
     if (!target) return;
+    if (typeof target.expandGroup === 'number') {
+      const gi = target.expandGroup;
+      setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi)));
+    }
     setHighlightKey(target.rowKey);
-    const el = scrollRef.current?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`);
-    el?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    const raf = requestAnimationFrame(() => {
+      scrollRef.current
+        ?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`)
+        ?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    });
     const t = setTimeout(() => setHighlightKey(null), 2200);
-    return () => clearTimeout(t);
+    return () => {
+      cancelAnimationFrame(raf);
+      clearTimeout(t);
+    };
   }, [target]);
 
   const groupIndexes = useMemo(() => {

From e460ea2300f57912eff46d92fbb6fb447fc435e4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 22:34:55 -0500
Subject: [PATCH 086/111] fix(high-contrast): stable line colors when
 deselecting legend items
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In HC mode the iwanthue palette is sized and indexed by the key set it's
generated over. ScatterGraph generated it from the *active* (selected) hw set,
so deselecting a line shrank the set, re-sized the palette, and shifted every
remaining line's hue — most visible on single-vendor agentic runs (which span
the full hue wheel since 2c06009), where deselecting B300 could recolor B200
from red to blue.

Pass the stable full set of hw-types-with-data as hcKeys so the palette and
per-key index are fixed; toggling now only hides/shows lines without recoloring
the rest. Adds a useThemeColors regression test asserting a line's HC color is
identical across active subsets when hcKeys is the full set.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  |  8 ++++++
 packages/app/src/hooks/useThemeColors.test.ts | 28 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 76231522..77770ec0 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -381,10 +381,18 @@ const ScatterGraph = React.memo(
       () => [...effectiveOfficialHwTypes],
       [effectiveOfficialHwTypes],
     );
+    // High-contrast palette is keyed off the FULL set of official hw types with
+    // data, not the active subset. Otherwise deselecting a line shrinks the key
+    // set, which re-sizes the iwanthue palette and shifts every remaining line's
+    // hue (most visible for single-vendor agentic runs that span the full wheel —
+    // e.g. deselecting B300 would recolor B200 from red to blue). Keying off the
+    // stable full set fixes each hw's color so toggling only hides/shows lines.
+    const stableHcKeys = useMemo(() => [...hwTypesWithData], [hwTypesWithData]);
     const { resolveColor, getCssColor } = useThemeColors({
       highContrast,
       identifiers: activeHwKeys,
       activeKeys: activeOfficialKeys,
+      hcKeys: stableHcKeys,
     });
 
     // --- Changelog ---
diff --git a/packages/app/src/hooks/useThemeColors.test.ts b/packages/app/src/hooks/useThemeColors.test.ts
index 7275e384..11050d19 100644
--- a/packages/app/src/hooks/useThemeColors.test.ts
+++ b/packages/app/src/hooks/useThemeColors.test.ts
@@ -170,4 +170,32 @@ describe('useThemeColors color maps', () => {
     }
     unmountOn();
   });
+
+  // Regression: deselecting a legend line must not recolor the remaining lines.
+  // The HC palette is sized/indexed by the key set it's generated over, so when
+  // it was generated over the *active* subset (no hcKeys), shrinking the
+  // selection re-sized the palette and shifted every remaining line's hue (most
+  // visible on single-vendor agentic runs spanning the full wheel). Passing a
+  // stable `hcKeys` (the full set with data) fixes each line's color.
+  it('keeps a line HC color stable across active subsets when hcKeys is the full set', () => {
+    const FULL = ['b200', 'b300']; // single-vendor (NVIDIA) agentic comparison
+
+    const all = renderHook<UseThemeColorsResult>(() =>
+      useThemeColors({ highContrast: true, activeKeys: ['b200', 'b300'], hcKeys: FULL }),
+    );
+    const b200WithBoth = all.result.current.resolveColor('b200');
+    const b300Color = all.result.current.resolveColor('b300');
+    all.unmount();
+
+    // b300 deselected → only b200 active, but hcKeys is still the full set.
+    const subset = renderHook<UseThemeColorsResult>(() =>
+      useThemeColors({ highContrast: true, activeKeys: ['b200'], hcKeys: FULL }),
+    );
+    const b200Alone = subset.result.current.resolveColor('b200');
+    subset.unmount();
+
+    expect(b200WithBoth).toMatch(/^#[0-9a-f]{6}$/iu);
+    expect(b200WithBoth).not.toBe(b300Color); // HC still produces distinct hues
+    expect(b200Alone).toBe(b200WithBoth); // deselecting b300 did NOT recolor b200
+  });
 });

From a912eab780a76ba015b21590d3c162e0fd4c37ea Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:04:28 -0500
Subject: [PATCH 087/111] chore(security): bump dompurify override to >=3.4.11
 (GHSA-cmwh-pvxp-8882)

---
 pnpm-lock.yaml      | 52 ++++++++++++++++++++++++++++++++-------------
 pnpm-workspace.yaml |  2 +-
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index cdd8a01d..bb7bb824 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -5,7 +5,7 @@ settings:
   excludeLinksFromLockfile: false
 
 overrides:
-  dompurify@<3.4.9: '>=3.4.9'
+  dompurify@<=3.4.10: '>=3.4.11'
   esbuild@>=0.27.3 <0.28.1: '>=0.28.1'
   form-data@>=4.0.0 <4.0.6: '>=4.0.6'
   hono@<4.12.21: '>=4.12.21'
@@ -20,7 +20,7 @@ importers:
     devDependencies:
       '@babel/core':
         specifier: ^7.29.6
-        version: 7.29.7
+        version: 7.29.7(supports-color@8.1.1)
       audit-ci:
         specifier: ^7.1.0
         version: 7.1.0
@@ -2994,9 +2994,6 @@ packages:
     resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==}
     engines: {node: '>=8'}
 
-  dompurify@3.4.10:
-    resolution: {integrity: sha512-0xzNv0e7oYC6yyuOGZIABPM4qtg3QxLFniDNPP4ZP90wR8Yq3zgwpRbrNiT4N3IKqDbbYFEJLV+JWEs19aZ//w==}
-
   dompurify@3.4.11:
     resolution: {integrity: sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==}
 
@@ -5538,7 +5535,27 @@ snapshots:
       '@babel/helpers': 7.29.7
       '@babel/parser': 7.29.7
       '@babel/template': 7.29.7
-      '@babel/traverse': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
+      '@babel/types': 7.29.7
+      '@jridgewell/remapping': 2.3.5
+      convert-source-map: 2.0.0
+      debug: 4.4.3(supports-color@8.1.1)
+      gensync: 1.0.0-beta.2
+      json5: 2.2.3
+      semver: 6.3.1
+    transitivePeerDependencies:
+      - supports-color
+
+  '@babel/core@7.29.7(supports-color@8.1.1)':
+    dependencies:
+      '@babel/code-frame': 7.29.7
+      '@babel/generator': 7.29.7
+      '@babel/helper-compilation-targets': 7.29.7
+      '@babel/helper-module-transforms': 7.29.7(@babel/core@7.29.7(supports-color@8.1.1))
+      '@babel/helpers': 7.29.7
+      '@babel/parser': 7.29.7
+      '@babel/template': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
       '@babel/types': 7.29.7
       '@jridgewell/remapping': 2.3.5
       convert-source-map: 2.0.0
@@ -5569,17 +5586,26 @@ snapshots:
 
   '@babel/helper-module-imports@7.29.7':
     dependencies:
-      '@babel/traverse': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
       '@babel/types': 7.29.7
     transitivePeerDependencies:
       - supports-color
 
+  '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7(supports-color@8.1.1))':
+    dependencies:
+      '@babel/core': 7.29.7(supports-color@8.1.1)
+      '@babel/helper-module-imports': 7.29.7
+      '@babel/helper-validator-identifier': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
+    transitivePeerDependencies:
+      - supports-color
+
   '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7)':
     dependencies:
       '@babel/core': 7.29.7
       '@babel/helper-module-imports': 7.29.7
       '@babel/helper-validator-identifier': 7.29.7
-      '@babel/traverse': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
     transitivePeerDependencies:
       - supports-color
 
@@ -5621,7 +5647,7 @@ snapshots:
       '@babel/parser': 7.29.7
       '@babel/types': 7.29.7
 
-  '@babel/traverse@7.29.7':
+  '@babel/traverse@7.29.7(supports-color@8.1.1)':
     dependencies:
       '@babel/code-frame': 7.29.7
       '@babel/generator': 7.29.7
@@ -7981,10 +8007,6 @@ snapshots:
     dependencies:
       path-type: 4.0.0
 
-  dompurify@3.4.10:
-    optionalDependencies:
-      '@types/trusted-types': 2.0.7
-
   dompurify@3.4.11:
     optionalDependencies:
       '@types/trusted-types': 2.0.7
@@ -8812,7 +8834,7 @@ snapshots:
 
   jest-worker@27.5.1:
     dependencies:
-      '@types/node': 25.9.3
+      '@types/node': 26.0.0
       merge-stream: 2.0.0
       supports-color: 8.1.1
 
@@ -9790,7 +9812,7 @@ snapshots:
       '@posthog/core': 1.35.3
       '@posthog/types': 1.390.2
       core-js: 3.49.0
-      dompurify: 3.4.10
+      dompurify: 3.4.11
       fflate: 0.4.8
       preact: 10.29.2
       query-selector-shadow-dom: 1.0.1
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
index c6ea723c..361059bb 100644
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@@ -25,7 +25,7 @@ auditConfig:
     - GHSA-h67p-54hq-rp68
 
 overrides:
-  dompurify@<3.4.9: '>=3.4.9'
+  dompurify@<=3.4.10: '>=3.4.11'
   esbuild@>=0.27.3 <0.28.1: '>=0.28.1'
   form-data@>=4.0.0 <4.0.6: '>=4.0.6'
   hono@<4.12.21: '>=4.12.21'

From ba6bc1ce6cedce56d45c8fcd96a74c3cd53879dc Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:28:18 -0500
Subject: [PATCH 088/111] test(e2e): align selector testid with
 scenario-selector rename; rewrite x-axis toggle test for single-chart mode
 buttons

---
 .../app/cypress/e2e/dropdown-switching.cy.ts  |  4 +-
 .../app/cypress/e2e/historical-trends.cy.ts   |  4 +-
 .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts  | 64 +++++++++----------
 packages/app/cypress/e2e/url-params.cy.ts     | 10 +--
 4 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts
index ac88dc84..4bc8b695 100644
--- a/packages/app/cypress/e2e/dropdown-switching.cy.ts
+++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts
@@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => {
     cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true');
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
 
-    cy.get('[data-testid="sequence-selector"]').click();
+    cy.get('[data-testid="scenario-selector"]').click();
 
     cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false');
-    cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true');
+    cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true');
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
   });
 
diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts
index f0a70a56..55b0e274 100644
--- a/packages/app/cypress/e2e/historical-trends.cy.ts
+++ b/packages/app/cypress/e2e/historical-trends.cy.ts
@@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => {
       delete doc.body.dataset.scrollLocked;
       doc.body.style.removeProperty('pointer-events');
     });
-    cy.get('[data-testid="sequence-selector"]').should('be.visible');
-    cy.get('[data-testid="sequence-selector"]').click();
+    cy.get('[data-testid="scenario-selector"]').should('be.visible');
+    cy.get('[data-testid="scenario-selector"]').click();
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
     cy.get('body').type('{esc}');
   });
diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index e17a4aff..636a7ccf 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -1,46 +1,42 @@
-describe('TTFT X-Axis Toggle (E2E chart)', () => {
+describe('X-Axis Mode Toggle (inference chart)', () => {
   before(() => {
-    cy.window().then((win) => {
-      win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+    cy.visit('/inference', {
+      onBeforeLoad(win) {
+        win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+      },
     });
-    cy.visit('/inference');
-    cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2);
+    cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible');
+    cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
   });
 
-  it('shows the x-axis dropdown in the e2e chart heading', () => {
-    cy.get('[data-testid="chart-figure"]')
-      .eq(1)
-      .find('h2 button')
-      .should('contain.text', 'vs.')
-      .and('contain.text', 'Latency');
+  it('shows the x-axis mode buttons with Interactivity active by default', () => {
+    cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-interactivity"]')
+      .should('be.visible')
+      .and('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
   });
 
-  it('opens popover with three x-axis options', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').within(() => {
-      cy.contains('End-to-end Latency').should('exist');
-      cy.contains('P99 TTFT').should('exist');
-      cy.contains('Median TTFT').should('exist');
-    });
-  });
-
-  it('switches x-axis to P99 TTFT and updates the heading', () => {
-    cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click();
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT');
+  it('switches the x-axis to TTFT and updates the heading', () => {
+    cy.get('[data-testid="x-axis-mode-ttft"]').click();
+    cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token');
   });
 
-  it('switches x-axis to Median TTFT and updates the heading', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').contains('Median TTFT').click();
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT');
+  it('switches the x-axis to E2E Latency and updates the heading', () => {
+    cy.get('[data-testid="x-axis-mode-e2e"]').click();
+    cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency');
   });
 
-  it('switches back to End-to-end Latency', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click();
-    cy.get('[data-testid="chart-figure"]')
-      .eq(1)
-      .find('h2')
-      .should('contain.text', 'End-to-end Latency');
+  it('switches back to Interactivity', () => {
+    cy.get('[data-testid="x-axis-mode-interactivity"]').click();
+    cy.get('[data-testid="x-axis-mode-interactivity"]').should(
+      'have.attr',
+      'aria-selected',
+      'true',
+    );
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
   });
 });
diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts
index 3c480686..927aee5f 100644
--- a/packages/app/cypress/e2e/url-params.cy.ts
+++ b/packages/app/cypress/e2e/url-params.cy.ts
@@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => {
 };
 
 const assertNoHydrationMismatch = () => {
-  cy.get('[data-testid="sequence-selector"]').should('be.visible');
+  cy.get('[data-testid="scenario-selector"]').should('be.visible');
   cy.get('@consoleError').then((spy) => {
     const calls = (spy as unknown as { args: unknown[][] }).args;
     const hydration = calls.filter((args) =>
@@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => {
 
     it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => {
       visitWithErrorSpy('/inference?i_seq=1k/1k');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       assertNoHydrationMismatch();
     });
 
@@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => {
       // Visit the canonical model-prefixed slug so the assertion is directly
       // about the rendered page, not about a bare-slug redirect interleaving.
       visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       assertNoHydrationMismatch();
     });
 
     it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => {
       visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk');
-      cy.get('[data-testid="sequence-selector"]')
+      cy.get('[data-testid="scenario-selector"]')
         .invoke('text')
         .should('not.contain', 'junk')
         .and('match', /[18]K . [18]K/u);
@@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => {
       // `effectivePrecisions` intersects the selection with available precisions
       // and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported.
       visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek');
       cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8');
       assertNoHydrationMismatch();

From ada19b54e41ea3ad87cdfc22dd3d27e1a3d7df44 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:41:03 -0500
Subject: [PATCH 089/111] test(datasets): component tests for distribution
 card, trace flamegraph (incl deep-link), and dataset list states

---
 .../app/cypress/component/dataset-list.cy.tsx | 93 +++++++++++++++++++
 .../component/distribution-card.cy.tsx        | 45 +++++++++
 .../cypress/component/trace-flamegraph.cy.tsx | 86 +++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 packages/app/cypress/component/dataset-list.cy.tsx
 create mode 100644 packages/app/cypress/component/distribution-card.cy.tsx
 create mode 100644 packages/app/cypress/component/trace-flamegraph.cy.tsx

diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx
new file mode 100644
index 00000000..f7cfcb9a
--- /dev/null
+++ b/packages/app/cypress/component/dataset-list.cy.tsx
@@ -0,0 +1,93 @@
+import { QueryClient, QueryClientProvider } from '@tanstack/react-query';
+import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime';
+
+import { DatasetList } from '@/components/datasets/dataset-list';
+import type { DatasetRecord } from '@/hooks/api/use-datasets';
+
+const datasets: DatasetRecord[] = [
+  {
+    id: 'ds-1',
+    slug: 'cc-traces-weka-full',
+    label: 'cc-traces-weka (full)',
+    variant: 'full',
+    description: 'Every captured request, unmodified.',
+    hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full',
+    license: 'apache-2.0',
+    conversation_count: 1234,
+    summary: {
+      totalIn: 5_000_000,
+      totalOut: 250_000,
+      cachedPct: 0.82,
+      mainTurns: 9800,
+      subagentGroups: 540,
+    },
+    ingested_at: '2026-06-20T00:00:00Z',
+  },
+  {
+    id: 'ds-2',
+    slug: 'cc-traces-weka-256k',
+    label: 'cc-traces-weka (256k)',
+    variant: '256k',
+    description: 'Turns trimmed to a 256k context window.',
+    hf_url: null,
+    license: 'apache-2.0',
+    conversation_count: 980,
+    summary: {
+      totalIn: 3_200_000,
+      totalOut: 180_000,
+      cachedPct: 0.79,
+      mainTurns: 7600,
+      subagentGroups: 410,
+    },
+    ingested_at: '2026-06-19T00:00:00Z',
+  },
+];
+
+function createMockRouter() {
+  return {
+    push: cy.stub(),
+    replace: cy.stub(),
+    refresh: cy.stub(),
+    back: cy.stub(),
+    forward: cy.stub(),
+    prefetch: cy.stub().resolves(),
+  };
+}
+
+function mountList() {
+  const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } });
+  cy.mount(
+    <AppRouterContext.Provider value={createMockRouter()}>
+      <QueryClientProvider client={queryClient}>
+        <DatasetList />
+      </QueryClientProvider>
+    </AppRouterContext.Provider>,
+  );
+}
+
+describe('DatasetList', () => {
+  it('renders a card per dataset with its summary stats', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list');
+    mountList();
+    cy.wait('@list');
+    cy.contains('cc-traces-weka (full)').should('be.visible');
+    cy.contains('cc-traces-weka (256k)').should('be.visible');
+    cy.contains('1,234').should('be.visible'); // conversation_count, localized
+    cy.contains('82%').should('be.visible'); // cachedPct
+    cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist');
+  });
+
+  it('shows the empty state when no datasets are ingested', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty');
+    mountList();
+    cy.wait('@empty');
+    cy.contains('No datasets ingested yet.').should('be.visible');
+  });
+
+  it('shows the error state when the request fails', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err');
+    mountList();
+    cy.wait('@err');
+    cy.contains('Failed to load datasets.').should('be.visible');
+  });
+});
diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx
new file mode 100644
index 00000000..fb7e5461
--- /dev/null
+++ b/packages/app/cypress/component/distribution-card.cy.tsx
@@ -0,0 +1,45 @@
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import type { Distribution } from '@/hooks/api/use-datasets';
+
+const distribution: Distribution = {
+  bins: [
+    { x0: 0, x1: 100, count: 5 },
+    { x0: 100, x1: 200, count: 20 },
+    { x0: 200, x1: 300, count: 12 },
+    { x0: 300, x1: 400, count: 3 },
+  ],
+  stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 },
+};
+
+describe('DistributionCard', () => {
+  it('renders the title, summary stats, and one bar per bin', () => {
+    cy.mount(
+      <DistributionCard title="Input tokens per turn" unit="tok" distribution={distribution} />,
+    );
+    cy.contains('Input tokens per turn').should('be.visible');
+    cy.contains('n=40').should('be.visible');
+    cy.contains('median 175').should('be.visible');
+    cy.contains('p90 320').should('be.visible');
+    // One filled bar rect per bin (ChartHover may add a transparent overlay rect).
+    cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length);
+  });
+
+  it('shows a "No data" placeholder when no distribution is provided', () => {
+    cy.mount(<DistributionCard title="Empty metric" unit="tok" />);
+    cy.contains('Empty metric').should('be.visible');
+    cy.contains('No data').should('be.visible');
+    cy.get('rect[class*="fill-primary"]').should('not.exist');
+  });
+
+  it('marks the chart as log scale when scale="log"', () => {
+    cy.mount(
+      <DistributionCard
+        title="Output tokens per turn"
+        unit="tok"
+        scale="log"
+        distribution={distribution}
+      />,
+    );
+    cy.contains('log scale').should('be.visible');
+  });
+});
diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx
new file mode 100644
index 00000000..1be90e0c
--- /dev/null
+++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx
@@ -0,0 +1,86 @@
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import type { ConversationStructure } from '@/hooks/api/use-datasets';
+
+// Two main turns followed by one subagent group with two child turns.
+// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`).
+const structure: ConversationStructure = {
+  blockSize: 64,
+  nodes: [
+    { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 },
+    {
+      kind: 'turn',
+      turnIndex: 1,
+      model: 'claude',
+      in: 2000,
+      out: 300,
+      cached: 1500,
+      uncached: 500,
+    },
+    {
+      kind: 'subagent',
+      label: 'Subagent: search',
+      agentId: 'agent-1',
+      durationMs: 12000,
+      in: 5000,
+      out: 800,
+      cached: 3000,
+      uncached: 2000,
+      children: [
+        {
+          kind: 'turn',
+          turnIndex: 0,
+          model: 'claude',
+          in: 2500,
+          out: 400,
+          cached: 1500,
+          uncached: 1000,
+        },
+        {
+          kind: 'turn',
+          turnIndex: 1,
+          model: 'claude',
+          in: 2500,
+          out: 400,
+          cached: 1500,
+          uncached: 1000,
+        },
+      ],
+    },
+  ],
+  totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 },
+};
+
+describe('TraceFlamegraph', () => {
+  it('renders the legend, main-turn rows, and the subagent group header', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.contains('Cached prefix').should('be.visible');
+    cy.contains('Uncached input').should('be.visible');
+    cy.contains('Output').should('be.visible');
+    cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1');
+    cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2');
+    cy.contains('Subagent: search').should('be.visible');
+  });
+
+  it('keeps subagent children collapsed until the group is expanded', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+    cy.contains('button', 'Subagent: search').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+    cy.get('[data-rowkey="g-2-c-1"]').should('be.visible');
+  });
+
+  it('expand all / collapse all toggles every subagent group', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.contains('button', 'Expand all').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+    cy.contains('button', 'Collapse all').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+  });
+
+  it('auto-expands and highlights the target group child for a request-timeline deep link', () => {
+    cy.mount(
+      <TraceFlamegraph structure={structure} highlightAgentId="agent-1" highlightTurn={1} />,
+    );
+    cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary');
+  });
+});

From 1c61ee3f597e22d33e891b73f7f95511a73844d3 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:47:02 -0500
Subject: [PATCH 090/111] refactor(datasets): extract shared compact()
 formatter, dedupe 5 local copies

---
 .../src/components/datasets/conversation-view.tsx    |  9 +--------
 .../app/src/components/datasets/dataset-detail.tsx   |  9 +--------
 .../app/src/components/datasets/dataset-list.tsx     |  9 +--------
 .../src/components/datasets/distribution-card.tsx    | 11 +----------
 packages/app/src/components/datasets/format.ts       | 12 ++++++++++++
 .../app/src/components/datasets/trace-flamegraph.tsx |  9 +--------
 6 files changed, 17 insertions(+), 42 deletions(-)
 create mode 100644 packages/app/src/components/datasets/format.ts

diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index 739d3bb2..d39b83d9 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -6,14 +6,7 @@ import { useSearchParams } from 'next/navigation';
 import { Card } from '@/components/ui/card';
 import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
 import { useDatasetConversation } from '@/hooks/api/use-datasets';
-
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
   const { data, isLoading, isError } = useDatasetConversation(slug, convId);
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
index 57c50649..9410a505 100644
--- a/packages/app/src/components/datasets/dataset-detail.tsx
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -18,14 +18,7 @@ import {
   type ConversationSort,
 } from '@/hooks/api/use-datasets';
 import { track } from '@/lib/analytics';
-
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 const PAGE = 50;
 
diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx
index 5fcc0dfe..84b279db 100644
--- a/packages/app/src/components/datasets/dataset-list.tsx
+++ b/packages/app/src/components/datasets/dataset-list.tsx
@@ -5,14 +5,7 @@ import Link from 'next/link';
 import { Card } from '@/components/ui/card';
 import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets';
 import { track } from '@/lib/analytics';
-
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 function DatasetCard({ d }: { d: DatasetRecord }) {
   const s = d.summary ?? {};
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
index 7abc367f..d0c0f166 100644
--- a/packages/app/src/components/datasets/distribution-card.tsx
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -5,16 +5,7 @@ import { useMemo } from 'react';
 import { Card } from '@/components/ui/card';
 import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover';
 import type { Distribution } from '@/hooks/api/use-datasets';
-
-/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  if (abs > 0 && abs < 1) return n.toFixed(2);
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 interface DistributionCardProps {
   title: string;
diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts
new file mode 100644
index 00000000..f6f5530c
--- /dev/null
+++ b/packages/app/src/components/datasets/format.ts
@@ -0,0 +1,12 @@
+/**
+ * Compact number formatter for dataset token/count displays:
+ * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82".
+ */
+export function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  if (abs > 0 && abs < 1) return n.toFixed(2);
+  return String(Math.round(n));
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index a577193b..12ecb4a4 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -4,14 +4,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
 import { createPortal } from 'react-dom';
 
 import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
-
-/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 // Stacked-bar segment colors. Cached prefix vs uncached input vs output —
 // fixed hues (theme-independent) so the meaning is stable in light/dark.

From e2e5424e7071d380d05b7c1bcfddfc5bccfc3c5b Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 10:26:34 -0500
Subject: [PATCH 091/111] refactor(db): squash agentic migrations into
 007_agentic.sql so numbering doesn't collide with master

---
 .claude/agents/ingest.md                      |   2 +-
 .../db/migrations/002_agentic_scenario.sql    |  30 --
 .../migrations/003_agentic_availability.sql   |  21 --
 packages/db/migrations/004_offload_mode.sql   |  42 ---
 .../migrations/006_agentic_trace_replay.sql   |  34 --
 packages/db/migrations/007_agentic.sql        | 326 ++++++++++++++++++
 .../007_agentic_trace_server_metrics_json.sql |  17 -
 .../008_agentic_aggregate_stats.sql           |  18 -
 .../migrations/009_agentic_chart_series.sql   |  19 -
 .../010_agentic_request_timeline.sql          |  15 -
 packages/db/migrations/011_datasets.sql       |  55 ---
 packages/db/migrations/012_run_datasets.sql   |  19 -
 12 files changed, 327 insertions(+), 271 deletions(-)
 delete mode 100644 packages/db/migrations/002_agentic_scenario.sql
 delete mode 100644 packages/db/migrations/003_agentic_availability.sql
 delete mode 100644 packages/db/migrations/004_offload_mode.sql
 delete mode 100644 packages/db/migrations/006_agentic_trace_replay.sql
 create mode 100644 packages/db/migrations/007_agentic.sql
 delete mode 100644 packages/db/migrations/007_agentic_trace_server_metrics_json.sql
 delete mode 100644 packages/db/migrations/008_agentic_aggregate_stats.sql
 delete mode 100644 packages/db/migrations/009_agentic_chart_series.sql
 delete mode 100644 packages/db/migrations/010_agentic_request_timeline.sql
 delete mode 100644 packages/db/migrations/011_datasets.sql
 delete mode 100644 packages/db/migrations/012_run_datasets.sql

diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
index aa0099ac..4ecbc1dd 100644
--- a/.claude/agents/ingest.md
+++ b/.claude/agents/ingest.md
@@ -178,7 +178,7 @@ cd packages/db && DATABASE_WRITE_URL='<direct write url>' \
   [--label "…"] [--variant full|256k] [--description "…"] [--limit N]
 ```
 
-It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
+It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
 
 ## Don't
 
diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql
deleted file mode 100644
index c143914e..00000000
--- a/packages/db/migrations/002_agentic_scenario.sql
+++ /dev/null
@@ -1,30 +0,0 @@
--- Support agentic scenarios in benchmark_results.
---
--- Scenarios are discriminated by benchmark_type:
---   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
---   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
---
--- conc retains its meaning (concurrent users/requests) for both.
-
--- 1) isl/osl become nullable for agentic rows
-alter table benchmark_results
-  alter column isl drop not null,
-  alter column osl drop not null;
-
--- 2) CHECK constraints: positive-or-null
-alter table benchmark_results
-  drop constraint benchmark_results_isl_positive,
-  drop constraint benchmark_results_osl_positive;
-
-alter table benchmark_results
-  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
-  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
-
--- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
---    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
-alter table benchmark_results
-  drop constraint benchmark_results_unique;
-
-alter table benchmark_results
-  add constraint benchmark_results_unique unique nulls not distinct
-    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql
deleted file mode 100644
index e96cbd50..00000000
--- a/packages/db/migrations/003_agentic_availability.sql
+++ /dev/null
@@ -1,21 +0,0 @@
--- Extend the availability table to cover agentic scenarios.
---
--- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
--- for availability and add benchmark_type so the frontend can enumerate
--- agentic vs single_turn scenarios per model/date.
---
--- Postgres primary keys require every column to be NOT NULL, so we drop the PK
--- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
--- equivalent except it allows isl/osl to be NULL for agentic rows.
-
-alter table availability
-  drop constraint availability_pkey;
-
-alter table availability
-  alter column isl drop not null,
-  alter column osl drop not null,
-  add column benchmark_type text not null default 'single_turn';
-
-alter table availability
-  add constraint availability_natural_key unique nulls not distinct
-    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql
deleted file mode 100644
index 24b617f1..00000000
--- a/packages/db/migrations/004_offload_mode.sql
+++ /dev/null
@@ -1,42 +0,0 @@
--- Add offload_mode as a first-class dimension on benchmark_results.
---
--- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
--- runs: a single run may emit two rows for the same (config, isl, osl, conc)
--- — one with offload disabled, one enabled. The pre-existing unique key
--- collapsed those into one row, forcing the ingest to skip variants.
---
--- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
--- assumption baked into the existing 5,500+ rows.
-
-alter table benchmark_results
-  add column offload_mode text not null default 'off';
-
--- Backfill agentic rows from the offload_mode value already living in metrics
--- JSONB (set during the earlier agentic ingest backfill).
-update benchmark_results
-   set offload_mode = metrics->>'offload_mode'
- where benchmark_type = 'agentic_traces'
-   and metrics ? 'offload_mode';
-
--- Replace the unique constraint so on/off variants can coexist.
-alter table benchmark_results
-  drop constraint benchmark_results_unique;
-
-alter table benchmark_results
-  add constraint benchmark_results_unique unique nulls not distinct
-    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
-
--- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
-drop materialized view if exists latest_benchmarks cascade;
-
-create materialized view latest_benchmarks as
-select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
-  br.*
-from benchmark_results br
-join latest_workflow_runs wr on wr.id = br.workflow_run_id
-where br.error is null
-order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
-
-create unique index latest_benchmarks_pk
-  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
-create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql
deleted file mode 100644
index 398bc725..00000000
--- a/packages/db/migrations/006_agentic_trace_replay.sql
+++ /dev/null
@@ -1,34 +0,0 @@
--- Capture raw aiperf trace files per agentic benchmark point.
---
--- The aiperf harness produces two per-point export files inside each
--- `agentic_<suffix>` artifact:
---   - profile_export.jsonl         (~2 MB raw, per-request data)
---   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
---
--- We persist them so the dashboard can later show per-request distributions,
--- KV cache utilization over time, and conversation traces without needing to
--- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
--- ~500 KB per point post-gzip the total fits comfortably without a separate
--- blob service.
---
--- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
--- column on benchmark_results). Older, non-aiperf agentic runs simply have a
--- NULL `trace_replay_id`.
-
-create table agentic_trace_replay (
-  id                                bigserial   primary key,
-  -- gzip(profile_export.jsonl); null when only the server metrics file existed
-  profile_export_jsonl_gz           bytea,
-  profile_export_uncompressed_size  bigint,
-  -- raw csv bytes; null when only the profile file existed
-  server_metrics_csv                bytea,
-  server_metrics_csv_size           bigint,
-  created_at                        timestamptz not null default now()
-);
-
-alter table benchmark_results
-  add column trace_replay_id bigint references agentic_trace_replay(id);
-
-create index benchmark_results_trace_replay_idx
-  on benchmark_results (trace_replay_id)
-  where trace_replay_id is not null;
diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/007_agentic.sql
new file mode 100644
index 00000000..eceea82e
--- /dev/null
+++ b/packages/db/migrations/007_agentic.sql
@@ -0,0 +1,326 @@
+-- 007_agentic.sql
+--
+-- Squashed agentic-benchmark + datasets schema. Collapses the feat/agentx
+-- migrations 002_agentic_scenario .. 012_run_datasets into one file that sorts
+-- after master's highest migration (006_benchmark_results_workers), so the
+-- branch's numbering no longer collides with master's 002-006. None of the
+-- collapsed migrations had been applied to any deployed database.
+--
+-- Statement order is preserved exactly. The latest_benchmarks recreate uses
+-- 'select br.*', so it retains every benchmark_results column added earlier
+-- (including master's 'workers' from 006) and re-keys the view on offload_mode.
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 002_agentic_scenario.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+--   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+--   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+  alter column isl drop not null,
+  alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+  drop constraint benchmark_results_isl_positive,
+  drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+--    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 003_agentic_availability.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+  drop constraint availability_pkey;
+
+alter table availability
+  alter column isl drop not null,
+  alter column osl drop not null,
+  add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+  add constraint availability_natural_key unique nulls not distinct
+    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 004_offload_mode.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+  add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+   set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+   and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+  br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 006_agentic_trace_replay.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_<suffix>` artifact:
+--   - profile_export.jsonl         (~2 MB raw, per-request data)
+--   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+  id                                bigserial   primary key,
+  -- gzip(profile_export.jsonl); null when only the server metrics file existed
+  profile_export_jsonl_gz           bytea,
+  profile_export_uncompressed_size  bigint,
+  -- raw csv bytes; null when only the profile file existed
+  server_metrics_csv                bytea,
+  server_metrics_csv_size           bigint,
+  created_at                        timestamptz not null default now()
+);
+
+alter table benchmark_results
+  add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+  on benchmark_results (trace_replay_id)
+  where trace_replay_id is not null;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 007_agentic_trace_server_metrics_json.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+  add column server_metrics_json_gz bytea,
+  add column server_metrics_json_uncompressed_size bigint;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 008_agentic_aggregate_stats.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+  add column aggregate_stats jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 009_agentic_chart_series.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+  add column chart_series jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 010_agentic_request_timeline.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+  add column request_timeline jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 011_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
+-- the agentic benchmarks replay) + their per-conversation trace structure.
+--
+-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
+-- not the source traces. These two tables back the new /datasets area: a
+-- registry of ingested dataset versions with precomputed summary + chart data,
+-- and one row per conversation holding a flamegraph-ready `structure` (turns +
+-- subagent groups with input split into cached-prefix vs uncached-suffix). The
+-- raw hash_ids are NOT stored — they're only needed at ingest to derive the
+-- cached/uncached split, so the runtime read is a single small JSONB.
+--
+-- Additive only. To revert this migration:
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   (and see the run_datasets revert below; this is all one migration now:
+--    delete from schema_migrations where filename = '007_agentic.sql';)
+
+create table datasets (
+  -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
+  id          text primary key,
+  -- URL key, e.g. 'cc-traces-weka-062126'.
+  slug        text not null unique,
+  label       text not null,
+  -- 'full' | '256k' | 'no-subagents' (the published variants).
+  variant     text not null default 'full',
+  description text,
+  hf_url      text,
+  license     text,
+  conversation_count integer not null default 0,
+  -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
+  summary     jsonb not null default '{}'::jsonb,
+  -- Precomputed distributions for the dataset-detail cards (input/output length,
+  -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
+  chart_data  jsonb not null default '{}'::jsonb,
+  dataset_version integer not null default 1,
+  ingested_at timestamptz not null default now()
+);
+
+create table dataset_conversations (
+  id          bigserial primary key,
+  dataset_id  text not null references datasets(id) on delete cascade,
+  -- The conversation id from the dataset record (trace id).
+  conv_id     text not null,
+  models      text[] not null default '{}',
+  num_turns           integer not null default 0,
+  num_subagent_groups integer not null default 0,
+  total_in    bigint not null default 0,
+  total_out   bigint not null default 0,
+  total_cached bigint not null default 0,
+  -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
+  -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
+  structure   jsonb not null,
+  unique (dataset_id, conv_id)
+);
+
+create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 012_run_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Maps a benchmark workflow_run to the source dataset it replayed, so the
+-- agentic detail page can deep-link each request in the timeline to the exact
+-- conversation in the /datasets viewer (the request's conversation_id, with any
+-- ::sa:/::fa: suffix stripped, is the dataset conv_id).
+--
+-- One row per workflow_run (every benchmark in a run replays the same dataset).
+-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/<slug>
+-- URL) rather than an FK, so the mapping can be recorded before/independent of
+-- the dataset being ingested; the UI degrades gracefully if the slug is absent.
+--
+-- Additive only. To revert this whole squashed migration:
+--   drop table if exists run_datasets;
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   drop table if exists agentic_trace_replay cascade;
+--   (plus the benchmark_results/availability column + constraint changes above)
+--   delete from schema_migrations where filename = '007_agentic.sql';
+
+create table run_datasets (
+  workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
+  dataset_slug    text not null,
+  created_at      timestamptz not null default now()
+);
diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
deleted file mode 100644
index ba7bd095..00000000
--- a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
+++ /dev/null
@@ -1,17 +0,0 @@
--- Add the full server-metrics time-series JSON to agentic_trace_replay.
---
--- The existing `server_metrics_csv` column holds aiperf's summary export —
--- one row per metric with avg/min/max/std/p1..p99 across the entire run.
--- That's enough for the cumulative cache-hit number but not for any
--- "metric over time" view (KV cache utilization curve, queue depth, prefix
--- hit rate per interval, cumulative prefill token source).
---
--- The harness also writes `server_metrics_export.json` which contains the
--- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
--- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
--- to ~6 MB gzipped (text with repeated metric names + numeric values).
--- That's the file we store here for any future time-series chart.
-
-alter table agentic_trace_replay
-  add column server_metrics_json_gz bytea,
-  add column server_metrics_json_uncompressed_size bigint;
diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql
deleted file mode 100644
index d55533b9..00000000
--- a/packages/db/migrations/008_agentic_aggregate_stats.sql
+++ /dev/null
@@ -1,18 +0,0 @@
--- Pre-computed aggregate stats for each agentic_trace_replay row.
---
--- Previously the agentic detail page parsed the (huge) profile_export.jsonl
--- and server_metrics_json blobs on every request to compute distribution
--- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
--- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
--- worst rows (high-conc TP+EP server_metrics blobs that decompress past
--- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
---
--- This column holds the computed stats so the API serves the page from a
--- single SQL row read. Shape mirrors the existing benchmark_results.metrics
--- JSONB convention; an inner `version` field lets the backfill script
--- detect rows whose stats were computed by an older algorithm and
--- recompute them. Null when stats haven't been computed yet (existing
--- rows pre-backfill; the API has a slow-path fallback for that case).
-
-alter table agentic_trace_replay
-  add column aggregate_stats jsonb;
diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql
deleted file mode 100644
index b42718b9..00000000
--- a/packages/db/migrations/009_agentic_chart_series.sql
+++ /dev/null
@@ -1,19 +0,0 @@
--- Pre-computed time-series for the agentic detail page chart.
---
--- Sibling to `aggregate_stats` (migration 008): that column stores
--- per-row percentile/derived *summaries*, this one stores the full
--- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
--- queueDepth, prefillTps, decodeTps, promptTokensBySource).
---
--- Without this, the detail page parsed the entire `server_metrics_json_gz`
--- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
--- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
--- With pre-computed series the page is a single SQL row read.
---
--- Shape includes an inner `version` field so the backfill script can
--- recompute rows whose stored series were produced by an older algorithm.
--- Null when the series haven't been computed yet; the API has a slow-path
--- fallback (with stream-parse for oversized blobs) for that case.
-
-alter table agentic_trace_replay
-  add column chart_series jsonb;
diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql
deleted file mode 100644
index 756b775e..00000000
--- a/packages/db/migrations/010_agentic_request_timeline.sql
+++ /dev/null
@@ -1,15 +0,0 @@
--- Pre-computed per-request timeline for the agentic detail page.
---
--- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
--- holds a thin per-request array extracted from `profile_export_jsonl_gz`
--- so the detail page can render a Gantt-style swimlane of every request
--- (one bar per conversation turn) without re-parsing the JSONL on every
--- page load.
---
--- Shape includes an inner `version` field so the backfill script can
--- recompute rows whose stored timeline was produced by an older
--- algorithm. Null when the timeline hasn't been computed yet; the API
--- falls back to parsing the blob in that case.
-
-alter table agentic_trace_replay
-  add column request_timeline jsonb;
diff --git a/packages/db/migrations/011_datasets.sql b/packages/db/migrations/011_datasets.sql
deleted file mode 100644
index 7a70d83f..00000000
--- a/packages/db/migrations/011_datasets.sql
+++ /dev/null
@@ -1,55 +0,0 @@
--- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
--- the agentic benchmarks replay) + their per-conversation trace structure.
---
--- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
--- not the source traces. These two tables back the new /datasets area: a
--- registry of ingested dataset versions with precomputed summary + chart data,
--- and one row per conversation holding a flamegraph-ready `structure` (turns +
--- subagent groups with input split into cached-prefix vs uncached-suffix). The
--- raw hash_ids are NOT stored — they're only needed at ingest to derive the
--- cached/uncached split, so the runtime read is a single small JSONB.
---
--- Additive only. To revert this migration:
---   drop table if exists dataset_conversations;
---   drop table if exists datasets;
---   delete from schema_migrations where filename = '011_datasets.sql';
-
-create table datasets (
-  -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
-  id          text primary key,
-  -- URL key, e.g. 'cc-traces-weka-062126'.
-  slug        text not null unique,
-  label       text not null,
-  -- 'full' | '256k' | 'no-subagents' (the published variants).
-  variant     text not null default 'full',
-  description text,
-  hf_url      text,
-  license     text,
-  conversation_count integer not null default 0,
-  -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
-  summary     jsonb not null default '{}'::jsonb,
-  -- Precomputed distributions for the dataset-detail cards (input/output length,
-  -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
-  chart_data  jsonb not null default '{}'::jsonb,
-  dataset_version integer not null default 1,
-  ingested_at timestamptz not null default now()
-);
-
-create table dataset_conversations (
-  id          bigserial primary key,
-  dataset_id  text not null references datasets(id) on delete cascade,
-  -- The conversation id from the dataset record (trace id).
-  conv_id     text not null,
-  models      text[] not null default '{}',
-  num_turns           integer not null default 0,
-  num_subagent_groups integer not null default 0,
-  total_in    bigint not null default 0,
-  total_out   bigint not null default 0,
-  total_cached bigint not null default 0,
-  -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
-  -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
-  structure   jsonb not null,
-  unique (dataset_id, conv_id)
-);
-
-create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);
diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql
deleted file mode 100644
index 58dd9f88..00000000
--- a/packages/db/migrations/012_run_datasets.sql
+++ /dev/null
@@ -1,19 +0,0 @@
--- Maps a benchmark workflow_run to the source dataset it replayed, so the
--- agentic detail page can deep-link each request in the timeline to the exact
--- conversation in the /datasets viewer (the request's conversation_id, with any
--- ::sa:/::fa: suffix stripped, is the dataset conv_id).
---
--- One row per workflow_run (every benchmark in a run replays the same dataset).
--- dataset_slug is a plain slug (matches datasets.slug / the /datasets/<slug>
--- URL) rather than an FK, so the mapping can be recorded before/independent of
--- the dataset being ingested; the UI degrades gracefully if the slug is absent.
---
--- Additive only. To revert:
---   drop table if exists run_datasets;
---   delete from schema_migrations where filename = '012_run_datasets.sql';
-
-create table run_datasets (
-  workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
-  dataset_slug    text not null,
-  created_at      timestamptz not null default now()
-);

From 772dfef5cde7a79d02963a9f151cb43b6592920e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 10:57:37 -0500
Subject: [PATCH 092/111] add agentic time-series and dataset timing

---
 .../e2e/agentic-point-time-series.cy.ts       | 98 +++++++++++++++++++
 .../e2e/datasets-flamegraph-time.cy.ts        | 85 ++++++++++++++++
 .../components/datasets/conversation-view.tsx |  3 +-
 .../datasets/trace-flamegraph.test.ts         | 16 +++
 .../components/datasets/trace-flamegraph.tsx  | 35 +++++++
 .../agentic-point/agentic-point-detail.tsx    | 97 +++++++++++++++++-
 .../agentic-point/expandable-chart.tsx        | 30 ++++--
 .../agentic-point/time-series-chart.test.ts   | 73 +++++++++++++-
 .../agentic-point/time-series-chart.tsx       | 60 ++++++++++++
 .../app/src/hooks/api/use-request-timeline.ts |  2 +
 .../src/etl/compute-request-timeline.test.ts  | 25 ++++-
 .../db/src/etl/compute-request-timeline.ts    | 12 ++-
 packages/db/src/etl/weka-structure.test.ts    | 28 +++++-
 packages/db/src/etl/weka-structure.ts         | 40 ++++++++
 14 files changed, 586 insertions(+), 18 deletions(-)
 create mode 100644 packages/app/cypress/e2e/agentic-point-time-series.cy.ts
 create mode 100644 packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
 create mode 100644 packages/app/src/components/datasets/trace-flamegraph.test.ts

diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
new file mode 100644
index 00000000..b0cfb60d
--- /dev/null
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -0,0 +1,98 @@
+const timelineRequest = (
+  index: number,
+  ttftMs: number,
+  tpotMs: number,
+  overrides: Record<string, unknown> = {},
+) => ({
+  cid: 'conversation-1',
+  ti: index,
+  wid: 'worker-1',
+  ad: 0,
+  phase: 'profiling',
+  credit: index * 1_000_000_000,
+  start: index * 1_000_000_000,
+  ack: null,
+  end: (index + 1) * 1_000_000_000,
+  ttftMs,
+  tpotMs,
+  isl: 1024,
+  osl: 128,
+  cancelled: false,
+  ...overrides,
+});
+
+describe('Agentic point request metric time series', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} });
+    cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null });
+    cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 });
+    cy.intercept('GET', '/api/v1/request-timeline*', {
+      body: {
+        version: 3,
+        startNs: 0,
+        endNs: 7_000_000_000,
+        durationS: 7,
+        requests: [
+          timelineRequest(0, 100, 10),
+          timelineRequest(1, 200, 20),
+          timelineRequest(2, 400, 25),
+          timelineRequest(3, 800, 40),
+          timelineRequest(4, 1600, 80),
+          timelineRequest(5, 3200, 160, { phase: 'warmup' }),
+          timelineRequest(6, 6400, 320, { cancelled: true }),
+        ],
+      },
+    });
+    cy.visit('/inference/agentic/206885');
+  });
+
+  it('renders rolling P75 interactivity and TTFT using profiling requests only', () => {
+    cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+      cy.contains('h2', 'Interactivity over time').should('be.visible');
+      cy.get('[data-testid="interactivity-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P75');
+      cy.get('svg circle').should('have.length', 5);
+      cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
+      cy.get('svg').should('contain.text', '1 / cumulative mean TPOT');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.contains('h2', 'TTFT over time').should('be.visible');
+      cy.get('svg circle').should('have.length', 5);
+      cy.get('svg').should('contain.text', 'TTFT (s)');
+      cy.get('svg').should('contain.text', 'Cumulative mean TTFT');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+  });
+
+  it('switches each chart independently from P75 to P90', () => {
+    cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+      cy.contains('svg', 'P75 (rolling 50 req)')
+        .find('path')
+        .first()
+        .invoke('attr', 'd')
+        .as('p75Path');
+      cy.contains('button', 'P90').click();
+      cy.get('[data-testid="interactivity-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P90');
+      cy.contains('svg', 'P90 (rolling 50 req)')
+        .find('path')
+        .first()
+        .invoke('attr', 'd')
+        .then(function (p90Path) {
+          expect(p90Path).not.to.equal(this.p75Path);
+        });
+    });
+
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.get('[data-testid="ttft-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P75');
+      cy.contains('button', 'P90').click();
+      cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+    });
+  });
+});
diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
new file mode 100644
index 00000000..672675a3
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
@@ -0,0 +1,85 @@
+describe('Dataset conversation flamegraph timing', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', {
+      body: {
+        conv_id: 'conversation-1',
+        models: ['model-a'],
+        num_turns: 2,
+        num_subagent_groups: 1,
+        total_in: 1000,
+        total_out: 100,
+        total_cached: 500,
+        structure: {
+          blockSize: 64,
+          totals: {
+            in: 1000,
+            out: 100,
+            cached: 500,
+            uncached: 500,
+            numTurns: 2,
+            numSubagentGroups: 1,
+          },
+          nodes: [
+            {
+              kind: 'turn',
+              turnIndex: 0,
+              startS: 0,
+              model: 'model-a',
+              in: 100,
+              out: 10,
+              cached: 0,
+              uncached: 100,
+            },
+            {
+              kind: 'subagent',
+              label: 'Explore',
+              agentId: 'agent-1',
+              startS: 3661.2,
+              endS: 3782.6,
+              durationMs: 121_400,
+              in: 800,
+              out: 80,
+              cached: 500,
+              uncached: 300,
+              children: [
+                {
+                  kind: 'turn',
+                  turnIndex: 1,
+                  startS: 3661.2,
+                  model: 'model-a',
+                  in: 800,
+                  out: 80,
+                  cached: 500,
+                  uncached: 300,
+                },
+              ],
+            },
+            {
+              kind: 'turn',
+              turnIndex: 2,
+              startS: 65.4,
+              model: 'model-a',
+              in: 100,
+              out: 10,
+              cached: 0,
+              uncached: 100,
+            },
+          ],
+        },
+      },
+    });
+    cy.visit('/datasets/test-dataset/conversations/conversation-1');
+  });
+
+  it('shows turn offsets and a collapsed subagent time range', () => {
+    cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00');
+    cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05');
+    cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03');
+    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist');
+  });
+
+  it('shows subturn offsets when the subagent group is expanded', () => {
+    cy.contains('button', 'Explore').click();
+    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01');
+  });
+});
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index d39b83d9..57aaa0c3 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -87,7 +87,8 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
         <p className="mb-4 text-xs text-muted-foreground">
           One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default —
           click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
-          plus generated output.
+          plus generated output. Timestamps are elapsed from conversation start; subagent headers
+          show their full active range.
         </p>
         <TraceFlamegraph
           structure={data.structure}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts
new file mode 100644
index 00000000..00293c00
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts
@@ -0,0 +1,16 @@
+import { describe, expect, it } from 'vitest';
+
+import { formatElapsedTime } from './trace-flamegraph';
+
+describe('formatElapsedTime', () => {
+  it('formats elapsed seconds below and above one hour', () => {
+    expect(formatElapsedTime(0)).toBe('00:00');
+    expect(formatElapsedTime(65.4)).toBe('01:05');
+    expect(formatElapsedTime(3661.6)).toBe('1:01:02');
+    expect(formatElapsedTime(86_541.149)).toBe('24:02:21');
+  });
+
+  it('clamps negative offsets to the conversation origin', () => {
+    expect(formatElapsedTime(-5)).toBe('00:00');
+  });
+});
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 12ecb4a4..d0bbb01f 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -24,6 +24,7 @@ interface VisibleRow {
   key: string;
   label: string;
   sublabel?: string;
+  timeLabel?: string;
   cached: number;
   uncached: number;
   output: number;
@@ -34,6 +35,24 @@ interface VisibleRow {
   groupIndex?: number;
 }
 
+/** Format seconds from conversation start as a compact elapsed timestamp. */
+export function formatElapsedTime(seconds: number): string {
+  const total = Math.max(0, Math.round(seconds));
+  const hours = Math.floor(total / 3600);
+  const minutes = Math.floor((total % 3600) / 60);
+  const secs = total % 60;
+  const mm = String(minutes).padStart(2, '0');
+  const ss = String(secs).padStart(2, '0');
+  return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`;
+}
+
+function timeLabel(startS?: number, endS?: number): string | undefined {
+  if (startS === undefined || !Number.isFinite(startS)) return undefined;
+  const start = formatElapsedTime(startS);
+  if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`;
+  return `+${start}–${formatElapsedTime(endS)}`;
+}
+
 interface TooltipState {
   x: number;
   y: number;
@@ -152,6 +171,7 @@ export function TraceFlamegraph({
           key: `t-${i}`,
           label: `Turn ${turnNo}`,
           sublabel: node.model ?? undefined,
+          timeLabel: timeLabel(node.startS),
           cached: node.cached,
           uncached: node.uncached,
           output: node.out,
@@ -168,6 +188,7 @@ export function TraceFlamegraph({
           sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${
             node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : ''
           }`,
+          timeLabel: timeLabel(node.startS, node.endS),
           cached: node.cached,
           uncached: node.uncached,
           output: node.out,
@@ -183,6 +204,7 @@ export function TraceFlamegraph({
               key: `g-${i}-c-${ci}`,
               label: `↳ subturn ${ci + 1}`,
               sublabel: child.model ?? undefined,
+              timeLabel: timeLabel(child.startS),
               cached: child.cached,
               uncached: child.uncached,
               output: child.out,
@@ -291,6 +313,15 @@ export function TraceFlamegraph({
                   )}
                 </div>
 
+                {/* Offset from conversation start. Group rows span the full
+                    subagent lifetime; leaf rows show their start instant. */}
+                <div
+                  className="w-36 shrink-0 text-[11px] tabular-nums text-muted-foreground"
+                  data-testid={`flamegraph-time-${row.key}`}
+                >
+                  {row.timeLabel ?? '—'}
+                </div>
+
                 {/* stacked bar — group headers render as a slim muted summary
                     strip so they read as aggregates, not individual turns. */}
                 <div
@@ -354,6 +385,10 @@ export function TraceFlamegraph({
                   ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
                   : '—'}
               </span>
+              <span>From start</span>
+              <span className="text-right tabular-nums text-foreground">
+                {tooltip.row.timeLabel ?? '—'}
+              </span>
             </div>
           </div>,
           document.body,
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 4a076955..e24b7e6b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -6,7 +6,7 @@ import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
 import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
-import { useRequestTimeline } from '@/hooks/api/use-request-timeline';
+import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-request-timeline';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -16,6 +16,7 @@ import {
 } from '@/hooks/api/use-trace-server-metrics';
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
 import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
 
 import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
@@ -30,8 +31,11 @@ import {
   cumulativeUniqueInputTokens,
   inflightUniqueTokens,
   rollingAverage,
+  rollingRequestMetric,
   sumSeries,
   timeRollingAverage,
+  type RequestMetric,
+  type RequestPercentile,
 } from './time-series-chart';
 
 interface Props {
@@ -114,6 +118,83 @@ const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
 ];
 
+const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption<RequestPercentile>[] = [
+  { value: 'p75', label: 'P75' },
+  { value: 'p90', label: 'P90' },
+];
+
+// Unofficial-run overlays cannot open this persisted point-detail route: they
+// have no benchmark_results id or stored request timeline. These charts are
+// therefore intentionally limited to DB-backed agentic points.
+function RequestMetricOverTime({
+  title,
+  metric,
+  timeline,
+  isLoading,
+}: {
+  title: string;
+  metric: RequestMetric;
+  timeline: RequestTimeline | null | undefined;
+  isLoading: boolean;
+}) {
+  const [percentile, setPercentile] = useState<RequestPercentile>('p75');
+  const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null;
+  const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity';
+  const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4';
+
+  const controls = (
+    <SegmentedToggle
+      value={percentile}
+      options={REQUEST_PERCENTILE_OPTIONS}
+      onValueChange={(value) => {
+        setPercentile(value);
+        track('inference_agentic_percentile_changed', { metric, percentile: value });
+      }}
+      ariaLabel={`${metricLabel} percentile`}
+      testId={`${metric}-percentile-toggle`}
+    />
+  );
+
+  return (
+    <ExpandableChart
+      title={title}
+      controls={controls}
+      testId={`${metric}-over-time-chart`}
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!timeline) return isLoading ? <Skeleton /> : <Empty />;
+        return (
+          <TimeSeriesChart
+            series={[
+              {
+                name: `${percentile.toUpperCase()} (rolling 50 req)`,
+                data: result?.trend ?? [],
+                rawData: result?.raw,
+                color,
+                strokeWidth: 2.5,
+              },
+              {
+                name: metric === 'ttft' ? 'Cumulative mean TTFT' : '1 / cumulative mean TPOT',
+                data: result?.cumulative ?? [],
+                color: '#ef4444',
+                strokeWidth: 3,
+              },
+            ]}
+            durationS={timeline.durationS}
+            yFmt={
+              metric === 'ttft'
+                ? (value) => `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s`
+                : (value) => `${value.toFixed(0)}`
+            }
+            yAxisLabel={metric === 'ttft' ? 'TTFT (s)' : 'Interactivity (tok/s/user)'}
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
 /** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
 function toAggPoint(
   sibling: { id: number; label: string },
@@ -254,6 +335,20 @@ export function AgenticPointDetail({ id }: Props) {
             }}
           />
 
+          <RequestMetricOverTime
+            title="Interactivity over time"
+            metric="interactivity"
+            timeline={timelineQuery.data}
+            isLoading={timelineQuery.isLoading}
+          />
+
+          <RequestMetricOverTime
+            title="TTFT over time"
+            metric="ttft"
+            timeline={timelineQuery.data}
+            isLoading={timelineQuery.isLoading}
+          />
+
           <ExpandableChart
             title="KV cache utilization over time"
             render={(expanded) => {
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
index 7c8e4538..cb5987ec 100644
--- a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -13,30 +13,40 @@ import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/u
 export function ExpandableChart({
   title,
   render,
+  controls,
+  testId,
 }: {
   title: string;
   render: (expanded: boolean) => ReactNode;
+  controls?: ReactNode;
+  testId?: string;
 }) {
   const [open, setOpen] = useState(false);
 
   return (
-    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4" data-testid={testId}>
       <div className="flex items-start justify-between mb-3 gap-2">
         <h2 className="text-sm font-semibold text-foreground">{title}</h2>
-        <button
-          type="button"
-          aria-label="Expand chart"
-          onClick={() => setOpen(true)}
-          className="text-muted-foreground hover:text-foreground transition-colors"
-        >
-          <Maximize2 className="size-4" />
-        </button>
+        <div className="flex items-center gap-2">
+          {controls}
+          <button
+            type="button"
+            aria-label="Expand chart"
+            onClick={() => setOpen(true)}
+            className="text-muted-foreground hover:text-foreground transition-colors"
+          >
+            <Maximize2 className="size-4" />
+          </button>
+        </div>
       </div>
       {render(false)}
       <Dialog open={open} onOpenChange={setOpen}>
         <DialogContent className="max-w-[min(96vw,1400px)] w-[min(96vw,1400px)]">
           <DialogHeader>
-            <DialogTitle>{title}</DialogTitle>
+            <div className="flex items-center justify-between gap-3 pr-8">
+              <DialogTitle>{title}</DialogTitle>
+              {controls}
+            </div>
           </DialogHeader>
           <div className="w-full">{render(true)}</div>
         </DialogContent>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
index 64deace4..926772db 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
@@ -1,6 +1,77 @@
 import { describe, expect, it } from 'vitest';
 
-import { cumulativeUniqueInputTokens } from './time-series-chart';
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import { cumulativeUniqueInputTokens, rollingRequestMetric } from './time-series-chart';
+
+const request = (
+  endS: number,
+  ttftMs: number | null,
+  tpotMs: number | null,
+  overrides: Partial<RequestRecord> = {},
+): RequestRecord => ({
+  cid: 'conversation',
+  ti: endS,
+  wid: 'worker',
+  ad: 0,
+  phase: 'profiling',
+  credit: 0,
+  start: 0,
+  ack: null,
+  end: endS * 1e9,
+  ttftMs,
+  tpotMs,
+  isl: 100,
+  osl: 10,
+  cancelled: false,
+  ...overrides,
+});
+
+describe('rollingRequestMetric', () => {
+  it('computes a trailing P75 TTFT over the requested window', () => {
+    const result = rollingRequestMetric(
+      [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30), request(4, 400, 40)],
+      'ttft',
+      'p75',
+      3,
+    );
+
+    expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 });
+    expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]);
+    expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]);
+  });
+
+  it('inverts the rolling TPOT percentile for interactivity', () => {
+    const result = rollingRequestMetric(
+      [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30)],
+      'interactivity',
+      'p90',
+      3,
+    );
+
+    expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]);
+    expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8);
+    expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]);
+  });
+
+  it('drops warmup, cancelled, missing, and non-positive samples', () => {
+    const result = rollingRequestMetric(
+      [
+        request(1, 100, 10),
+        request(2, 200, 20, { phase: 'warmup' }),
+        request(3, 300, 30, { cancelled: true }),
+        request(4, null, null),
+        request(5, 0, 0),
+      ],
+      'ttft',
+      'p90',
+    );
+
+    expect(result.raw).toEqual([{ t: 1, value: 0.1 }]);
+    expect(result.trend).toEqual([{ t: 1, value: 0.1 }]);
+    expect(result.cumulative).toEqual([{ t: 1, value: 0.1 }]);
+  });
+});
 
 describe('cumulativeUniqueInputTokens', () => {
   it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => {
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 6b00b1e6..749a17e4 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -3,6 +3,7 @@
 import { useMemo } from 'react';
 
 import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
 
 import { ChartHover, type HoverItem } from './chart-hover';
 
@@ -32,6 +33,65 @@ interface TimeSeriesChartProps {
   height?: number;
 }
 
+export type RequestMetric = 'interactivity' | 'ttft';
+export type RequestPercentile = 'p75' | 'p90';
+
+/** Linear-interpolated percentile (matches numpy's default method). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+/**
+ * Build raw request samples plus a trailing request-count percentile.
+ *
+ * The percentile is computed in latency space. Interactivity then inverts
+ * the selected TPOT percentile, matching the aggregate chart convention:
+ * P90 interactivity = 1 / P90 TPOT (a conservative tail-latency view).
+ */
+export function rollingRequestMetric(
+  requests: readonly RequestRecord[],
+  metric: RequestMetric,
+  percentile: RequestPercentile,
+  windowSize = 50,
+): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } {
+  const q = percentile === 'p75' ? 0.75 : 0.9;
+  const samples = requests
+    .filter((request) => request.phase === 'profiling' && !request.cancelled)
+    .flatMap((request) => {
+      const latencyMs = metric === 'ttft' ? request.ttftMs : request.tpotMs;
+      if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return [];
+      return [{ t: request.end / 1e9, latencyMs }];
+    })
+    .toSorted((a, b) => a.t - b.t);
+
+  const raw = samples.map(({ t, latencyMs }) => ({
+    t,
+    value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs,
+  }));
+  const trend = samples.map(({ t }, i) => {
+    const start = Math.max(0, i - Math.max(1, windowSize) + 1);
+    const sorted = samples
+      .slice(start, i + 1)
+      .map((sample) => sample.latencyMs)
+      .toSorted((a, b) => a - b);
+    const latencyMs = quantile(sorted, q);
+    return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs };
+  });
+  let latencySumMs = 0;
+  const cumulative = samples.map(({ t, latencyMs }, i) => {
+    latencySumMs += latencyMs;
+    const meanLatencyMs = latencySumMs / (i + 1);
+    return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs };
+  });
+
+  return { raw, trend, cumulative };
+}
+
 /**
  * Time-weighted rolling average over a `windowS`-second trailing window.
  * Treats the input as a step function (value held constant between
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
index d3ceaab8..094d2230 100644
--- a/packages/app/src/hooks/api/use-request-timeline.ts
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -20,6 +20,8 @@ export interface RequestRecord {
   /** ns offset from timeline.startNs. Last byte received. */
   end: number;
   ttftMs: number | null;
+  /** Time per output token in ms. */
+  tpotMs: number | null;
   isl: number | null;
   osl: number | null;
   cancelled: boolean;
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
index 64512aca..61e69fe8 100644
--- a/packages/db/src/etl/compute-request-timeline.test.ts
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -15,6 +15,8 @@ interface SyntheticRequest {
   end: number;
   ack?: number | null;
   ttftMs?: number | null;
+  tpotMs?: number | null;
+  tpotKey?: 'inter_token_latency' | 'time_per_output_token';
   isl?: number | null;
   osl?: number | null;
   cancelled?: boolean;
@@ -37,6 +39,8 @@ function makeBlob(requests: SyntheticRequest[]) {
       },
       metrics: {
         time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+        [r.tpotKey ?? 'inter_token_latency']:
+          r.tpotMs === null ? null : { value: r.tpotMs ?? 10, unit: 'ms' },
         input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
         output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
       },
@@ -115,7 +119,7 @@ describe('computeRequestTimeline', () => {
     expect(r.phase).toBe('profiling');
   });
 
-  it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => {
+  it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => {
     const tl = computeRequestTimeline(
       makeBlob([
         {
@@ -125,6 +129,7 @@ describe('computeRequestTimeline', () => {
           start: 10,
           end: 100,
           ttftMs: 25.5,
+          tpotMs: 12.5,
           isl: 1024,
           osl: 256,
           cancelled: true,
@@ -134,10 +139,28 @@ describe('computeRequestTimeline', () => {
     const r = tl?.requests[0]!;
     expect(r.cancelled).toBe(true);
     expect(r.ttftMs).toBeCloseTo(25.5, 6);
+    expect(r.tpotMs).toBeCloseTo(12.5, 6);
     expect(r.isl).toBe(1024);
     expect(r.osl).toBe(256);
   });
 
+  it('accepts time_per_output_token as a TPOT alias', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'a',
+          ti: 0,
+          credit: 0,
+          start: 10,
+          end: 100,
+          tpotMs: 8.25,
+          tpotKey: 'time_per_output_token',
+        },
+      ]),
+    );
+    expect(tl?.requests[0]?.tpotMs).toBeCloseTo(8.25, 6);
+  });
+
   it('skips records missing both credit_issued_ns and request_start_ns', () => {
     // Build a record with only request_end_ns — the helper rejects it.
     const broken = gzipSync(
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
index a1134f7a..707e8c54 100644
--- a/packages/db/src/etl/compute-request-timeline.ts
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -14,7 +14,7 @@
 import { gunzipSync } from 'node:zlib';
 
 /** Bump when the extraction algorithm changes — backfill recomputes anything older. */
-export const REQUEST_TIMELINE_VERSION = 1;
+export const REQUEST_TIMELINE_VERSION = 3;
 
 export interface RequestRecord {
   /** Conversation id (groups turns of one agent session). */
@@ -37,6 +37,8 @@ export interface RequestRecord {
   end: number;
   /** Time-to-first-token in ms. */
   ttftMs: number | null;
+  /** Time per output token in ms. */
+  tpotMs: number | null;
   /** Input sequence length (tokens). */
   isl: number | null;
   /** Output sequence length (tokens). */
@@ -76,6 +78,8 @@ interface RawRecord {
   metadata?: RawMetadata;
   metrics?: {
     time_to_first_token?: RawMetricValue | number;
+    time_per_output_token?: RawMetricValue | number;
+    inter_token_latency?: RawMetricValue | number;
     input_sequence_length?: RawMetricValue | number;
     output_sequence_length?: RawMetricValue | number;
   };
@@ -108,6 +112,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
   const raw: {
     meta: RawMetadata;
     ttftMs: number | null;
+    tpotMs: number | null;
     isl: number | null;
     osl: number | null;
   }[] = [];
@@ -135,6 +140,10 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
     raw.push({
       meta,
       ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+      tpotMs:
+        readNum(rec.metrics?.time_per_output_token) ??
+        readNum(rec.metrics?.inter_token_latency) ??
+        null,
       isl: readNum(rec.metrics?.input_sequence_length) ?? null,
       osl: readNum(rec.metrics?.output_sequence_length) ?? null,
     });
@@ -163,6 +172,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
       ack,
       end,
       ttftMs: r.ttftMs,
+      tpotMs: r.tpotMs,
       isl: r.isl,
       osl: r.osl,
       cancelled: m.was_cancelled === true,
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
index 95bfef38..5287b682 100644
--- a/packages/db/src/etl/weka-structure.test.ts
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -86,17 +86,18 @@ describe('buildConversationStructure', () => {
       id: 'c4',
       block_size: 64,
       requests: [
-        { type: 'n', model: 'main', in: 64, out: 10, hash_ids: [1] },
+        { type: 'n', model: 'main', t: 0, in: 64, out: 10, hash_ids: [1] },
         {
           type: 'subagent',
           agent_id: 'a1',
           subagent_type: 'Explore',
+          t: 12.5,
           duration_ms: 1234,
           requests: [
             // sees parent block 1 (snapshot at spawn) → 1 block cached
-            { type: 'n', model: 'sub', in: 128, out: 7, hash_ids: [1, 5] },
+            { type: 'n', model: 'sub', t: 12.5, in: 128, out: 7, hash_ids: [1, 5] },
             // now block 5 is also seen within the subagent → 2 cached
-            { type: 'n', model: 'sub', in: 128, out: 3, hash_ids: [1, 5] },
+            { type: 'n', model: 'sub', t: 13.1, in: 128, out: 3, hash_ids: [1, 5] },
           ],
         },
         // Parent turn after subagent: block 5 must NOT be cached (subagent
@@ -113,7 +114,10 @@ describe('buildConversationStructure', () => {
     expect(sub.label).toBe('Explore');
     expect(sub.agentId).toBe('a1');
     expect(sub.durationMs).toBe(1234);
+    expect(sub.startS).toBe(12.5);
+    expect(sub.endS).toBeCloseTo(13.734, 6);
     expect(sub.children).toHaveLength(2);
+    expect(sub.children.map((child) => child.startS)).toEqual([12.5, 13.1]);
     expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
     expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
     expect(sub.in).toBe(256);
@@ -132,6 +136,24 @@ describe('buildConversationStructure', () => {
     expect(s.blockSize).toBe(64);
     expect((s.nodes[0] as SubagentNode).label).toBe('Subagent');
   });
+
+  it('derives a subagent time range from child timings when group timing is absent', () => {
+    const conv: RawWekaConversation = {
+      id: 'c6',
+      requests: [
+        {
+          type: 'subagent',
+          requests: [
+            { type: 'n', t: 5, api_time: 2.5, in: 10, out: 1 },
+            { type: 'n', t: 9, api_time: 3, in: 10, out: 1 },
+          ],
+        },
+      ],
+    };
+    const sub = buildConversationStructure(conv).nodes[0] as SubagentNode;
+    expect(sub.startS).toBe(5);
+    expect(sub.endS).toBe(12);
+  });
 });
 
 describe('histograms', () => {
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index e4113c68..33e222b4 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -48,6 +48,8 @@ export interface RawWekaConversation {
 export interface TurnNode {
   kind: 'turn';
   turnIndex: number;
+  /** Seconds from the start of the conversation. */
+  startS?: number;
   model?: string;
   in: number;
   out: number;
@@ -61,6 +63,10 @@ export interface SubagentNode {
   kind: 'subagent';
   label: string;
   agentId?: string;
+  /** Seconds from the start of the conversation. */
+  startS?: number;
+  /** Seconds from the start of the conversation. */
+  endS?: number;
   durationMs?: number;
   in: number;
   out: number;
@@ -130,6 +136,35 @@ function subagentLabel(s: RawWekaSubagent): string {
   return base && base.length > 0 ? base : 'Subagent';
 }
 
+function finiteTime(value: number | undefined): number | undefined {
+  return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined;
+}
+
+function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } {
+  const children = entry.requests ?? [];
+  const childStarts = children
+    .map((child) => finiteTime(child.t))
+    .filter((value): value is number => value !== undefined);
+  const startS =
+    finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined);
+  const durationMs = finiteTime(entry.duration_ms);
+  if (startS !== undefined && durationMs !== undefined) {
+    return { startS, endS: startS + durationMs / 1000 };
+  }
+
+  const childEnds = children
+    .map((child) => {
+      const childStart = finiteTime(child.t);
+      if (childStart === undefined) return undefined;
+      return childStart + (finiteTime(child.api_time) ?? 0);
+    })
+    .filter((value): value is number => value !== undefined);
+  return {
+    startS,
+    endS: childEnds.length > 0 ? Math.max(...childEnds) : startS,
+  };
+}
+
 /**
  * Build the flamegraph structure for one conversation. Main turns share a single
  * accumulating prefix-cache `seen` set; each subagent group runs against a
@@ -153,6 +188,7 @@ export function buildConversationStructure(
 
   for (const entry of conv.requests ?? []) {
     if (isSubagent(entry)) {
+      const { startS, endS } = subagentTimeRange(entry);
       const childSeen = new Set(seen); // snapshot at spawn; not merged back
       const children: TurnNode[] = [];
       let gin = 0;
@@ -165,6 +201,7 @@ export function buildConversationStructure(
         children.push({
           kind: 'turn',
           turnIndex: turnIndex++,
+          startS: finiteTime(inner.t),
           model: inner.model,
           in: split.in,
           out,
@@ -180,6 +217,8 @@ export function buildConversationStructure(
         kind: 'subagent',
         label: subagentLabel(entry),
         agentId: entry.agent_id,
+        startS,
+        endS,
         durationMs: entry.duration_ms,
         in: gin,
         out: gout,
@@ -198,6 +237,7 @@ export function buildConversationStructure(
       nodes.push({
         kind: 'turn',
         turnIndex: turnIndex++,
+        startS: finiteTime(entry.t),
         model: entry.model,
         in: split.in,
         out,

From 13471d75072d574d42be008a462dbfce9467c95d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 13:44:55 -0500
Subject: [PATCH 093/111] add dataset percentile distributions

---
 .../component/distribution-card.cy.tsx        | 41 ++++++++-
 .../cypress/e2e/datasets-distributions.cy.ts  | 90 +++++++++++++++++++
 .../components/datasets/dataset-detail.tsx    |  6 ++
 .../components/datasets/distribution-card.tsx | 23 +++--
 packages/app/src/hooks/api/use-datasets.ts    |  5 ++
 packages/db/src/etl/weka-structure.test.ts    | 18 ++++
 packages/db/src/etl/weka-structure.ts         | 46 ++++++++++
 packages/db/src/ingest-weka-dataset.ts        | 50 ++++-------
 8 files changed, 235 insertions(+), 44 deletions(-)
 create mode 100644 packages/app/cypress/e2e/datasets-distributions.cy.ts

diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx
index fb7e5461..511505b9 100644
--- a/packages/app/cypress/component/distribution-card.cy.tsx
+++ b/packages/app/cypress/component/distribution-card.cy.tsx
@@ -8,7 +8,16 @@ const distribution: Distribution = {
     { x0: 200, x1: 300, count: 12 },
     { x0: 300, x1: 400, count: 3 },
   ],
-  stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 },
+  stats: {
+    count: 40,
+    min: 10,
+    max: 390,
+    mean: 180,
+    median: 175,
+    p75: 250,
+    p90: 320,
+    p95: 360,
+  },
 };
 
 describe('DistributionCard', () => {
@@ -18,8 +27,13 @@ describe('DistributionCard', () => {
     );
     cy.contains('Input tokens per turn').should('be.visible');
     cy.contains('n=40').should('be.visible');
-    cy.contains('median 175').should('be.visible');
+    cy.contains('p50 175').should('be.visible');
+    cy.contains('p75 250').should('be.visible');
     cy.contains('p90 320').should('be.visible');
+    cy.contains('p95 360').should('be.visible');
+    cy.get(
+      'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]',
+    ).should('have.length', 8);
     // One filled bar rect per bin (ChartHover may add a transparent overlay rect).
     cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length);
   });
@@ -42,4 +56,27 @@ describe('DistributionCard', () => {
     );
     cy.contains('log scale').should('be.visible');
   });
+
+  it('renders older v1 stats without unavailable percentile guides', () => {
+    cy.mount(
+      <DistributionCard
+        title="Legacy metric"
+        unit="tok"
+        distribution={{
+          bins: distribution.bins,
+          stats: {
+            count: 40,
+            min: 10,
+            max: 390,
+            mean: 180,
+            median: 175,
+            p90: 320,
+          },
+        }}
+      />,
+    );
+    cy.contains('p50 175').should('be.visible');
+    cy.contains('p90 320').should('be.visible');
+    cy.contains('NaN').should('not.exist');
+  });
 });
diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts
new file mode 100644
index 00000000..7edda341
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts
@@ -0,0 +1,90 @@
+const distribution = (values: {
+  median: number;
+  p75: number;
+  p90: number;
+  p95: number;
+  max: number;
+}) => ({
+  bins: [
+    { x0: 0, x1: 10, count: 5 },
+    { x0: 10, x1: 100, count: 15 },
+  ],
+  stats: {
+    count: 20,
+    min: 0,
+    mean: 40,
+    ...values,
+  },
+});
+
+describe('Dataset distribution percentiles', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/datasets/test-dataset', {
+      body: {
+        id: 'test-dataset',
+        slug: 'test-dataset',
+        label: 'Test dataset',
+        variant: 'full',
+        description: null,
+        hf_url: null,
+        license: 'apache-2.0',
+        conversation_count: 1,
+        summary: {
+          mainTurns: 20,
+          subagentGroups: 0,
+          subagentTurns: 0,
+          cachedPct: 0.5,
+          totalIn: 1000,
+          totalOut: 200,
+        },
+        chart_data: {
+          version: 2,
+          inputTokensPerTurn: distribution({
+            median: 100,
+            p75: 200,
+            p90: 300,
+            p95: 400,
+            max: 500,
+          }),
+          outputTokensPerTurn: distribution({
+            median: 10,
+            p75: 20,
+            p90: 30,
+            p95: 40,
+            max: 50,
+          }),
+          uncachedInputTokensPerTurn: distribution({
+            median: 0,
+            p75: 64,
+            p90: 128,
+            p95: 256,
+            max: 512,
+          }),
+        },
+        ingested_at: '2026-06-23T00:00:00Z',
+      },
+    });
+    cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', {
+      body: { total: 0, items: [] },
+    });
+    cy.visit('/datasets/test-dataset');
+  });
+
+  it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => {
+    const expected = [
+      ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']],
+      ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']],
+      ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']],
+    ] as const;
+
+    for (const [title, percentiles] of expected) {
+      cy.contains('[data-slot="card"]', title).within(() => {
+        for (const percentile of percentiles) cy.contains(percentile).should('be.visible');
+        cy.get('svg line[stroke="#3b82f6"]').should('exist');
+        cy.get('svg line[stroke="#22c55e"]').should('exist');
+        cy.get('svg line[stroke="#f59e0b"]').should('exist');
+        cy.get('svg line[stroke="#ef4444"]').should('exist');
+      });
+    }
+  });
+});
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
index 9410a505..ac8b2de5 100644
--- a/packages/app/src/components/datasets/dataset-detail.tsx
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -145,6 +145,12 @@ export function DatasetDetail({ slug }: { slug: string }) {
             scale="log"
             distribution={cd.outputTokensPerTurn}
           />
+          <DistributionCard
+            title="Uncached input tokens per request"
+            unit="tokens"
+            scale="log"
+            distribution={cd.uncachedInputTokensPerTurn}
+          />
           <DistributionCard
             title="Turns per conversation"
             unit="turns"
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
index d0c0f166..3d0e45d7 100644
--- a/packages/app/src/components/datasets/distribution-card.tsx
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -23,7 +23,7 @@ const PAD = { top: 12, right: 16, bottom: 48, left: 52 };
 
 /**
  * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a
- * themeable bar chart with median/p90 guide lines and a hover tooltip. Bars are
+ * themeable bar chart with p50/p75/p90/p95 guide lines and a hover tooltip. Bars are
  * drawn at equal visual width; for log-scaled bins the edge labels are already
  * log-spaced so the shape reads as a log histogram.
  */
@@ -71,11 +71,17 @@ export function DistributionCard({
   const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed;
   const stats = distribution?.stats;
 
-  const guides = stats
-    ? ([
-        { label: 'median', value: stats.median, color: '#3b82f6' },
+  const guides: { label: string; value: number; color: string }[] = stats
+    ? [
+        { label: 'p50', value: stats.median, color: '#3b82f6' },
+        ...(typeof stats.p75 === 'number'
+          ? [{ label: 'p75', value: stats.p75, color: '#22c55e' }]
+          : []),
         { label: 'p90', value: stats.p90, color: '#f59e0b' },
-      ] as const)
+        ...(typeof stats.p95 === 'number'
+          ? [{ label: 'p95', value: stats.p95, color: '#ef4444' }]
+          : []),
+      ]
     : [];
 
   // X tick labels from a few bin edges.
@@ -108,8 +114,11 @@ export function DistributionCard({
       {subtitle && <div className="mb-1 text-xs text-muted-foreground">{subtitle}</div>}
       {stats && (
         <div className="mb-2 text-xs text-muted-foreground">
-          n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '}
-          {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit}
+          n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)}
+          {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}</>} · p90{' '}
+          {formatValue(stats.p90)}
+          {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}</>} · max{' '}
+          {formatValue(stats.max)} {unit}
         </div>
       )}
       <div className="w-full text-muted-foreground">
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts
index 3ce61a85..96b0f59f 100644
--- a/packages/app/src/hooks/api/use-datasets.ts
+++ b/packages/app/src/hooks/api/use-datasets.ts
@@ -46,7 +46,11 @@ export interface DistributionStats {
   max: number;
   mean: number;
   median: number;
+  /** Added in chart_data v2. */
+  p75?: number;
   p90: number;
+  /** Added in chart_data v2. */
+  p95?: number;
 }
 
 export interface Distribution {
@@ -57,6 +61,7 @@ export interface Distribution {
 export interface DatasetChartData {
   version?: number;
   inputTokensPerTurn?: Distribution;
+  uncachedInputTokensPerTurn?: Distribution;
   outputTokensPerTurn?: Distribution;
   turnsPerConversation?: Distribution;
   subagentGroupsPerConversation?: Distribution;
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
index 5287b682..4debf1ae 100644
--- a/packages/db/src/etl/weka-structure.test.ts
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -4,6 +4,8 @@ import {
   buildConversationStructure,
   linearHistogram,
   logHistogram,
+  logHistogramWithZero,
+  summarizeValues,
   type RawWekaConversation,
   type SubagentNode,
   type TurnNode,
@@ -177,4 +179,20 @@ describe('histograms', () => {
     expect(linearHistogram([])).toEqual([]);
     expect(logHistogram([])).toEqual([]);
   });
+
+  it('preserves zero-valued samples in a dedicated log histogram bin', () => {
+    const bins = logHistogramWithZero([0, 0, 1, 10, 100], 4);
+    expect(bins[0]).toEqual({ x0: 0, x1: 1, count: 2 });
+    expect(bins.reduce((total, bin) => total + bin.count, 0)).toBe(5);
+  });
+});
+
+describe('summarizeValues', () => {
+  it('computes the same linearly-interpolated percentile set as request distributions', () => {
+    const summary = summarizeValues(Array.from({ length: 100 }, (_, i) => i + 1));
+    expect(summary.median).toBeCloseTo(50.5, 6);
+    expect(summary.p75).toBeCloseTo(75.25, 6);
+    expect(summary.p90).toBeCloseTo(90.1, 6);
+    expect(summary.p95).toBeCloseTo(95.05, 6);
+  });
 });
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index 33e222b4..ac7a6eab 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -274,6 +274,42 @@ export interface HistogramBin {
   count: number;
 }
 
+export interface NumberSummary {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  p75: number;
+  p90: number;
+  p95: number;
+}
+
+/** Distribution summary with linear-interpolated percentiles. */
+export function summarizeValues(values: readonly number[]): NumberSummary {
+  if (values.length === 0) {
+    return { count: 0, min: 0, max: 0, mean: 0, median: 0, p75: 0, p90: 0, p95: 0 };
+  }
+  const sorted = [...values].toSorted((a, b) => a - b);
+  const quantile = (q: number): number => {
+    const pos = (sorted.length - 1) * q;
+    const lo = Math.floor(pos);
+    const hi = Math.ceil(pos);
+    if (lo === hi) return sorted[lo]!;
+    return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+  };
+  return {
+    count: sorted.length,
+    min: sorted[0]!,
+    max: sorted.at(-1)!,
+    mean: sorted.reduce((sum, value) => sum + value, 0) / sorted.length,
+    median: quantile(0.5),
+    p75: quantile(0.75),
+    p90: quantile(0.9),
+    p95: quantile(0.95),
+  };
+}
+
 /** Linear-width histogram over [0, max]. Empty input → []. */
 export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] {
   if (values.length === 0) return [];
@@ -313,3 +349,13 @@ export function logHistogram(values: readonly number[], bins = 40): HistogramBin
   }
   return out;
 }
+
+/** Log-width histogram that preserves zero as a dedicated first bin. */
+export function logHistogramWithZero(values: readonly number[], bins = 40): HistogramBin[] {
+  const zeroCount = values.filter((value) => value === 0).length;
+  const positive = values.filter((value) => value > 0);
+  if (zeroCount === 0) return logHistogram(positive, bins);
+  if (positive.length === 0) return [{ x0: 0, x1: 1, count: zeroCount }];
+  const positiveBins = logHistogram(positive, Math.max(1, bins - 1));
+  return [{ x0: 0, x1: positiveBins[0]?.x0 ?? 1, count: zeroCount }, ...positiveBins];
+}
diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
index 22069419..e00471d7 100644
--- a/packages/db/src/ingest-weka-dataset.ts
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -24,6 +24,8 @@ import {
   buildConversationStructure,
   linearHistogram,
   logHistogram,
+  logHistogramWithZero,
+  summarizeValues,
   type ConversationStructure,
   type RawWekaConversation,
   type TurnNode,
@@ -140,6 +142,7 @@ async function* iterRows(
 
 interface Accumulator {
   inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children)
+  uncachedInputPerTurn: number[];
   outputPerTurn: number[];
   cachedFractionPerTurn: number[]; // cached/in, for turns with in>0
   turnsPerConv: number[]; // main (top-level) turns
@@ -157,6 +160,7 @@ interface Accumulator {
 function newAccumulator(): Accumulator {
   return {
     inputPerTurn: [],
+    uncachedInputPerTurn: [],
     outputPerTurn: [],
     cachedFractionPerTurn: [],
     turnsPerConv: [],
@@ -174,6 +178,7 @@ function newAccumulator(): Accumulator {
 
 function recordTurn(acc: Accumulator, t: TurnNode): void {
   acc.inputPerTurn.push(t.in);
+  acc.uncachedInputPerTurn.push(t.uncached);
   acc.outputPerTurn.push(t.out);
   if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in);
   if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1;
@@ -198,57 +203,32 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void {
   }
 }
 
-interface NumberSummary {
-  count: number;
-  min: number;
-  max: number;
-  mean: number;
-  median: number;
-  p90: number;
-}
-
-function summarize(values: number[]): NumberSummary {
-  if (values.length === 0) {
-    return { count: 0, min: 0, max: 0, mean: 0, median: 0, p90: 0 };
-  }
-  const sorted = [...values].toSorted((a, b) => a - b);
-  const n = sorted.length;
-  // Quantile by position; q(0)=min, q(1)=max — avoids array-tail indexing that
-  // the linter rewrites to `.at(-1)` (which widens the type to `| undefined`).
-  const q = (p: number) => sorted[Math.min(n - 1, Math.max(0, Math.floor(p * (n - 1))))];
-  const sum = sorted.reduce((a, b) => a + b, 0);
-  return {
-    count: n,
-    min: q(0),
-    max: q(1),
-    mean: sum / n,
-    median: q(0.5),
-    p90: q(0.9),
-  };
-}
-
 function buildChartData(acc: Accumulator) {
   return {
-    version: 1,
+    version: 2,
     inputTokensPerTurn: {
       bins: logHistogram(acc.inputPerTurn),
-      stats: summarize(acc.inputPerTurn),
+      stats: summarizeValues(acc.inputPerTurn),
+    },
+    uncachedInputTokensPerTurn: {
+      bins: logHistogramWithZero(acc.uncachedInputPerTurn),
+      stats: summarizeValues(acc.uncachedInputPerTurn),
     },
     outputTokensPerTurn: {
       bins: logHistogram(acc.outputPerTurn),
-      stats: summarize(acc.outputPerTurn),
+      stats: summarizeValues(acc.outputPerTurn),
     },
     turnsPerConversation: {
       bins: linearHistogram(acc.turnsPerConv),
-      stats: summarize(acc.turnsPerConv),
+      stats: summarizeValues(acc.turnsPerConv),
     },
     subagentGroupsPerConversation: {
       bins: linearHistogram(acc.subagentGroupsPerConv),
-      stats: summarize(acc.subagentGroupsPerConv),
+      stats: summarizeValues(acc.subagentGroupsPerConv),
     },
     cachedFractionPerTurn: {
       bins: linearHistogram(acc.cachedFractionPerTurn, 20),
-      stats: summarize(acc.cachedFractionPerTurn),
+      stats: summarizeValues(acc.cachedFractionPerTurn),
     },
   };
 }

From 8bfe66408d6b8514031e47af1b94ede19c369d97 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 16:10:02 -0500
Subject: [PATCH 094/111] use cumulative percentiles for agentic charts

---
 .../e2e/agentic-point-time-series.cy.ts       | 34 ++++++++++---------
 .../agentic-point/agentic-point-detail.tsx    |  7 ++--
 .../agentic-point/time-series-chart.test.ts   |  4 +--
 .../agentic-point/time-series-chart.tsx       | 20 ++++++++---
 4 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
index b0cfb60d..db59dda2 100644
--- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -46,15 +46,15 @@ describe('Agentic point request metric time series', () => {
     cy.visit('/inference/agentic/206885');
   });
 
-  it('renders rolling P75 interactivity and TTFT using profiling requests only', () => {
+  it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => {
     cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
       cy.contains('h2', 'Interactivity over time').should('be.visible');
       cy.get('[data-testid="interactivity-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
-        .should('have.text', 'P75');
+        .should('have.text', 'P90');
       cy.get('svg circle').should('have.length', 5);
-      cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
-      cy.get('svg').should('contain.text', '1 / cumulative mean TPOT');
+      cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+      cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT');
       cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
     });
 
@@ -62,37 +62,39 @@ describe('Agentic point request metric time series', () => {
       cy.contains('h2', 'TTFT over time').should('be.visible');
       cy.get('svg circle').should('have.length', 5);
       cy.get('svg').should('contain.text', 'TTFT (s)');
-      cy.get('svg').should('contain.text', 'Cumulative mean TTFT');
+      cy.get('svg').should('contain.text', 'Cumulative P90 TTFT');
       cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
     });
   });
 
-  it('switches each chart independently from P75 to P90', () => {
+  it('switches each chart independently from P90 to P75', () => {
     cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
-      cy.contains('svg', 'P75 (rolling 50 req)')
+      cy.contains('svg', 'P90 (rolling 50 req)')
         .find('path')
         .first()
         .invoke('attr', 'd')
-        .as('p75Path');
-      cy.contains('button', 'P90').click();
+        .as('p90Path');
+      cy.contains('button', 'P75').click();
       cy.get('[data-testid="interactivity-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
-        .should('have.text', 'P90');
-      cy.contains('svg', 'P90 (rolling 50 req)')
+        .should('have.text', 'P75');
+      cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT');
+      cy.contains('svg', 'P75 (rolling 50 req)')
         .find('path')
         .first()
         .invoke('attr', 'd')
-        .then(function (p90Path) {
-          expect(p90Path).not.to.equal(this.p75Path);
+        .then(function (p75Path) {
+          expect(p75Path).not.to.equal(this.p90Path);
         });
     });
 
     cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
       cy.get('[data-testid="ttft-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
-        .should('have.text', 'P75');
-      cy.contains('button', 'P90').click();
-      cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+        .should('have.text', 'P90');
+      cy.contains('button', 'P75').click();
+      cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
+      cy.get('svg').should('contain.text', 'Cumulative P75 TTFT');
     });
   });
 });
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index e24b7e6b..e1bc1524 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -137,7 +137,7 @@ function RequestMetricOverTime({
   timeline: RequestTimeline | null | undefined;
   isLoading: boolean;
 }) {
-  const [percentile, setPercentile] = useState<RequestPercentile>('p75');
+  const [percentile, setPercentile] = useState<RequestPercentile>('p90');
   const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null;
   const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity';
   const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4';
@@ -174,7 +174,10 @@ function RequestMetricOverTime({
                 strokeWidth: 2.5,
               },
               {
-                name: metric === 'ttft' ? 'Cumulative mean TTFT' : '1 / cumulative mean TPOT',
+                name:
+                  metric === 'ttft'
+                    ? `Cumulative ${percentile.toUpperCase()} TTFT`
+                    : `1 / cumulative ${percentile.toUpperCase()} TPOT`,
                 data: result?.cumulative ?? [],
                 color: '#ef4444',
                 strokeWidth: 3,
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
index 926772db..3506ff45 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
@@ -38,7 +38,7 @@ describe('rollingRequestMetric', () => {
 
     expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 });
     expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]);
-    expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]);
+    expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.325]);
   });
 
   it('inverts the rolling TPOT percentile for interactivity', () => {
@@ -51,7 +51,7 @@ describe('rollingRequestMetric', () => {
 
     expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]);
     expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8);
-    expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]);
+    expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]);
   });
 
   it('drops warmup, cancelled, missing, and non-positive samples', () => {
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 749a17e4..0c0b5739 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -82,11 +82,21 @@ export function rollingRequestMetric(
     const latencyMs = quantile(sorted, q);
     return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs };
   });
-  let latencySumMs = 0;
-  const cumulative = samples.map(({ t, latencyMs }, i) => {
-    latencySumMs += latencyMs;
-    const meanLatencyMs = latencySumMs / (i + 1);
-    return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs };
+  const prefixLatencies: number[] = [];
+  const cumulative = samples.map(({ t, latencyMs }) => {
+    let lo = 0;
+    let hi = prefixLatencies.length;
+    while (lo < hi) {
+      const mid = (lo + hi) >> 1;
+      if (prefixLatencies[mid]! <= latencyMs) lo = mid + 1;
+      else hi = mid;
+    }
+    prefixLatencies.splice(lo, 0, latencyMs);
+    const cumulativeLatencyMs = quantile(prefixLatencies, q);
+    return {
+      t,
+      value: metric === 'ttft' ? cumulativeLatencyMs / 1000 : 1000 / cumulativeLatencyMs,
+    };
   });
 
   return { raw, trend, cumulative };

From e3e0bf43ddec5dd8c1d4f21e1c3f9baff469f8f9 Mon Sep 17 00:00:00 2001
From: Alec Ibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 18:34:16 -0500
Subject: [PATCH 095/111] fix(db): build each chart line from a single run, no
 cross-run/date stitching (#491)

---
 ..._latest_benchmarks_single_run_per_line.sql |  49 +++++
 .../src/json-provider.line-single-run.test.ts | 203 ++++++++++++++++++
 packages/db/src/json-provider.ts              |  50 +++--
 packages/db/src/queries/benchmarks.ts         |  58 +++--
 4 files changed, 323 insertions(+), 37 deletions(-)
 create mode 100644 packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
 create mode 100644 packages/db/src/json-provider.line-single-run.test.ts

diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
new file mode 100644
index 00000000..039dfe09
--- /dev/null
+++ b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
@@ -0,0 +1,49 @@
+-- ============================================================
+-- LATEST_BENCHMARKS — one run per line (no cross-run stitching)
+-- ============================================================
+--
+-- Previously the view did `distinct on (config_id, conc, isl, osl)` ordered by
+-- date desc — resolved INDEPENDENTLY per concurrency. So if a newer run
+-- re-measured only some concurrencies (a partial re-sweep), the concurrencies it
+-- skipped fell back to an older run that did measure them, and a single chart line
+-- ended up stitched from points produced by different runs on different dates.
+--
+-- A line is one config + sequence + offload mode
+-- (config_id, benchmark_type, isl, osl, offload_mode) plotted
+-- across concurrencies, and it must come from a SINGLE workflow run. We pick the
+-- newest run per line (newest date, then latest sweep by run_started_at, then
+-- highest workflow_run_id so exactly one run wins even on a same-day / null tie),
+-- then keep EVERY concurrency that one run measured. A partial re-sweep therefore
+-- truncates the line to its own concurrencies rather than borrowing an older run's.
+
+drop materialized view if exists latest_benchmarks;
+
+create materialized view latest_benchmarks as
+with winners as (
+  select distinct on (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode)
+         br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+         br.workflow_run_id as winning_run_id
+  from benchmark_results br
+  join latest_workflow_runs wr on wr.id = br.workflow_run_id
+  where br.error is null
+  order by br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+           br.date desc, wr.run_started_at desc nulls last, br.workflow_run_id desc
+)
+select br.*
+from benchmark_results br
+join winners w
+  on  w.config_id      = br.config_id
+  and w.benchmark_type = br.benchmark_type
+  and w.isl is not distinct from br.isl
+  and w.osl is not distinct from br.osl
+  and w.offload_mode = br.offload_mode
+  and w.winning_run_id = br.workflow_run_id
+where br.error is null;
+
+-- Unique key now includes benchmark_type (part of the line key). One run per line
+-- guarantees one row per concurrency, so this stays unique and keeps
+-- REFRESH MATERIALIZED VIEW CONCURRENTLY working.
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, benchmark_type, offload_mode)
+  nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/src/json-provider.line-single-run.test.ts b/packages/db/src/json-provider.line-single-run.test.ts
new file mode 100644
index 00000000..b75fa26a
--- /dev/null
+++ b/packages/db/src/json-provider.line-single-run.test.ts
@@ -0,0 +1,203 @@
+import { mkdtempSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
+
+import type { getLatestBenchmarks as GetLatestBenchmarks } from './json-provider.js';
+
+/**
+ * A chart line is one config + sequence + offload mode
+ * (config_id, benchmark_type, isl, osl, offload_mode) plotted across concurrencies, and it must
+ * come from a SINGLE workflow run. getLatestBenchmarks picks the
+ * newest run per line (date, then run_started_at, then workflow_run_id) and returns EVERY
+ * concurrency that one run measured — never stitching skipped concurrencies from an older run.
+ *
+ * These fixtures exercise the multi-concurrency cases the as-of test can't (it is single-conc):
+ * a partial re-sweep that must truncate the line, per-sequence line independence, and the
+ * same-day workflow_run_id tiebreak.
+ */
+
+const cfg = (id: number) => ({
+  id,
+  hardware: 'h100',
+  framework: 'vllm',
+  model: 'testm',
+  precision: 'fp8',
+  spec_method: 'none',
+  disagg: false,
+  is_multinode: false,
+  prefill_tp: 1,
+  prefill_ep: 1,
+  prefill_dp_attention: false,
+  prefill_num_workers: 1,
+  decode_tp: 1,
+  decode_ep: 1,
+  decode_dp_attention: false,
+  decode_num_workers: 1,
+  num_prefill_gpu: 0,
+  num_decode_gpu: 8,
+});
+
+const run = (id: number, githubId: number, startedAt: string | null, date: string) => ({
+  id,
+  github_run_id: githubId,
+  run_attempt: 1,
+  name: `run ${githubId}`,
+  status: 'completed',
+  conclusion: 'success',
+  head_sha: 'sha',
+  head_branch: 'main',
+  html_url: `https://github.com/x/runs/${githubId}`,
+  created_at: startedAt ?? `${date}T00:00:00Z`,
+  run_started_at: startedAt,
+  date,
+});
+
+let nextResultId = 1000;
+const result = (
+  runDbId: number,
+  configId: number,
+  date: string,
+  conc: number,
+  tpot: number,
+  isl = 1024,
+  osl = 1024,
+  offloadMode = 'off',
+) => ({
+  id: nextResultId++,
+  workflow_run_id: runDbId,
+  config_id: configId,
+  benchmark_type: 'latency',
+  date,
+  isl,
+  osl,
+  conc,
+  offload_mode: offloadMode,
+  image: null,
+  metrics: { median_tpot: tpot },
+  error: null,
+  server_log_id: null,
+});
+
+const OLD = '2026-06-10';
+const NEW = '2026-06-14';
+let getLatestBenchmarks: typeof GetLatestBenchmarks;
+
+beforeAll(async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'infx-line-'));
+  writeFileSync(join(dir, 'configs.json'), JSON.stringify([cfg(1), cfg(2)]));
+  writeFileSync(
+    join(dir, 'workflow_runs.json'),
+    JSON.stringify([
+      run(10, 100, `${OLD}T04:00:00Z`, OLD), // run A: older full sweep
+      run(11, 101, `${NEW}T05:00:00Z`, NEW), // run B: newer partial re-sweep
+      run(20, 200, `${NEW}T07:00:00Z`, NEW), // run E: same-day, lower run id
+      run(21, 201, `${NEW}T07:00:00Z`, NEW), // run F: same-day, SAME timestamp, higher run id
+    ]),
+  );
+  writeFileSync(
+    join(dir, 'benchmark_results.json'),
+    JSON.stringify([
+      // config 1, seq (1024,1024): run A full sweep, run B partial re-sweep.
+      result(10, 1, OLD, 1, 0.1),
+      result(10, 1, OLD, 8, 0.18),
+      result(10, 1, OLD, 64, 0.5),
+      result(11, 1, NEW, 1, 0.09),
+      result(11, 1, NEW, 8, 0.16),
+      // config 1, seq (8192,1024): only run A measured it (run B skipped this sequence).
+      result(10, 1, OLD, 1, 0.2, 8192, 1024),
+      result(10, 1, OLD, 8, 0.3, 8192, 1024),
+      // Offload mode is an independent line dimension. A newer off-mode run must not hide
+      // the older on-mode line for the same config and sequence.
+      result(10, 1, OLD, 4, 0.25, 4096, 4096, 'on'),
+      result(11, 1, NEW, 4, 0.2, 4096, 4096, 'off'),
+      // config 2, seq (1024,1024): two same-day runs with identical run_started_at.
+      result(20, 2, NEW, 1, 0.5),
+      result(20, 2, NEW, 8, 0.6),
+      result(20, 2, NEW, 64, 0.7),
+      result(21, 2, NEW, 1, 0.4),
+      result(21, 2, NEW, 8, 0.45),
+    ]),
+  );
+  process.env.DUMP_DIR = dir;
+  const mod = await import('./json-provider.js');
+  getLatestBenchmarks = mod.getLatestBenchmarks;
+});
+
+afterAll(() => {
+  delete process.env.DUMP_DIR;
+});
+
+/** Concurrencies + their run urls for one (config sequence) line, sorted by conc. */
+function line(
+  rows: { isl: number | null; osl: number | null; conc: number; run_url: string | null }[],
+  configRunUrlRe: RegExp,
+  isl: number,
+  osl: number,
+) {
+  return rows
+    .filter((r) => r.isl === isl && r.osl === osl && r.run_url?.match(configRunUrlRe))
+    .toSorted((a, b) => a.conc - b.conc)
+    .map((r) => ({ conc: r.conc, runUrl: r.run_url }));
+}
+
+describe('getLatestBenchmarks — one run per line', () => {
+  it('truncates a line to the newest run: a partial re-sweep hides the older run’s extra concs', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false);
+    // config 1 / seq (1024,1024): run B (101) measured only conc 1 & 8. conc 64 from run A is gone.
+    const seq = line(rows, /runs\/(?:100|101)\//u, 1024, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/101/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/101/attempts/1' },
+    ]);
+    expect(seq.some((p) => p.conc === 64)).toBe(false);
+  });
+
+  it('keeps a different sequence of the same config on its own winning run', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false);
+    // seq (8192,1024) was only in run A; run B winning the other sequence must not erase it.
+    const seq = line(rows, /runs\/100\//u, 8192, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+    ]);
+  });
+
+  it('selects winning runs independently for each offload mode', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false).filter(
+      (r) => r.isl === 4096 && r.osl === 4096,
+    );
+
+    expect(
+      rows
+        .map((r) => ({ offloadMode: r.offload_mode, runUrl: r.run_url }))
+        .toSorted((a, b) => a.offloadMode.localeCompare(b.offloadMode)),
+    ).toEqual([
+      { offloadMode: 'off', runUrl: 'https://github.com/x/runs/101/attempts/1' },
+      { offloadMode: 'on', runUrl: 'https://github.com/x/runs/100/attempts/1' },
+    ]);
+  });
+
+  it('breaks a same-day, same-timestamp tie by workflow_run_id (higher id wins the whole line)', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false);
+    // config 2: run E (200, id 20) and run F (201, id 21) share run_started_at; F wins by id.
+    const seq = line(rows, /runs\/(?:200|201)\//u, 1024, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/201/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/201/attempts/1' },
+    ]);
+    // run E's extra conc 64 must not bleed into run F's line.
+    expect(seq.some((p) => p.conc === 64)).toBe(false);
+  });
+
+  it('as of the older run, shows that run’s full sweep (no truncation by a later run)', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false, '100');
+    const seq = line(rows, /runs\/100\//u, 1024, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+      { conc: 64, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+    ]);
+  });
+});
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index c23e5f48..4e548efe 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -72,6 +72,8 @@ interface RawBenchmarkResult {
   isl: number;
   osl: number;
   conc: number;
+  /** Added by the AgentX schema; older dumps omit it and are treated as off. */
+  offload_mode?: string;
   image: string | null;
   metrics: Record<string, number>;
   /** Added in migration 006; older dumps omit this field — surfaced as undefined. */
@@ -333,12 +335,11 @@ const STRIP_HISTORY_KEYS = new Set([
 ]);
 
 /**
- * Comparator for DISTINCT ON (config, conc, isl, osl) selection: latest calendar
- * day first, then — for sweeps on the same day — the latest workflow run first by
- * `run_started_at` (NULLS LAST). Mirrors the SQL date-filtered query and the
- * `latest_benchmarks` view (migration 003): a calendar day alone ties two same-day
- * sweeps, so without this an older run's points can shadow a same-day re-sweep.
- * `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically.
+ * Run-recency comparator used to pick the newest run per line: latest calendar day first,
+ * then — for sweeps on the same day — the latest workflow run first by `run_started_at`
+ * (NULLS LAST). Mirrors the `br.date DESC, wr.run_started_at DESC NULLS LAST` portion of the
+ * SQL ORDER BY; callers apply a `workflow_run_id` DESC final tiebreak on top so exactly one
+ * run wins. `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically.
  * Exported so the same-day tiebreak is unit-tested in parity with the SQL.
  */
 export function compareBenchmarkRecency(
@@ -355,6 +356,10 @@ export function compareBenchmarkRecency(
   return bStarted.localeCompare(aStarted);
 }
 
+/** Chart-line identity: one config + sequence + offload mode. */
+const lineKey = (br: RawBenchmarkResult): string =>
+  `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`;
+
 export function getLatestBenchmarks(
   modelKey: string | string[],
   date?: string,
@@ -390,27 +395,32 @@ export function getLatestBenchmarks(
     return true;
   });
 
-  // DISTINCT ON (config_id, conc, isl, osl) — keep the one with the latest date,
-  // tiebreaking same-day runs by run_started_at so the latest sweep wins.
-  const seen = new Map<string, RawBenchmarkResult>();
-  candidates.sort((a, b) =>
-    compareBenchmarkRecency(
+  // Single run per LINE (config_id, benchmark_type, isl, osl, offload_mode): pick the newest run that
+  // produced data for the line, then keep EVERY concurrency that one run measured. Sort by
+  // recency (date, then run_started_at) with a final workflow_run_id DESC tiebreak so exactly
+  // one run wins even when run_started_at is equal/null — matching the SQL ORDER BY.
+  candidates.sort((a, b) => {
+    const recency = compareBenchmarkRecency(
       toDateString(a.date),
       toDateString(b.date),
       s.latestRunsById.get(a.workflow_run_id)?.run_started_at ?? null,
       s.latestRunsById.get(b.workflow_run_id)?.run_started_at ?? null,
-    ),
-  );
+    );
+    return recency === 0 ? b.workflow_run_id - a.workflow_run_id : recency;
+  });
+  const winningRun = new Map<string, number>();
   for (const br of candidates) {
-    const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`;
-    if (!seen.has(key)) seen.set(key, br);
+    const key = lineKey(br);
+    if (!winningRun.has(key)) winningRun.set(key, br.workflow_run_id);
   }
 
-  return [...seen.values()].map((br) => {
-    const c = s.configs.get(br.config_id)!;
-    const wr = s.latestRunsById.get(br.workflow_run_id)!;
-    return toBenchmarkRow(br, c, wr);
-  });
+  return candidates
+    .filter((br) => winningRun.get(lineKey(br)) === br.workflow_run_id)
+    .map((br) => {
+      const c = s.configs.get(br.config_id)!;
+      const wr = s.latestRunsById.get(br.workflow_run_id)!;
+      return toBenchmarkRow(br, c, wr);
+    });
 }
 
 /** In-memory mirror of {@link import('./queries/benchmarks.js').getBenchmarksForRun}. */
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 6833756a..37301e2b 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -51,9 +51,14 @@ export interface BenchmarkRow {
 /**
  * Fetch the latest benchmark results for one or more model DB keys across ALL sequences,
  * up to a given date. Multiple keys support point-release grouping — e.g. passing
- * `['glm5', 'glm5.1']` unions both buckets under the one display. Returns the most recent
- * result per (config, concurrency, isl, osl) — so every GPU/framework + sequence combo
- * that has been benchmarked appears, with the newest data winning.
+ * `['glm5', 'glm5.1']` unions both buckets under the one display.
+ *
+ * Selection unit is the LINE, not the point: for each line
+ * `(config_id, benchmark_type, isl, osl, offload_mode)` we pick the single newest workflow run that
+ * produced data for it (newest date, then latest sweep, then highest run id) and return
+ * EVERY concurrency that one run measured — and nothing from any other run. A partial
+ * re-sweep therefore truncates the line to its own concurrencies rather than stitching the
+ * skipped ones from an older run. This guarantees a line never mixes runs/dates.
  *
  * The frontend filters by sequence client-side. This eliminates API round-trips when
  * switching sequences — the data is already cached by React Query.
@@ -74,13 +79,8 @@ export async function getLatestBenchmarks(
 ): Promise<BenchmarkRow[]> {
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   if (date) {
-    // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest)
-    // exact=true: only return data from this exact date (for GPU comparison)
-    // exact=false (default): return latest data as of this date (for main chart)
-    // Same-day tiebreak by wr.run_started_at (latest sweep wins), mirroring the
-    // latest_benchmarks view (migration 003). br.date is a calendar day, so two
-    // sweeps on the same day tie on date alone and Postgres would otherwise pick
-    // an arbitrary one — leaving an older run's points shadowing a same-day re-sweep.
+    // Date-filtered: use the base table (the view only has the absolute latest).
+    // exact=true: only this exact date (GPU comparison); exact=false (default): as of this date.
     const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`;
     // "As of run" filter (main chart only): keep results whose run started no later
     // than the selected run. run_started_at is an absolute timestamp, so this also
@@ -97,8 +97,29 @@ export async function getLatestBenchmarks(
             )
           )`
         : sql``;
+    // winners: the single newest run per LINE
+    // (config_id, benchmark_type, isl, osl, offload_mode) under the
+    // date/run cutoff. br.date is a calendar day, so two same-day sweeps tie on date — break
+    // by wr.run_started_at (latest sweep wins), then br.workflow_run_id so exactly one run wins
+    // even when run_started_at is equal/null. The outer join then pulls EVERY concurrency that
+    // winning run measured for the line, so the line is built from one run only (no carry-forward
+    // of concurrencies a partial re-sweep skipped).
     const rows = await sql`
-      SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+      WITH winners AS (
+        SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode)
+          br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+          br.workflow_run_id AS winning_run_id
+        FROM benchmark_results br
+        JOIN configs c ON c.id = br.config_id
+        JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+        WHERE c.model = ANY(${modelKeys})
+          AND br.error IS NULL
+          AND ${dateFilter}
+          ${runFilter}
+        ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+                 br.date DESC, wr.run_started_at DESC NULLS LAST, br.workflow_run_id DESC
+      )
+      SELECT
         br.id,
         c.hardware,
         c.framework,
@@ -130,12 +151,15 @@ export async function getLatestBenchmarks(
       FROM benchmark_results br
       JOIN configs c ON c.id = br.config_id
       JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
-      WHERE c.model = ANY(${modelKeys})
-        AND br.error IS NULL
-        AND ${dateFilter}
-        ${runFilter}
-      ORDER BY br.config_id, br.conc, br.isl, br.osl,
-               br.date DESC, wr.run_started_at DESC NULLS LAST
+      JOIN winners w
+        ON w.config_id = br.config_id
+        AND w.benchmark_type = br.benchmark_type
+        AND w.isl IS NOT DISTINCT FROM br.isl
+        AND w.osl IS NOT DISTINCT FROM br.osl
+        AND w.offload_mode = br.offload_mode
+        AND w.winning_run_id = br.workflow_run_id
+      WHERE br.error IS NULL
+      ORDER BY br.config_id, br.conc, br.isl, br.osl
     `;
     return rows as unknown as BenchmarkRow[];
   }

From 2c3bb6dcaaff6c04ec56928cc08843b267c464bb Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 23:08:36 -0500
Subject: [PATCH 096/111] Default agentic charts to interactivity

---
 packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts          | 7 ++++---
 packages/app/src/components/inference/InferenceContext.tsx | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index 636a7ccf..df199b81 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -9,13 +9,14 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
     cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
   });
 
-  it('shows the x-axis mode buttons with Interactivity active by default', () => {
+  it('shows Interactivity by default for the agentic view', () => {
+    cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces');
     cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible');
     cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible');
     cy.get('[data-testid="x-axis-mode-interactivity"]')
       .should('be.visible')
       .and('have.attr', 'aria-selected', 'true');
-    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
   });
 
   it('switches the x-axis to TTFT and updates the heading', () => {
@@ -37,6 +38,6 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
       'aria-selected',
       'true',
     );
-    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
   });
 });
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 839afeed..ddb923b8 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -533,7 +533,7 @@ export function InferenceProvider({
 
   // Reconcile the x-axis mode with the scenario kind:
   //  - On mount with no `i_xmode` URL param: snap to the kind's natural default
-  //    (agentic → ttft, fixed → interactivity). The state itself was initialized
+  //    (interactivity for both agentic and fixed-sequence scenarios). The state was initialized
   //    to a SSR-stable constant so server and client render the same DOM; this
   //    effect fixes it up after hydration.
   //  - When the user later switches sequence kinds: snap to the new kind's
@@ -565,7 +565,7 @@ export function InferenceProvider({
       // — fall through to the default snap below.
       return;
     }
-    handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
+    handleSetXAxisMode('interactivity');
   }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]);
 
   // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or

From 28d007f28df8dfa3a1f826fd0f04876722f0e324 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 25 Jun 2026 15:55:08 -0500
Subject: [PATCH 097/111] feat(datasets): bracket grouping for parallel
 requests in flamegraph

Replace the per-row P# badges with a colored left-gutter bracket that
groups requests in the same main-agent or subagent scope whose original
execution intervals overlapped (ran in parallel). Non-transitive overlap
chains get their own side-by-side lanes; the gutter only renders when an
overlap group exists, so non-parallel traces have no extra whitespace.

Legend swatch and conversation-view copy updated to describe the bracket;
e2e assertions check data-overlap-group on bracket segments.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../e2e/datasets-flamegraph-time.cy.ts        |  56 ++-
 .../components/datasets/conversation-view.tsx |   4 +-
 .../components/datasets/trace-flamegraph.tsx  | 405 +++++++++++++++---
 3 files changed, 407 insertions(+), 58 deletions(-)

diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
index 672675a3..58d95c27 100644
--- a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
+++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
@@ -24,6 +24,7 @@ describe('Dataset conversation flamegraph timing', () => {
               kind: 'turn',
               turnIndex: 0,
               startS: 0,
+              endS: 1.2,
               model: 'model-a',
               in: 100,
               out: 10,
@@ -46,11 +47,34 @@ describe('Dataset conversation flamegraph timing', () => {
                   kind: 'turn',
                   turnIndex: 1,
                   startS: 3661.2,
+                  endS: 3668.2,
                   model: 'model-a',
-                  in: 800,
-                  out: 80,
-                  cached: 500,
-                  uncached: 300,
+                  in: 300,
+                  out: 30,
+                  cached: 150,
+                  uncached: 150,
+                },
+                {
+                  kind: 'turn',
+                  turnIndex: 2,
+                  startS: 3665.2,
+                  endS: 3671.2,
+                  model: 'model-a',
+                  in: 300,
+                  out: 30,
+                  cached: 200,
+                  uncached: 100,
+                },
+                {
+                  kind: 'turn',
+                  turnIndex: 3,
+                  startS: 3670.2,
+                  endS: 3675.2,
+                  model: 'model-a',
+                  in: 200,
+                  out: 20,
+                  cached: 150,
+                  uncached: 50,
                 },
               ],
             },
@@ -58,6 +82,7 @@ describe('Dataset conversation flamegraph timing', () => {
               kind: 'turn',
               turnIndex: 2,
               startS: 65.4,
+              endS: 67.4,
               model: 'model-a',
               in: 100,
               out: 10,
@@ -72,14 +97,31 @@ describe('Dataset conversation flamegraph timing', () => {
   });
 
   it('shows turn offsets and a collapsed subagent time range', () => {
-    cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00');
-    cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05');
+    cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00–00:01');
+    cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05–01:07');
     cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03');
     cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist');
   });
 
   it('shows subturn offsets when the subagent group is expanded', () => {
     cy.contains('button', 'Explore').click();
-    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01');
+    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01–1:01:08');
+    // Parallel groups render as left-gutter brackets; each member row carries
+    // one bracket segment per group it belongs to (non-transitive chains keep
+    // their own segments/lanes).
+    cy.get('[data-testid="flamegraph-overlap-g-1-c-0"]')
+      .should('have.length', 1)
+      .and('have.attr', 'data-overlap-group', 'subagent-1-1');
+    cy.get('[data-testid="flamegraph-overlap-g-1-c-1"]')
+      .should('have.length', 2)
+      .then(($segs) => {
+        expect([...$segs].map((seg) => seg.dataset.overlapGroup).toSorted()).to.deep.equal([
+          'subagent-1-1',
+          'subagent-1-2',
+        ]);
+      });
+    cy.get('[data-testid="flamegraph-overlap-g-1-c-2"]')
+      .should('have.length', 1)
+      .and('have.attr', 'data-overlap-group', 'subagent-1-2');
   });
 });
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index 57aaa0c3..ce10241a 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -88,7 +88,9 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
           One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default —
           click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
           plus generated output. Timestamps are elapsed from conversation start; subagent headers
-          show their full active range.
+          show their full active range. A colored bracket on the left groups requests in the same
+          main-agent or subagent scope whose original execution intervals overlapped (ran in
+          parallel).
         </p>
         <TraceFlamegraph
           structure={data.structure}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index d0bbb01f..1af65216 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -4,6 +4,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
 import { createPortal } from 'react-dom';
 
 import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
 import { compact } from './format';
 
 // Stacked-bar segment colors. Cached prefix vs uncached input vs output —
@@ -20,6 +21,96 @@ const LEGEND = [
   { key: 'output', label: 'Output', color: SEG.output },
 ] as const;
 
+// Kept distinct from token-segment colors. A row can carry multiple rails when
+// it overlaps different requests during different parts of its lifetime.
+const OVERLAP_COLORS = ['#06b6d4', '#ec4899', '#6366f1', '#84cc16', '#f97316'] as const;
+
+// Width (px) of one parallel-group bracket lane in the left gutter. Overlapping
+// groups (non-transitive chains) get their own lane so their brackets sit
+// side-by-side instead of stacking visually.
+const LANE_W = 14;
+
+export interface TimedRequest {
+  key: string;
+  startS?: number;
+  endS?: number;
+}
+
+export interface RequestOverlapGroup {
+  id: string;
+  requestKeys: string[];
+  startS: number;
+  endS: number;
+}
+
+/**
+ * Find maximal sets of requests that were simultaneously in flight.
+ * Intervals are half-open, so one request ending exactly when another begins
+ * is serialized rather than parallel. Maximal-set filtering prevents a nested
+ * A/B pair from duplicating an A/B/C marker, while preserving A/B and B/C as
+ * separate groups when their overlaps happen at different times.
+ */
+export function findRequestOverlapGroups(
+  requests: TimedRequest[],
+  scopeKey = 'scope',
+): RequestOverlapGroup[] {
+  const valid = requests.filter(
+    (request): request is TimedRequest & { startS: number; endS: number } =>
+      Number.isFinite(request.startS) &&
+      Number.isFinite(request.endS) &&
+      request.endS! > request.startS!,
+  );
+  const boundaries = [
+    ...new Set(valid.flatMap((request) => [request.startS, request.endS])),
+  ].toSorted((a, b) => a - b);
+  const candidates = new Map<string, Omit<RequestOverlapGroup, 'id'>>();
+
+  for (let i = 0; i < boundaries.length - 1; i++) {
+    const startS = boundaries[i]!;
+    const endS = boundaries[i + 1]!;
+    if (endS <= startS) continue;
+    const requestKeys = valid
+      .filter((request) => request.startS <= startS && request.endS >= endS)
+      .map((request) => request.key)
+      .toSorted();
+    if (requestKeys.length < 2) continue;
+    const key = requestKeys.join('\u0000');
+    const existing = candidates.get(key);
+    candidates.set(key, {
+      requestKeys,
+      startS: existing ? Math.min(existing.startS, startS) : startS,
+      endS: existing ? Math.max(existing.endS, endS) : endS,
+    });
+  }
+
+  const maximal = [...candidates.values()].filter(
+    (candidate, _, all) =>
+      !all.some(
+        (other) =>
+          other.requestKeys.length > candidate.requestKeys.length &&
+          candidate.requestKeys.every((key) => other.requestKeys.includes(key)),
+      ),
+  );
+
+  return maximal
+    .toSorted(
+      (a, b) =>
+        a.startS - b.startS ||
+        a.endS - b.endS ||
+        a.requestKeys.join(',').localeCompare(b.requestKeys.join(',')),
+    )
+    .map((group, index) => ({ ...group, id: `${scopeKey}-${index + 1}` }));
+}
+
+interface RowOverlap {
+  id: string;
+  label: string;
+  color: string;
+  startS: number;
+  endS: number;
+  peerCount: number;
+}
+
 interface VisibleRow {
   key: string;
   label: string;
@@ -33,6 +124,7 @@ interface VisibleRow {
   isGroup: boolean;
   isExpanded: boolean;
   groupIndex?: number;
+  overlaps: RowOverlap[];
 }
 
 /** Format seconds from conversation start as a compact elapsed timestamp. */
@@ -161,6 +253,42 @@ export function TraceFlamegraph({
   const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]);
   const collapseAll = useCallback(() => setExpanded(new Set()), []);
 
+  const overlapsByRow = useMemo(() => {
+    const mainGroups = findRequestOverlapGroups(
+      nodes.flatMap((node, i) =>
+        node.kind === 'turn' ? [{ key: `t-${i}`, startS: node.startS, endS: node.endS }] : [],
+      ),
+      'main',
+    );
+    const subagentGroups = nodes.flatMap((node, i) =>
+      node.kind === 'subagent'
+        ? findRequestOverlapGroups(
+            node.children.map((child, ci) => ({
+              key: `g-${i}-c-${ci}`,
+              startS: child.startS,
+              endS: child.endS,
+            })),
+            `subagent-${i}`,
+          )
+        : [],
+    );
+    const groups: RequestOverlapGroup[] = [...mainGroups, ...subagentGroups];
+
+    const byRow = new Map<string, RowOverlap[]>();
+    groups.forEach((group, groupIndex) => {
+      const overlap = {
+        id: group.id,
+        label: `P${groupIndex + 1}`,
+        color: OVERLAP_COLORS[groupIndex % OVERLAP_COLORS.length]!,
+        startS: group.startS,
+        endS: group.endS,
+        peerCount: group.requestKeys.length - 1,
+      };
+      group.requestKeys.forEach((key) => byRow.set(key, [...(byRow.get(key) ?? []), overlap]));
+    });
+    return byRow;
+  }, [nodes]);
+
   const rows = useMemo<VisibleRow[]>(() => {
     const out: VisibleRow[] = [];
     let turnNo = 0;
@@ -171,7 +299,7 @@ export function TraceFlamegraph({
           key: `t-${i}`,
           label: `Turn ${turnNo}`,
           sublabel: node.model ?? undefined,
-          timeLabel: timeLabel(node.startS),
+          timeLabel: timeLabel(node.startS, node.endS),
           cached: node.cached,
           uncached: node.uncached,
           output: node.out,
@@ -179,6 +307,7 @@ export function TraceFlamegraph({
           indent: 0,
           isGroup: false,
           isExpanded: false,
+          overlaps: overlapsByRow.get(`t-${i}`) ?? [],
         });
       } else {
         const isExpanded = expanded.has(i);
@@ -197,6 +326,7 @@ export function TraceFlamegraph({
           isGroup: true,
           isExpanded,
           groupIndex: i,
+          overlaps: [],
         });
         if (isExpanded) {
           node.children.forEach((child, ci) => {
@@ -204,7 +334,7 @@ export function TraceFlamegraph({
               key: `g-${i}-c-${ci}`,
               label: `↳ subturn ${ci + 1}`,
               sublabel: child.model ?? undefined,
-              timeLabel: timeLabel(child.startS),
+              timeLabel: timeLabel(child.startS, child.endS),
               cached: child.cached,
               uncached: child.uncached,
               output: child.out,
@@ -212,13 +342,14 @@ export function TraceFlamegraph({
               indent: 1,
               isGroup: false,
               isExpanded: false,
+              overlaps: overlapsByRow.get(`g-${i}-c-${ci}`) ?? [],
             });
           });
         }
       }
     });
     return out;
-  }, [nodes, expanded]);
+  }, [nodes, expanded, overlapsByRow]);
 
   // Two scales: leaf turns/subturns share a per-turn axis (the primary signal —
   // how cached/uncached evolves), while subagent group headers carry aggregates
@@ -234,6 +365,90 @@ export function TraceFlamegraph({
     [rows],
   );
 
+  // Geometry for the parallel-group brackets drawn in the left gutter. Each
+  // overlap group becomes a vertical bracket spanning from its first to its last
+  // visible member row, with a right-pointing tick on the exact member rows.
+  // Non-transitive chains (a row in two groups) get separate lanes so their
+  // brackets sit side by side. `through` = a row inside a group's span that is
+  // NOT itself a member (the aux-stream edge case) — drawn as a faint connector
+  // with no tick.
+  const braces = useMemo(() => {
+    interface Seg {
+      role: 'first' | 'middle' | 'last' | 'through';
+      isMember: boolean;
+      color: string;
+      groupId: string;
+      peerCount: number;
+      startS: number;
+      endS: number;
+    }
+    const groupMap = new Map<
+      string,
+      { id: string; color: string; peerCount: number; startS: number; endS: number; idxs: number[] }
+    >();
+    rows.forEach((r, idx) => {
+      for (const ov of r.overlaps) {
+        const g = groupMap.get(ov.id) ?? {
+          id: ov.id,
+          color: ov.color,
+          peerCount: ov.peerCount,
+          startS: ov.startS,
+          endS: ov.endS,
+          idxs: [],
+        };
+        g.idxs.push(idx);
+        groupMap.set(ov.id, g);
+      }
+    });
+    const groups = [...groupMap.values()]
+      .filter((g) => g.idxs.length >= 2) // need ≥2 visible members to bracket
+      .map((g) => ({
+        ...g,
+        min: Math.min(...g.idxs),
+        max: Math.max(...g.idxs),
+        members: new Set(g.idxs),
+      }))
+      .toSorted((a, b) => a.min - b.min || a.max - b.max);
+
+    // Greedy lane assignment: a group reuses a lane whose previous group ended
+    // before this one starts.
+    const laneEnd: number[] = [];
+    const laneOf = new Map<string, number>();
+    for (const g of groups) {
+      let lane = laneEnd.findIndex((end) => end < g.min);
+      if (lane === -1) {
+        lane = laneEnd.length;
+        laneEnd.push(g.max);
+      } else {
+        laneEnd[lane] = g.max;
+      }
+      laneOf.set(g.id, lane);
+    }
+    const laneCount = laneEnd.length;
+
+    const rowSegs: (Seg | null)[][] = rows.map(() =>
+      Array.from({ length: laneCount }, () => null as Seg | null),
+    );
+    for (const g of groups) {
+      const lane = laneOf.get(g.id)!;
+      for (let idx = g.min; idx <= g.max; idx++) {
+        const isMember = g.members.has(idx);
+        const role =
+          idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through';
+        rowSegs[idx]![lane] = {
+          role,
+          isMember,
+          color: g.color,
+          groupId: g.id,
+          peerCount: g.peerCount,
+          startS: g.startS,
+          endS: g.endS,
+        };
+      }
+    }
+    return { laneCount, rowSegs };
+  }, [rows]);
+
   const onMove = (e: React.MouseEvent, row: VisibleRow) => {
     setTooltip({ x: e.clientX, y: e.clientY, row });
   };
@@ -251,19 +466,32 @@ export function TraceFlamegraph({
               <span className="text-muted-foreground">{l.label}</span>
             </span>
           ))}
+          <span className="inline-flex items-center gap-1.5">
+            <span
+              className="inline-block h-4 w-2 rounded-l-sm border-y-2 border-l-2"
+              style={{ borderColor: OVERLAP_COLORS[0] }}
+            />
+            <span className="text-muted-foreground">Bracketed rows ran in parallel</span>
+          </span>
         </div>
         {groupIndexes.length > 0 && (
           <div className="flex items-center gap-1.5">
             <button
               type="button"
-              onClick={expandAll}
+              onClick={() => {
+                track('datasets_flamegraph_expand_all');
+                expandAll();
+              }}
               className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
             >
               Expand all
             </button>
             <button
               type="button"
-              onClick={collapseAll}
+              onClick={() => {
+                track('datasets_flamegraph_collapse_all');
+                collapseAll();
+              }}
               className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
             >
               Collapse all
@@ -276,8 +504,10 @@ export function TraceFlamegraph({
         ref={scrollRef}
         className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2"
       >
-        <div className="flex flex-col gap-0.5">
-          {rows.map((row) => {
+        {/* gap-0 so the per-row bracket segments connect into a continuous
+            vertical rail across the rows of a parallel group. */}
+        <div className="flex flex-col gap-0">
+          {rows.map((row, idx) => {
             // Group headers use the group axis; turns/subturns use the per-turn
             // axis. Clamp to the track width either way.
             const denom = row.isGroup ? maxGroupTotal : maxTotal;
@@ -286,64 +516,139 @@ export function TraceFlamegraph({
             const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0;
             const ow = row.total > 0 ? (row.output / row.total) * 100 : 0;
             const isHighlighted = row.key === highlightKey;
+            const segs = braces.rowSegs[idx]!;
             return (
               <div
                 key={row.key}
                 data-rowkey={row.key}
-                className={`flex items-center gap-2 rounded-sm transition-colors duration-700 ${
+                className={`flex items-stretch rounded-sm transition-colors duration-700 ${
                   isHighlighted ? 'bg-primary/20 ring-2 ring-primary' : 'ring-0'
                 }`}
-                style={{ paddingLeft: row.indent * 20 }}
               >
-                {/* label / group toggle */}
-                <div className="flex w-44 shrink-0 items-center gap-1 truncate">
-                  {row.isGroup ? (
-                    <button
-                      type="button"
-                      onClick={() => row.groupIndex !== undefined && toggle(row.groupIndex)}
-                      className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
-                    >
-                      <span className="inline-block w-3 text-muted-foreground">
-                        {row.isExpanded ? '▾' : '▸'}
-                      </span>
-                      <span className="truncate">{row.label}</span>
-                    </button>
-                  ) : (
-                    <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
-                  )}
-                </div>
+                {/* Parallel-group bracket gutter (only rendered when the
+                    conversation has any overlaps, so non-overlap traces keep a
+                    flush-left layout with no dead space). */}
+                {braces.laneCount > 0 && (
+                  <div
+                    className="flex shrink-0 self-stretch"
+                    style={{ width: braces.laneCount * LANE_W }}
+                  >
+                    {segs.map((seg, lane) => {
+                      if (!seg) return <div key={lane} style={{ width: LANE_W }} />;
+                      const top = seg.role === 'first' ? '50%' : '0';
+                      const bottom = seg.role === 'last' ? '50%' : '0';
+                      return (
+                        <div
+                          key={lane}
+                          className="relative"
+                          style={{ width: LANE_W }}
+                          {...(seg.isMember
+                            ? {
+                                'data-testid': `flamegraph-overlap-${row.key}`,
+                                'data-overlap-group': seg.groupId,
+                              }
+                            : {})}
+                          title={
+                            seg.isMember
+                              ? `Ran in parallel with ${seg.peerCount} other request${
+                                  seg.peerCount === 1 ? '' : 's'
+                                } (+${formatElapsedTime(seg.startS)}–${formatElapsedTime(seg.endS)})`
+                              : undefined
+                          }
+                        >
+                          {/* vertical rail */}
+                          <div
+                            className="absolute"
+                            style={{
+                              left: 5,
+                              width: 2,
+                              top,
+                              bottom,
+                              backgroundColor: seg.color,
+                              opacity: seg.isMember ? 0.95 : 0.3,
+                              borderTopLeftRadius: seg.role === 'first' ? 3 : 0,
+                              borderBottomLeftRadius: seg.role === 'last' ? 3 : 0,
+                            }}
+                          />
+                          {/* right-pointing tick marking an actual member row */}
+                          {seg.isMember && (
+                            <div
+                              className="absolute"
+                              style={{
+                                left: 5,
+                                top: '50%',
+                                height: 2,
+                                width: LANE_W - 7,
+                                transform: 'translateY(-1px)',
+                                backgroundColor: seg.color,
+                              }}
+                            />
+                          )}
+                        </div>
+                      );
+                    })}
+                  </div>
+                )}
 
-                {/* Offset from conversation start. Group rows span the full
-                    subagent lifetime; leaf rows show their start instant. */}
+                {/* row content (indented for subagent children) */}
                 <div
-                  className="w-36 shrink-0 text-[11px] tabular-nums text-muted-foreground"
-                  data-testid={`flamegraph-time-${row.key}`}
+                  className="flex flex-1 items-center gap-2 py-0.5"
+                  style={{ paddingLeft: row.indent * 20 }}
                 >
-                  {row.timeLabel ?? '—'}
-                </div>
+                  {/* label / group toggle */}
+                  <div className="flex w-52 shrink-0 items-center overflow-hidden">
+                    {row.isGroup ? (
+                      <button
+                        type="button"
+                        onClick={() => {
+                          track('datasets_flamegraph_group_toggled', {
+                            expanded: !row.isExpanded,
+                          });
+                          if (row.groupIndex !== undefined) toggle(row.groupIndex);
+                        }}
+                        className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
+                      >
+                        <span className="inline-block w-3 text-muted-foreground">
+                          {row.isExpanded ? '▾' : '▸'}
+                        </span>
+                        <span className="truncate">{row.label}</span>
+                      </button>
+                    ) : (
+                      <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
+                    )}
+                  </div>
 
-                {/* stacked bar — group headers render as a slim muted summary
-                    strip so they read as aggregates, not individual turns. */}
-                <div
-                  className="relative flex h-5 flex-1 items-center"
-                  onMouseMove={(e) => onMove(e, row)}
-                  onMouseLeave={() => setTooltip(null)}
-                >
+                  {/* Original interval, measured from conversation start. */}
                   <div
-                    className={`flex overflow-hidden rounded-sm ${
-                      row.isGroup ? 'h-2.5 opacity-80' : 'h-5'
-                    }`}
-                    style={{ width: `${widthPct}%` }}
+                    className="w-36 shrink-0 text-[11px] tabular-nums text-muted-foreground"
+                    data-testid={`flamegraph-time-${row.key}`}
                   >
-                    <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
-                    <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
-                    <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                    {row.timeLabel ?? '—'}
                   </div>
-                </div>
 
-                {/* total */}
-                <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
-                  {compact(row.total)}
+                  {/* stacked bar — group headers render as a slim muted summary
+                      strip so they read as aggregates, not individual turns. */}
+                  <div
+                    className="relative flex h-5 flex-1 items-center"
+                    onMouseMove={(e) => onMove(e, row)}
+                    onMouseLeave={() => setTooltip(null)}
+                  >
+                    <div
+                      className={`flex overflow-hidden rounded-sm ${
+                        row.isGroup ? 'h-2.5 opacity-80' : 'h-5'
+                      }`}
+                      style={{ width: `${widthPct}%` }}
+                    >
+                      <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
+                      <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
+                      <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                    </div>
+                  </div>
+
+                  {/* total */}
+                  <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
+                    {compact(row.total)}
+                  </div>
                 </div>
               </div>
             );

From f7f82d40fda392c3b1dfa8ebe0de6227e2e5c6a4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 25 Jun 2026 16:04:21 -0500
Subject: [PATCH 098/111] fix(datasets): bound flamegraph bracket gutter for
 high-parallelism traces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A pathological conversation (1621 turns, a subagent fanning out into 622
children with 17-way concurrency) produced 49 bracket lanes — a 686px
gutter that pushed the bars off-screen, plus one DOM node per lane per
row (~110k empty divs, 157k total nodes on Expand all).

Cap displayed lanes at MAX_LANES (6): overflow groups fold into the last
"dense" lane, so every parallel row still carries a marker but the gutter
width stays bounded. Render the gutter sparsely (only lanes a row touches,
absolutely positioned) instead of a dense lane-per-row matrix. A subtle
note surfaces when lanes are capped so the fold isn't silent.

Outlier now: gutter 686px -> 84px, DOM on Expand all 157k -> 35k nodes.
Normal multi-lane traces are unchanged (<=6 lanes hit the identity path).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../components/datasets/trace-flamegraph.tsx  | 61 ++++++++++++++-----
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 1af65216..158c03c3 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -30,6 +30,15 @@ const OVERLAP_COLORS = ['#06b6d4', '#ec4899', '#6366f1', '#84cc16', '#f97316'] a
 // side-by-side instead of stacking visually.
 const LANE_W = 14;
 
+// Cap on simultaneously-drawn bracket lanes. A pathological conversation (e.g. a
+// long-running session whose subagent fans out into hundreds of children with
+// 15+ concurrent requests) can require dozens of lanes; left unbounded the
+// gutter grows wide enough to push the bars off-screen AND emits one DOM node
+// per lane per row (tens of thousands of empty divs). We bound it: lanes beyond
+// the cap fold into the last "dense" lane, which stays readable for the common
+// case (≤6 concurrent) and degrades gracefully for the outliers.
+const MAX_LANES = 6;
+
 export interface TimedRequest {
   key: string;
   startS?: number;
@@ -424,18 +433,25 @@ export function TraceFlamegraph({
       }
       laneOf.set(g.id, lane);
     }
-    const laneCount = laneEnd.length;
-
-    const rowSegs: (Seg | null)[][] = rows.map(() =>
-      Array.from({ length: laneCount }, () => null as Seg | null),
-    );
+    const rawLaneCount = laneEnd.length;
+    // Bound the gutter (see MAX_LANES). Lanes past the cap collapse onto the last
+    // visible lane, so every parallel row still carries a marker but the gutter
+    // width and DOM-node count stay bounded regardless of how parallel the
+    // conversation is.
+    const laneCount = Math.min(rawLaneCount, MAX_LANES);
+    const displayLane = (lane: number) => Math.min(lane, laneCount - 1);
+
+    // Sparse per-row segments: only lanes that actually carry a bracket on a row
+    // are stored (and later rendered). The previous dense matrix emitted one DOM
+    // node per lane per row — catastrophic at 49 lanes × 2k rows.
+    const rowSegs: { lane: number; seg: Seg }[][] = rows.map(() => []);
     for (const g of groups) {
-      const lane = laneOf.get(g.id)!;
+      const lane = displayLane(laneOf.get(g.id)!);
       for (let idx = g.min; idx <= g.max; idx++) {
         const isMember = g.members.has(idx);
         const role =
           idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through';
-        rowSegs[idx]![lane] = {
+        const seg: Seg = {
           role,
           isMember,
           color: g.color,
@@ -444,9 +460,15 @@ export function TraceFlamegraph({
           startS: g.startS,
           endS: g.endS,
         };
+        const cell = rowSegs[idx]!;
+        const existing = cell.find((c) => c.lane === lane);
+        // Collisions only happen in the folded overflow lane. Prefer a real
+        // member marker over a faint pass-through connector.
+        if (!existing) cell.push({ lane, seg });
+        else if (seg.isMember && !existing.seg.isMember) existing.seg = seg;
       }
     }
-    return { laneCount, rowSegs };
+    return { laneCount, overflowLanes: rawLaneCount - laneCount, rowSegs };
   }, [rows]);
 
   const onMove = (e: React.MouseEvent, row: VisibleRow) => {
@@ -500,6 +522,14 @@ export function TraceFlamegraph({
         )}
       </div>
 
+      {braces.overflowLanes > 0 && (
+        <p className="mb-2 text-[11px] text-muted-foreground">
+          Dense parallel region — bracket lanes capped at {MAX_LANES}; {braces.overflowLanes}{' '}
+          further overlapping {braces.overflowLanes === 1 ? 'group is' : 'groups are'} folded into
+          the last lane.
+        </p>
+      )}
+
       <div
         ref={scrollRef}
         className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2"
@@ -527,21 +557,22 @@ export function TraceFlamegraph({
               >
                 {/* Parallel-group bracket gutter (only rendered when the
                     conversation has any overlaps, so non-overlap traces keep a
-                    flush-left layout with no dead space). */}
+                    flush-left layout with no dead space). Segments are sparse and
+                    absolutely positioned per lane so a row only pays for the
+                    lanes it actually touches. */}
                 {braces.laneCount > 0 && (
                   <div
-                    className="flex shrink-0 self-stretch"
+                    className="relative shrink-0 self-stretch"
                     style={{ width: braces.laneCount * LANE_W }}
                   >
-                    {segs.map((seg, lane) => {
-                      if (!seg) return <div key={lane} style={{ width: LANE_W }} />;
+                    {segs.map(({ lane, seg }) => {
                       const top = seg.role === 'first' ? '50%' : '0';
                       const bottom = seg.role === 'last' ? '50%' : '0';
                       return (
                         <div
-                          key={lane}
-                          className="relative"
-                          style={{ width: LANE_W }}
+                          key={`${lane}-${seg.groupId}`}
+                          className="absolute top-0 bottom-0"
+                          style={{ left: lane * LANE_W, width: LANE_W }}
                           {...(seg.isMember
                             ? {
                                 'data-testid': `flamegraph-overlap-${row.key}`,

From 95d7f0110102bbb763b61dc64942b0a9b1ae60c8 Mon Sep 17 00:00:00 2001
From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com>
Date: Fri, 26 Jun 2026 04:13:55 +0000
Subject: [PATCH 099/111] fix(db): add endS to TurnNode so flamegraph timing
 typechecks

Co-authored-by: Alec Ibarra <adibarra@users.noreply.github.com>

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 packages/db/src/etl/weka-structure.ts | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index ac7a6eab..26cc8da1 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -50,6 +50,8 @@ export interface TurnNode {
   turnIndex: number;
   /** Seconds from the start of the conversation. */
   startS?: number;
+  /** Seconds from the start of the conversation (startS + api_time). */
+  endS?: number;
   model?: string;
   in: number;
   out: number;
@@ -140,6 +142,13 @@ function finiteTime(value: number | undefined): number | undefined {
   return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined;
 }
 
+/** End of a turn = its start plus the request's api_time (seconds). */
+function turnEndS(req: RawWekaRequest): number | undefined {
+  const startS = finiteTime(req.t);
+  if (startS === undefined) return undefined;
+  return startS + (finiteTime(req.api_time) ?? 0);
+}
+
 function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } {
   const children = entry.requests ?? [];
   const childStarts = children
@@ -153,11 +162,7 @@ function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: nu
   }
 
   const childEnds = children
-    .map((child) => {
-      const childStart = finiteTime(child.t);
-      if (childStart === undefined) return undefined;
-      return childStart + (finiteTime(child.api_time) ?? 0);
-    })
+    .map((child) => turnEndS(child))
     .filter((value): value is number => value !== undefined);
   return {
     startS,
@@ -202,6 +207,7 @@ export function buildConversationStructure(
           kind: 'turn',
           turnIndex: turnIndex++,
           startS: finiteTime(inner.t),
+          endS: turnEndS(inner),
           model: inner.model,
           in: split.in,
           out,
@@ -238,6 +244,7 @@ export function buildConversationStructure(
         kind: 'turn',
         turnIndex: turnIndex++,
         startS: finiteTime(entry.t),
+        endS: turnEndS(entry),
         model: entry.model,
         in: split.in,
         out,

From e3a6d41d92349ac824196aae503ec4ed02d0e21e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 26 Jun 2026 01:27:30 -0500
Subject: [PATCH 100/111] fix(agentic): enforce slow-tail interactivity (intvty
 = 1/itl) end-to-end
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agentic artifacts ship *_intvty under two harness definitions: slow-tail
1/p(ITL) (what the charts assume) vs fast-tail p(1/ITL), which inverts
percentile order (p90 lands at ~1/p10(ITL)). Ingest stored the artifact
value verbatim and the frontend only filled intvty when missing, so newer
"timing fix" runs landed with the wrong definition — e.g. p90 reading 23.9
instead of 11.2 — silently contaminating cross-run Pareto comparisons.

Enforce the invariant in every path:
- ingest mapper: derive agentic mean/median/p75/p90/p95/p99 *_intvty from
  *_itl, discarding the artifact value (self-correcting ingest).
- frontend agenticAliases: always derive intvty = 1/itl (override, not
  fill-if-missing) so overlay / ?unofficialrun= rows match.
- backfill-agentic-intvty script: one-time fix for stored rows (already run
  against the DB: 164 rows / 656 values rewritten, 0 contaminated after).
- ingest agent doc: note the invariant + the backfill escape hatch.

std_intvty is intentionally left alone (reciprocal of a std is meaningless;
the API strips it). Unit tests added on both the mapper and the transform.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .claude/agents/ingest.md                      |   3 +
 .../app/src/lib/benchmark-transform.test.ts   |  32 ++++++
 packages/app/src/lib/benchmark-transform.ts   |  15 ++-
 packages/db/package.json                      |   2 +
 packages/db/src/backfill-agentic-intvty.ts    | 107 ++++++++++++++++++
 packages/db/src/etl/benchmark-mapper.test.ts  |  44 +++++++
 packages/db/src/etl/benchmark-mapper.ts       |  19 ++++
 7 files changed, 217 insertions(+), 5 deletions(-)
 create mode 100644 packages/db/src/backfill-agentic-intvty.ts

diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
index 4ecbc1dd..59045378 100644
--- a/.claude/agents/ingest.md
+++ b/.claude/agents/ingest.md
@@ -157,6 +157,7 @@ If user doesn't specify a description, ask for one OR derive from the run name.
 - **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `_<runner>_<attempt>` suffix.
 - **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection.
 - **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = '<full-sweep-date>'` so the frontend's max-date-per-group dedup doesn't drop the older sweep.
+- **Agentic interactivity normalization (`*_intvty`)**: for `agentic_traces` runs, interactivity MUST be the slow-tail reciprocal of the ITL percentile — `*_intvty = 1/*_itl` (so `p90_intvty = 1/p90_itl`). Some harness versions emit `*_intvty` as `p(1/ITL)` instead (fast-tail — inverts percentile order, e.g. p90 shows ~`1/p10(ITL)`), which silently contaminates cross-run Pareto comparisons. The ingest mapper (`benchmark-mapper.ts`) now **derives `*_intvty` from `*_itl` and discards the artifact's value** for agentic rows, so a normal ingest is self-correcting — no manual step needed. The frontend `agenticAliases` does the same for overlay / `?unofficialrun=` rows. If you ever load agentic data through a path that bypasses the mapper, run `pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes` (idempotent; rewrites `mean/p75/p90/p95 _intvty = 1/_itl`) then refresh the MV + purge cache. `std_intvty` is intentionally left alone (the reciprocal of a std is meaningless; the API strips it anyway).
 
 ## Process
 
@@ -180,6 +181,8 @@ cd packages/db && DATABASE_WRITE_URL='<direct write url>' \
 
 It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
 
+New agentic benchmark artifacts preserve AIPerf's `metadata.dataset` provenance as a top-level `dataset` object. Standard benchmark ingest automatically derives the dataset slug from `dataset.hf_dataset_name` and upserts `run_datasets`; do not manually backfill that mapping for new-format runs. Manual mapping is only needed for legacy artifacts that do not contain dataset provenance.
+
 ## Don't
 
 - Don't push to git unless the user asked.
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 88fb6a8b..648ebaae 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -854,3 +854,35 @@ describe('mergeRunScopedRows', () => {
     expect(mergeRunScopedRows([], baseRows)).toBe(baseRows);
   });
 });
+
+describe('rowToAggDataEntry — agentic interactivity invariant', () => {
+  // Agentic artifacts have shipped *_intvty under two definitions across harness
+  // versions (slow-tail 1/p(ITL) vs fast-tail p(1/ITL)). The chart's
+  // interactivity selector is slow-tail, so we always derive intvty = 1/itl and
+  // discard the artifact value. Mirrors the ingest mapper + backfill.
+  const agentic = (metrics: Record<string, number>) =>
+    rowToAggDataEntry(makeRow({ benchmark_type: 'agentic_traces', isl: null, osl: null, metrics }));
+
+  it('overrides an artifact-supplied (fast-tail) *_intvty with 1/*_itl', () => {
+    const entry = agentic({
+      p90_itl: 0.0893, // slow-tail 1/itl ≈ 11.198
+      p90_intvty: 23.91, // fast-tail contamination — must be discarded
+      p75_itl: 0.0692,
+      p75_intvty: 19, // must be discarded
+    });
+    expect(entry.p90_intvty).toBeCloseTo(1 / 0.0893, 6);
+    expect(entry.p75_intvty).toBeCloseTo(1 / 0.0692, 6);
+    expect(entry.p90_intvty).not.toBeCloseTo(23.91, 1);
+  });
+
+  it('derives intvty from itl when the artifact omits intvty entirely', () => {
+    const entry = agentic({ p90_itl: 0.1, p95_itl: 0.2 });
+    expect(entry.p90_intvty).toBeCloseTo(10, 6);
+    expect(entry.p95_intvty).toBeCloseTo(5, 6);
+  });
+
+  it('does not invert interactivity for single_turn rows', () => {
+    const entry = rowToAggDataEntry(makeRow({ metrics: { p90_itl: 0.05, p90_intvty: 999 } }));
+    expect(entry.p90_intvty).toBe(999);
+  });
+});
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index a1c86776..cb8e3ceb 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -21,18 +21,23 @@ import type { BenchmarkRow } from '@/lib/api';
  *   e2el   ≡ ttlt   (time-to-last-token == end-to-end latency)
  *   tpot   ≡ itl    (time-per-output-token == inter-token-latency for single-output)
  *   intvty ≡ 1/itl  (tok/s from the user's perspective)
- * Existing fields win if present; we only fill in the gaps.
+ *
+ * e2el/tpot only fill gaps (existing fields win). `intvty` is ALWAYS derived from
+ * itl, overriding any artifact-supplied value: the harness definition of
+ * `*_intvty` has drifted (some versions emit `p(1/ITL)`, which inverts percentile
+ * order), so for a slow-tail selector interactivity must be `1/p(ITL)`. This
+ * matches the ingest mapper + backfill-agentic-intvty for official rows; doing it
+ * here keeps overlay / `?unofficialrun=` rows (transformed live from raw
+ * artifacts, never through the DB) on the same definition.
  */
 function agenticAliases(m: Record<string, number>): Record<string, number> {
   const out: Record<string, number> = {};
-  for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) {
+  for (const suffix of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
     const itl = m[`${suffix}_itl`];
     const ttlt = m[`${suffix}_ttlt`];
     if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
     if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl;
-    if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) {
-      out[`${suffix}_intvty`] = 1 / itl;
-    }
+    if (itl !== undefined && itl > 0) out[`${suffix}_intvty`] = 1 / itl;
   }
   return out;
 }
diff --git a/packages/db/package.json b/packages/db/package.json
index 8b97c2c3..17d6f627 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,8 +19,10 @@
     "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
+    "db:backfill-agentic-intvty": "dotenv -e ../../.env -- tsx src/backfill-agentic-intvty.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+    "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts",
     "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
diff --git a/packages/db/src/backfill-agentic-intvty.ts b/packages/db/src/backfill-agentic-intvty.ts
new file mode 100644
index 00000000..a8eebdba
--- /dev/null
+++ b/packages/db/src/backfill-agentic-intvty.ts
@@ -0,0 +1,107 @@
+/**
+ * Backfill: enforce the slow-tail interactivity invariant on agentic rows.
+ *
+ * Agentic trace-replay artifacts emit both `*_itl` and `*_intvty`. Historically
+ * the harness wrote `*_intvty = 1/p(ITL)` (slow-tail — "interactivity at the
+ * p-th latency"), which is what the inference chart's interactivity selector
+ * and the detail time-series both assume. A later "timing fix" harness started
+ * emitting `*_intvty = p(1/ITL)` instead (fast-tail — equivalent to
+ * `1/p(100-x)(ITL)`), because taking the reciprocal reverses percentile order.
+ * Ingest stores every metric verbatim, so those runs landed in the DB with the
+ * opposite definition — e.g. p90 reading 23.9 instead of 11.2 for the same
+ * point — contaminating cross-run Pareto comparisons.
+ *
+ * This rewrites `mean/p75/p90/p95 _intvty = 1/_itl` for every agentic row so the
+ * stored value always matches the slow-tail definition the charts use. It is
+ * idempotent: rows already on the correct definition are left untouched (guarded
+ * by a relative-deviation check). `std_intvty` is intentionally NOT touched —
+ * the reciprocal of a standard deviation is meaningless, and the API strips it.
+ * The prior fast-tail value is discarded on purpose (p10_itl isn't stored, so it
+ * isn't recoverable anyway, and per project policy fast-tail must not back a
+ * slow-tail selector).
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils.js';
+
+// Percentile-style keys whose interactivity is the reciprocal of the matching
+// ITL percentile. `std` is excluded by design (not a reciprocal); `median`/`p99`
+// are absent from agentic artifacts so they never appear here.
+const KEYS = ['mean', 'p75', 'p90', 'p95'] as const;
+
+// Relative tolerance: skip rows already within 1e-6 of 1/itl so correct rows
+// keep their original full-precision value and the change counts are accurate.
+const REL_TOL = 1e-6;
+
+const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
+
+async function contaminationCounts(): Promise<Record<string, number>> {
+  const out: Record<string, number> = {};
+  for (const k of KEYS) {
+    const rows = await sql.unsafe(`
+      SELECT count(*)::int AS n
+      FROM benchmark_results
+      WHERE benchmark_type = 'agentic_traces'
+        AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0
+        AND metrics ? '${k}_intvty'
+        AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric)
+            > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric)
+    `);
+    out[k] = (rows[0] as unknown as { n: number }).n;
+  }
+  return out;
+}
+
+async function main(): Promise<void> {
+  const total = await sql<{ n: number }[]>`
+    SELECT count(*)::int AS n FROM benchmark_results WHERE benchmark_type = 'agentic_traces'
+  `;
+  console.log(`Agentic rows: ${total[0]!.n}`);
+
+  const before = await contaminationCounts();
+  console.log('Contaminated (intvty != 1/itl) before:', JSON.stringify(before));
+  if (KEYS.every((k) => before[k] === 0)) {
+    console.log('Nothing to backfill — all agentic rows already satisfy intvty = 1/itl.');
+    await sql.end();
+    return;
+  }
+
+  if (!hasYesFlag() && !(await confirm('Rewrite *_intvty = 1/*_itl for these rows? (y/N) '))) {
+    await sql.end();
+    return;
+  }
+
+  let totalUpdated = 0;
+  for (const k of KEYS) {
+    // keys are from a fixed trusted const — safe to interpolate.
+    const res = await sql.unsafe(`
+      UPDATE benchmark_results
+      SET metrics = jsonb_set(metrics, '{${k}_intvty}', to_jsonb(1.0 / (metrics->>'${k}_itl')::numeric))
+      WHERE benchmark_type = 'agentic_traces'
+        AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0
+        AND metrics ? '${k}_intvty'
+        AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric)
+            > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric)
+    `);
+    console.log(`  ${k}_intvty: updated ${res.count} row(s)`);
+    totalUpdated += res.count;
+  }
+
+  const after = await contaminationCounts();
+  console.log('Contaminated after:', JSON.stringify(after));
+  if (!KEYS.every((k) => after[k] === 0)) {
+    throw new Error('Backfill incomplete — some rows still deviate. Aborting before MV refresh.');
+  }
+
+  await refreshLatestBenchmarks(sql);
+  console.log(`Done. Rewrote ${totalUpdated} metric value(s) across agentic rows.`);
+  await sql.end();
+}
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts
index 65fb3e39..5fe9ffde 100644
--- a/packages/db/src/etl/benchmark-mapper.test.ts
+++ b/packages/db/src/etl/benchmark-mapper.test.ts
@@ -22,6 +22,20 @@ function makeV1Row(overrides: Record<string, any> = {}): Record<string, any> {
   };
 }
 
+/** Minimal valid agentic row: scenario_type triggers the agentic path; `users` → conc. */
+function makeAgenticRow(overrides: Record<string, any> = {}): Record<string, any> {
+  return {
+    infmax_model_prefix: 'dsv4',
+    hw: 'b200-nv',
+    framework: 'vllm',
+    precision: 'fp4',
+    scenario_type: 'agentic-coding',
+    users: 72,
+    tput_per_gpu: 20000,
+    ...overrides,
+  };
+}
+
 /** Minimal valid v2 benchmark row (disaggregated prefill/decode parallelism). */
 function makeV2Row(overrides: Record<string, any> = {}): Record<string, any> {
   return {
@@ -570,3 +584,33 @@ describe('extractWorkers', () => {
     expect(extractWorkers([null, 'bad', 0, undefined])).toBeUndefined();
   });
 });
+
+describe('mapBenchmarkRow — agentic interactivity normalization', () => {
+  it('derives *_intvty from 1/*_itl, discarding the artifact value', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeAgenticRow({
+        p90_itl: 0.0893,
+        p90_intvty: 23.91, // fast-tail contamination — must be overwritten
+        p75_itl: 0.0692,
+        p75_intvty: 19,
+      }),
+      tracker,
+    );
+    expect(result!.benchmarkType).toBe('agentic_traces');
+    expect(result!.metrics.p90_intvty).toBeCloseTo(1 / 0.0893, 6);
+    expect(result!.metrics.p75_intvty).toBeCloseTo(1 / 0.0692, 6);
+  });
+
+  it('derives *_intvty even when the artifact omits it', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0.1 }), tracker);
+    expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6);
+  });
+
+  it('does not touch *_intvty for single_turn rows', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeV1Row({ p90_itl: 0.05, p90_intvty: 999 }), tracker);
+    expect(result!.metrics.p90_intvty).toBe(999);
+  });
+});
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 258a5ecc..5ec3343c 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -63,6 +63,9 @@ const NON_METRIC_KEYS = new Set([
   'offload_mode',
   'num_requests_total',
   'num_requests_successful',
+  // Public-dataset provenance emitted by aiperf. The ingest runner uses this
+  // object to populate run_datasets; it is not a benchmark metric.
+  'dataset',
   // per-worker measured-power array (not a numeric scalar). Surfaced as a
   // sibling of the metrics JSONB by mapBenchmarkRow so the metrics column
   // stays Record<string, number> for the index signature on BenchmarkRow.
@@ -266,6 +269,22 @@ export function mapBenchmarkRow(
     (metrics as Record<string, unknown>).offload_mode = offloadModeRaw;
   }
 
+  // Slow-tail interactivity invariant. Agentic artifacts ship `*_intvty`, but the
+  // definition has drifted across harness versions: some emit `1/p(ITL)`
+  // (slow-tail), others `p(1/ITL)` — which inverts percentile order, so p90 comes
+  // out as ~1/p10(ITL) instead. The inference chart's interactivity selector and
+  // the detail time-series both treat interactivity as the reciprocal of the ITL
+  // percentile, so we derive it from `*_itl` here rather than trust the artifact,
+  // keeping every agentic row on one definition. `std` is excluded — the
+  // reciprocal of a standard deviation is meaningless. Mirrored in the frontend
+  // overlay path (agenticAliases) and the one-time backfill-agentic-intvty script.
+  if (isAgentic) {
+    for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) {
+      const itl = metrics[`${k}_itl`];
+      if (typeof itl === 'number' && itl > 0) metrics[`${k}_intvty`] = 1 / itl;
+    }
+  }
+
   // Artifact names encode '/' as '#' to avoid path separators; restore the URI.
   const image = row.image ? String(row.image).replaceAll('#', '/') : null;
 

From 3ab43e6443a42aefdc21e505ad5673e018b9dc2c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 26 Jun 2026 17:51:17 -0500
Subject: [PATCH 101/111] feat(agentic): agentic-point detail, datasets, and
 trace-replay metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Frontend (packages/app): agentic-point detail page (server-metrics time
series, derived metrics, request timeline, histograms, aggregates),
datasets list/detail, and supporting hooks/charts/utilities.

Backing (packages/db, packages/constants): trace-replay ingest + ETL
(server-metrics adapters, trace-artifact discovery, dataset provenance,
chart-series/aggregate-stats compute), queries (derived-agentic-metrics,
trace-histograms, trace-server-metrics, request-timeline, datasets,
agentic-aggregates), migration 009, and shared agentic constants.

Committed together because the frontend API routes import the db query
functions — a frontend-only commit would not build.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/data-pipeline.md                         |  12 +
 .../kv-cache-hit-rate-anomaly.md              | 113 +++++
 .../e2e/agentic-point-time-series.cy.ts       | 220 ++++++++
 .../cypress/e2e/datasets-distributions.cy.ts  |  43 ++
 .../e2e/gpu-compare-agentic-detail.cy.ts      |  54 ++
 .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts  |  47 ++
 .../api/v1/derived-agentic-metrics/route.ts   |   6 +-
 .../components/datasets/dataset-detail.tsx    |  34 +-
 .../src/components/datasets/dataset-list.tsx  |  10 +-
 .../app/src/components/datasets/format.ts     |   6 +
 .../datasets/trace-flamegraph.test.ts         |  41 +-
 .../components/inference/InferenceContext.tsx |  13 +-
 .../agentic-point/agentic-point-detail.tsx    | 475 +++++++++++++++---
 .../agentic-point/request-timeline.test.ts    | 101 ++++
 .../agentic-point/request-timeline.tsx        | 169 +++++--
 .../agentic-point/time-series-chart.test.ts   | 175 ++++++-
 .../agentic-point/time-series-chart.tsx       | 169 ++++++-
 .../inference/hooks/useChartData.ts           |   8 +-
 .../app/src/components/inference/types.ts     |  11 +-
 .../components/inference/ui/ChartDisplay.tsx  |  81 ++-
 .../src/components/inference/ui/GPUGraph.tsx  |  47 ++
 .../components/inference/ui/ScatterGraph.tsx  |   8 +-
 .../src/components/inference/utils.test.ts    |  21 +-
 .../app/src/components/inference/utils.ts     |  14 +
 .../inference/utils/tooltip-utils.test.ts     |  32 ++
 .../inference/utils/tooltipUtils.ts           |  34 +-
 packages/app/src/hooks/api/use-datasets.ts    |   6 +
 .../api/use-derived-agentic-metrics.test.ts   |  13 +
 .../hooks/api/use-derived-agentic-metrics.ts  |  26 +-
 .../src/hooks/api/use-trace-server-metrics.ts |  28 ++
 .../d3-chart/layers/scatter-points.test.ts    |  50 +-
 .../src/lib/d3-chart/layers/scatter-points.ts |  17 +-
 packages/constants/src/agentic.ts             |   2 +
 packages/constants/src/index.ts               |   1 +
 .../migrations/009_dataset_request_stats.sql  |  55 ++
 packages/db/src/backfill-aggregate-stats.ts   |  33 +-
 packages/db/src/backfill-chart-series.ts      |  27 +-
 packages/db/src/backfill-dataset-stats.ts     | 115 +++++
 .../src/etl/compute-aggregate-stats.test.ts   |  31 +-
 .../db/src/etl/compute-aggregate-stats.ts     |  27 +
 .../db/src/etl/compute-chart-series.test.ts   |  89 ++++
 packages/db/src/etl/compute-chart-series.ts   | 105 +++-
 .../db/src/etl/dataset-provenance.test.ts     |  40 ++
 packages/db/src/etl/dataset-provenance.ts     |  30 ++
 .../db/src/etl/server-metrics-adapters.ts     | 100 ++++
 .../src/etl/trace-artifact-discovery.test.ts  |  66 +++
 .../db/src/etl/trace-artifact-discovery.ts    |  89 ++++
 packages/db/src/etl/trace-replay-ingest.ts    |   6 +-
 packages/db/src/etl/weka-structure.test.ts    |  46 +-
 packages/db/src/etl/weka-structure.ts         |  51 +-
 packages/db/src/ingest-ci-run.ts              |  77 ++-
 packages/db/src/ingest-weka-dataset.ts        |  31 +-
 packages/db/src/queries/agentic-aggregates.ts |   4 +-
 packages/db/src/queries/datasets.ts           |   4 +
 .../queries/derived-agentic-metrics.test.ts   |  15 +
 .../db/src/queries/derived-agentic-metrics.ts |  40 +-
 .../db/src/queries/request-timeline.test.ts   |  45 ++
 packages/db/src/queries/request-timeline.ts   |  28 +-
 .../db/src/queries/trace-histograms.test.ts   |  78 +++
 packages/db/src/queries/trace-histograms.ts   |  67 ++-
 .../src/queries/trace-server-metrics.test.ts  | 104 ++++
 .../db/src/queries/trace-server-metrics.ts    |  36 +-
 62 files changed, 3231 insertions(+), 295 deletions(-)
 create mode 100644 docs/investigations/kv-cache-hit-rate-anomaly.md
 create mode 100644 packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.test.ts
 create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
 create mode 100644 packages/constants/src/agentic.ts
 create mode 100644 packages/db/migrations/009_dataset_request_stats.sql
 create mode 100644 packages/db/src/backfill-dataset-stats.ts
 create mode 100644 packages/db/src/etl/dataset-provenance.test.ts
 create mode 100644 packages/db/src/etl/dataset-provenance.ts
 create mode 100644 packages/db/src/etl/server-metrics-adapters.ts
 create mode 100644 packages/db/src/etl/trace-artifact-discovery.test.ts
 create mode 100644 packages/db/src/etl/trace-artifact-discovery.ts
 create mode 100644 packages/db/src/queries/request-timeline.test.ts
 create mode 100644 packages/db/src/queries/trace-histograms.test.ts
 create mode 100644 packages/db/src/queries/trace-server-metrics.test.ts

diff --git a/docs/data-pipeline.md b/docs/data-pipeline.md
index 38e7d471..bc439e8a 100644
--- a/docs/data-pipeline.md
+++ b/docs/data-pipeline.md
@@ -62,6 +62,18 @@ Configs are preloaded into an in-memory Map at ingest start. `getOrCreateConfig(
 
 Unmapped models/hardware are tracked (not silently dropped) so operators can see what new GPU or model names appeared in CI artifacts. This is how new GPUs get added to the system — the skip tracker acts as a change detection mechanism.
 
+### Server-Metric Orchestrator Adapters
+
+AIPerf defines the `server_metrics_export.json` envelope, but labels such as worker role and rank belong to the serving orchestrator. The chart-series ETL therefore normalizes raw series through an orchestrator-specific adapter before exposing per-worker metrics. For example, the Dynamo adapter maps `dynamo_component=prefill|backend` to canonical `prefill|decode` roles and uses the endpoint, worker ID, DP rank, and engine together as the source identity.
+
+Adapters are selected from the benchmark's canonical framework, and per-worker series are only emitted for disaggregated configs with a recognized adapter. Unknown orchestrators and non-disaggregated configs retain their aggregate-only series; roles are never guessed from ports or metric names. The frontend only consumes the canonical source identity and never interprets orchestrator-native labels.
+
+### Agentic Dataset Provenance
+
+AIPerf exports public-dataset provenance in `metadata.dataset`, including the Hugging Face dataset ID. InferenceX preserves that object as `dataset` on each agentic aggregate benchmark row. During benchmark ingest, `ingest-ci-run.ts` derives the dashboard slug from `hf_dataset_name` (for example, `semianalysisai/cc-traces-weka-062126` becomes `cc-traces-weka-062126`) and upserts `run_datasets` for the workflow run.
+
+Legacy artifacts without provenance leave any existing mapping untouched. A workflow run can map to only one dataset; conflicting dataset IDs fail ingest rather than silently linking the run to an arbitrary dataset.
+
 ## Frontend Transform Pipeline
 
 ### Why transformBenchmarkRows Exists
diff --git a/docs/investigations/kv-cache-hit-rate-anomaly.md b/docs/investigations/kv-cache-hit-rate-anomaly.md
new file mode 100644
index 00000000..61ffee42
--- /dev/null
+++ b/docs/investigations/kv-cache-hit-rate-anomaly.md
@@ -0,0 +1,113 @@
+# KV cache hit-rate anomaly on agentic benchmarks (dsv4, b200, vllm)
+
+## Core issue
+
+vLLM's prefix cache should be hitting at ~98% on multi-turn agentic conversation replay (each turn extends the prior turn's context). It isn't. Something in the **dataset definition** or **aiperf replay** is producing requests whose token streams aren't actually prefix-compatible turn-to-turn.
+
+| Concurrency | Theoretical max hit % | vLLM actual hit % |
+| ----------: | --------------------: | ----------------: |
+|           1 |                97.45% |            83.15% |
+|           2 |                98.34% |            46.78% |
+|           4 |                97.99% |            12.43% |
+
+This is **not** a capacity problem. KV cache is sized at 3.29M tokens (12,868 blocks × 256). The conc=4 workload's unique-content footprint is **~1.11M DSV4 tokens** — would fit in ~34% util. Observed peak util is 49.8%, so the cache is holding more blocks than the workload needs, yet vLLM can't find them on lookup.
+
+## Data sources
+
+- **Benchmark points**:
+  - http://localhost:3002/inference/agentic/206252 (conc=1)
+  - http://localhost:3002/inference/agentic/206245 (conc=2)
+  - http://localhost:3002/inference/agentic/206247 (conc=4)
+- **Neon DB**: project `silent-pond-29172997`, branch `br-cold-sky-ai0c09cy` (agentx-dev). Connection via `DATABASE_WRITE_URL` in `.env`. Console: https://console.neon.tech/app/projects/silent-pond-29172997/branches/br-cold-sky-ai0c09cy
+  - `agentic_trace_replay.profile_export_jsonl_gz` — gzipped aiperf per-request records
+  - `agentic_trace_replay.server_metrics_json_gz` — gzipped vllm per-scrape prometheus metrics
+  - `agentic_trace_replay.request_timeline` (jsonb) — pre-computed per-request timeline used by the simulation
+- **Trace replay dataset** (the source-of-truth for "what should be cacheable"): https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-051926. Each row has pre-computed 64-token block `hash_ids` per turn; `hash_id_scope: 'local'` (per-conversation).
+
+## Theoretical max simulation
+
+For each replayed request, look up the matching turn in the HF dataset and walk a per-conversation trie of 64-token block hash IDs. Hits = longest contiguous prefix from block 0 that has appeared in any prior request (mirrors vLLM's chained-hash semantics).
+
+Confirms: the workload IS prefix-cacheable end-to-end. Theoretical max ≈ 98% across all three concurrency levels — same dataset, same conversations, just different dispatch order.
+
+## Why this points at the dataset/replay, not vLLM
+
+- **Capacity is not the bottleneck.** Cache holds ~3× the unique content of the workload. Cache util tops out below capacity.
+- **The metric isn't lying.** vLLM's own counters cross-check: `prefill_kv_computed_tokens + prefix_cache_hits ≈ request_prompt_tokens` (67.85M + 9.61M ≈ 77.47M for conc=4).
+- **It's not a tokenizer artifact.** DSV4 tokens are ~54% the count of Claude tokens, but BPE is left-monotonic on stable text — hit-rate ratio is invariant to tokenizer choice for prefix-growth workloads.
+- **It's not the multi-engine DP bug** we found earlier (commit `f2618f4`) — this deployment has 1 engine.
+
+What's left: the bytes that vLLM actually receives turn-to-turn are not the same prefix + delta that the dataset's `hash_ids` describe. Most likely culprits:
+
+1. **aiperf isn't sending the cumulative chat history** the way the dataset assumes — each turn is being assembled differently than the previous, breaking the byte-level prefix.
+2. **Something in the request payload varies per request** (timestamps, request IDs, tool result serialization order, etc.) — invalidates block 0's hash, cascades to every subsequent block via vLLM's chained hashing.
+3. **BPE re-merging across message boundaries** when aiperf re-tokenizes the full history each turn instead of appending tokens.
+
+## Root cause: `ConversationReconstructor` strips the prev user's `partial_tail` every turn
+
+The bug is in `utils/aiperf/src/aiperf/dataset/loader/weka_synth_buf.py` — specifically the **boundary case** in `truncate_synth_buf_at_block` (line 453–464) combined with `turn_delta`'s reset logic (line 354–360).
+
+What happens turn-to-turn:
+
+1. `init_turn_0` builds a trailing user segment whose `tokens` = `[block_aligned_tokens] + [partial_tail_tokens]` where `partial_tail_n = in_tokens % bs`. The wire prompt for turn 0 includes these tail tokens.
+2. `advance_turn` computes `lcp = longest_common_prefix(prev_hash_ids, curr_hash_ids)`. When the LCP equals the prev turn's total block count (the normal append-only case), `truncate_synth_buf_at_block` hits its boundary branch: `cursor + seg.block_count == target_blocks`.
+3. That branch **strips `prev_partial_tail` tokens off the trailing user segment in place** and re-decodes its `content`. This sets `_last_disturbance_at = i` (the index of the prev trailing user segment).
+4. New `assistant` + `user` segments are appended.
+5. `turn_delta` sees `_last_disturbance_at < _emitted_segment_count` and forces `reset_context=True`, re-emitting **the whole conversation** with the now-stripped trailing user.
+
+The endpoint (`utils/aiperf/src/aiperf/endpoints/base_endpoint.py:110-140`) honors `reset_context=True` via `messages = list(turn.raw_messages)` instead of `messages.extend(...)`.
+
+Result: every turn sends the full chat history, but the bytes of the prev user message differ from what was sent the turn before — the trailing `partial_tail` chars are missing. vLLM tokenizes the new prompt, hashes 256-token blocks, and the chained-hash invariant breaks at the first block containing the trimmed boundary. That block + every subsequent block of the new turn miss the cache.
+
+### Empirical confirmation
+
+Reproducer at `/tmp/test-reconstructor.py` instantiates `ConversationReconstructor` with mock decoders and walks a synthetic 3-turn conversation:
+
+```
+=== Turn 0 ===
+  delta msgs: 2, reset=False
+  wire len: 21683
+
+=== Turn 1 ===
+  delta msgs: 4, reset=True            ← every turn resets
+  wire len: 25307
+
+=== DIFF turn 0 vs turn 1 (wire-level) ===
+  common prefix chars: 21549 / wire0 21683 (99.4%)
+  wire0[...] = '... 983406 12 1 133 184 16 57 71 155 37 '     ← partial_tail decoded
+  wire1[...] = '... 983406<|im_end|>\n<|im_start|>assista'    ← stripped, template marker next
+  turn0 user content len: 19812, turn1 user[0] content len: 19711   ← 101 chars stripped
+```
+
+Across the conc=1 run (point 206252), **280/280 (100%)** consecutive turn-pairs have `prev_in_tokens % bs != 0` — i.e., every single turn hits this boundary disturbance.
+
+### Why the gap widens with concurrency
+
+At conc=1 the gap (97.45% − 83.15% = 14pp) is roughly the fraction of each turn's blocks lost to the trimmed-tail invalidation (last user block + chat-template delta). At higher conc:
+
+- `reset_context=True` makes every request re-send the **entire** conversation prompt, so wire bandwidth + prefill work scale superlinearly per turn.
+- Concurrent conversations all do this simultaneously; each writes long sequences of "new" blocks past their respective divergence points, evicting other conversations' usable prefix blocks even though aggregate unique content (1.11M tokens) fits comfortably in the 3.29M-token cache.
+
+### Fix sketch
+
+The boundary-cut strip exists to keep the next turn's `assistant` segment block-aligned. Two viable fixes:
+
+1. **Don't mutate the prev trailing user segment.** Leave its `partial_tail` tokens intact; append the new asst+user as strict-append (no reset_context). The wire-prefix becomes byte-stable turn-to-turn. Cost: the new asst content's block_start no longer aligns to the prev_hash_ids tail, so hash_id accounting for asst blocks loses 1 block of fidelity per turn.
+2. **Track `partial_tail` separately** from the prev user segment so the segment's emitted content stays byte-stable, and only the trailing tail (which is regenerated each turn anyway) is allowed to vary.
+
+Option 1 is the minimal change. Validate with the reproducer above — remove the strip in `truncate_synth_buf_at_block`'s boundary case and re-run; turn N+1's wire prefix should equal turn N's wire byte-for-byte up to the end of the prev assistant template.
+
+## Re-running the simulation
+
+```bash
+# 1. dump request timelines from DB
+pnpm --filter @semianalysisai/inferencex-db exec dotenv -e ../../.env -- tsx /tmp/dump-rt-multi.ts
+
+# 2. run analysis (needs `pip3 install --break-system-packages --user datasets`)
+python3 /tmp/cache-sim-multi.py
+
+# 3. reproduce the partial_tail strip
+python3 /tmp/test-reconstructor.py
+```
+
+Scripts live in `/tmp/` from this session; recreate from inline code in the previous version of this doc if missing.
diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
index db59dda2..4a450f7c 100644
--- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -40,6 +40,26 @@ describe('Agentic point request metric time series', () => {
           timelineRequest(4, 1600, 80),
           timelineRequest(5, 3200, 160, { phase: 'warmup' }),
           timelineRequest(6, 6400, 320, { cancelled: true }),
+          timelineRequest(7, 0, 0, {
+            cid: 'conversation-1::sa:subagent_001_abcd',
+            credit: 1_100_000_000,
+            start: 1_100_000_000,
+            end: 1_900_000_000,
+            ttftMs: null,
+            tpotMs: null,
+            isl: null,
+            osl: null,
+          }),
+          timelineRequest(8, 0, 0, {
+            cid: 'conversation-1::sa:subagent_001_abcd:aux:011',
+            credit: 1_200_000_000,
+            start: 1_200_000_000,
+            end: 1_800_000_000,
+            ttftMs: null,
+            tpotMs: null,
+            isl: null,
+            osl: null,
+          }),
         ],
       },
     });
@@ -52,6 +72,7 @@ describe('Agentic point request metric time series', () => {
       cy.get('[data-testid="interactivity-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
         .should('have.text', 'P90');
+      cy.get('[data-testid="interactivity-point-count"]').should('have.text', '5 points');
       cy.get('svg circle').should('have.length', 5);
       cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
       cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT');
@@ -60,6 +81,7 @@ describe('Agentic point request metric time series', () => {
 
     cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
       cy.contains('h2', 'TTFT over time').should('be.visible');
+      cy.get('[data-testid="ttft-point-count"]').should('have.text', '5 points');
       cy.get('svg circle').should('have.length', 5);
       cy.get('svg').should('contain.text', 'TTFT (s)');
       cy.get('svg').should('contain.text', 'Cumulative P90 TTFT');
@@ -67,6 +89,34 @@ describe('Agentic point request metric time series', () => {
     });
   });
 
+  it('switches ISL and OSL cards from distributions to in-flight averages', () => {
+    cy.get('[data-testid="isl-metric-chart"]').within(() => {
+      cy.get('[data-testid="isl-metric-inflight"]').click();
+      cy.contains('h2', 'Average ISL in flight').should('be.visible');
+      cy.get('svg').should('contain.text', 'Average ISL in flight (30s avg)');
+    });
+    cy.get('[data-testid="osl-metric-chart"]').within(() => {
+      cy.get('[data-testid="osl-metric-inflight"]').click();
+      cy.contains('h2', 'Average OSL in flight').should('be.visible');
+      cy.contains('Retrospective: final observed OSL').should('be.visible');
+      cy.get('svg').should('contain.text', 'Average OSL in flight (30s avg)');
+    });
+  });
+
+  it('switches the TTFT chart to E2E request latency over time', () => {
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.get('[data-testid="latency-metric-e2e"]').click();
+      cy.contains('h2', 'E2E latency over time').should('be.visible');
+      cy.get('[data-testid="e2e-point-count"]').should('have.text', '7 points');
+      cy.get('svg circle').should('have.length', 7);
+      cy.get('svg').should('contain.text', 'E2E latency (s)');
+      cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency');
+
+      cy.get('[data-testid="latency-metric-ttft"]').click();
+      cy.contains('h2', 'TTFT over time').should('be.visible');
+    });
+  });
+
   it('switches each chart independently from P90 to P75', () => {
     cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
       cy.contains('svg', 'P90 (rolling 50 req)')
@@ -97,4 +147,174 @@ describe('Agentic point request metric time series', () => {
       cy.get('svg').should('contain.text', 'Cumulative P75 TTFT');
     });
   });
+
+  it('switches the request activity card from queue depth to cumulative completions', () => {
+    cy.get('[data-testid="request-activity-chart"]').within(() => {
+      cy.contains('h2', 'Request queue depth').should('be.visible');
+      cy.get('[data-testid="request-activity-completed"]').click();
+      cy.contains('h2', 'Cumulative completed requests').should('be.visible');
+      cy.get('svg').should('contain.text', 'Completed requests');
+      cy.get('svg').should('contain.text', 'Requests');
+      cy.get('[data-testid="request-activity-queue"]').click();
+      cy.contains('h2', 'Request queue depth').should('be.visible');
+    });
+  });
+
+  it('shows total time with no requests in flight on the request timeline', () => {
+    cy.get('[data-testid="detail-view-timeline"]').click();
+    cy.location('search').should('contain', 'view=timeline');
+    cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 1.00s (14.3%)');
+    cy.get('[data-timeline-row-kind="aux"]')
+      .should('have.css', 'padding-left', '24px')
+      .and('contain.text', 'aux 011 · parallel');
+  });
+
+  it('restores the request timeline view after browser Back from a dataset route', () => {
+    cy.window().then((win) => {
+      win.history.pushState({}, '', '/datasets/test-dataset/conversations/conversation-1');
+    });
+    cy.go('back');
+    cy.location('pathname').should('eq', '/inference/agentic/206885');
+    cy.location('search').should('contain', 'view=timeline');
+    cy.get('[data-testid="detail-view-timeline"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="timeline-total-idle-time"]').should('be.visible');
+  });
+
+  it('shows a cumulative average for unique input tokens in flight', () => {
+    cy.get('[data-testid="detail-view-point"]').click();
+    cy.get('[data-testid="unique-input-inflight-chart"]').within(() => {
+      cy.get('svg').should('contain.text', 'Cumulative average');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+  });
+});
+
+const pointMeta = {
+  id: 206885,
+  hardware: 'gb200',
+  framework: 'dynamo-vllm',
+  model: 'deepseek-r1-0528',
+  precision: 'fp8',
+  spec_method: 'none',
+  disagg: true,
+  conc: 128,
+  offload_mode: 'off',
+  isl: null,
+  osl: null,
+  benchmark_type: 'agentic_traces',
+  date: '2026-06-23',
+  run_url: null,
+  server_gpu_cache_hit_rate: 0.5,
+  server_cpu_cache_hit_rate: null,
+};
+
+const sourceSeries = (source: Record<string, unknown>, prompt: number, generation: number) => ({
+  source,
+  kvCacheUsage: [
+    { t: 0, value: 0.25 },
+    { t: 1, value: 0.5 },
+  ],
+  prefixCacheHitRate: [{ t: 0, value: 0.5 }],
+  queueDepth: [{ t: 0, running: 2, waiting: 1, total: 3 }],
+  promptTokensBySource: { miss: [{ t: 0, value: prompt }] },
+  promptTps: [{ t: 0, value: prompt }],
+  generationTps: [{ t: 0, value: generation }],
+  prefixCacheHitsTps: [{ t: 0, value: prompt / 2 }],
+  hostKvCacheUsage: [],
+  kvCacheUsageByEngine: [],
+});
+
+describe('Agentic point orchestrator metric sources', () => {
+  beforeEach(() => {
+    const prefill = sourceSeries(
+      {
+        id: 'dynamo|prefill|10.30.1.56:7500|prefill-a|0|0',
+        adapter: 'dynamo',
+        role: 'prefill',
+        endpointUrl: '10.30.1.56:7500',
+        nativeRole: 'prefill',
+        workerId: 'prefill-a',
+        dpRank: '0',
+        engine: '0',
+      },
+      100,
+      1,
+    );
+    const decode = sourceSeries(
+      {
+        id: 'dynamo|decode|10.30.1.206:7516|decode-a|0|0',
+        adapter: 'dynamo',
+        role: 'decode',
+        endpointUrl: '10.30.1.206:7516',
+        nativeRole: 'backend',
+        workerId: 'decode-a',
+        dpRank: '0',
+        engine: '0',
+      },
+      300,
+      400,
+    );
+    cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} });
+    cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 });
+    cy.intercept('GET', '/api/v1/request-timeline*', { statusCode: 404 });
+    cy.intercept('GET', '/api/v1/trace-server-metrics*', {
+      body: {
+        meta: pointMeta,
+        startNs: 0,
+        endNs: 2_000_000_000,
+        durationS: 2,
+        timeslicesCount: 2,
+        kvCacheUsage: prefill.kvCacheUsage,
+        prefixCacheHitRate: prefill.prefixCacheHitRate,
+        queueDepth: prefill.queueDepth,
+        promptTokensBySource: prefill.promptTokensBySource,
+        prefillTps: prefill.promptTps,
+        decodeTps: decode.generationTps,
+        prefixCacheHitsTps: prefill.prefixCacheHitsTps,
+        hostKvCacheUsage: [],
+        kvCacheUsageByEngine: [],
+        metricSources: [prefill, decode],
+      },
+    });
+    cy.visit('/inference/agentic/206885');
+  });
+
+  it('switches every server chart to an orchestrator-normalized worker', () => {
+    cy.get('[data-testid="metric-source-toolbar"]')
+      .should('have.css', 'position', 'sticky')
+      .and('have.css', 'top', '64px');
+    cy.get('[data-testid="metric-source-select"]').should('contain.text', 'All endpoints').click();
+    cy.contains('[role="option"]', 'Decode · decode-a').click();
+
+    cy.get('[data-testid="metric-source-select"]').should('contain.text', 'Decode · decode-a');
+    cy.contains('h2', 'Throughput · Decode · decode-a').should('be.visible');
+    cy.contains('svg', 'Decode (avg n=50)').should('be.visible');
+
+    cy.get('[data-testid="metric-source-select"]').click();
+    cy.contains('[role="option"]', 'Prefill · prefill-a').click();
+    cy.contains('h2', 'Throughput · Prefill · prefill-a').should('be.visible');
+  });
+
+  it('toggles input and decode independently while keeping one visible', () => {
+    cy.get('[data-testid="throughput-series-input"]')
+      .should('have.attr', 'aria-pressed', 'true')
+      .and('not.be.disabled');
+    cy.get('[data-testid="throughput-series-decode"]')
+      .should('have.attr', 'aria-pressed', 'true')
+      .and('not.be.disabled');
+    cy.contains('svg', 'Input (avg n=50)').should('be.visible');
+    cy.contains('svg', 'Decode (avg n=50)').should('be.visible');
+    cy.contains('svg', 'Total running avg (60s burn-in)').should('be.visible');
+
+    cy.get('[data-testid="throughput-series-input"]').click();
+    cy.get('[data-testid="throughput-series-input"]').should('have.attr', 'aria-pressed', 'false');
+    cy.get('[data-testid="throughput-series-decode"]').should('be.disabled');
+    cy.contains('svg', 'Input (avg n=50)').should('not.exist');
+    cy.contains('svg', 'Total running avg (60s burn-in)').should('not.exist');
+
+    cy.get('[data-testid="throughput-series-input"]').click();
+    cy.get('[data-testid="throughput-series-decode"]').click();
+    cy.get('[data-testid="throughput-series-input"]').should('be.disabled');
+    cy.get('[data-testid="throughput-series-decode"]').should('have.attr', 'aria-pressed', 'false');
+  });
 });
diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts
index 7edda341..6ce4bc34 100644
--- a/packages/app/cypress/e2e/datasets-distributions.cy.ts
+++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts
@@ -33,6 +33,10 @@ describe('Dataset distribution percentiles', () => {
           mainTurns: 20,
           subagentGroups: 0,
           subagentTurns: 0,
+          medianRequestsPerConversation: 12,
+          meanRequestsPerConversation: 14.6,
+          medianSubagentsPerTrace: 3,
+          meanSubagentsPerTrace: 4.8,
           cachedPct: 0.5,
           totalIn: 1000,
           totalOut: 200,
@@ -60,6 +64,20 @@ describe('Dataset distribution percentiles', () => {
             p95: 256,
             max: 512,
           }),
+          subagentInputTokensPerRequest: distribution({
+            median: 1000,
+            p75: 2000,
+            p90: 3000,
+            p95: 4000,
+            max: 5000,
+          }),
+          subagentOutputTokensPerRequest: distribution({
+            median: 100,
+            p75: 200,
+            p90: 300,
+            p95: 400,
+            max: 500,
+          }),
         },
         ingested_at: '2026-06-23T00:00:00Z',
       },
@@ -87,4 +105,29 @@ describe('Dataset distribution percentiles', () => {
       });
     }
   });
+
+  it('shows median and mean model requests per conversation', () => {
+    cy.contains('dt', 'Median requests / convo').next('dd').should('have.text', '12');
+    cy.contains('dt', 'Mean requests / convo').next('dd').should('have.text', '14.6');
+  });
+
+  it('summarizes subagents per trace instead of charting group counts', () => {
+    cy.contains('dt', 'Median subagents / trace').next('dd').should('have.text', '3');
+    cy.contains('dt', 'Mean subagents / trace').next('dd').should('have.text', '4.8');
+    cy.contains('Subagent groups per conversation').should('not.exist');
+  });
+
+  it('shows ISL and OSL distributions for inner subagent requests only', () => {
+    const expected = [
+      ['Subagent request ISL', ['p50 1.0k', 'p75 2.0k', 'p90 3.0k', 'p95 4.0k']],
+      ['Subagent request OSL', ['p50 100', 'p75 200', 'p90 300', 'p95 400']],
+    ] as const;
+
+    for (const [title, percentiles] of expected) {
+      cy.contains('[data-slot="card"]', title).within(() => {
+        cy.contains('Inner subagent requests only').should('be.visible');
+        for (const percentile of percentiles) cy.contains(percentile).should('be.visible');
+      });
+    }
+  });
 });
diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
new file mode 100644
index 00000000..d574dd2a
--- /dev/null
+++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts
@@ -0,0 +1,54 @@
+describe('GPU comparison agentic point detail', () => {
+  it('exposes the per-point charts as a normal browser link', () => {
+    cy.intercept('GET', '/api/v1/trace-availability*', (request) => {
+      const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? [];
+      if (ids.length < 20) request.alias = 'gpuTraceAvailability';
+      request.continue();
+    });
+
+    cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', {
+      onBeforeLoad(win) {
+        win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+      },
+    });
+
+    cy.get('[data-testid="gpu-multiselect"] [role="combobox"]').click({ force: true });
+    cy.get('[role="option"]').first().click();
+    cy.contains('button', 'Select date range').click();
+    cy.get('body').then(($body) => {
+      if ($body.text().includes('View anyway')) {
+        cy.contains('button', 'View anyway').click();
+      } else {
+        cy.contains('button', 'Max Range').click();
+        cy.contains('button', 'Apply').click();
+      }
+    });
+
+    cy.get('[data-testid="gpu-graph"]').first().should('be.visible');
+    cy.wait('@gpuTraceAvailability');
+    cy.wait(100);
+    cy.get('[data-testid="gpu-graph"]')
+      .first()
+      .find('svg .dot-group')
+      .should('have.length.greaterThan', 0)
+      .first()
+      .then(($point) => {
+        const point = $point[0] as unknown as SVGElement & {
+          __data__: { benchmark_type?: string; id?: number };
+        };
+        expect(point.__data__.benchmark_type).to.equal('agentic_traces');
+        expect(point.__data__.id).to.be.a('number');
+        cy.wrap($point).find('.visible-shape').click({ force: true });
+      });
+
+    cy.get('[data-chart-tooltip]:visible').should('have.length', 1);
+    cy.get('[data-chart-tooltip]:visible [data-action="view-charts"]')
+      .should('be.visible')
+      .then(($link) => {
+        expect($link).to.match('a');
+        expect($link).not.to.have.attr('target');
+        expect($link.attr('href')).to.match(/^\/inference\/agentic\/\d+$/u);
+      });
+    cy.location('pathname').should('eq', '/inference');
+  });
+});
diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index df199b81..924ff9a9 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -1,3 +1,23 @@
+const interceptDerivedMetrics = () => {
+  cy.intercept('GET', '/api/v1/derived-agentic-metrics*', (request) => {
+    const ids = new URL(request.url).searchParams.get('ids')?.split(',').filter(Boolean) ?? [];
+    request.reply({
+      body: Object.fromEntries(
+        ids.map((id, index) => [
+          id,
+          {
+            id: Number(id),
+            normalized_session_time_s: 60 + index,
+            p90_prefill_tps_per_user: 100 + index,
+            p75_normalized_e2e_400_s: 8 + index,
+            p90_normalized_e2e_400_s: 12 + index,
+          },
+        ]),
+      ),
+    });
+  }).as('derivedAgenticMetrics');
+};
+
 describe('X-Axis Mode Toggle (inference chart)', () => {
   before(() => {
     cy.visit('/inference', {
@@ -13,6 +33,7 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
     cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces');
     cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible');
     cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should('be.visible');
     cy.get('[data-testid="x-axis-mode-interactivity"]')
       .should('be.visible')
       .and('have.attr', 'aria-selected', 'true');
@@ -31,6 +52,32 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
     cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency');
   });
 
+  it('switches to request-level normalized E2E at 400 output tokens', () => {
+    interceptDerivedMetrics();
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click();
+    cy.wait('@derivedAgenticMetrics');
+    cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should(
+      'have.attr',
+      'aria-selected',
+      'true',
+    );
+    cy.get('[data-testid="chart-figure"] h2').should(
+      'contain.text',
+      'P90 Normalized E2E @ 400 output tokens',
+    );
+    cy.get('[data-testid="chart-figure"] svg').should(
+      'contain.text',
+      'P90 Normalized E2E @ 400 output tokens (s)',
+    );
+
+    cy.get('[data-testid="percentile-selector"]').click();
+    cy.contains('[role="option"]', 'p75').click();
+    cy.get('[data-testid="chart-figure"] h2').should(
+      'contain.text',
+      'P75 Normalized E2E @ 400 output tokens',
+    );
+  });
+
   it('switches back to Interactivity', () => {
     cy.get('[data-testid="x-axis-mode-interactivity"]').click();
     cy.get('[data-testid="x-axis-mode-interactivity"]').should(
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index 6ce7c017..6f7ab1ce 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -13,12 +13,12 @@ export const dynamic = 'force-dynamic';
 // blobOnly: the response is one entry per id with two numbers, but the
 // derivation work parses thousands of JSONL records per blob — cache the
 // computed result so a chart-refresh hits the warm path.
-// Bumped to v2 when mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user.
+// Bumped to v3 for per-request normalized-E2E @ 400 output tokens.
 // Stale v1 cache entries return undefined for the new field and silently
 // blank the chart with "No data available".
 const getCachedDerivedAgenticMetrics = cachedQuery(
   (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
-  'derived-agentic-metrics-v2',
+  'derived-agentic-metrics-v3',
   { blobOnly: true },
 );
 
@@ -33,6 +33,8 @@ const MAX_IDS_PER_REQUEST = 200;
  *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
  *  - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT)
  *    across every turn in every session.
+ *  - p75/p90_normalized_e2e_400_s: percentile of per-request
+ *    TTFT + 399 × observed ITL.
  *
  * Ids without a trace_replay blob or with unparseable records are omitted.
  */
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
index ac8b2de5..573e2f6b 100644
--- a/packages/app/src/components/datasets/dataset-detail.tsx
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -18,7 +18,7 @@ import {
   type ConversationSort,
 } from '@/hooks/api/use-datasets';
 import { track } from '@/lib/analytics';
-import { compact } from './format';
+import { compact, perConversation } from './format';
 
 const PAGE = 50;
 
@@ -97,11 +97,22 @@ export function DatasetDetail({ slug }: { slug: string }) {
 
       {/* summary stats */}
       <Card className="p-4">
-        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-6">
+        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-4 lg:grid-cols-8">
           <Stat label="Conversations" value={dataset.conversation_count.toLocaleString()} />
+          <Stat
+            label="Median requests / convo"
+            value={perConversation(s.medianRequestsPerConversation)}
+          />
+          <Stat
+            label="Mean requests / convo"
+            value={perConversation(s.meanRequestsPerConversation)}
+          />
           <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
-          <Stat label="Subagent groups" value={compact(s.subagentGroups ?? 0)} />
-          <Stat label="Subagent turns" value={compact(s.subagentTurns ?? 0)} />
+          <Stat
+            label="Median subagents / trace"
+            value={perConversation(s.medianSubagentsPerTrace)}
+          />
+          <Stat label="Mean subagents / trace" value={perConversation(s.meanSubagentsPerTrace)} />
           <Stat
             label="Cached input"
             value={typeof s.cachedPct === 'number' ? `${(s.cachedPct * 100).toFixed(0)}%` : '—'}
@@ -157,9 +168,18 @@ export function DatasetDetail({ slug }: { slug: string }) {
             distribution={cd.turnsPerConversation}
           />
           <DistributionCard
-            title="Subagent groups per conversation"
-            unit="groups"
-            distribution={cd.subagentGroupsPerConversation}
+            title="Subagent request ISL"
+            subtitle="Inner subagent requests only"
+            unit="tokens"
+            scale="log"
+            distribution={cd.subagentInputTokensPerRequest}
+          />
+          <DistributionCard
+            title="Subagent request OSL"
+            subtitle="Inner subagent requests only"
+            unit="tokens"
+            scale="log"
+            distribution={cd.subagentOutputTokensPerRequest}
           />
           <DistributionCard
             title="Cached fraction per turn"
diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx
index 84b279db..bda49311 100644
--- a/packages/app/src/components/datasets/dataset-list.tsx
+++ b/packages/app/src/components/datasets/dataset-list.tsx
@@ -5,7 +5,7 @@ import Link from 'next/link';
 import { Card } from '@/components/ui/card';
 import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets';
 import { track } from '@/lib/analytics';
-import { compact } from './format';
+import { compact, perConversation } from './format';
 
 function DatasetCard({ d }: { d: DatasetRecord }) {
   const s = d.summary ?? {};
@@ -28,6 +28,14 @@ function DatasetCard({ d }: { d: DatasetRecord }) {
         )}
         <dl className="grid grid-cols-2 gap-x-4 gap-y-1.5 text-xs">
           <Stat label="Conversations" value={d.conversation_count.toLocaleString()} />
+          <Stat
+            label="Median requests / convo"
+            value={perConversation(s.medianRequestsPerConversation)}
+          />
+          <Stat
+            label="Mean requests / convo"
+            value={perConversation(s.meanRequestsPerConversation)}
+          />
           <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
           <Stat label="Subagent groups" value={compact(s.subagentGroups ?? 0)} />
           <Stat label="Cached input" value={cachedPct} />
diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts
index f6f5530c..f42dceb6 100644
--- a/packages/app/src/components/datasets/format.ts
+++ b/packages/app/src/components/datasets/format.ts
@@ -10,3 +10,9 @@ export function compact(n: number): string {
   if (abs > 0 && abs < 1) return n.toFixed(2);
   return String(Math.round(n));
 }
+
+/** Format a per-conversation count without hiding a meaningful fractional mean. */
+export function perConversation(n: number | undefined): string {
+  if (typeof n !== 'number' || !Number.isFinite(n)) return '—';
+  return n.toLocaleString(undefined, { maximumFractionDigits: 1 });
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts
index 00293c00..2ead726b 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.test.ts
+++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 
-import { formatElapsedTime } from './trace-flamegraph';
+import { findRequestOverlapGroups, formatElapsedTime } from './trace-flamegraph';
 
 describe('formatElapsedTime', () => {
   it('formats elapsed seconds below and above one hour', () => {
@@ -14,3 +14,42 @@ describe('formatElapsedTime', () => {
     expect(formatElapsedTime(-5)).toBe('00:00');
   });
 });
+
+describe('findRequestOverlapGroups', () => {
+  it('keeps non-transitive overlap chains as separate groups', () => {
+    const groups = findRequestOverlapGroups([
+      { key: 'A', startS: 1, endS: 8 },
+      { key: 'B', startS: 5, endS: 11 },
+      { key: 'C', startS: 9, endS: 15 },
+    ]);
+
+    expect(groups.map((group) => group.requestKeys)).toEqual([
+      ['A', 'B'],
+      ['B', 'C'],
+    ]);
+    expect(groups.map(({ startS, endS }) => [startS, endS])).toEqual([
+      [5, 8],
+      [9, 11],
+    ]);
+  });
+
+  it('does not consider touching or invalid intervals parallel', () => {
+    expect(
+      findRequestOverlapGroups([
+        { key: 'A', startS: 1, endS: 5 },
+        { key: 'B', startS: 5, endS: 8 },
+        { key: 'missing-end', startS: 3 },
+        { key: 'zero-duration', startS: 4, endS: 4 },
+      ]),
+    ).toEqual([]);
+  });
+
+  it('returns only the maximal simultaneous set for nested intervals', () => {
+    const groups = findRequestOverlapGroups([
+      { key: 'A', startS: 1, endS: 10 },
+      { key: 'B', startS: 2, endS: 8 },
+      { key: 'C', startS: 3, endS: 7 },
+    ]);
+    expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]);
+  });
+});
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index ddb923b8..3dddb5dd 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -159,10 +159,17 @@ export function InferenceProvider({
   // computing a kind-based default here would diverge between server and client
   // and cause a hydration mismatch. The scenario-kind default is applied in a
   // post-mount effect below (and a ref tracks whether the user has overridden).
-  type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  type XAxisMode =
+    | 'ttft'
+    | 'e2e'
+    | 'normalized-e2e'
+    | 'interactivity'
+    | 'session-time'
+    | 'prefill-tps';
   const VALID_X_MODES: XAxisMode[] = [
     'ttft',
     'e2e',
+    'normalized-e2e',
     'interactivity',
     'session-time',
     'prefill-tps',
@@ -544,7 +551,9 @@ export function InferenceProvider({
     const kind = sequenceKind(effectiveSequence);
     const isInitialMount = lastSeqKindRef.current === null;
     const isAgenticOnlyMode =
-      selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps';
+      selectedXAxisMode === 'normalized-e2e' ||
+      selectedXAxisMode === 'session-time' ||
+      selectedXAxisMode === 'prefill-tps';
     // On a stale render where kind hasn't changed, bail unless the current
     // mode is agentic-only and we just landed on a fixed-seq scenario — in
     // that case force the snap so the chart doesn't try to plot trace-derived
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index e1bc1524..77d87997 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -1,8 +1,8 @@
 'use client';
 
 import Link from 'next/link';
-import { useRouter } from 'next/navigation';
-import { useState } from 'react';
+import { usePathname, useRouter, useSearchParams } from 'next/navigation';
+import { useCallback, useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
 import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
@@ -10,12 +10,20 @@ import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-reques
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
+  type MetricSource,
   type PointMeta,
   type QueueDepthPoint,
   type TimeSeriesPoint,
 } from '@/hooks/api/use-trace-server-metrics';
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
 import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
 import { track } from '@/lib/analytics';
 
 import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
@@ -26,16 +34,20 @@ import { SiblingNav, chipLabel } from './sibling-nav';
 import {
   StackedAreaChart,
   TimeSeriesChart,
-  cumulativeAverage,
+  averageSequenceLengthInFlight,
+  buildThroughputChartSeries,
+  cumulativeCompletedRequests,
   cumulativeDifferenceMonotonic,
+  cumulativeTimeAverage,
   cumulativeUniqueInputTokens,
   inflightUniqueTokens,
   rollingAverage,
   rollingRequestMetric,
-  sumSeries,
   timeRollingAverage,
+  toggleThroughputSeries,
   type RequestMetric,
   type RequestPercentile,
+  type ThroughputSeriesKey,
 } from './time-series-chart';
 
 interface Props {
@@ -112,17 +124,56 @@ const DP_RANK_PALETTE = [
 ];
 
 type DetailView = 'point' | 'timeline' | 'aggregates';
+type RequestActivityView = 'queue' | 'completed';
+type SequenceMetricView = 'distribution' | 'inflight';
 const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
   { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' },
   { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
 ];
 
+const isDetailView = (value: string | null): value is DetailView =>
+  value === 'point' || value === 'timeline' || value === 'aggregates';
+
 const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption<RequestPercentile>[] = [
   { value: 'p75', label: 'P75' },
   { value: 'p90', label: 'P90' },
 ];
 
+const LATENCY_METRIC_OPTIONS: SegmentedToggleOption<'ttft' | 'e2e'>[] = [
+  { value: 'ttft', label: 'TTFT', testId: 'latency-metric-ttft' },
+  { value: 'e2e', label: 'E2E', testId: 'latency-metric-e2e' },
+];
+
+const REQUEST_ACTIVITY_OPTIONS: SegmentedToggleOption<RequestActivityView>[] = [
+  { value: 'queue', label: 'Queue depth', testId: 'request-activity-queue' },
+  { value: 'completed', label: 'Completed', testId: 'request-activity-completed' },
+];
+
+const SEQUENCE_METRIC_OPTIONS: SegmentedToggleOption<SequenceMetricView>[] = [
+  { value: 'distribution', label: 'Distribution' },
+  { value: 'inflight', label: 'In-flight avg' },
+];
+
+const SOURCE_ROLE_LABEL: Record<MetricSource['role'], string> = {
+  router: 'Router',
+  prefill: 'Prefill',
+  decode: 'Decode',
+  combined: 'Combined',
+  unknown: 'Unknown',
+};
+
+export function metricSourceLabel(source: MetricSource): string {
+  const instance =
+    source.workerId ??
+    (source.dpRank ? `DP ${source.dpRank}` : null) ??
+    source.endpointUrl ??
+    (source.engine ? `engine ${source.engine}` : null);
+  return instance
+    ? `${SOURCE_ROLE_LABEL[source.role]} · ${instance}`
+    : SOURCE_ROLE_LABEL[source.role];
+}
+
 // Unofficial-run overlays cannot open this persisted point-detail route: they
 // have no benchmark_results id or stored request timeline. These charts are
 // therefore intentionally limited to DB-backed agentic points.
@@ -131,33 +182,68 @@ function RequestMetricOverTime({
   metric,
   timeline,
   isLoading,
+  latencySelector = false,
 }: {
   title: string;
   metric: RequestMetric;
   timeline: RequestTimeline | null | undefined;
   isLoading: boolean;
+  latencySelector?: boolean;
 }) {
   const [percentile, setPercentile] = useState<RequestPercentile>('p90');
-  const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null;
-  const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity';
-  const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4';
+  const [latencyMetric, setLatencyMetric] = useState<'ttft' | 'e2e'>('ttft');
+  const selectedMetric = latencySelector ? latencyMetric : metric;
+  const result = timeline
+    ? rollingRequestMetric(timeline.requests, selectedMetric, percentile, 50)
+    : null;
+  const metricLabel =
+    selectedMetric === 'ttft' ? 'TTFT' : selectedMetric === 'e2e' ? 'E2E latency' : 'Interactivity';
+  const color =
+    selectedMetric === 'ttft' ? '#f59e0b' : selectedMetric === 'e2e' ? '#a855f7' : '#06b6d4';
+  const pointCount = result?.raw.length;
+  const isLatency = selectedMetric !== 'interactivity';
 
   const controls = (
-    <SegmentedToggle
-      value={percentile}
-      options={REQUEST_PERCENTILE_OPTIONS}
-      onValueChange={(value) => {
-        setPercentile(value);
-        track('inference_agentic_percentile_changed', { metric, percentile: value });
-      }}
-      ariaLabel={`${metricLabel} percentile`}
-      testId={`${metric}-percentile-toggle`}
-    />
+    <div className="flex items-center gap-2">
+      {latencySelector && (
+        <SegmentedToggle
+          value={latencyMetric}
+          options={LATENCY_METRIC_OPTIONS}
+          onValueChange={(value) => {
+            setLatencyMetric(value);
+            track('inference_agentic_latency_metric_changed', { metric: value });
+          }}
+          ariaLabel="Latency metric"
+          testId="latency-metric-toggle"
+        />
+      )}
+      <span
+        className="text-xs tabular-nums text-muted-foreground"
+        data-testid={`${selectedMetric}-point-count`}
+      >
+        {pointCount === undefined
+          ? '— points'
+          : `${pointCount.toLocaleString()} ${pointCount === 1 ? 'point' : 'points'}`}
+      </span>
+      <SegmentedToggle
+        value={percentile}
+        options={REQUEST_PERCENTILE_OPTIONS}
+        onValueChange={(value) => {
+          setPercentile(value);
+          track('inference_agentic_percentile_changed', {
+            metric: selectedMetric,
+            percentile: value,
+          });
+        }}
+        ariaLabel={`${metricLabel} percentile`}
+        testId={`${selectedMetric}-percentile-toggle`}
+      />
+    </div>
   );
 
   return (
     <ExpandableChart
-      title={title}
+      title={latencySelector ? `${metricLabel} over time` : title}
       controls={controls}
       testId={`${metric}-over-time-chart`}
       render={(expanded) => {
@@ -174,10 +260,9 @@ function RequestMetricOverTime({
                 strokeWidth: 2.5,
               },
               {
-                name:
-                  metric === 'ttft'
-                    ? `Cumulative ${percentile.toUpperCase()} TTFT`
-                    : `1 / cumulative ${percentile.toUpperCase()} TPOT`,
+                name: isLatency
+                  ? `Cumulative ${percentile.toUpperCase()} ${metricLabel}`
+                  : `1 / cumulative ${percentile.toUpperCase()} TPOT`,
                 data: result?.cumulative ?? [],
                 color: '#ef4444',
                 strokeWidth: 3,
@@ -185,11 +270,11 @@ function RequestMetricOverTime({
             ]}
             durationS={timeline.durationS}
             yFmt={
-              metric === 'ttft'
+              isLatency
                 ? (value) => `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s`
                 : (value) => `${value.toFixed(0)}`
             }
-            yAxisLabel={metric === 'ttft' ? 'TTFT (s)' : 'Interactivity (tok/s/user)'}
+            yAxisLabel={isLatency ? `${metricLabel} (s)` : 'Interactivity (tok/s/user)'}
             {...size}
           />
         );
@@ -198,6 +283,79 @@ function RequestMetricOverTime({
   );
 }
 
+function SequenceMetricCard({
+  metric,
+  values,
+  timeline,
+  histogramLoading,
+  timelineLoading,
+}: {
+  metric: 'isl' | 'osl';
+  values: readonly number[] | undefined;
+  timeline: RequestTimeline | null | undefined;
+  histogramLoading: boolean;
+  timelineLoading: boolean;
+}) {
+  const [view, setView] = useState<SequenceMetricView>('distribution');
+  const acronym = metric.toUpperCase();
+  const fullName = metric === 'isl' ? 'Input sequence length' : 'Output sequence length';
+  const testPrefix = `${metric}-metric`;
+  return (
+    <ExpandableChart
+      title={view === 'distribution' ? `${fullName} distribution` : `Average ${acronym} in flight`}
+      testId={`${testPrefix}-chart`}
+      controls={
+        <SegmentedToggle
+          value={view}
+          options={SEQUENCE_METRIC_OPTIONS.map((option) => ({
+            ...option,
+            testId: `${testPrefix}-${option.value}`,
+          }))}
+          onValueChange={(value) => {
+            setView(value);
+            track('inference_agentic_sequence_metric_view_changed', { metric, view: value });
+          }}
+          ariaLabel={`${acronym} chart view`}
+          testId={`${testPrefix}-toggle`}
+          buttonClassName="px-2 py-1 text-xs"
+        />
+      }
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (view === 'distribution') {
+          if (values) return <Distribution values={values} unit="tokens" {...size} />;
+          return histogramLoading ? <Skeleton /> : <Empty />;
+        }
+        if (!timeline) return timelineLoading ? <Skeleton /> : <Empty />;
+        const raw = averageSequenceLengthInFlight(timeline.requests, metric);
+        return (
+          <div>
+            {metric === 'osl' && (
+              <p className="mb-2 text-xs text-muted-foreground">
+                Retrospective: final observed OSL is assigned across each request&apos;s lifetime.
+              </p>
+            )}
+            <TimeSeriesChart
+              series={[
+                {
+                  name: `Average ${acronym} in flight (30s avg)`,
+                  data: timeRollingAverage(raw, 30),
+                  rawData: raw,
+                  color: metric === 'isl' ? '#3b82f6' : '#a855f7',
+                  strokeWidth: 2.5,
+                },
+              ]}
+              durationS={timeline.durationS}
+              yAxisLabel="Tokens / request"
+              {...size}
+            />
+          </div>
+        );
+      }}
+    />
+  );
+}
+
 /** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
 function toAggPoint(
   sibling: { id: number; label: string },
@@ -216,6 +374,8 @@ function toAggPoint(
 
 export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
+  const pathname = usePathname();
+  const searchParams = useSearchParams();
   const histQuery = useTraceHistograms([id], true);
   const metricsQuery = useTraceServerMetrics(id, true);
   const siblingsQuery = useBenchmarkSiblings(id);
@@ -224,7 +384,24 @@ export function AgenticPointDetail({ id }: Props) {
   const metrics = metricsQuery.data;
   const siblingsData = siblingsQuery.data;
 
-  const [view, setView] = useState<DetailView>('point');
+  const requestedView = searchParams.get('view');
+  const view: DetailView = isDetailView(requestedView) ? requestedView : 'point';
+  const setView = useCallback(
+    (nextView: DetailView) => {
+      const nextParams = new URLSearchParams(searchParams.toString());
+      if (nextView === 'point') nextParams.delete('view');
+      else nextParams.set('view', nextView);
+      const query = nextParams.toString();
+      router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false });
+      track('inference_agentic_detail_view_changed', { view: nextView });
+    },
+    [pathname, router, searchParams],
+  );
+  const [metricSourceId, setMetricSourceId] = useState('all');
+  const [requestActivityView, setRequestActivityView] = useState<RequestActivityView>('queue');
+  const [throughputSeries, setThroughputSeries] = useState<ReadonlySet<ThroughputSeriesKey>>(
+    () => new Set(['input', 'decode']),
+  );
   // Fetch aggregates only when the aggregates view is active. Uses the full
   // sibling set (across parallelism + concurrency configs) so each chart
   // shows how the metric varies across the SKU.
@@ -234,6 +411,21 @@ export function AgenticPointDetail({ id }: Props) {
   // "Unique input tokens in flight" chart, so fetch whenever we're on
   // either view.
   const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point');
+  const metricSources = metrics?.metricSources ?? [];
+  const selectedMetricSource = metricSources.find(({ source }) => source.id === metricSourceId);
+  const serverSeries = selectedMetricSource
+    ? {
+        kvCacheUsage: selectedMetricSource.kvCacheUsage,
+        prefixCacheHitRate: selectedMetricSource.prefixCacheHitRate,
+        queueDepth: selectedMetricSource.queueDepth,
+        promptTokensBySource: selectedMetricSource.promptTokensBySource,
+        prefillTps: selectedMetricSource.promptTps,
+        decodeTps: selectedMetricSource.generationTps,
+        prefixCacheHitsTps: selectedMetricSource.prefixCacheHitsTps,
+        hostKvCacheUsage: selectedMetricSource.hostKvCacheUsage,
+        kvCacheUsageByEngine: selectedMetricSource.kvCacheUsageByEngine,
+      }
+    : metrics;
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -297,6 +489,48 @@ export function AgenticPointDetail({ id }: Props) {
         )}
       </div>
 
+      {view === 'point' && metricSources.length > 1 && (
+        <div
+          className="sticky top-16 z-40 flex items-center justify-end gap-2 rounded-lg border border-border/40 bg-background/90 px-3 py-2 shadow-sm backdrop-blur"
+          data-testid="metric-source-toolbar"
+        >
+          <span className="text-xs text-muted-foreground">Server metrics</span>
+          <Select
+            value={selectedMetricSource?.source.id ?? 'all'}
+            onValueChange={(value) => {
+              setMetricSourceId(value);
+              const source = metricSources.find((entry) => entry.source.id === value)?.source;
+              track('inference_agentic_metric_source_changed', {
+                source: value,
+                role: source?.role ?? 'all',
+                adapter: source?.adapter ?? metrics?.meta.framework ?? 'unknown',
+              });
+            }}
+          >
+            <SelectTrigger
+              size="sm"
+              className="max-w-72"
+              aria-label="Server metrics source"
+              data-testid="metric-source-select"
+            >
+              <SelectValue />
+            </SelectTrigger>
+            <SelectContent>
+              <SelectItem value="all">All endpoints</SelectItem>
+              {metricSources.map(({ source }) => (
+                <SelectItem
+                  key={source.id}
+                  value={source.id}
+                  title={source.endpointUrl ?? undefined}
+                >
+                  {metricSourceLabel(source)}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
+      )}
+
       {view === 'aggregates' ? (
         <AggregatesGrid
           siblings={siblingsData?.siblings ?? []}
@@ -321,21 +555,19 @@ export function AgenticPointDetail({ id }: Props) {
         )
       ) : (
         <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-          <ExpandableChart
-            title="Input sequence length distribution"
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
-              return histQuery.isLoading ? <Skeleton /> : <Empty />;
-            }}
+          <SequenceMetricCard
+            metric="isl"
+            values={hist?.isl}
+            timeline={timelineQuery.data}
+            histogramLoading={histQuery.isLoading}
+            timelineLoading={timelineQuery.isLoading}
           />
-          <ExpandableChart
-            title="Output sequence length distribution"
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
-              return histQuery.isLoading ? <Skeleton /> : <Empty />;
-            }}
+          <SequenceMetricCard
+            metric="osl"
+            values={hist?.osl}
+            timeline={timelineQuery.data}
+            histogramLoading={histQuery.isLoading}
+            timelineLoading={timelineQuery.isLoading}
           />
 
           <RequestMetricOverTime
@@ -350,21 +582,22 @@ export function AgenticPointDetail({ id }: Props) {
             metric="ttft"
             timeline={timelineQuery.data}
             isLoading={timelineQuery.isLoading}
+            latencySelector
           />
 
           <ExpandableChart
             title="KV cache utilization over time"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics) return <Skeleton />;
+              if (!metrics || !serverSeries) return <Skeleton />;
               // For SGLang hicache rows we have both GPU (HBM) util and
               // host (CPU offload pool) util — overlay them as two lines.
-              const hasHost = metrics.hostKvCacheUsage.length > 0;
+              const hasHost = serverSeries.hostKvCacheUsage.length > 0;
               // DEP runs report one series per engine. When there's more
               // than one, draw one line per rank in distinct colors so
               // load skew is visible at a glance; cluster-average sits on
               // top in white so it stands out.
-              const perEngine = metrics.kvCacheUsageByEngine ?? [];
+              const perEngine = serverSeries.kvCacheUsageByEngine ?? [];
               const hasPerEngine = perEngine.length > 1;
               // Render order matters: per-engine first → average drawn on top.
               const series = [
@@ -385,10 +618,10 @@ export function AgenticPointDetail({ id }: Props) {
                     : hasPerEngine
                       ? 'Avg'
                       : 'GPU KV cache (avg n=50)',
-                  data: rollingAverage(metrics.kvCacheUsage, 50),
+                  data: rollingAverage(serverSeries.kvCacheUsage, 50),
                   // Skip raw scatter when per-engine overlay is on — the
                   // DP-rank lines already convey the spread, dots would be noise.
-                  rawData: hasPerEngine ? undefined : metrics.kvCacheUsage,
+                  rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage,
                   // Bold red Avg sits on top of the translucent per-DP lines.
                   // DP 1 in the palette is #ef4444 (lighter red); the darker
                   // #dc2626 here plus the heavier stroke keeps it distinct.
@@ -399,8 +632,8 @@ export function AgenticPointDetail({ id }: Props) {
                   ? [
                       {
                         name: 'CPU offload pool (avg n=50)',
-                        data: rollingAverage(metrics.hostKvCacheUsage, 50),
-                        rawData: metrics.hostKvCacheUsage,
+                        data: rollingAverage(serverSeries.hostKvCacheUsage, 50),
+                        rawData: serverSeries.hostKvCacheUsage,
                         color: '#f97316',
                         strokeWidth: 2,
                       },
@@ -421,17 +654,55 @@ export function AgenticPointDetail({ id }: Props) {
           />
 
           <ExpandableChart
-            title="Request queue depth"
+            title={
+              requestActivityView === 'queue'
+                ? 'Request queue depth'
+                : 'Cumulative completed requests'
+            }
+            testId="request-activity-chart"
+            controls={
+              <SegmentedToggle
+                value={requestActivityView}
+                options={REQUEST_ACTIVITY_OPTIONS}
+                onValueChange={(value) => {
+                  setRequestActivityView(value);
+                  track('inference_agentic_request_activity_changed', { view: value });
+                }}
+                ariaLabel="Request activity metric"
+                testId="request-activity-toggle"
+                buttonClassName="px-2 py-1 text-xs"
+              />
+            }
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics) return <Skeleton />;
+              if (requestActivityView === 'completed') {
+                if (!timelineQuery.data) {
+                  return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
+                }
+                return (
+                  <TimeSeriesChart
+                    series={[
+                      {
+                        name: 'Completed requests',
+                        data: cumulativeCompletedRequests(timelineQuery.data.requests),
+                        color: '#3b82f6',
+                        strokeWidth: 2.5,
+                      },
+                    ]}
+                    durationS={timelineQuery.data.durationS}
+                    yAxisLabel="Requests"
+                    {...size}
+                  />
+                );
+              }
+              if (!metrics || !serverSeries) return <Skeleton />;
               return (
                 <TimeSeriesChart
                   series={[
                     {
                       name: 'Running (avg n=50)',
                       data: rollingAverage(
-                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
                           t: p.t,
                           value: p.running,
                         })),
@@ -443,7 +714,7 @@ export function AgenticPointDetail({ id }: Props) {
                     {
                       name: 'Waiting (avg n=50)',
                       data: rollingAverage(
-                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
                           t: p.t,
                           value: p.waiting,
                         })),
@@ -455,7 +726,7 @@ export function AgenticPointDetail({ id }: Props) {
                     {
                       name: 'Total (avg n=50)',
                       data: rollingAverage(
-                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
                           t: p.t,
                           value: p.total,
                         })),
@@ -477,14 +748,14 @@ export function AgenticPointDetail({ id }: Props) {
             title="Prefix cache hit rate per interval"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics) return <Skeleton />;
+              if (!metrics || !serverSeries) return <Skeleton />;
               return (
                 <TimeSeriesChart
                   series={[
                     {
                       name: 'GPU (HBM, avg n=50)',
-                      data: rollingAverage(metrics.prefixCacheHitRate, 50),
-                      rawData: metrics.prefixCacheHitRate,
+                      data: rollingAverage(serverSeries.prefixCacheHitRate, 50),
+                      rawData: serverSeries.prefixCacheHitRate,
                       color: '#a855f7',
                       strokeWidth: 2,
                     },
@@ -500,33 +771,61 @@ export function AgenticPointDetail({ id }: Props) {
           />
 
           <ExpandableChart
-            title="Throughput (total & decode)"
+            title={
+              selectedMetricSource
+                ? `Throughput · ${metricSourceLabel(selectedMetricSource.source)}`
+                : 'Throughput (input & decode)'
+            }
+            controls={
+              <div className="flex items-center gap-1" data-testid="throughput-series-toggle">
+                {(
+                  [
+                    ['input', 'Input'],
+                    ['decode', 'Decode'],
+                  ] as const
+                ).map(([key, label]) => {
+                  const active = throughputSeries.has(key);
+                  const isOnlyActive = active && throughputSeries.size === 1;
+                  return (
+                    <button
+                      key={key}
+                      type="button"
+                      aria-pressed={active}
+                      disabled={isOnlyActive}
+                      data-testid={`throughput-series-${key}`}
+                      className={`rounded px-2 py-1 text-xs font-medium transition-colors ${
+                        active
+                          ? key === 'input'
+                            ? 'bg-blue-500/20 text-blue-600 dark:text-blue-300'
+                            : 'bg-orange-500/20 text-orange-600 dark:text-orange-300'
+                          : 'bg-muted text-muted-foreground hover:text-foreground'
+                      } disabled:cursor-not-allowed disabled:opacity-60`}
+                      onClick={() => {
+                        const next = toggleThroughputSeries(throughputSeries, key);
+                        if (next === throughputSeries) return;
+                        setThroughputSeries(next);
+                        track('inference_agentic_throughput_series_toggled', {
+                          series: key,
+                          enabled: next.has(key),
+                        });
+                      }}
+                    >
+                      {label}
+                    </button>
+                  );
+                })}
+              </div>
+            }
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics) return <Skeleton />;
-              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+              if (!metrics || !serverSeries) return <Skeleton />;
               return (
                 <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'Total (avg n=50)',
-                      data: rollingAverage(total, 50),
-                      color: '#3b82f6',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Decode (avg n=50)',
-                      data: rollingAverage(metrics.decodeTps, 50),
-                      color: '#f97316',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Total running avg',
-                      data: cumulativeAverage(total),
-                      color: '#ef4444',
-                      strokeWidth: 3,
-                    },
-                  ]}
+                  series={buildThroughputChartSeries(
+                    serverSeries.prefillTps,
+                    serverSeries.decodeTps,
+                    throughputSeries,
+                  )}
                   durationS={metrics.durationS}
                   yAxisLabel="Tokens / sec"
                   {...size}
@@ -539,10 +838,10 @@ export function AgenticPointDetail({ id }: Props) {
             title="Cumulative prompt token source breakdown"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics) return <Skeleton />;
+              if (!metrics || !serverSeries) return <Skeleton />;
               return (
                 <StackedAreaChart
-                  sourceSeries={metrics.promptTokensBySource}
+                  sourceSeries={serverSeries.promptTokensBySource}
                   durationS={metrics.durationS}
                   {...size}
                 />
@@ -554,7 +853,7 @@ export function AgenticPointDetail({ id }: Props) {
             title="Total unique input tokens over time"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics) return <Skeleton />;
+              if (!metrics || !serverSeries) return <Skeleton />;
               // Unique = total prompt tokens received minus tokens served from
               // any cache tier — i.e. the freshly prefill-computed tokens. Prefer
               // the promptTokensBySource breakdown (its buckets sum to the real
@@ -564,11 +863,16 @@ export function AgenticPointDetail({ id }: Props) {
               // tokens across scheduler passes, so its cumulative can exceed the
               // prompt tokens received, driving the diff negative and freezing
               // the monotonic-clamped line after a few seconds.
-              const uniqueFromBreakdown = cumulativeUniqueInputTokens(metrics.promptTokensBySource);
+              const uniqueFromBreakdown = cumulativeUniqueInputTokens(
+                serverSeries.promptTokensBySource,
+              );
               const uniqueData =
                 uniqueFromBreakdown.length > 0
                   ? uniqueFromBreakdown
-                  : cumulativeDifferenceMonotonic(metrics.prefillTps, metrics.prefixCacheHitsTps);
+                  : cumulativeDifferenceMonotonic(
+                      serverSeries.prefillTps,
+                      serverSeries.prefixCacheHitsTps,
+                    );
               return (
                 <TimeSeriesChart
                   series={[
@@ -589,6 +893,7 @@ export function AgenticPointDetail({ id }: Props) {
 
           <ExpandableChart
             title="Unique input tokens in flight"
+            testId="unique-input-inflight-chart"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!timelineQuery.data) {
@@ -613,6 +918,12 @@ export function AgenticPointDetail({ id }: Props) {
                       color: '#a855f7',
                       strokeWidth: 2,
                     },
+                    {
+                      name: 'Cumulative average',
+                      data: cumulativeTimeAverage(raw),
+                      color: '#ef4444',
+                      strokeWidth: 3,
+                    },
                   ]}
                   durationS={timelineQuery.data.durationS}
                   yAxisLabel="Tokens"
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
new file mode 100644
index 00000000..d15da878
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -0,0 +1,101 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import { buildRequestTimelineRows, requestIdleStats, splitTimelineCid } from './request-timeline';
+
+const request = (start: number, end: number): RequestRecord => ({
+  cid: 'conversation',
+  ti: start,
+  wid: 'worker',
+  ad: 0,
+  phase: 'profiling',
+  credit: start,
+  start,
+  ack: null,
+  end,
+  ttftMs: null,
+  tpotMs: null,
+  isl: null,
+  osl: null,
+  cancelled: false,
+});
+
+describe('requestIdleStats', () => {
+  it('sums only gaps where no requests overlap', () => {
+    expect(
+      requestIdleStats([
+        request(0, 10),
+        request(5, 20),
+        request(30, 40),
+        request(35, 50),
+        request(70, 80),
+      ]),
+    ).toEqual({ idleNs: 30, spanNs: 80 });
+  });
+
+  it('handles unsorted and nested requests without double-counting busy time', () => {
+    expect(requestIdleStats([request(20, 30), request(0, 100), request(10, 40)])).toEqual({
+      idleNs: 0,
+      spanNs: 100,
+    });
+  });
+
+  it('does not count time before the first start or after the final end', () => {
+    expect(requestIdleStats([request(100, 200), request(300, 400)])).toEqual({
+      idleNs: 100,
+      spanNs: 300,
+    });
+  });
+
+  it('returns zeroes for an empty timeline', () => {
+    expect(requestIdleStats([])).toEqual({ idleNs: 0, spanNs: 0 });
+  });
+});
+
+describe('subagent timeline hierarchy', () => {
+  it('parses aux lanes separately from their parent subagent id', () => {
+    expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:011')).toEqual({
+      parent: 'conv',
+      subagentBase: 'subagent_001_abcd',
+      stream: null,
+      aux: '011',
+    });
+  });
+
+  it('renders aux requests as always-visible children of their subagent', () => {
+    const records = [
+      { ...request(0, 10), cid: 'conv' },
+      { ...request(10, 30), cid: 'conv::sa:subagent_001_abcd' },
+      { ...request(12, 20), cid: 'conv::sa:subagent_001_abcd:aux:011' },
+      { ...request(14, 24), cid: 'conv::sa:subagent_001_abcd:aux:012' },
+      { ...request(40, 50), cid: 'conv::sa:subagent_002_ef01' },
+    ];
+
+    const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+    expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([
+      { kind: 'parent', depth: 0 },
+      { kind: 'subagent', depth: 1 },
+      { kind: 'aux', depth: 2 },
+      { kind: 'aux', depth: 2 },
+      { kind: 'subagent', depth: 1 },
+    ]);
+    expect(rows[1]!.requests.map((record) => record.cid)).toEqual(['conv::sa:subagent_001_abcd']);
+    expect(rows[1]!.auxCount).toBe(2);
+    expect(rows[2]!.label).toBe('aux 011 · parallel');
+    expect(rows[3]!.label).toBe('aux 012 · parallel');
+  });
+
+  it('keeps aux lanes visible while primary streams remain collapsed', () => {
+    const records = [
+      { ...request(10, 20), cid: 'conv::sa:subagent_001_abcd:s0' },
+      { ...request(12, 22), cid: 'conv::sa:subagent_001_abcd:s1' },
+      { ...request(14, 18), cid: 'conv::sa:subagent_001_abcd:aux:001' },
+    ];
+
+    const rows = buildRequestTimelineRows(records, 'conversation', new Set());
+    expect(rows.map((row) => row.kind)).toEqual(['parent', 'subagent', 'aux']);
+    expect(rows[1]!.requests).toHaveLength(2);
+    expect(rows[2]!.requests).toHaveLength(1);
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index baf3dc1f..bdf0a9b9 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -32,6 +32,35 @@ export function subagentIdOf(cid: string): string | null {
   return colon === -1 ? raw : raw.slice(0, colon);
 }
 
+export interface RequestIdleStats {
+  /** Total time between the first start and last end with no request running. */
+  idleNs: number;
+  /** Wall-clock span from the first request start to the final request end. */
+  spanNs: number;
+}
+
+/**
+ * Merge request intervals and sum the gaps between them. Queue time before a
+ * request starts is intentionally excluded: "in flight" means [start, end].
+ */
+export function requestIdleStats(requests: readonly RequestRecord[]): RequestIdleStats {
+  const intervals = requests
+    .filter(({ start, end }) => Number.isFinite(start) && Number.isFinite(end) && end >= start)
+    .map(({ start, end }) => ({ start, end }))
+    .toSorted((a, b) => a.start - b.start || a.end - b.end);
+  if (intervals.length === 0) return { idleNs: 0, spanNs: 0 };
+
+  const firstStart = intervals[0]!.start;
+  let mergedEnd = intervals[0]!.end;
+  let idleNs = 0;
+  for (let i = 1; i < intervals.length; i++) {
+    const interval = intervals[i]!;
+    if (interval.start > mergedEnd) idleNs += interval.start - mergedEnd;
+    if (interval.end > mergedEnd) mergedEnd = interval.end;
+  }
+  return { idleNs, spanNs: mergedEnd - firstStart };
+}
+
 /**
  * Gantt-style request timeline for one agentic benchmark point.
  *
@@ -95,10 +124,12 @@ const PHASE_COLORS: Record<string, string> = {
  *                      when collapsed.
  *   stream           — one :sN stream of a multi-stream subagent (depth 2).
  *                      Hidden by default; toggled in via the parent's chevron.
+ *   aux              — one :aux:N parallel lane (depth 2). Always visible
+ *                      beneath its owning subagent.
  */
-type RowKind = 'parent' | 'worker' | 'subagent' | 'stream';
+type RowKind = 'parent' | 'worker' | 'subagent' | 'stream' | 'aux';
 
-interface Row {
+export interface RequestTimelineRow {
   key: string;
   label: string;
   color: string;
@@ -109,28 +140,40 @@ interface Row {
   streamCount?: number;
   /** For stream rows: the parent subagent's row key (drives expand/collapse). */
   parentRowKey?: string;
+  /** Number of always-visible auxiliary lanes under this subagent. */
+  auxCount?: number;
 }
 
 /**
  * Conversation ids for subagent calls look like
- *   <parent_cid>::sa:<agent_id>[:s<stream_idx>]
+ *   <parent_cid>::sa:<agent_id>[:s<stream_idx>|:aux:<aux_idx>]
  * The optional `:s<N>` suffix is set when the harness fans a single
  * subagent into multiple parallel "streams" (interval-graph
  * decomposition in weka_trace._pack_into_streams). We split it off so
- * we can group all streams of one subagent under a single header row.
+ * we can group every parallel lane under a single subagent header row.
  */
-function splitCid(cid: string): {
+export function splitTimelineCid(cid: string): {
   parent: string;
   subagentBase: string | null;
   stream: number | null;
+  aux: string | null;
 } {
   const sep = cid.indexOf('::sa:');
-  if (sep === -1) return { parent: cid, subagentBase: null, stream: null };
+  if (sep === -1) return { parent: cid, subagentBase: null, stream: null, aux: null };
   const parent = cid.slice(0, sep);
   const raw = cid.slice(sep + 5);
+  const auxMatch = /^(?<base>[^:]+):aux:(?<aux>.+)$/.exec(raw);
+  if (auxMatch) {
+    return {
+      parent,
+      subagentBase: auxMatch.groups!.base!,
+      stream: null,
+      aux: auxMatch.groups!.aux!,
+    };
+  }
   const m = /^(?<base>.*):s(?<stream>\d+)$/.exec(raw);
-  if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) };
-  return { parent, subagentBase: raw, stream: null };
+  if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]), aux: null };
+  return { parent, subagentBase: raw, stream: null, aux: null };
 }
 
 /**
@@ -139,6 +182,7 @@ function splitCid(cid: string): {
  *     subagent_001                  (collapsed by default, container)
  *       :s0                         (hidden unless expanded)
  *       :s1
+ *       aux 011 · parallel          (always visible)
  *     subagent_002
  *     ...
  *
@@ -147,11 +191,11 @@ function splitCid(cid: string): {
  * streams' requests — overlapping bars visually communicate the
  * stream-level parallelism without expanding.
  */
-function buildRows(
+export function buildRequestTimelineRows(
   requests: RequestRecord[],
   mode: RowMode,
   expandedSubagents: ReadonlySet<string>,
-): Row[] {
+): RequestTimelineRow[] {
   if (mode !== 'conversation') {
     // Worker mode: flat rows, sorted by first activity.
     const groups = new Map<string, RequestRecord[]>();
@@ -163,7 +207,7 @@ function buildRows(
       }
       list.push(r);
     }
-    const rows: Row[] = [];
+    const rows: RequestTimelineRow[] = [];
     let i = 0;
     for (const [key, list] of groups) {
       list.sort((a, b) => a.start - b.start);
@@ -181,17 +225,21 @@ function buildRows(
     return rows;
   }
 
-  // Conversation mode — tree: parent → subagent → stream.
+  // Conversation mode — tree: parent → subagent → stream/aux lane.
+  interface SubagentLanes {
+    streams: Map<number | null, RequestRecord[]>;
+    aux: Map<string, RequestRecord[]>;
+  }
   interface Tree {
     parentCid: string;
     parentReqs: RequestRecord[];
-    // subagentBase → (streamIndex|null → requests)
-    subagents: Map<string, Map<number | null, RequestRecord[]>>;
+    // subagentBase → primary streams + always-visible auxiliary lanes.
+    subagents: Map<string, SubagentLanes>;
     firstStart: number;
   }
   const trees = new Map<string, Tree>();
   for (const r of requests) {
-    const { parent, subagentBase, stream } = splitCid(r.cid);
+    const { parent, subagentBase, stream, aux } = splitTimelineCid(r.cid);
     let tree = trees.get(parent);
     if (!tree) {
       tree = {
@@ -205,20 +253,26 @@ function buildRows(
     if (subagentBase === null) {
       tree.parentReqs.push(r);
     } else {
-      let saMap = tree.subagents.get(subagentBase);
-      if (!saMap) {
-        saMap = new Map();
-        tree.subagents.set(subagentBase, saMap);
+      let lanes = tree.subagents.get(subagentBase);
+      if (!lanes) {
+        lanes = { streams: new Map(), aux: new Map() };
+        tree.subagents.set(subagentBase, lanes);
+      }
+      if (aux === null) {
+        const list = lanes.streams.get(stream);
+        if (list) list.push(r);
+        else lanes.streams.set(stream, [r]);
+      } else {
+        const list = lanes.aux.get(aux);
+        if (list) list.push(r);
+        else lanes.aux.set(aux, [r]);
       }
-      const list = saMap.get(stream);
-      if (list) list.push(r);
-      else saMap.set(stream, [r]);
     }
     if (r.start < tree.firstStart) tree.firstStart = r.start;
   }
 
   const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
-  const rows: Row[] = [];
+  const rows: RequestTimelineRow[] = [];
   let colorIdx = 0;
   for (const tree of sortedTrees) {
     const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
@@ -237,20 +291,25 @@ function buildRows(
     // One subagent row per base (which may contain N streams).
     const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => {
       const aStart = Math.min(
-        ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+        ...[...a[1].streams.values(), ...a[1].aux.values()].map(
+          (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY,
+        ),
       );
       const bStart = Math.min(
-        ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+        ...[...b[1].streams.values(), ...b[1].aux.values()].map(
+          (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY,
+        ),
       );
       return aStart - bStart;
     });
-    for (const [saBase, streams] of subagentEntries) {
+    for (const [saBase, lanes] of subagentEntries) {
       const subagentKey = `${tree.parentCid}::sa:${saBase}`;
-      // Union of all stream requests for collapsed-view bars.
+      // Union of primary stream requests for collapsed-view bars. Aux lanes
+      // stay separate so their overlap remains visible as parallel work.
       const allReqs: RequestRecord[] = [];
-      for (const reqs of streams.values()) allReqs.push(...reqs);
+      for (const reqs of lanes.streams.values()) allReqs.push(...reqs);
       allReqs.sort((a, b) => a.start - b.start);
-      const streamCount = streams.size;
+      const streamCount = lanes.streams.size;
       rows.push({
         key: subagentKey,
         label: `↳ ${formatSubagentLabel(saBase)}`,
@@ -259,12 +318,13 @@ function buildRows(
         depth: 1,
         kind: 'subagent',
         streamCount,
+        auxCount: lanes.aux.size,
       });
 
       // Stream children only when expanded AND there's more than one
       // stream (a single-stream subagent has nothing extra to show).
       if (streamCount > 1 && expandedSubagents.has(subagentKey)) {
-        const streamEntries = [...streams.entries()].toSorted((a, b) => {
+        const streamEntries = [...lanes.streams.entries()].toSorted((a, b) => {
           // Sort by stream index (null first as the "default" stream)
           const ai = a[0] ?? -1;
           const bi = b[0] ?? -1;
@@ -283,6 +343,27 @@ function buildRows(
           });
         }
       }
+
+      // Aux lanes encode concurrent requests within the subagent. Keep them
+      // visible even when primary streams are collapsed so parallelism is not
+      // hidden behind an interaction.
+      const auxEntries = [...lanes.aux.entries()].toSorted(
+        (a, b) =>
+          (a[1][0]?.start ?? Number.POSITIVE_INFINITY) -
+          (b[1][0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      for (const [auxId, reqs] of auxEntries) {
+        reqs.sort((a, b) => a.start - b.start);
+        rows.push({
+          key: `${subagentKey}:aux:${auxId}`,
+          label: `aux ${auxId} · parallel`,
+          color,
+          requests: reqs,
+          depth: 2,
+          kind: 'aux',
+          parentRowKey: subagentKey,
+        });
+      }
     }
   }
   return rows;
@@ -340,7 +421,7 @@ function countLt(sorted: number[], target: number): number {
 interface TooltipData {
   x: number;
   y: number;
-  row: Row;
+  row: RequestTimelineRow;
   req: RequestRecord;
 }
 
@@ -475,9 +556,10 @@ export function RequestTimelineView({
     [data.requests, phaseFilter, hasWarmup],
   );
   const rows = useMemo(
-    () => buildRows(filtered, rowMode, expandedSubagents),
+    () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents),
     [filtered, rowMode, expandedSubagents],
   );
+  const idleStats = useMemo(() => requestIdleStats(filtered), [filtered]);
 
   // Pre-sort the timestamp columns so the cursor-time stats popover can
   // count "running / waiting at time t" in O(log n). With a few hundred
@@ -669,7 +751,16 @@ export function RequestTimelineView({
         <span className="ml-auto text-xs text-muted-foreground">
           {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
           {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
-          {formatDuration((dataEnd - dataStart) / 1e6)}
+          {formatDuration((dataEnd - dataStart) / 1e6)} ·{' '}
+          <span
+            data-testid="timeline-total-idle-time"
+            title="Time between the first request start and final request end with no requests in flight"
+          >
+            idle {formatDuration(idleStats.idleNs / 1e6)}
+            {idleStats.spanNs > 0
+              ? ` (${((idleStats.idleNs / idleStats.spanNs) * 100).toFixed(1)}%)`
+              : ''}
+          </span>
           {isZoomed && (
             <>
               {' · '}
@@ -705,12 +796,13 @@ export function RequestTimelineView({
               </div>
               {rows.map((row) => {
                 const isSubagentRow = row.kind === 'subagent';
-                const isStreamRow = row.kind === 'stream';
+                const isChildRow = row.kind === 'stream' || row.kind === 'aux';
                 const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
                 const isExpanded = isExpandable && expandedSubagents.has(row.key);
                 return (
                   <div
                     key={row.key}
+                    data-timeline-row-kind={row.kind}
                     className="flex items-center gap-1 overflow-hidden pr-2"
                     style={{
                       height: ROW_HEIGHT + ROW_GAP,
@@ -734,20 +826,23 @@ export function RequestTimelineView({
                       className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
                       style={{
                         backgroundColor: row.color,
-                        opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
+                        opacity: isChildRow ? 0.4 : isSubagentRow ? 0.55 : 1,
                       }}
                     />
                     <span
                       className="text-[10px] font-mono truncate"
                       style={{
                         color: row.color,
-                        opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                        opacity: isChildRow ? 0.7 : isSubagentRow ? 0.85 : 1,
                       }}
                     >
                       {row.label}
                       {isExpandable && (
                         <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
                       )}
+                      {isSubagentRow && (row.auxCount ?? 0) > 0 && (
+                        <span className="text-muted-foreground ml-1">+{row.auxCount} aux</span>
+                      )}
                     </span>
                     <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
                       {row.requests.length > 0 ? row.requests.length : '—'}
@@ -881,7 +976,7 @@ export function RequestTimelineView({
                           opacity={
                             req.cancelled
                               ? 0.35
-                              : row.kind === 'stream'
+                              : row.kind === 'stream' || row.kind === 'aux'
                                 ? 0.5
                                 : row.kind === 'subagent'
                                   ? 0.6
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
index 3506ff45..a9ece859 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
@@ -2,7 +2,16 @@ import { describe, expect, it } from 'vitest';
 
 import type { RequestRecord } from '@/hooks/api/use-request-timeline';
 
-import { cumulativeUniqueInputTokens, rollingRequestMetric } from './time-series-chart';
+import {
+  averageSequenceLengthInFlight,
+  buildThroughputChartSeries,
+  cumulativeAverage,
+  cumulativeCompletedRequests,
+  cumulativeTimeAverage,
+  cumulativeUniqueInputTokens,
+  rollingRequestMetric,
+  toggleThroughputSeries,
+} from './time-series-chart';
 
 const request = (
   endS: number,
@@ -54,6 +63,22 @@ describe('rollingRequestMetric', () => {
     expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]);
   });
 
+  it('computes E2E latency from request start through request end', () => {
+    const result = rollingRequestMetric(
+      [request(2, 100, 10, { start: 500_000_000 }), request(4, 200, 20, { start: 1_000_000_000 })],
+      'e2e',
+      'p90',
+      50,
+    );
+
+    expect(result.raw).toEqual([
+      { t: 2, value: 1.5 },
+      { t: 4, value: 3 },
+    ]);
+    expect(result.trend.at(-1)?.value).toBeCloseTo(2.85, 8);
+    expect(result.cumulative.at(-1)?.value).toBeCloseTo(2.85, 8);
+  });
+
   it('drops warmup, cancelled, missing, and non-positive samples', () => {
     const result = rollingRequestMetric(
       [
@@ -73,6 +98,154 @@ describe('rollingRequestMetric', () => {
   });
 });
 
+describe('cumulativeAverage', () => {
+  it('hides the startup interval without removing it from later averages', () => {
+    const result = cumulativeAverage(
+      [
+        { t: 0, value: 300 },
+        { t: 30, value: 0 },
+        { t: 60, value: 0 },
+        { t: 90, value: 100 },
+      ],
+      60,
+    );
+
+    expect(result).toEqual([
+      { t: 60, value: 100 },
+      { t: 90, value: 100 },
+    ]);
+  });
+
+  it('preserves the original behavior when no burn-in is requested', () => {
+    expect(
+      cumulativeAverage([
+        { t: 0, value: 10 },
+        { t: 1, value: 20 },
+      ]),
+    ).toEqual([
+      { t: 0, value: 10 },
+      { t: 1, value: 15 },
+    ]);
+  });
+});
+
+describe('cumulativeTimeAverage', () => {
+  it('computes a run-to-date time-weighted average for a step series', () => {
+    expect(
+      cumulativeTimeAverage([
+        { t: 0, value: 100 },
+        { t: 1, value: 300 },
+        { t: 3, value: 100 },
+        { t: 4, value: 0 },
+      ]),
+    ).toEqual([
+      { t: 0, value: 100 },
+      { t: 1, value: 100 },
+      { t: 3, value: 700 / 3 },
+      { t: 4, value: 200 },
+    ]);
+  });
+
+  it('coalesces same-time request events to their final step value', () => {
+    expect(
+      cumulativeTimeAverage([
+        { t: 0, value: 0 },
+        { t: 0, value: 100 },
+        { t: 2, value: 0 },
+      ]),
+    ).toEqual([
+      { t: 0, value: 100 },
+      { t: 2, value: 100 },
+    ]);
+  });
+});
+
+describe('cumulativeCompletedRequests', () => {
+  it('sorts profiling completions and excludes warmup and cancelled requests', () => {
+    expect(
+      cumulativeCompletedRequests([
+        request(4, 100, 10),
+        request(2, 100, 10),
+        request(1, 100, 10, { phase: 'warmup' }),
+        request(3, 100, 10, { cancelled: true }),
+      ]),
+    ).toEqual([
+      { t: 0, value: 0 },
+      { t: 2, value: 1 },
+      { t: 4, value: 2 },
+    ]);
+  });
+
+  it('returns no series when there are no successful profiling completions', () => {
+    expect(cumulativeCompletedRequests([request(1, 100, 10, { cancelled: true })])).toEqual([]);
+  });
+});
+
+describe('averageSequenceLengthInFlight', () => {
+  it('computes the event-time average across overlapping profiling requests', () => {
+    expect(
+      averageSequenceLengthInFlight(
+        [
+          request(4, 100, 10, { start: 0, end: 4_000_000_000, isl: 100 }),
+          request(3, 100, 10, { start: 1_000_000_000, end: 3_000_000_000, isl: 300 }),
+        ],
+        'isl',
+      ),
+    ).toEqual([
+      { t: 0, value: 100 },
+      { t: 1, value: 200 },
+      { t: 3, value: 100 },
+      { t: 4, value: 0 },
+    ]);
+  });
+
+  it('excludes cancelled, warmup, and missing sequence lengths', () => {
+    expect(
+      averageSequenceLengthInFlight(
+        [
+          request(1, 100, 10, { osl: null }),
+          request(2, 100, 10, { osl: 20, cancelled: true }),
+          request(3, 100, 10, { osl: 30, phase: 'warmup' }),
+        ],
+        'osl',
+      ),
+    ).toEqual([]);
+  });
+});
+
+describe('toggleThroughputSeries', () => {
+  it('allows either series to be hidden when both are selected', () => {
+    expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'input')]).toEqual(['decode']);
+    expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'decode')]).toEqual(['input']);
+  });
+
+  it('does not allow the final visible series to be hidden', () => {
+    const selected = new Set<'input' | 'decode'>(['decode']);
+    expect(toggleThroughputSeries(selected, 'decode')).toBe(selected);
+  });
+
+  it('allows the hidden series to be restored', () => {
+    expect([...toggleThroughputSeries(new Set(['decode']), 'input')]).toEqual(['decode', 'input']);
+  });
+
+  it('only includes the total running average when both series are visible', () => {
+    const input = [{ t: 0, value: 10 }];
+    const decode = [{ t: 0, value: 20 }];
+
+    expect(
+      buildThroughputChartSeries(input, decode, new Set(['input', 'decode'])).map(
+        ({ name }) => name,
+      ),
+    ).toEqual(['Input (avg n=50)', 'Decode (avg n=50)', 'Total running avg (60s burn-in)']);
+    expect(
+      buildThroughputChartSeries(input, decode, new Set(['input'])).map(({ name }) => name),
+    ).toEqual(['Input (avg n=50)']);
+    expect(
+      buildThroughputChartSeries(input, decode, new Set(['decode'])).map(({ name }) => name),
+    ).toEqual(['Decode (avg n=50)']);
+  });
+});
+
 describe('cumulativeUniqueInputTokens', () => {
   it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => {
     const out = cumulativeUniqueInputTokens({
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 0c0b5739..ab744286 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -33,8 +33,21 @@ interface TimeSeriesChartProps {
   height?: number;
 }
 
-export type RequestMetric = 'interactivity' | 'ttft';
+export type RequestMetric = 'interactivity' | 'ttft' | 'e2e';
 export type RequestPercentile = 'p75' | 'p90';
+export type ThroughputSeriesKey = 'input' | 'decode';
+
+/** Toggle one throughput series while preserving the at-least-one invariant. */
+export function toggleThroughputSeries(
+  selected: ReadonlySet<ThroughputSeriesKey>,
+  key: ThroughputSeriesKey,
+): ReadonlySet<ThroughputSeriesKey> {
+  if (selected.has(key) && selected.size === 1) return selected;
+  const next = new Set(selected);
+  if (next.has(key)) next.delete(key);
+  else next.add(key);
+  return next;
+}
 
 /** Linear-interpolated percentile (matches numpy's default method). */
 function quantile(sortedAsc: number[], q: number): number {
@@ -47,7 +60,8 @@ function quantile(sortedAsc: number[], q: number): number {
 }
 
 /**
- * Build raw request samples plus a trailing request-count percentile.
+ * Build raw request samples plus a trailing request-count percentile. E2E
+ * latency is measured from HTTP request start through final response byte.
  *
  * The percentile is computed in latency space. Interactivity then inverts
  * the selected TPOT percentile, matching the aggregate chart convention:
@@ -63,7 +77,12 @@ export function rollingRequestMetric(
   const samples = requests
     .filter((request) => request.phase === 'profiling' && !request.cancelled)
     .flatMap((request) => {
-      const latencyMs = metric === 'ttft' ? request.ttftMs : request.tpotMs;
+      const latencyMs =
+        metric === 'ttft'
+          ? request.ttftMs
+          : metric === 'e2e'
+            ? (request.end - request.start) / 1e6
+            : request.tpotMs;
       if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return [];
       return [{ t: request.end / 1e9, latencyMs }];
     })
@@ -71,7 +90,7 @@ export function rollingRequestMetric(
 
   const raw = samples.map(({ t, latencyMs }) => ({
     t,
-    value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs,
+    value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000,
   }));
   const trend = samples.map(({ t }, i) => {
     const start = Math.max(0, i - Math.max(1, windowSize) + 1);
@@ -80,7 +99,7 @@ export function rollingRequestMetric(
       .map((sample) => sample.latencyMs)
       .toSorted((a, b) => a - b);
     const latencyMs = quantile(sorted, q);
-    return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs };
+    return { t, value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000 };
   });
   const prefixLatencies: number[] = [];
   const cumulative = samples.map(({ t, latencyMs }) => {
@@ -95,7 +114,7 @@ export function rollingRequestMetric(
     const cumulativeLatencyMs = quantile(prefixLatencies, q);
     return {
       t,
-      value: metric === 'ttft' ? cumulativeLatencyMs / 1000 : 1000 / cumulativeLatencyMs,
+      value: metric === 'interactivity' ? 1000 / cumulativeLatencyMs : cumulativeLatencyMs / 1000,
     };
   });
 
@@ -154,18 +173,60 @@ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): Tim
   return out;
 }
 
-/** Expanding-window cumulative mean from index 0..i. */
-export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+/**
+ * Expanding-window cumulative mean from index 0..i.
+ *
+ * `burnInS` suppresses rendering during the unstable startup interval while
+ * retaining those samples in every later average. This avoids visually
+ * promoting a single bursty counter bucket without changing the run-to-date
+ * meaning of the line once it appears.
+ */
+export function cumulativeAverage(data: TimeSeriesPoint[], burnInS = 0): TimeSeriesPoint[] {
   if (data.length === 0) return data;
-  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  const out: TimeSeriesPoint[] = [];
+  const firstT = data[0]!.t;
   let sum = 0;
   for (let i = 0; i < data.length; i++) {
     sum += data[i]!.value;
-    out[i] = { t: data[i]!.t, value: sum / (i + 1) };
+    if (data[i]!.t - firstT >= burnInS) {
+      out.push({ t: data[i]!.t, value: sum / (i + 1) });
+    }
   }
   return out;
 }
 
+/**
+ * Run-to-date time-weighted average of a step series.
+ *
+ * Duplicate timestamps are coalesced to their final value before integration;
+ * this is important for request handoffs where several start/end events occur
+ * at the same instant. Each value is held until the next timestamp.
+ */
+export function cumulativeTimeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return [];
+  const points: TimeSeriesPoint[] = [];
+  for (const point of data.toSorted((a, b) => a.t - b.t)) {
+    if (!Number.isFinite(point.t) || !Number.isFinite(point.value)) continue;
+    const previous = points.at(-1);
+    if (previous?.t === point.t) previous.value = point.value;
+    else points.push({ ...point });
+  }
+  if (points.length === 0) return [];
+
+  const firstT = points[0]!.t;
+  let previousT = firstT;
+  let previousValue = points[0]!.value;
+  let area = 0;
+  return points.map((point, index) => {
+    if (index === 0) return { t: point.t, value: point.value };
+    area += previousValue * (point.t - previousT);
+    const duration = point.t - firstT;
+    previousT = point.t;
+    previousValue = point.value;
+    return { t: point.t, value: duration > 0 ? area / duration : point.value };
+  });
+}
+
 /**
  * Running cumulative sum of a per-interval rate series. Each output point
  * is the integral of the rate from start to that point, assuming the rate
@@ -183,6 +244,60 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/** Cumulative count of successfully completed profiling requests by end time. */
+export function cumulativeCompletedRequests(requests: readonly RequestRecord[]): TimeSeriesPoint[] {
+  const completionTimes = requests
+    .filter((request) => request.phase === 'profiling' && !request.cancelled)
+    .map((request) => request.end / 1e9)
+    .filter(Number.isFinite)
+    .toSorted((a, b) => a - b);
+  if (completionTimes.length === 0) return [];
+  return [{ t: 0, value: 0 }, ...completionTimes.map((t, index) => ({ t, value: index + 1 }))];
+}
+
+/**
+ * Retrospective average sequence length among requests active at each event.
+ * OSL uses the request's final observed length across its whole lifetime.
+ */
+export function averageSequenceLengthInFlight(
+  requests: readonly RequestRecord[],
+  metric: 'isl' | 'osl',
+): TimeSeriesPoint[] {
+  const events = new Map<number, { tokenDelta: number; countDelta: number }>();
+  const addEvent = (t: number, tokenDelta: number, countDelta: number) => {
+    const current = events.get(t) ?? { tokenDelta: 0, countDelta: 0 };
+    current.tokenDelta += tokenDelta;
+    current.countDelta += countDelta;
+    events.set(t, current);
+  };
+
+  for (const request of requests) {
+    const tokens = request[metric];
+    if (
+      request.phase !== 'profiling' ||
+      request.cancelled ||
+      tokens === null ||
+      !Number.isFinite(tokens) ||
+      tokens < 0 ||
+      request.end < request.start
+    ) {
+      continue;
+    }
+    addEvent(request.start / 1e9, tokens, 1);
+    addEvent(request.end / 1e9, -tokens, -1);
+  }
+
+  let tokensInFlight = 0;
+  let requestsInFlight = 0;
+  return [...events.entries()]
+    .toSorted((a, b) => a[0] - b[0])
+    .map(([t, event]) => {
+      tokensInFlight += event.tokenDelta;
+      requestsInFlight += event.countDelta;
+      return { t, value: requestsInFlight > 0 ? tokensInFlight / requestsInFlight : 0 };
+    });
+}
+
 // A promptTokensBySource bucket label denotes tokens served from some cache
 // tier (local prefix cache, offloaded/host KV, remote KV transfer) rather than
 // freshly computed. Matches vllm labels (`local_cache_hit`,
@@ -340,6 +455,40 @@ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSerie
   return out;
 }
 
+/** Build throughput lines from the currently visible input/decode signals. */
+export function buildThroughputChartSeries(
+  input: TimeSeriesPoint[],
+  decode: TimeSeriesPoint[],
+  selected: ReadonlySet<ThroughputSeriesKey>,
+): Series[] {
+  const series: Series[] = [];
+  if (selected.has('input')) {
+    series.push({
+      name: 'Input (avg n=50)',
+      data: rollingAverage(input, 50),
+      color: '#3b82f6',
+      strokeWidth: 1.6,
+    });
+  }
+  if (selected.has('decode')) {
+    series.push({
+      name: 'Decode (avg n=50)',
+      data: rollingAverage(decode, 50),
+      color: '#f97316',
+      strokeWidth: 1.6,
+    });
+  }
+  if (selected.size === 2) {
+    series.push({
+      name: 'Total running avg (60s burn-in)',
+      data: cumulativeAverage(sumSeries(input, decode), 60),
+      color: '#ef4444',
+      strokeWidth: 3,
+    });
+  }
+  return series;
+}
+
 const fmtIntDefault = (n: number) =>
   n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
 
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 0f3eedc7..654dd1b9 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -43,7 +43,13 @@ import {
   type QuickFilters,
 } from '@/components/inference/utils/quickFilters';
 
-type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+type XAxisMode =
+  | 'ttft'
+  | 'e2e'
+  | 'normalized-e2e'
+  | 'interactivity'
+  | 'session-time'
+  | 'prefill-tps';
 
 /**
  * Resolve the percentile-prefixed e2e-latency field name for the given
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index e0f5ae1a..5d0981b8 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -728,13 +728,20 @@ export interface InferenceChartContextType {
    * at a time, picked by the big buttons above the chart.
    * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
    * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
+   * - 'normalized-e2e'→ agentic-only; x = per-request E2E normalized to 400 output tokens
    * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
    * - 'session-time'  → agentic-only; x = mean-normalized session time (live-computed from trace blobs)
    * - 'prefill-tps'   → agentic-only; x = mean of P90 prefill TPS/user per session
    */
-  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  selectedXAxisMode:
+    | 'ttft'
+    | 'e2e'
+    | 'normalized-e2e'
+    | 'interactivity'
+    | 'session-time'
+    | 'prefill-tps';
   setSelectedXAxisMode: (
-    mode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps',
+    mode: 'ttft' | 'e2e' | 'normalized-e2e' | 'interactivity' | 'session-time' | 'prefill-tps',
   ) => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index caf713cc..9ad3d881 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -1,5 +1,8 @@
 'use client';
-import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants';
+import {
+  DISPLAY_MODEL_TO_DB,
+  NORMALIZED_E2E_OUTPUT_TOKENS,
+} from '@semianalysisai/inferencex-constants';
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
 import { useEffect, useMemo, useRef, useState } from 'react';
@@ -14,7 +17,10 @@ import type {
   OverlayData,
   TrendDataPoint,
 } from '@/components/inference/types';
-import { processOverlayChartData } from '@/components/inference/utils';
+import {
+  processOverlayChartData,
+  selectUnofficialOverlayForMode,
+} from '@/components/inference/utils';
 import {
   isRunComparisonEntry,
   makeRunComparisonEntry,
@@ -70,7 +76,13 @@ import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 
 type InferenceViewMode = 'chart' | 'table';
 
-type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+type XAxisMode =
+  | 'ttft'
+  | 'e2e'
+  | 'normalized-e2e'
+  | 'interactivity'
+  | 'session-time'
+  | 'prefill-tps';
 
 interface XAxisModeButton {
   value: XAxisMode;
@@ -81,6 +93,7 @@ interface XAxisModeButton {
 const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [
   { value: 'ttft', label: 'TTFT' },
   { value: 'e2e', label: 'E2E Latency' },
+  { value: 'normalized-e2e', label: 'Normalized E2E', agenticOnly: true },
   { value: 'interactivity', label: 'Interactivity' },
   { value: 'session-time', label: 'Session Time', agenticOnly: true },
   { value: 'prefill-tps', label: 'Prefill TPS / user', agenticOnly: true },
@@ -378,7 +391,9 @@ export default function ChartDisplay() {
 
   const useDerived =
     sequenceKind(selectedSequence) === 'agentic' &&
-    (selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps');
+    (selectedXAxisMode === 'normalized-e2e' ||
+      selectedXAxisMode === 'session-time' ||
+      selectedXAxisMode === 'prefill-tps');
   const derivedTargetIds = useMemo(() => {
     if (!useDerived) return [] as number[];
     const ids = new Set<number>();
@@ -403,10 +418,14 @@ export default function ChartDisplay() {
     if (!useDerived) return visibleGraphs;
     if (!derivedMetrics) return visibleGraphs.map((graph) => ({ ...graph, data: [] }));
     const isSession = selectedXAxisMode === 'session-time';
+    const isNormalizedE2e = selectedXAxisMode === 'normalized-e2e';
+    const percentileLabel = selectedPercentile.toUpperCase();
     const xLabel = isSession
       ? 'Mean Normalized Session Time (min)'
-      : 'P90 Prefill TPS per user (tok/s)';
-    const rooflineCorner = isSession ? 'upper_right' : 'upper_left';
+      : isNormalizedE2e
+        ? `${percentileLabel} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens (s)`
+        : 'P90 Prefill TPS per user (tok/s)';
+    const rooflineCorner = isSession || isNormalizedE2e ? 'upper_right' : 'upper_left';
     return visibleGraphs.map((graph) => {
       const chartDefinition = {
         ...graph.chartDefinition,
@@ -420,14 +439,25 @@ export default function ChartDisplay() {
           const metrics = derivedMetrics[point.id];
           const raw = isSession
             ? metrics?.normalized_session_time_s
-            : metrics?.p90_prefill_tps_per_user;
+            : isNormalizedE2e
+              ? selectedPercentile === 'p75'
+                ? metrics?.p75_normalized_e2e_400_s
+                : metrics?.p90_normalized_e2e_400_s
+              : metrics?.p90_prefill_tps_per_user;
           if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
           return { ...point, x: isSession ? raw / 60 : raw };
         })
         .filter((point): point is NonNullable<typeof point> => point !== null);
       return { ...graph, chartDefinition, data };
     });
-  }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
+  }, [
+    useDerived,
+    visibleGraphs,
+    derivedMetrics,
+    selectedXAxisMode,
+    selectedYAxisMetric,
+    selectedPercentile,
+  ]);
 
   const displayGraphs =
     isFirstLoad || isDerivedLoading
@@ -488,10 +518,11 @@ export default function ChartDisplay() {
                       );
                       // Match warnings against the same series the chart annotates,
                       // including visible unofficial-run overlay series.
-                      const overlay =
-                        graph.chartDefinition.chartType === 'e2e'
-                          ? overlayDataByChartType.e2e
-                          : overlayDataByChartType.interactivity;
+                      const overlay = selectUnofficialOverlayForMode(
+                        selectedXAxisMode,
+                        graph.chartDefinition.chartType,
+                        overlayDataByChartType,
+                      );
                       const visibleOverlayRows = isTimelineMode
                         ? []
                         : (overlay?.data ?? []).filter(
@@ -551,6 +582,9 @@ export default function ChartDisplay() {
                                 if (selectedXAxisMode === 'prefill-tps') {
                                   return 'vs. P90 Prefill TPS / user';
                                 }
+                                if (selectedXAxisMode === 'normalized-e2e') {
+                                  return `vs. ${selectedPercentile.toUpperCase()} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens`;
+                                }
                                 const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                                 if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
                                   const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
@@ -597,15 +631,22 @@ export default function ChartDisplay() {
                             )}
                           </p>
                           <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
+                          {isUnofficialRun && selectedXAxisMode === 'normalized-e2e' && (
+                            <p className="mb-2 text-xs text-muted-foreground">
+                              Normalized E2E requires persisted per-request traces, so
+                              unofficial-run overlays are unavailable for this experimental view.
+                            </p>
+                          )}
                           <UnofficialDomainNotice />
                         </>
                       );
 
                       if (getViewMode(graphIndex) === 'table') {
-                        const overlay =
-                          graph.chartDefinition.chartType === 'e2e'
-                            ? overlayDataByChartType.e2e
-                            : overlayDataByChartType.interactivity;
+                        const overlay = selectUnofficialOverlayForMode(
+                          selectedXAxisMode,
+                          graph.chartDefinition.chartType,
+                          overlayDataByChartType,
+                        );
                         const overlayRows = (overlay?.data ?? []).filter((p) =>
                           selectedPrecisions.includes(p.precision),
                         );
@@ -657,9 +698,11 @@ export default function ChartDisplay() {
                             chartDefinition={graph.chartDefinition}
                             caption={chartCaption}
                             overlayData={
-                              graph.chartDefinition.chartType === 'e2e'
-                                ? (overlayDataByChartType.e2e ?? undefined)
-                                : (overlayDataByChartType.interactivity ?? undefined)
+                              selectUnofficialOverlayForMode(
+                                selectedXAxisMode,
+                                graph.chartDefinition.chartType,
+                                overlayDataByChartType,
+                              ) ?? undefined
                             }
                           />
                           {selectedGPUs.length > 0 &&
diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index f9031489..a8cfed48 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -12,6 +12,7 @@ import { getChartWatermark } from '@/lib/data-mappings';
 import { generateGpuDateColors } from '@/lib/dynamic-colors';
 import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils';
 import { useThemeColors } from '@/hooks/useThemeColors';
+import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
 import { D3Chart } from '@/lib/d3-chart/D3Chart';
 import type {
   CustomLayerConfig,
@@ -260,6 +261,20 @@ const GPUGraph = React.memo(
       return pts;
     }, [groupedData, activeDates, hideNonOptimal, optimalPointKeys]);
 
+    // GPU comparison currently renders official DB-backed points only. Unofficial
+    // overlays have no benchmark_results id or persisted trace, so they cannot
+    // open the dedicated per-point charts route.
+    const agenticIds = useMemo(
+      () =>
+        filteredData.flatMap((point) =>
+          point.benchmark_type === 'agentic_traces' && typeof point.id === 'number'
+            ? [point.id]
+            : [],
+        ),
+      [filteredData],
+    );
+    const { data: traceAvailability } = useTraceAvailability(agenticIds);
+
     // Warning annotations for visible series with known upstream issues —
     // same treatment the scatter view gets, applied to the date-comparison view.
     // Lines here are colored per (gpu, date) pair, so take the first active
@@ -799,6 +814,7 @@ const GPUGraph = React.memo(
               selectedYAxisMetric,
               hardwareConfig,
               runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+              hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false,
             }),
           getRulerX: (d, xScale) => (xScale as d3.ScaleLinear<number, number>)(d.x),
           getRulerY: (d, yScale) => (yScale as d3.ScaleLinear<number, number>)(d.y),
@@ -812,6 +828,37 @@ const GPUGraph = React.memo(
               sel.select('.visible-shape') as any,
               getShapeKeyForPrecision(d.precision, selectedPrecisions),
             ),
+          onPointClick: (d: InferenceData) => {
+            track('gpu_timeseries_data_point_clicked', {
+              id: d.id,
+              hw: String(d.hwKey),
+              x: d.x,
+              y: d.y,
+            });
+            const tooltipEl = chartRef.current?.getTooltipElement();
+            if (!tooltipEl) return;
+            const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+            if (!viewBtn || typeof d.id !== 'number') return;
+            viewBtn.addEventListener('click', (event) => {
+              event.stopPropagation();
+              track('gpu_timeseries_view_charts_opened', {
+                id: d.id,
+                hwKey: String(d.hwKey),
+                conc: d.conc,
+              });
+            });
+            // Pinning updates D3Chart's React state. GPU comparison rebuilds
+            // several inline layer configs on that render, whose cleanup can
+            // briefly hide the otherwise-pinned portal tooltip. Restore its
+            // pinned visibility after that render settles.
+            requestAnimationFrame(() => {
+              const pinnedTooltip = chartRef.current?.getTooltipElement();
+              if (!pinnedTooltip || chartRef.current?.getPinnedPoint() !== d) return;
+              pinnedTooltip.style.opacity = '1';
+              pinnedTooltip.style.display = 'block';
+              pinnedTooltip.style.pointerEvents = 'auto';
+            });
+          },
           attachToLayer: 1,
         }}
         onRender={(ctx: RenderContext) => {
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index d3e185d9..b7328acc 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -969,20 +969,16 @@ const ScatterGraph = React.memo(
             });
           }
 
-          // ── "View charts" → navigate to dedicated detail page ────────────
+          // ── "View charts" real link (supports browser open-in-new-tab) ───
           const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
           if (viewBtn && typeof d.id === 'number') {
-            const pointId = d.id;
             viewBtn.addEventListener('click', (btnEvent) => {
               btnEvent.stopPropagation();
               track('latency_view_charts_opened', {
-                id: pointId,
+                id: d.id,
                 hwKey: String(d.hwKey),
                 conc: d.conc,
               });
-              chartRef.current?.dismissTooltip();
-              chartRef.current?.hideTooltip();
-              window.location.assign(`/inference/agentic/${pointId}`);
             });
           }
         },
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 589ba580..7d5b1482 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -1,7 +1,26 @@
 import { describe, it, expect } from 'vitest';
 
 import type { ChartDefinition, InferenceData } from '@/components/inference/types';
-import { filterDataByCostLimit, processOverlayChartData } from '@/components/inference/utils';
+import {
+  filterDataByCostLimit,
+  processOverlayChartData,
+  selectUnofficialOverlayForMode,
+} from '@/components/inference/utils';
+
+describe('selectUnofficialOverlayForMode', () => {
+  const overlays = { e2e: { id: 'e2e' }, interactivity: { id: 'interactivity' } };
+
+  it('suppresses raw unofficial E2E data for normalized E2E mode', () => {
+    expect(selectUnofficialOverlayForMode('normalized-e2e', 'e2e', overlays)).toBeNull();
+  });
+
+  it('preserves matching unofficial overlays for supported modes', () => {
+    expect(selectUnofficialOverlayForMode('e2e', 'e2e', overlays)).toBe(overlays.e2e);
+    expect(selectUnofficialOverlayForMode('interactivity', 'interactivity', overlays)).toBe(
+      overlays.interactivity,
+    );
+  });
+});
 
 // ---------------------------------------------------------------------------
 // fixture factories
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4876c614..f6ebd0f8 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -8,6 +8,20 @@ import chartDefinitions from '@/components/inference/inference-chart-config.json
 
 import type { ChartDefinition, InferenceData, YAxisMetricKey } from './types';
 
+/**
+ * Select the matching unofficial-run overlay for a chart mode. Normalized E2E
+ * is intentionally excluded: unofficial benchmark rows do not include the
+ * persisted per-request trace needed to normalize before taking percentiles.
+ */
+export function selectUnofficialOverlayForMode<T>(
+  xAxisMode: string,
+  chartType: 'e2e' | 'interactivity',
+  overlays: { e2e: T | null; interactivity: T | null },
+): T | null {
+  if (xAxisMode === 'normalized-e2e') return null;
+  return overlays[chartType];
+}
+
 /**
  * Filters data points based on cost limits defined in the chart definition.
  * Only applies filtering for cost-related metrics, and only filters based on
diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
index 5a5bd7e9..e4b9d31f 100644
--- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts
+++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts
@@ -150,6 +150,15 @@ describe('getPointLabel', () => {
 // generateTooltipContent
 // ===========================================================================
 describe('generateTooltipContent', () => {
+  it('renders View charts as a same-tab anchor so browsers offer open-in-new-tab', () => {
+    const html = generateTooltipContent(
+      tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+    );
+    expect(html).toContain('<a data-action="view-charts"');
+    expect(html).toContain('href="/inference/agentic/1"');
+    expect(html).not.toContain('data-action="view-charts" target=');
+  });
+
   it('includes hardware display label from config', () => {
     const html = generateTooltipContent(tooltipConfig());
     expect(html).toContain('H100');
@@ -365,4 +374,27 @@ describe('generateGPUGraphTooltipContent', () => {
     );
     expect(html).toContain('vllm-v0.6.0<br />abc123');
   });
+
+  it('shows View charts only for pinned points with stored trace data', () => {
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+      ),
+    ).toContain('data-action="view-charts"');
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }),
+      ),
+    ).toContain('href="/inference/agentic/1"');
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: false, hasTrace: true }),
+      ),
+    ).not.toContain('data-action="view-charts"');
+    expect(
+      generateGPUGraphTooltipContent(
+        tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: false }),
+      ),
+    ).not.toContain('data-action="view-charts"');
+  });
 });
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index ea039336..e3f0de6d 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -134,15 +134,19 @@ const generateAgenticHTML = (d: InferenceData): string => {
   return parts.join('');
 };
 
-/** "View charts" button — only visible when the tooltip is pinned and the
- *  point has stored trace data. Wired up by the ScatterGraph click handler. */
-const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => {
-  if (!isPinned || !hasTraceData) return '';
-  return `<button data-action="view-charts" style="
-    margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500;
+/** "View charts" link — only visible when the tooltip is pinned and the
+ *  point has stored trace data. Wired up by the scatter/GPU graph click handlers. */
+const viewChartsButtonHTML = (
+  isPinned: boolean,
+  hasTraceData: boolean,
+  pointId: number | undefined,
+): string => {
+  if (!isPinned || !hasTraceData || typeof pointId !== 'number') return '';
+  return `<a data-action="view-charts" href="/inference/agentic/${pointId}" style="
+    display: block; margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500;
     border: 1px solid var(--border); border-radius: 6px; cursor: pointer;
-    background: var(--accent); color: var(--accent-foreground);
-  ">View charts &rarr;</button>`;
+    background: var(--accent); color: var(--accent-foreground); text-align: center; text-decoration: none;
+  ">View charts &rarr;</a>`;
 };
 
 const shortenSha = (image: string) =>
@@ -254,7 +258,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       </div>
       ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
-      ${viewChartsButtonHTML(isPinned, Boolean(hasTrace))}
+      ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
       ${
         isPinned
           ? `<button data-action="track-over-time" style="
@@ -323,7 +327,16 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
  * @returns HTML string for the tooltip content
  */
 export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => {
-  const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+  const {
+    data: d,
+    isPinned,
+    xLabel,
+    yLabel,
+    selectedYAxisMetric,
+    hardwareConfig,
+    runUrl,
+    hasTrace,
+  } = config;
 
   return `
     <div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 12px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1); user-select: ${isPinned ? 'text' : 'none'};">
@@ -372,6 +385,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
       </div>
       ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
+      ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
     </div>
   `;
 };
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts
index 96b0f59f..46491b4e 100644
--- a/packages/app/src/hooks/api/use-datasets.ts
+++ b/packages/app/src/hooks/api/use-datasets.ts
@@ -17,6 +17,10 @@ export interface DatasetSummary {
   mainTurns?: number;
   subagentGroups?: number;
   subagentTurns?: number;
+  meanRequestsPerConversation?: number;
+  medianRequestsPerConversation?: number;
+  meanSubagentsPerTrace?: number;
+  medianSubagentsPerTrace?: number;
   modelMix?: Record<string, number>;
   [k: string]: unknown;
 }
@@ -63,6 +67,8 @@ export interface DatasetChartData {
   inputTokensPerTurn?: Distribution;
   uncachedInputTokensPerTurn?: Distribution;
   outputTokensPerTurn?: Distribution;
+  subagentInputTokensPerRequest?: Distribution;
+  subagentOutputTokensPerRequest?: Distribution;
   turnsPerConversation?: Distribution;
   subagentGroupsPerConversation?: Distribution;
   cachedFractionPerTurn?: Distribution;
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..2e54f418
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+
+import { chunkDerivedAgenticMetricIds } from './use-derived-agentic-metrics';
+
+describe('chunkDerivedAgenticMetricIds', () => {
+  it('keeps every id while respecting the API limit', () => {
+    const ids = Array.from({ length: 401 }, (_, index) => index + 1);
+    const chunks = chunkDerivedAgenticMetricIds(ids);
+
+    expect(chunks.map((chunk) => chunk.length)).toEqual([200, 200, 1]);
+    expect(chunks.flat()).toEqual(ids);
+  });
+});
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
index 6bc7ae5e..c4f517f7 100644
--- a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -8,18 +8,38 @@ export interface DerivedAgenticMetric {
   /** P90 of per-turn ISL/TTFT across every turn in every session.
    *  Null when no prefill rates could be computed. */
   p90_prefill_tps_per_user: number | null;
+  /** P75 normalized per-request E2E at a fixed 400-token output length. */
+  p75_normalized_e2e_400_s: number | null;
+  /** P90 normalized per-request E2E at a fixed 400-token output length. */
+  p90_normalized_e2e_400_s: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
 
+const MAX_IDS_PER_REQUEST = 200;
+
+export function chunkDerivedAgenticMetricIds(ids: number[]): number[][] {
+  const chunks: number[][] = [];
+  for (let i = 0; i < ids.length; i += MAX_IDS_PER_REQUEST) {
+    chunks.push(ids.slice(i, i + MAX_IDS_PER_REQUEST));
+  }
+  return chunks;
+}
+
 async function fetchDerivedAgenticMetrics(
   ids: number[],
   signal?: AbortSignal,
 ): Promise<DerivedAgenticMetricMap> {
   if (ids.length === 0) return {};
-  const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${ids.join(',')}`, { signal });
-  if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`);
-  return (await res.json()) as DerivedAgenticMetricMap;
+  const chunks = chunkDerivedAgenticMetricIds(ids);
+  const maps = await Promise.all(
+    chunks.map(async (chunk) => {
+      const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${chunk.join(',')}`, { signal });
+      if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`);
+      return (await res.json()) as DerivedAgenticMetricMap;
+    }),
+  );
+  return Object.assign({}, ...maps) as DerivedAgenticMetricMap;
 }
 
 /**
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index 11905aaa..a16be558 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -30,6 +30,32 @@ export interface PointMeta {
   server_cpu_cache_hit_rate: number | null;
 }
 
+export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown';
+
+export interface MetricSource {
+  id: string;
+  adapter: string;
+  role: MetricSourceRole;
+  endpointUrl: string | null;
+  nativeRole: string | null;
+  workerId: string | null;
+  dpRank: string | null;
+  engine: string | null;
+}
+
+export interface MetricSourceSeries {
+  source: MetricSource;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  promptTps: TimeSeriesPoint[];
+  generationTps: TimeSeriesPoint[];
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  hostKvCacheUsage: TimeSeriesPoint[];
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
 export interface TraceServerMetrics {
   meta: PointMeta;
   startNs: number;
@@ -51,6 +77,8 @@ export interface TraceServerMetrics {
    * the cluster-average `kvCacheUsage` line covers that case alone.
    */
   kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /** Orchestrator-normalized metrics grouped by endpoint/worker. */
+  metricSources: MetricSourceSeries[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
index debbb788..8b691ee4 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts
@@ -4,7 +4,7 @@ import { describe, expect, it } from 'vitest';
 
 import type { ShapeKey } from '@/lib/chart-rendering';
 
-import { renderScatterPoints, syncPointShape } from './scatter-points';
+import { computeTooltipPosition, renderScatterPoints, syncPointShape } from './scatter-points';
 
 interface TestPoint {
   hwKey: string;
@@ -163,3 +163,51 @@ describe('syncPointShape', () => {
     expect(g.selectAll('.visible-shape').size()).toBe(1);
   });
 });
+
+describe('computeTooltipPosition', () => {
+  it('keeps a tall pinned tooltip inside the visible viewport', () => {
+    const tooltipNode = document.createElement('div');
+    document.body.append(tooltipNode);
+    Object.defineProperty(tooltipNode, 'getBoundingClientRect', {
+      value: () => ({
+        width: 300,
+        height: 400,
+        left: 0,
+        top: 0,
+        right: 300,
+        bottom: 400,
+        x: 0,
+        y: 0,
+        toJSON: () => ({}),
+      }),
+    });
+
+    const container = document.createElement('div');
+    Object.defineProperties(container, {
+      clientWidth: { value: 800 },
+      clientHeight: { value: 600 },
+      getBoundingClientRect: {
+        value: () => ({
+          width: 800,
+          height: 600,
+          left: 100,
+          top: 600,
+          right: 900,
+          bottom: 1200,
+          x: 100,
+          y: 600,
+          toJSON: () => ({}),
+        }),
+      },
+    });
+    Object.defineProperties(document.documentElement, {
+      clientWidth: { configurable: true, value: 1280 },
+      clientHeight: { configurable: true, value: 720 },
+    });
+
+    expect(computeTooltipPosition(450, 100, d3.select(tooltipNode), container)).toEqual({
+      left: 560,
+      top: 316,
+    });
+  });
+});
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index c73f1302..433ed6d1 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -323,8 +323,9 @@ export function attachScatterTooltipHandlers<
  * whole problem; we just need the coordinates in viewport space.
  *
  * Strategy: pick preferred side (right/below cursor), flip if it overflows the
- * container, then clamp to container bounds. Tall tooltips that don't fit get
- * clamped to the container edges.
+ * container, then clamp the final fixed coordinates to the viewport. The
+ * viewport clamp matters when a chart continues below the fold: container-
+ * local coordinates can otherwise place a pinned tooltip's actions offscreen.
  */
 export function computeTooltipPosition(
   mx: number,
@@ -357,8 +358,16 @@ export function computeTooltipPosition(
   let top = my + offset + th <= ch ? my + offset : my - offset - th;
   top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top));
 
-  // Convert container-local coords → viewport coords for `position: fixed`.
-  return { left: left + rect.left, top: top + rect.top };
+  // Convert container-local coords → viewport coords for `position: fixed`,
+  // then keep the complete tooltip visible when its dimensions permit it.
+  const viewportWidth = document.documentElement.clientWidth || window.innerWidth;
+  const viewportHeight = document.documentElement.clientHeight || window.innerHeight;
+  left += rect.left;
+  top += rect.top;
+  left = Math.max(EDGE_PAD, Math.min(viewportWidth - tw - EDGE_PAD, left));
+  top = Math.max(EDGE_PAD, Math.min(viewportHeight - th - EDGE_PAD, top));
+
+  return { left, top };
 }
 
 /** Update scatter point positions on zoom. */
diff --git a/packages/constants/src/agentic.ts b/packages/constants/src/agentic.ts
new file mode 100644
index 00000000..42eab306
--- /dev/null
+++ b/packages/constants/src/agentic.ts
@@ -0,0 +1,2 @@
+/** Fixed output length used by the experimental normalized-E2E chart metric. */
+export const NORMALIZED_E2E_OUTPUT_TOKENS = 400;
diff --git a/packages/constants/src/index.ts b/packages/constants/src/index.ts
index e767e500..7d3d6783 100644
--- a/packages/constants/src/index.ts
+++ b/packages/constants/src/index.ts
@@ -1,3 +1,4 @@
+export * from './agentic';
 export * from './framework-aliases';
 export * from './github';
 export * from './gpu-keys';
diff --git a/packages/db/migrations/009_dataset_request_stats.sql b/packages/db/migrations/009_dataset_request_stats.sql
new file mode 100644
index 00000000..0b7c11bb
--- /dev/null
+++ b/packages/db/migrations/009_dataset_request_stats.sql
@@ -0,0 +1,55 @@
+-- Backfill dataset-level requests/conversation statistics.
+-- A request is one actual model call: each top-level turn plus each child turn
+-- inside a subagent group. The group container itself is not a request.
+
+with per_conversation as (
+  select
+    dc.dataset_id,
+    dc.num_subagent_groups,
+    (
+      dc.num_turns + coalesce((
+        select sum(jsonb_array_length(node.value->'children'))
+        from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) as node(value)
+        where node.value->>'kind' = 'subagent'
+      ), 0)
+    )::double precision as request_count
+  from dataset_conversations dc
+), request_stats as (
+  select
+    dataset_id,
+    avg(request_count) as mean_requests,
+    percentile_cont(0.5) within group (order by request_count) as median_requests,
+    avg(num_subagent_groups::double precision) as mean_subagents,
+    percentile_cont(0.5) within group (order by num_subagent_groups) as median_subagents
+  from per_conversation
+  group by dataset_id
+)
+update datasets d
+set summary = jsonb_set(
+  jsonb_set(
+    jsonb_set(
+      jsonb_set(
+        jsonb_set(
+          d.summary,
+          '{meanRequestsPerConversation}',
+          to_jsonb(request_stats.mean_requests),
+          true
+        ),
+        '{medianRequestsPerConversation}',
+        to_jsonb(request_stats.median_requests),
+        true
+      ),
+      '{meanSubagentsPerTrace}',
+      to_jsonb(request_stats.mean_subagents),
+      true
+    ),
+    '{medianSubagentsPerTrace}',
+    to_jsonb(request_stats.median_subagents),
+    true
+  ),
+  '{version}',
+  '3'::jsonb,
+  true
+)
+from request_stats
+where d.id = request_stats.dataset_id;
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
index 8dd42dce..5bd760b7 100644
--- a/packages/db/src/backfill-aggregate-stats.ts
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -23,7 +23,12 @@
  */
 
 import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
-import { computeAggregateStats, STATS_VERSION } from './etl/compute-aggregate-stats.js';
+import {
+  computeAggregateStats,
+  mergeProfileStatsUpgrade,
+  STATS_VERSION,
+  type AggregateStats,
+} from './etl/compute-aggregate-stats.js';
 import { createAdminSql } from './etl/db-utils.js';
 
 interface CliFlags {
@@ -104,9 +109,9 @@ async function main(): Promise<void> {
     try {
       // Fetch one row at a time — the json_gz blob is the heavy field.
       const [row] = await sql<
-        { profile_export_jsonl_gz: Buffer | null; server_metrics_json_gz: Buffer | null }[]
+        { profile_export_jsonl_gz: Buffer | null; aggregate_stats: AggregateStats | null }[]
       >`
-        select profile_export_jsonl_gz, server_metrics_json_gz
+        select profile_export_jsonl_gz, aggregate_stats
         from agentic_trace_replay
         where id = ${id}
       `;
@@ -115,10 +120,24 @@ async function main(): Promise<void> {
         continue;
       }
 
-      const stats = await computeAggregateStats({
-        profileBlob: row.profile_export_jsonl_gz,
-        serverBlob: row.server_metrics_json_gz,
-      });
+      let stats: AggregateStats;
+      if (row.aggregate_stats?.version === 3) {
+        const profileStats = await computeAggregateStats({
+          profileBlob: row.profile_export_jsonl_gz,
+          serverBlob: null,
+        });
+        stats = mergeProfileStatsUpgrade(row.aggregate_stats, profileStats);
+      } else {
+        const [serverRow] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
+          select server_metrics_json_gz
+          from agentic_trace_replay
+          where id = ${id}
+        `;
+        stats = await computeAggregateStats({
+          profileBlob: row.profile_export_jsonl_gz,
+          serverBlob: serverRow?.server_metrics_json_gz ?? null,
+        });
+      }
 
       await sql`
         update agentic_trace_replay
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
index 66156b45..416904f9 100644
--- a/packages/db/src/backfill-chart-series.ts
+++ b/packages/db/src/backfill-chart-series.ts
@@ -108,17 +108,34 @@ async function main(): Promise<void> {
   for (const { id } of candidates) {
     const start = Date.now();
     try {
-      const [row] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
-        select server_metrics_json_gz
-        from agentic_trace_replay
-        where id = ${id}
+      const [row] = await sql<
+        {
+          server_metrics_json_gz: Buffer | null;
+          framework: string | null;
+          disagg: boolean | null;
+        }[]
+      >`
+        select atr.server_metrics_json_gz, source.framework, source.disagg
+        from agentic_trace_replay atr
+        left join lateral (
+          select c.framework, c.disagg
+          from benchmark_results br
+          join configs c on c.id = br.config_id
+          where br.trace_replay_id = atr.id
+          order by br.id
+          limit 1
+        ) source on true
+        where atr.id = ${id}
       `;
       if (!row) {
         console.warn(`  id=${id}: row vanished, skipping`);
         continue;
       }
 
-      const series = await computeChartSeries(row.server_metrics_json_gz);
+      const series = await computeChartSeries(row.server_metrics_json_gz, {
+        framework: row.framework,
+        disagg: row.disagg ?? false,
+      });
 
       await sql`
         update agentic_trace_replay
diff --git a/packages/db/src/backfill-dataset-stats.ts b/packages/db/src/backfill-dataset-stats.ts
new file mode 100644
index 00000000..6dce6164
--- /dev/null
+++ b/packages/db/src/backfill-dataset-stats.ts
@@ -0,0 +1,115 @@
+/**
+ * Backfill dataset summary stats and subagent-only ISL/OSL distributions from
+ * the compact structures already stored in `dataset_conversations`.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-dataset-stats --yes
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { createAdminSql } from './etl/db-utils.js';
+import { logHistogram, summarizeValues } from './etl/weka-structure.js';
+
+interface DatasetRow {
+  id: string;
+  slug: string;
+  summary: Record<string, unknown>;
+  chart_data: Record<string, unknown>;
+}
+
+interface ConversationRow {
+  num_subagent_groups: number | string;
+  request_count: number | string;
+}
+
+interface SubagentRequestRow {
+  input_tokens: number | string;
+  output_tokens: number | string;
+}
+
+const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
+
+async function main(): Promise<void> {
+  const datasets = await sql<DatasetRow[]>`
+    select id, slug, summary, chart_data
+    from datasets
+    order by slug
+  `;
+  if (datasets.length === 0) {
+    console.log('No datasets found.');
+    return;
+  }
+
+  console.log(`Backfill subagent dataset stats for ${datasets.length} dataset(s).`);
+  if (!hasYesFlag() && !(await confirm('Continue? (y/N) '))) return;
+
+  for (const dataset of datasets) {
+    const conversations = await sql<ConversationRow[]>`
+      select
+        num_subagent_groups,
+        (
+          num_turns + coalesce((
+            select sum(jsonb_array_length(node.value->'children'))
+            from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value)
+            where node.value->>'kind' = 'subagent'
+          ), 0)
+        ) as request_count
+      from dataset_conversations dc
+      where dataset_id = ${dataset.id}
+    `;
+    const requests = await sql<SubagentRequestRow[]>`
+      select
+        (child.value->>'in')::double precision as input_tokens,
+        (child.value->>'out')::double precision as output_tokens
+      from dataset_conversations dc
+      cross join lateral jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value)
+      cross join lateral jsonb_array_elements(coalesce(node.value->'children', '[]'::jsonb)) child(value)
+      where dc.dataset_id = ${dataset.id}
+        and node.value->>'kind' = 'subagent'
+    `;
+
+    const subagentsPerTrace = conversations.map((row) => Number(row.num_subagent_groups));
+    const requestsPerConversation = conversations.map((row) => Number(row.request_count));
+    const inputTokens = requests.map((row) => Number(row.input_tokens));
+    const outputTokens = requests.map((row) => Number(row.output_tokens));
+    const subagentStats = summarizeValues(subagentsPerTrace);
+    const requestStats = summarizeValues(requestsPerConversation);
+    const summary = {
+      ...dataset.summary,
+      version: 3,
+      meanSubagentsPerTrace: subagentStats.mean,
+      medianSubagentsPerTrace: subagentStats.median,
+      meanRequestsPerConversation: requestStats.mean,
+      medianRequestsPerConversation: requestStats.median,
+    };
+    const chartData = {
+      ...dataset.chart_data,
+      version: 3,
+      subagentInputTokensPerRequest: {
+        bins: logHistogram(inputTokens),
+        stats: summarizeValues(inputTokens),
+      },
+      subagentOutputTokensPerRequest: {
+        bins: logHistogram(outputTokens),
+        stats: summarizeValues(outputTokens),
+      },
+    };
+
+    await sql`
+      update datasets
+      set summary = ${sql.json(summary)},
+          chart_data = ${sql.json(structuredClone(chartData) as unknown as Parameters<typeof sql.json>[0])}
+      where id = ${dataset.id}
+    `;
+    console.log(
+      `  ${dataset.slug}: ${requests.length.toLocaleString()} inner requests, median ${subagentStats.median}, mean ${subagentStats.mean.toFixed(1)} subagents/trace`,
+    );
+  }
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-dataset-stats failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts
index de0009de..7b745c09 100644
--- a/packages/db/src/etl/compute-aggregate-stats.test.ts
+++ b/packages/db/src/etl/compute-aggregate-stats.test.ts
@@ -2,7 +2,11 @@ import { gzipSync } from 'node:zlib';
 
 import { describe, expect, it } from 'vitest';
 
-import { STATS_VERSION, computeAggregateStats } from './compute-aggregate-stats.js';
+import {
+  STATS_VERSION,
+  computeAggregateStats,
+  mergeProfileStatsUpgrade,
+} from './compute-aggregate-stats.js';
 
 /** Build a minimal `profile_export.jsonl` from a few synthetic requests. */
 function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) {
@@ -64,6 +68,7 @@ describe('computeAggregateStats', () => {
     expect(stats.prefixCacheHitRate).toBeNull();
     expect(stats.normalizedSessionTimeS).toBeNull();
     expect(stats.p90PrefillTpsPerUser).toBeNull();
+    expect(stats.normalizedE2e400).toBeNull();
   });
 
   it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => {
@@ -90,6 +95,8 @@ describe('computeAggregateStats', () => {
     //   scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625]
     //   mean ≈ 1.9653
     expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3);
+    expect(stats.normalizedE2e400?.n).toBe(3);
+    expect(stats.normalizedE2e400?.p90).toBeGreaterThan(0);
   });
 
   it('computes KV util + prefix hit rate from the server blob alone', async () => {
@@ -107,6 +114,7 @@ describe('computeAggregateStats', () => {
     expect(stats.osl).toBeNull();
     expect(stats.normalizedSessionTimeS).toBeNull();
     expect(stats.p90PrefillTpsPerUser).toBeNull();
+    expect(stats.normalizedE2e400).toBeNull();
   });
 
   it('tolerates a malformed profile blob by leaving its metrics null', async () => {
@@ -117,7 +125,28 @@ describe('computeAggregateStats', () => {
     expect(stats.osl).toBeNull();
     expect(stats.normalizedSessionTimeS).toBeNull();
     expect(stats.p90PrefillTpsPerUser).toBeNull();
+    expect(stats.normalizedE2e400).toBeNull();
     // Version still set so the row is considered "computed".
     expect(stats.version).toBe(STATS_VERSION);
   });
 });
+
+describe('mergeProfileStatsUpgrade', () => {
+  it('updates profile metrics while preserving existing server distributions', async () => {
+    const existing = await computeAggregateStats({
+      profileBlob: null,
+      serverBlob: makeServerBlob(),
+    });
+    const profile = await computeAggregateStats({
+      profileBlob: makeProfileBlob([{ isl: 100, osl: 100, rl: 2080, ttft: 100 }]),
+      serverBlob: null,
+    });
+
+    const merged = mergeProfileStatsUpgrade(existing, profile);
+    expect(merged.version).toBe(STATS_VERSION);
+    expect(merged.isl?.mean).toBe(100);
+    expect(merged.normalizedE2e400?.p90).toBeGreaterThan(0);
+    expect(merged.kvCacheUtil).toEqual(existing.kvCacheUtil);
+    expect(merged.prefixCacheHitRate).toEqual(existing.prefixCacheHitRate);
+  });
+});
diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts
index a422cfec..15e5f1ba 100644
--- a/packages/db/src/etl/compute-aggregate-stats.ts
+++ b/packages/db/src/etl/compute-aggregate-stats.ts
@@ -39,6 +39,30 @@ export interface AggregateStats {
   normalizedSessionTimeS: number | null;
   /** P90 of per-turn ISL/TTFT pooled across every session's turns. */
   p90PrefillTpsPerUser: number | null;
+  /** Per-request normalized E2E distribution at a fixed 400-token OSL. */
+  normalizedE2e400: MetricPercentiles | null;
+}
+
+/**
+ * Upgrade an existing stats bundle when only profile-derived fields changed.
+ * This avoids re-reading and decompressing the much larger server-metrics blob
+ * while preserving its already-computed KV/cache distributions.
+ */
+export function mergeProfileStatsUpgrade(
+  existing: Omit<AggregateStats, 'normalizedE2e400'> & {
+    normalizedE2e400?: MetricPercentiles | null;
+  },
+  profile: AggregateStats,
+): AggregateStats {
+  return {
+    ...profile,
+    isl: profile.isl ?? existing.isl,
+    osl: profile.osl ?? existing.osl,
+    normalizedSessionTimeS: profile.normalizedSessionTimeS ?? existing.normalizedSessionTimeS,
+    p90PrefillTpsPerUser: profile.p90PrefillTpsPerUser ?? existing.p90PrefillTpsPerUser,
+    kvCacheUtil: existing.kvCacheUtil,
+    prefixCacheHitRate: existing.prefixCacheHitRate,
+  };
 }
 
 /** Metric subtrees we extract via stream-parse on oversized server blobs. */
@@ -93,6 +117,7 @@ export async function computeAggregateStats(args: {
   let oslPct: MetricPercentiles | null = null;
   let normalized: number | null = null;
   let prefillP90: number | null = null;
+  let normalizedE2e400: MetricPercentiles | null = null;
 
   if (args.profileBlob) {
     try {
@@ -103,6 +128,7 @@ export async function computeAggregateStats(args: {
       const derived = computeDerivedFromBlob(jsonl);
       normalized = derived.normalized_session_time_s;
       prefillP90 = derived.p90_prefill_tps_per_user;
+      normalizedE2e400 = derived.normalized_e2e_400;
     } catch {
       // ignore malformed blob — leave nulls
     }
@@ -143,5 +169,6 @@ export async function computeAggregateStats(args: {
     prefixCacheHitRate: prefixPct,
     normalizedSessionTimeS: normalized,
     p90PrefillTpsPerUser: prefillP90,
+    normalizedE2e400,
   };
 }
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
index 4c6f8791..7d292207 100644
--- a/packages/db/src/etl/compute-chart-series.test.ts
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -105,6 +105,20 @@ function buildEngineSeries(engineId: number, baseRunning: number) {
   };
 }
 
+function buildDynamoSeries(
+  endpoint_url: string,
+  dynamo_component: 'prefill' | 'backend',
+  worker_id: string,
+  value: number,
+  field: 'rate' | 'avg' = 'rate',
+) {
+  return {
+    endpoint_url,
+    labels: { dynamo_component, worker_id, dp_rank: '0', engine: '0' },
+    timeslices: [{ start_ns: 0, end_ns: 1e9, [field]: value }],
+  };
+}
+
 describe('computeChartSeries', () => {
   it('returns null when the blob is null', async () => {
     expect(await computeChartSeries(null)).toBeNull();
@@ -206,4 +220,79 @@ describe('computeChartSeries', () => {
       { t: 1, value: 300 },
     ]);
   });
+
+  it('uses the Dynamo adapter to preserve workers and canonical prefill/decode roles', async () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:prompt_tokens': {
+          series: [
+            buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 100),
+            buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 200),
+            buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 300),
+          ],
+        },
+        'vllm:generation_tokens': {
+          series: [
+            buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 1),
+            buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 2),
+            buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 400),
+          ],
+        },
+        'vllm:num_requests_running': {
+          series: [
+            buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 3, 'avg'),
+            buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 4, 'avg'),
+            buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 5, 'avg'),
+          ],
+        },
+      },
+    });
+
+    const blob = gzipSync(Buffer.from(json));
+    const result = await computeChartSeries(blob, {
+      framework: 'dynamo-vllm',
+      disagg: true,
+    });
+
+    expect(result?.metricSources).toHaveLength(3);
+    expect(result?.metricSources.map(({ source: s }) => [s.role, s.workerId, s.engine])).toEqual([
+      ['prefill', 'prefill-b', '0'],
+      ['prefill', 'prefill-a', '0'],
+      ['decode', 'decode-a', '0'],
+    ]);
+    const prefillA = result?.metricSources.find(({ source: s }) => s.workerId === 'prefill-a');
+    const decode = result?.metricSources.find(({ source: s }) => s.role === 'decode');
+    expect(prefillA?.promptTps).toEqual([{ t: 0, value: 100 }]);
+    expect(prefillA?.queueDepth).toEqual([{ t: 0, running: 3, waiting: 0, total: 3 }]);
+    expect(decode?.generationTps).toEqual([{ t: 0, value: 400 }]);
+
+    const nonDisagg = await computeChartSeries(blob, {
+      framework: 'dynamo-vllm',
+      disagg: false,
+    });
+    expect(nonDisagg?.metricSources).toEqual([]);
+  });
+
+  it('does not interpret Dynamo-native labels without selecting the Dynamo adapter', async () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:prompt_tokens': {
+          series: [
+            {
+              endpoint_url: '10.30.1.56:7500',
+              labels: { dynamo_component: 'prefill', worker_id: 'prefill-a', engine: '0' },
+              timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 100 }],
+            },
+          ],
+        },
+      },
+    });
+
+    const result = await computeChartSeries(gzipSync(Buffer.from(json)), {
+      framework: 'vllm',
+      disagg: true,
+    });
+
+    expect(result?.metricSources).toEqual([]);
+  });
 });
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 46600f7d..394a5826 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -17,6 +17,12 @@ import { parser } from 'stream-json';
 import { pick } from 'stream-json/filters/pick.js';
 import { streamObject } from 'stream-json/streamers/stream-object.js';
 
+import {
+  selectServerMetricsAdapter,
+  type MetricSource,
+  type ServerMetricsContext,
+} from './server-metrics-adapters';
+
 /**
  * Bump when the extraction algorithm changes — backfill recomputes anything
  * older.
@@ -49,8 +55,16 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * `kvCacheUsageByEngine` (one entry per DP rank). The cluster-average
  * line hides load skew on DEP configs; the detail page overlays the
  * per-rank lines so a hot rank is visible at a glance.
+ *
+ * v9: retain orchestrator-normalized per-source series. Dynamo labels are
+ * mapped to canonical router/prefill/decode roles, allowing the frontend to
+ * inspect individual workers without interpreting Dynamo-native labels.
+ *
+ * v10: only emit per-source series for disaggregated configs with a recognized
+ * orchestrator adapter. Non-disaggregated and unsupported configs retain the
+ * existing aggregate-only behavior.
  */
-export const CHART_SERIES_VERSION = 8;
+export const CHART_SERIES_VERSION = 10;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -103,6 +117,26 @@ export interface ChartSeries {
    * visible without changing the headline number.
    */
   kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /**
+   * The same metrics grouped by normalized server source. Existing aggregate
+   * fields above remain the default and preserve compatibility with old rows.
+   */
+  metricSources: MetricSourceSeries[];
+}
+
+export interface MetricSourceSeries {
+  source: MetricSource;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  /** Raw prompt-token counter rate for this source. */
+  promptTps: TimeSeriesPoint[];
+  /** Raw generation-token counter rate for this source. */
+  generationTps: TimeSeriesPoint[];
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  hostKvCacheUsage: TimeSeriesPoint[];
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -115,6 +149,7 @@ interface RawSlice {
 }
 
 interface RawSeries {
+  endpoint_url?: string;
   labels?: Record<string, string>;
   timeslices?: RawSlice[];
 }
@@ -204,7 +239,10 @@ async function parseMetrics(buffer: Buffer): Promise<MetricsMap> {
  * The math mirrors `getTraceServerMetrics` — this helper exists so ingest,
  * backfill, and the API path produce byte-identical results.
  */
-export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeries | null> {
+export async function computeChartSeries(
+  blob: Buffer | null,
+  context: ServerMetricsContext = {},
+): Promise<ChartSeries | null> {
   if (!blob) return null;
   let metrics: MetricsMap;
   try {
@@ -213,7 +251,7 @@ export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeri
     // Malformed blob → no series (caller treats null as "no data").
     return null;
   }
-  return buildSeriesFromMetrics(metrics);
+  return buildSeriesFromMetrics(metrics, context);
 }
 
 /**
@@ -249,7 +287,12 @@ function sortedEntries(m: Map<number, number>): [number, number][] {
   return [...m.entries()].toSorted((a, b) => a[0] - b[0]);
 }
 
-function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
+function buildSeriesFromMetrics(
+  metrics: MetricsMap,
+  context: ServerMetricsContext,
+  includeMetricSources = true,
+  originStartNs?: number,
+): ChartSeries {
   // Timing reference: smallest start_ns and largest end_ns across every
   // timeslice we extracted. timeslicesCount is the length of any single
   // series (engines are scraped on the same cadence), so picking the max
@@ -269,7 +312,7 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }
   }
   if (!Number.isFinite(startNs)) startNs = 0;
-  const tOf = (ns: number) => (ns - startNs) / 1e9;
+  const tOf = (ns: number) => (ns - (originStartNs ?? startNs)) / 1e9;
 
   // Pick the first metric name whose series array has any data; fallback
   // chain lets the same code path serve both vllm:* and sglang:* blobs.
@@ -465,6 +508,57 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }
     if (arr.length > 0) promptTokensBySource[source] = arr;
   }
+
+  const metricSources: MetricSourceSeries[] = [];
+  const adapter = selectServerMetricsAdapter(context);
+  if (includeMetricSources && context.disagg && adapter.id !== 'generic') {
+    const grouped = new Map<string, { source: MetricSource; metrics: MetricsMap }>();
+    for (const [metricName, metric] of Object.entries(metrics)) {
+      for (const series of metric.series ?? []) {
+        const source = adapter.identifySource(series);
+        let group = grouped.get(source.id);
+        if (!group) {
+          group = { source, metrics: {} };
+          grouped.set(source.id, group);
+        }
+        const groupedMetric = (group.metrics[metricName] ??= { series: [] });
+        groupedMetric.series!.push(series);
+      }
+    }
+    for (const { source, metrics: sourceMetrics } of grouped.values()) {
+      const sourceSeries = buildSeriesFromMetrics(
+        sourceMetrics,
+        context,
+        false,
+        originStartNs ?? startNs,
+      );
+      metricSources.push({
+        source,
+        kvCacheUsage: sourceSeries.kvCacheUsage,
+        prefixCacheHitRate: sourceSeries.prefixCacheHitRate,
+        queueDepth: sourceSeries.queueDepth,
+        promptTokensBySource: sourceSeries.promptTokensBySource,
+        promptTps: sourceSeries.prefillTps,
+        generationTps: sourceSeries.decodeTps,
+        prefixCacheHitsTps: sourceSeries.prefixCacheHitsTps,
+        hostKvCacheUsage: sourceSeries.hostKvCacheUsage,
+        kvCacheUsageByEngine: sourceSeries.kvCacheUsageByEngine,
+      });
+    }
+    const roleOrder: Record<MetricSource['role'], number> = {
+      router: 0,
+      prefill: 1,
+      decode: 2,
+      combined: 3,
+      unknown: 4,
+    };
+    metricSources.sort(
+      (a, b) =>
+        roleOrder[a.source.role] - roleOrder[b.source.role] ||
+        (a.source.endpointUrl ?? '').localeCompare(b.source.endpointUrl ?? '') ||
+        a.source.id.localeCompare(b.source.id),
+    );
+  }
   return {
     version: CHART_SERIES_VERSION,
     startNs,
@@ -480,5 +574,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     prefixCacheHitsTps,
     hostKvCacheUsage,
     kvCacheUsageByEngine,
+    metricSources,
   };
 }
diff --git a/packages/db/src/etl/dataset-provenance.test.ts b/packages/db/src/etl/dataset-provenance.test.ts
new file mode 100644
index 00000000..4022546e
--- /dev/null
+++ b/packages/db/src/etl/dataset-provenance.test.ts
@@ -0,0 +1,40 @@
+import { describe, expect, it } from 'vitest';
+
+import { datasetSlugFromBenchmarkRow } from './dataset-provenance';
+
+describe('datasetSlugFromBenchmarkRow', () => {
+  it('maps aiperf public-dataset provenance to the dashboard dataset slug', () => {
+    expect(
+      datasetSlugFromBenchmarkRow({
+        dataset: {
+          source_type: 'public_dataset',
+          loader: 'semianalysis_cc_traces_weka_with_subagents',
+          hf_dataset_name: 'semianalysisai/cc-traces-weka-062126',
+          hf_split: 'train',
+          num_dataset_entries: 393,
+        },
+      }),
+    ).toBe('cc-traces-weka-062126');
+  });
+
+  it('supports an unnamespaced Hugging Face dataset id', () => {
+    expect(
+      datasetSlugFromBenchmarkRow({
+        dataset: {
+          source_type: 'public_dataset',
+          hf_dataset_name: 'cc-traces-weka-062126',
+        },
+      }),
+    ).toBe('cc-traces-weka-062126');
+  });
+
+  it.each([
+    {},
+    { dataset: null },
+    { dataset: { source_type: 'synthetic', hf_dataset_name: 'owner/data' } },
+    { dataset: { source_type: 'public_dataset', hf_dataset_name: '' } },
+    { dataset: { source_type: 'public_dataset' } },
+  ])('ignores rows without usable public-dataset provenance: %j', (row) => {
+    expect(datasetSlugFromBenchmarkRow(row)).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/dataset-provenance.ts b/packages/db/src/etl/dataset-provenance.ts
new file mode 100644
index 00000000..7c30716c
--- /dev/null
+++ b/packages/db/src/etl/dataset-provenance.ts
@@ -0,0 +1,30 @@
+/** Dataset provenance emitted by aiperf and preserved in agentic benchmark rows. */
+export interface DatasetProvenance {
+  source_type?: unknown;
+  loader?: unknown;
+  hf_dataset_name?: unknown;
+  hf_split?: unknown;
+  hf_subset?: unknown;
+  num_dataset_entries?: unknown;
+}
+
+/**
+ * Resolve the dashboard dataset slug from a benchmark row's provenance.
+ *
+ * Dataset ingest uses the final path component of the Hugging Face dataset id
+ * as `datasets.slug`, so `semianalysisai/cc-traces-weka-062126` maps to
+ * `cc-traces-weka-062126` here as well.
+ */
+export function datasetSlugFromBenchmarkRow(row: Record<string, unknown>): string | null {
+  const dataset = row.dataset;
+  if (!dataset || typeof dataset !== 'object' || Array.isArray(dataset)) return null;
+
+  const provenance = dataset as DatasetProvenance;
+  if (provenance.source_type !== 'public_dataset') return null;
+  if (typeof provenance.hf_dataset_name !== 'string') return null;
+
+  const datasetId = provenance.hf_dataset_name.trim().replace(/\/+$/u, '');
+  if (!datasetId) return null;
+  const slug = datasetId.slice(datasetId.lastIndexOf('/') + 1);
+  return slug || null;
+}
diff --git a/packages/db/src/etl/server-metrics-adapters.ts b/packages/db/src/etl/server-metrics-adapters.ts
new file mode 100644
index 00000000..f123d9f8
--- /dev/null
+++ b/packages/db/src/etl/server-metrics-adapters.ts
@@ -0,0 +1,100 @@
+/**
+ * Normalize orchestrator-specific server-metric labels into a stable source
+ * identity consumed by the API and frontend. AIPerf owns the export envelope;
+ * the serving orchestrator owns the meaning of labels inside each series.
+ */
+
+export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown';
+
+export interface RawMetricSourceSeries {
+  endpoint_url?: string;
+  labels?: Record<string, string>;
+}
+
+export interface ServerMetricsContext {
+  /** Canonical framework stored in configs, for example `dynamo-vllm`. */
+  framework?: string | null;
+  /** Per-worker role series are only meaningful for disaggregated configs. */
+  disagg?: boolean;
+}
+
+export interface MetricSource {
+  /** Stable key used to join this source across different metric names. */
+  id: string;
+  adapter: string;
+  role: MetricSourceRole;
+  endpointUrl: string | null;
+  nativeRole: string | null;
+  workerId: string | null;
+  dpRank: string | null;
+  engine: string | null;
+}
+
+interface ServerMetricsAdapter {
+  id: string;
+  matches: (context: ServerMetricsContext) => boolean;
+  identifySource: (series: RawMetricSourceSeries) => MetricSource;
+}
+
+function stableId(adapter: string, parts: (string | null | undefined)[]): string {
+  return [adapter, ...parts.map((part) => part ?? '')].join('|');
+}
+
+const dynamoAdapter: ServerMetricsAdapter = {
+  id: 'dynamo',
+  matches: ({ framework }) => framework?.startsWith('dynamo-') ?? false,
+  identifySource(series) {
+    const labels = series.labels ?? {};
+    const nativeRole = labels['dynamo_component'] ?? null;
+    const role: MetricSourceRole =
+      nativeRole === 'prefill'
+        ? 'prefill'
+        : nativeRole === 'backend'
+          ? 'decode'
+          : nativeRole === 'frontend' || nativeRole === 'router'
+            ? 'router'
+            : 'unknown';
+    const endpointUrl = series.endpoint_url ?? labels['dynamo_endpoint'] ?? null;
+    const workerId = labels['worker_id'] ?? null;
+    const dpRank = labels['dp_rank'] ?? null;
+    const engine = labels['engine'] ?? labels['engine_idx'] ?? null;
+    return {
+      id: stableId('dynamo', [role, endpointUrl, workerId, dpRank, engine]),
+      adapter: 'dynamo',
+      role,
+      endpointUrl,
+      nativeRole,
+      workerId,
+      dpRank,
+      engine,
+    };
+  },
+};
+
+const genericAdapter: ServerMetricsAdapter = {
+  id: 'generic',
+  matches: () => true,
+  identifySource(series) {
+    const labels = series.labels ?? {};
+    const endpointUrl = series.endpoint_url ?? null;
+    const workerId = labels['worker_id'] ?? null;
+    const dpRank = labels['dp_rank'] ?? null;
+    const engine = labels['engine'] ?? labels['engine_idx'] ?? null;
+    return {
+      id: stableId('generic', [endpointUrl, workerId, dpRank, engine]),
+      adapter: 'generic',
+      role: endpointUrl || workerId || dpRank || engine ? 'unknown' : 'combined',
+      endpointUrl,
+      nativeRole: null,
+      workerId,
+      dpRank,
+      engine,
+    };
+  },
+};
+
+const ADAPTERS: readonly ServerMetricsAdapter[] = [dynamoAdapter, genericAdapter];
+
+export function selectServerMetricsAdapter(context: ServerMetricsContext): ServerMetricsAdapter {
+  return ADAPTERS.find((adapter) => adapter.matches(context)) ?? genericAdapter;
+}
diff --git a/packages/db/src/etl/trace-artifact-discovery.test.ts b/packages/db/src/etl/trace-artifact-discovery.test.ts
new file mode 100644
index 00000000..2bb1d51b
--- /dev/null
+++ b/packages/db/src/etl/trace-artifact-discovery.test.ts
@@ -0,0 +1,66 @@
+import { execFileSync } from 'node:child_process';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import { afterEach, describe, expect, it } from 'vitest';
+
+import { discoverTraceReplayArtifacts } from './trace-artifact-discovery';
+
+const tempDirs: string[] = [];
+
+function tempDir(): string {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'trace-artifacts-test-'));
+  tempDirs.push(dir);
+  return dir;
+}
+
+function writeTraceFiles(dir: string): void {
+  fs.mkdirSync(path.join(dir, 'aiperf_artifacts'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'profile_export.jsonl'), '{}\n');
+  fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.csv'), 'x,y\n');
+  fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.json'), '{}');
+}
+
+afterEach(() => {
+  for (const dir of tempDirs.splice(0)) fs.rmSync(dir, { recursive: true, force: true });
+});
+
+describe('discoverTraceReplayArtifacts', () => {
+  it('discovers the existing single-node sibling layout', () => {
+    const root = tempDir();
+    writeTraceFiles(path.join(root, 'agentic_config-a'));
+
+    const found = discoverTraceReplayArtifacts(root);
+
+    expect(found.get('config-a')).toMatchObject({
+      profileJsonl: expect.stringContaining('profile_export.jsonl'),
+      serverMetricsCsv: expect.stringContaining('server_metrics_export.csv'),
+      serverMetricsJson: expect.stringContaining('server_metrics_export.json'),
+    });
+  });
+
+  it('extracts and indexes multinode traces by concurrency', () => {
+    const root = tempDir();
+    const artifactDir = path.join(root, 'multinode_server_logs_config-b');
+    const archiveSource = path.join(root, 'archive-source');
+    writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_96'));
+    writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_128'));
+    fs.mkdirSync(artifactDir, { recursive: true });
+    execFileSync('tar', [
+      '-czf',
+      path.join(artifactDir, 'multinode_server_logs.tar.gz'),
+      '-C',
+      archiveSource,
+      '.',
+    ]);
+    fs.rmSync(archiveSource, { recursive: true, force: true });
+
+    const found = discoverTraceReplayArtifacts(root);
+
+    expect([...found.keys()].toSorted()).toEqual(['config-b|128', 'config-b|96']);
+    expect(found.get('config-b|96')?.profileJsonl).toContain(
+      'multinode_server_logs/agentic/conc_96/aiperf_artifacts/profile_export.jsonl',
+    );
+  });
+});
diff --git a/packages/db/src/etl/trace-artifact-discovery.ts b/packages/db/src/etl/trace-artifact-discovery.ts
new file mode 100644
index 00000000..cea0269e
--- /dev/null
+++ b/packages/db/src/etl/trace-artifact-discovery.ts
@@ -0,0 +1,89 @@
+import { execFileSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+
+export interface TraceReplayArtifactPaths {
+  profileJsonl: string | null;
+  serverMetricsCsv: string | null;
+  serverMetricsJson: string | null;
+}
+
+const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
+
+function traceFilesIn(dir: string): TraceReplayArtifactPaths | null {
+  let profileJsonl: string | null = null;
+  let serverMetricsCsv: string | null = null;
+  let serverMetricsJson: string | null = null;
+
+  for (const subdir of TRACE_SUBDIRS) {
+    const traceDir = path.join(dir, subdir);
+    if (!fs.existsSync(traceDir) || !fs.statSync(traceDir).isDirectory()) continue;
+
+    const profilePath = path.join(traceDir, 'profile_export.jsonl');
+    const csvPath = path.join(traceDir, 'server_metrics_export.csv');
+    const jsonPath = path.join(traceDir, 'server_metrics_export.json');
+    if (!profileJsonl && fs.existsSync(profilePath)) profileJsonl = profilePath;
+    if (!serverMetricsCsv && fs.existsSync(csvPath)) serverMetricsCsv = csvPath;
+    if (!serverMetricsJson && fs.existsSync(jsonPath)) serverMetricsJson = jsonPath;
+  }
+
+  if (!profileJsonl && !serverMetricsCsv && !serverMetricsJson) return null;
+  return { profileJsonl, serverMetricsCsv, serverMetricsJson };
+}
+
+function extractMultinodeArchive(artifactDir: string): string | null {
+  const archivePath = path.join(artifactDir, 'multinode_server_logs.tar.gz');
+  const extractedDir = path.join(artifactDir, 'multinode_server_logs');
+
+  if (!fs.existsSync(extractedDir) && fs.existsSync(archivePath)) {
+    fs.mkdirSync(extractedDir, { recursive: true });
+    execFileSync('tar', ['-xzf', archivePath, '-C', extractedDir], { stdio: 'ignore' });
+  }
+
+  return fs.existsSync(extractedDir) ? extractedDir : null;
+}
+
+/**
+ * Discover trace-replay siblings in both artifact layouts:
+ *
+ * - Single-node: `agentic_<suffix>/aiperf_artifacts/*`
+ * - Multinode: `multinode_server_logs_<suffix>/multinode_server_logs.tar.gz`,
+ *   containing `agentic/conc_<N>/aiperf_artifacts/*`
+ *
+ * Multinode keys include concurrency (`<suffix>|<N>`) because one artifact
+ * contains several points, each with a distinct trace payload.
+ */
+export function discoverTraceReplayArtifacts(
+  artifactsDir: string,
+): Map<string, TraceReplayArtifactPaths> {
+  const discovered = new Map<string, TraceReplayArtifactPaths>();
+  if (!fs.existsSync(artifactsDir)) return discovered;
+
+  for (const entry of fs.readdirSync(artifactsDir)) {
+    const artifactDir = path.join(artifactsDir, entry);
+    if (!fs.statSync(artifactDir).isDirectory()) continue;
+
+    if (entry.startsWith('agentic_')) {
+      const trace = traceFilesIn(artifactDir);
+      if (trace) discovered.set(entry.replace(/^agentic_/u, ''), trace);
+      continue;
+    }
+
+    if (!entry.startsWith('multinode_server_logs_')) continue;
+    const extractedDir = extractMultinodeArchive(artifactDir);
+    if (!extractedDir) continue;
+
+    const agenticDir = path.join(extractedDir, 'agentic');
+    if (!fs.existsSync(agenticDir) || !fs.statSync(agenticDir).isDirectory()) continue;
+
+    const suffix = entry.replace(/^multinode_server_logs_/u, '');
+    for (const concEntry of fs.readdirSync(agenticDir)) {
+      const match = concEntry.match(/^conc_(?<conc>\d+)$/u);
+      if (!match?.groups?.conc) continue;
+      const trace = traceFilesIn(path.join(agenticDir, concEntry));
+      if (trace) discovered.set(`${suffix}|${match.groups.conc}`, trace);
+    }
+  }
+
+  return discovered;
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index cb022ca9..b50168db 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -15,6 +15,7 @@ import type postgres from 'postgres';
 import { computeAggregateStats } from './compute-aggregate-stats.js';
 import { computeChartSeries } from './compute-chart-series.js';
 import { computeRequestTimeline } from './compute-request-timeline.js';
+import type { ServerMetricsContext } from './server-metrics-adapters';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -33,6 +34,8 @@ type Sql = ReturnType<typeof postgres>;
  * @param serverMetricsJson   Raw bytes of `server_metrics_export.json` —
  *                            per-scrape time-series of every Prometheus metric.
  *                            Optional, gzipped before storage (~42x ratio).
+ * @param metricsContext      Canonical framework used to select the
+ *                            orchestrator-specific metric-label adapter.
  */
 export async function insertTraceReplay(
   sql: Sql,
@@ -40,6 +43,7 @@ export async function insertTraceReplay(
   profileExportJsonl: Buffer | null,
   serverMetricsCsv: Buffer | null,
   serverMetricsJson: Buffer | null = null,
+  metricsContext: ServerMetricsContext = {},
 ): Promise<void> {
   if (benchmarkResultIds.length === 0) return;
   if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
@@ -65,7 +69,7 @@ export async function insertTraceReplay(
   // a streaming parser for oversized server_metrics blobs.
   const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
     computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
-    computeChartSeries(metricsJsonGz),
+    computeChartSeries(metricsJsonGz, metricsContext),
     Promise.resolve(computeRequestTimeline(profileGz)),
   ]);
 
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
index 4debf1ae..97e8759d 100644
--- a/packages/db/src/etl/weka-structure.test.ts
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -2,9 +2,11 @@ import { describe, it, expect } from 'vitest';
 import {
   countSeenPrefixBlocks,
   buildConversationStructure,
+  countConversationRequests,
   linearHistogram,
   logHistogram,
   logHistogramWithZero,
+  subagentRequestTurns,
   summarizeValues,
   type RawWekaConversation,
   type SubagentNode,
@@ -88,7 +90,7 @@ describe('buildConversationStructure', () => {
       id: 'c4',
       block_size: 64,
       requests: [
-        { type: 'n', model: 'main', t: 0, in: 64, out: 10, hash_ids: [1] },
+        { type: 'n', model: 'main', t: 0, api_time: 1, in: 64, out: 10, hash_ids: [1] },
         {
           type: 'subagent',
           agent_id: 'a1',
@@ -119,7 +121,12 @@ describe('buildConversationStructure', () => {
     expect(sub.startS).toBe(12.5);
     expect(sub.endS).toBeCloseTo(13.734, 6);
     expect(sub.children).toHaveLength(2);
-    expect(sub.children.map((child) => child.startS)).toEqual([12.5, 13.1]);
+    expect(countConversationRequests(s)).toBe(4);
+    expect(subagentRequestTurns(s).map((turn) => turn.model)).toEqual(['sub', 'sub']);
+    expect(sub.children.map((child) => [child.startS, child.endS])).toEqual([
+      [12.5, 12.5],
+      [13.1, 13.1],
+    ]);
     expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
     expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
     expect(sub.in).toBe(256);
@@ -127,6 +134,26 @@ describe('buildConversationStructure', () => {
 
     const afterSub = s.nodes[2] as TurnNode;
     expect(afterSub.cached).toBe(64); // only block 1; block 5 not folded back
+    expect((s.nodes[0] as TurnNode).endS).toBe(1);
+  });
+
+  it('counts top-level and subagent child turns as requests, but not subagent groups', () => {
+    const structure = buildConversationStructure({
+      id: 'request-count',
+      requests: [
+        { type: 'n', in: 1, out: 1 },
+        {
+          type: 'subagent',
+          requests: [
+            { type: 'n', in: 1, out: 1 },
+            { type: 'n', in: 1, out: 1 },
+          ],
+        },
+      ],
+    });
+
+    expect(countConversationRequests(structure)).toBe(3);
+    expect(subagentRequestTurns(structure)).toHaveLength(2);
   });
 
   it('falls back to the default block size and a generic subagent label', () => {
@@ -156,6 +183,21 @@ describe('buildConversationStructure', () => {
     expect(sub.startS).toBe(5);
     expect(sub.endS).toBe(12);
   });
+
+  it('normalizes legacy subagent-relative request intervals', () => {
+    const structure = buildConversationStructure({
+      id: 'legacy-relative',
+      requests: [
+        {
+          type: 'subagent',
+          t: 100,
+          requests: [{ type: 'n', t: 2, api_time: 3, in: 10, out: 1 }],
+        },
+      ],
+    });
+    const child = (structure.nodes[0] as SubagentNode).children[0]!;
+    expect(child).toMatchObject({ startS: 102, endS: 105 });
+  });
 });
 
 describe('histograms', () => {
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index 26cc8da1..f6cea1c1 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -50,7 +50,7 @@ export interface TurnNode {
   turnIndex: number;
   /** Seconds from the start of the conversation. */
   startS?: number;
-  /** Seconds from the start of the conversation (startS + api_time). */
+  /** End of the original request interval (`startS + api_time`). */
   endS?: number;
   model?: string;
   in: number;
@@ -92,6 +92,16 @@ export interface ConversationStructure {
   };
 }
 
+/** Actual model requests in a conversation: main turns plus subagent child turns. */
+export function countConversationRequests(structure: ConversationStructure): number {
+  return structure.totals.numTurns + subagentRequestTurns(structure).length;
+}
+
+/** Model requests issued by inner subagents, excluding all parent-agent turns. */
+export function subagentRequestTurns(structure: ConversationStructure): TurnNode[] {
+  return structure.nodes.flatMap((node) => (node.kind === 'subagent' ? node.children : []));
+}
+
 const isSubagent = (e: RawWekaEntry): e is RawWekaSubagent =>
   (e as RawWekaSubagent).type === 'subagent';
 
@@ -142,17 +152,30 @@ function finiteTime(value: number | undefined): number | undefined {
   return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined;
 }
 
-/** End of a turn = its start plus the request's api_time (seconds). */
-function turnEndS(req: RawWekaRequest): number | undefined {
-  const startS = finiteTime(req.t);
+function requestEndS(startS: number | undefined, apiTime: number | undefined): number | undefined {
   if (startS === undefined) return undefined;
-  return startS + (finiteTime(req.api_time) ?? 0);
+  const duration = finiteTime(apiTime) ?? 0;
+  return startS + duration;
+}
+
+/** Mirror aiperf's legacy-relative/current-absolute subagent timestamp handling. */
+function subagentRequestStartS(
+  entry: RawWekaSubagent,
+  request: RawWekaRequest,
+): number | undefined {
+  const requestStart = finiteTime(request.t);
+  if (requestStart === undefined) return undefined;
+  const groupStart = finiteTime(entry.t);
+  if (groupStart !== undefined && requestStart + 1e-6 < groupStart) {
+    return groupStart + requestStart;
+  }
+  return requestStart;
 }
 
 function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } {
   const children = entry.requests ?? [];
   const childStarts = children
-    .map((child) => finiteTime(child.t))
+    .map((child) => subagentRequestStartS(entry, child))
     .filter((value): value is number => value !== undefined);
   const startS =
     finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined);
@@ -162,7 +185,11 @@ function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: nu
   }
 
   const childEnds = children
-    .map((child) => turnEndS(child))
+    .map((child) => {
+      const childStart = subagentRequestStartS(entry, child);
+      if (childStart === undefined) return undefined;
+      return childStart + (finiteTime(child.api_time) ?? 0);
+    })
     .filter((value): value is number => value !== undefined);
   return {
     startS,
@@ -203,11 +230,12 @@ export function buildConversationStructure(
       for (const inner of entry.requests ?? []) {
         const split = splitInput(inner, childSeen, blockSize);
         const out = Math.max(0, Math.round(inner.out ?? 0));
+        const childStartS = subagentRequestStartS(entry, inner);
         children.push({
           kind: 'turn',
           turnIndex: turnIndex++,
-          startS: finiteTime(inner.t),
-          endS: turnEndS(inner),
+          startS: childStartS,
+          endS: requestEndS(childStartS, inner.api_time),
           model: inner.model,
           in: split.in,
           out,
@@ -240,11 +268,12 @@ export function buildConversationStructure(
     } else {
       const split = splitInput(entry, seen, blockSize);
       const out = Math.max(0, Math.round(entry.out ?? 0));
+      const startS = finiteTime(entry.t);
       nodes.push({
         kind: 'turn',
         turnIndex: turnIndex++,
-        startS: finiteTime(entry.t),
-        endS: turnEndS(entry),
+        startS,
+        endS: requestEndS(startS, entry.api_time),
         model: entry.model,
         in: split.in,
         out,
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 127522c8..2a5f15f0 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -46,6 +46,8 @@ import {
   insertServerLog,
 } from './etl/benchmark-ingest';
 import { insertTraceReplay } from './etl/trace-replay-ingest';
+import { discoverTraceReplayArtifacts } from './etl/trace-artifact-discovery';
+import { datasetSlugFromBenchmarkRow } from './etl/dataset-provenance';
 import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper';
 import { ingestEvalRow } from './etl/eval-ingest';
 import { mapEvalSamples } from './etl/eval-samples-mapper';
@@ -337,6 +339,7 @@ async function main(): Promise<void> {
   let totalSampleFiles = 0;
   let totalChangelogs = 0;
   let totalTraceReplayLinked = 0;
+  const datasetSlugs = new Set<string>();
 
   // ── Check for evals-only flag in changelog ────────────────────────────
   const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog);
@@ -397,46 +400,7 @@ async function main(): Promise<void> {
     // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current).
     // Older non-aiperf agentic runs don't ship this sibling. Key on the bare
     // suffix so both names map to the same Map entry.
-    const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
-    const traceReplayPaths = new Map<
-      string,
-      {
-        profileJsonl: string | null;
-        serverMetricsCsv: string | null;
-        serverMetricsJson: string | null;
-      }
-    >();
-    if (fs.existsSync(artifactsDir)) {
-      for (const d of fs.readdirSync(artifactsDir)) {
-        if (!d.startsWith('agentic_')) continue;
-        let profile: string | null = null;
-        let metrics: string | null = null;
-        let metricsJson: string | null = null;
-        for (const sub of TRACE_SUBDIRS) {
-          const dir = path.join(artifactsDir, d, sub);
-          if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) continue;
-          if (!profile) {
-            const p = path.join(dir, 'profile_export.jsonl');
-            if (fs.existsSync(p)) profile = p;
-          }
-          if (!metrics) {
-            const m = path.join(dir, 'server_metrics_export.csv');
-            if (fs.existsSync(m)) metrics = m;
-          }
-          if (!metricsJson) {
-            const j = path.join(dir, 'server_metrics_export.json');
-            if (fs.existsSync(j)) metricsJson = j;
-          }
-        }
-        if (!profile && !metrics && !metricsJson) continue;
-        const suffix = stripBmkAndAgenticPrefix(d);
-        traceReplayPaths.set(suffix, {
-          profileJsonl: profile,
-          serverMetricsCsv: metrics,
-          serverMetricsJson: metricsJson,
-        });
-      }
-    }
+    const traceReplayPaths = discoverTraceReplayArtifacts(artifactsDir);
     if (traceReplayPaths.size > 0) {
       console.log(`  Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`);
     }
@@ -452,6 +416,12 @@ async function main(): Promise<void> {
         ? data
         : [data as Record<string, any>];
 
+      for (const rawRow of rawRows) {
+        if (!rawRow || typeof rawRow !== 'object') continue;
+        const datasetSlug = datasetSlugFromBenchmarkRow(rawRow);
+        if (datasetSlug) datasetSlugs.add(datasetSlug);
+      }
+
       const rows = rawRows
         .filter((r) => typeof r === 'object' && r !== null)
         .map((r) => mapBenchmarkRow(r, tracker))
@@ -514,7 +484,11 @@ async function main(): Promise<void> {
           // `bmk_agentic_<suffix>` artifact we just ingested.
           if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) {
             const suffix = stripBmkAndAgenticPrefix(parentDir);
-            const trace = traceReplayPaths.get(suffix);
+            const concMatch = path.basename(file).match(/_conc(?<conc>\d+)\.json$/u);
+            const trace =
+              (concMatch?.groups?.conc
+                ? traceReplayPaths.get(`${suffix}|${concMatch.groups.conc}`)
+                : undefined) ?? traceReplayPaths.get(suffix);
             if (trace) {
               try {
                 const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
@@ -524,7 +498,10 @@ async function main(): Promise<void> {
                 const metricsJson = trace.serverMetricsJson
                   ? fs.readFileSync(trace.serverMetricsJson)
                   : null;
-                await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson);
+                await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson, {
+                  framework: toInsert[0]?.config.framework,
+                  disagg: toInsert[0]?.config.disagg,
+                });
                 totalTraceReplayLinked += insertedIds.length;
               } catch (error: any) {
                 tracker.recordDbError(`trace_replay for ${suffix}`, error);
@@ -553,6 +530,22 @@ async function main(): Promise<void> {
         tracker.recordDbError('availability', error);
       }
     }
+
+    if (datasetSlugs.size > 1) {
+      throw new Error(
+        `Conflicting dataset provenance in workflow run ${runId}: ${[...datasetSlugs].toSorted().join(', ')}`,
+      );
+    }
+    const [datasetSlug] = datasetSlugs;
+    if (datasetSlug) {
+      await sql`
+        insert into run_datasets (workflow_run_id, dataset_slug)
+        values (${workflowRunId}, ${datasetSlug})
+        on conflict (workflow_run_id) do update
+        set dataset_slug = excluded.dataset_slug
+      `;
+      console.log(`  Dataset: linked workflow run to ${datasetSlug}`);
+    }
   }
 
   // ── Ingest run stats ──────────────────────────────────────────────────
diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
index e00471d7..ed6774c0 100644
--- a/packages/db/src/ingest-weka-dataset.ts
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -22,9 +22,11 @@ import { createAdminSql } from './etl/db-utils';
 import { hasNoSslFlag } from './cli-utils';
 import {
   buildConversationStructure,
+  countConversationRequests,
   linearHistogram,
   logHistogram,
   logHistogramWithZero,
+  subagentRequestTurns,
   summarizeValues,
   type ConversationStructure,
   type RawWekaConversation,
@@ -146,6 +148,9 @@ interface Accumulator {
   outputPerTurn: number[];
   cachedFractionPerTurn: number[]; // cached/in, for turns with in>0
   turnsPerConv: number[]; // main (top-level) turns
+  requestsPerConv: number[]; // main turns + subagent child turns
+  subagentInputPerRequest: number[];
+  subagentOutputPerRequest: number[];
   subagentGroupsPerConv: number[];
   subagentTurnsPerGroup: number[];
   totalIn: number;
@@ -164,6 +169,9 @@ function newAccumulator(): Accumulator {
     outputPerTurn: [],
     cachedFractionPerTurn: [],
     turnsPerConv: [],
+    requestsPerConv: [],
+    subagentInputPerRequest: [],
+    subagentOutputPerRequest: [],
     subagentGroupsPerConv: [],
     subagentTurnsPerGroup: [],
     totalIn: 0,
@@ -191,6 +199,11 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void {
   acc.mainTurns += s.totals.numTurns;
   acc.subagentGroups += s.totals.numSubagentGroups;
   acc.turnsPerConv.push(s.totals.numTurns);
+  acc.requestsPerConv.push(countConversationRequests(s));
+  for (const turn of subagentRequestTurns(s)) {
+    acc.subagentInputPerRequest.push(turn.in);
+    acc.subagentOutputPerRequest.push(turn.out);
+  }
   acc.subagentGroupsPerConv.push(s.totals.numSubagentGroups);
   for (const node of s.nodes) {
     if (node.kind === 'turn') {
@@ -205,7 +218,7 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void {
 
 function buildChartData(acc: Accumulator) {
   return {
-    version: 2,
+    version: 3,
     inputTokensPerTurn: {
       bins: logHistogram(acc.inputPerTurn),
       stats: summarizeValues(acc.inputPerTurn),
@@ -218,6 +231,14 @@ function buildChartData(acc: Accumulator) {
       bins: logHistogram(acc.outputPerTurn),
       stats: summarizeValues(acc.outputPerTurn),
     },
+    subagentInputTokensPerRequest: {
+      bins: logHistogram(acc.subagentInputPerRequest),
+      stats: summarizeValues(acc.subagentInputPerRequest),
+    },
+    subagentOutputTokensPerRequest: {
+      bins: logHistogram(acc.subagentOutputPerRequest),
+      stats: summarizeValues(acc.subagentOutputPerRequest),
+    },
     turnsPerConversation: {
       bins: linearHistogram(acc.turnsPerConv),
       stats: summarizeValues(acc.turnsPerConv),
@@ -235,8 +256,10 @@ function buildChartData(acc: Accumulator) {
 
 function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string | null) {
   const cachedPct = acc.totalIn > 0 ? acc.totalCached / acc.totalIn : 0;
+  const requestsPerConversation = summarizeValues(acc.requestsPerConv);
+  const subagentsPerTrace = summarizeValues(acc.subagentGroupsPerConv);
   return {
-    version: 1,
+    version: 3,
     blockSize,
     hashIdScope,
     totalIn: acc.totalIn,
@@ -246,6 +269,10 @@ function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string |
     mainTurns: acc.mainTurns,
     subagentGroups: acc.subagentGroups,
     subagentTurns: acc.subagentTurns,
+    meanRequestsPerConversation: requestsPerConversation.mean,
+    medianRequestsPerConversation: requestsPerConversation.median,
+    meanSubagentsPerTrace: subagentsPerTrace.mean,
+    medianSubagentsPerTrace: subagentsPerTrace.median,
     modelMix: acc.modelCounts,
   };
 }
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index da5d18a0..4493b7dc 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -36,8 +36,10 @@ import type { DbClient } from '../connection.js';
  * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate
  * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way
  * they do for vllm runs.
+ *
+ * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL.
  */
-export const STATS_VERSION = 3;
+export const STATS_VERSION = 4;
 
 export interface MetricPercentiles {
   mean: number;
diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts
index 89c6ca5e..cfefe391 100644
--- a/packages/db/src/queries/datasets.ts
+++ b/packages/db/src/queries/datasets.ts
@@ -20,6 +20,10 @@ export interface DatasetSummary {
   mainTurns?: number;
   subagentGroups?: number;
   subagentTurns?: number;
+  meanRequestsPerConversation?: number;
+  medianRequestsPerConversation?: number;
+  meanSubagentsPerTrace?: number;
+  medianSubagentsPerTrace?: number;
   modelMix?: Record<string, number>;
   [k: string]: unknown;
 }
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
index 321434be..afc5b22d 100644
--- a/packages/db/src/queries/derived-agentic-metrics.test.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -24,6 +24,21 @@ describe('computeDerivedFromBlob', () => {
     const out = computeDerivedFromBlob('');
     expect(out.normalized_session_time_s).toBeNull();
     expect(out.p90_prefill_tps_per_user).toBeNull();
+    expect(out.normalized_e2e_400).toBeNull();
+  });
+
+  it('normalizes each request to 400 output tokens before taking percentiles', () => {
+    const jsonl = [
+      // Both requests have TTFT=2s and ITL=20ms, despite very different OSL/E2E.
+      rec('s1', 0, { isl: 100, osl: 100, ttft_ms: 2000, latency_ms: 3980 }),
+      rec('s2', 0, { isl: 100, osl: 1000, ttft_ms: 2000, latency_ms: 21_980 }),
+    ].join('\n');
+
+    const out = computeDerivedFromBlob(jsonl);
+    // 2s TTFT + 399 × 20ms ITL = 9.98s for both requests.
+    expect(out.normalized_e2e_400?.n).toBe(2);
+    expect(out.normalized_e2e_400?.p75).toBeCloseTo(9.98, 8);
+    expect(out.normalized_e2e_400?.p90).toBeCloseTo(9.98, 8);
   });
 
   it('rescales single-session time and computes P90 prefill', () => {
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index 35a4b76c..fda44280 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -20,8 +20,10 @@
 
 import { gunzipSync } from 'node:zlib';
 
+import { NORMALIZED_E2E_OUTPUT_TOKENS } from '@semianalysisai/inferencex-constants';
+
 import type { DbClient } from '../connection.js';
-import { STATS_VERSION } from './agentic-aggregates';
+import { percentilesOf, STATS_VERSION, type MetricPercentiles } from './agentic-aggregates';
 
 export interface DerivedAgenticMetric {
   /** benchmark_results.id this entry belongs to. */
@@ -30,6 +32,10 @@ export interface DerivedAgenticMetric {
   normalized_session_time_s: number | null;
   /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */
   p90_prefill_tps_per_user: number | null;
+  /** P75 normalized per-request E2E at a fixed 400-token output length. */
+  p75_normalized_e2e_400_s: number | null;
+  /** P90 normalized per-request E2E at a fixed 400-token output length. */
+  p90_normalized_e2e_400_s: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
@@ -111,6 +117,7 @@ function meanOf(xs: number[]): number {
 export function computeDerivedFromBlob(jsonl: string): {
   normalized_session_time_s: number | null;
   p90_prefill_tps_per_user: number | null;
+  normalized_e2e_400: MetricPercentiles | null;
 } {
   // Group records by conversation_id, filter to the profiling phase.
   const bySession = new Map<string, TurnFields[]>();
@@ -135,7 +142,11 @@ export function computeDerivedFromBlob(jsonl: string): {
     list.push(turn);
   }
   if (bySession.size === 0) {
-    return { normalized_session_time_s: null, p90_prefill_tps_per_user: null };
+    return {
+      normalized_session_time_s: null,
+      p90_prefill_tps_per_user: null,
+      normalized_e2e_400: null,
+    };
   }
 
   // Per-session aggregates for session time; per-turn prefill rates pool into
@@ -143,6 +154,7 @@ export function computeDerivedFromBlob(jsonl: string): {
   const sessionTimesS: number[] = [];
   const sessionLoads: number[] = [];
   const allPrefillRates: number[] = [];
+  const allNormalizedE2eS: number[] = [];
   for (const turns of bySession.values()) {
     let timeMs = 0;
     let load = 0;
@@ -151,6 +163,21 @@ export function computeDerivedFromBlob(jsonl: string): {
       load += t.isl + t.osl;
       const ttftSec = t.ttft_ms / 1000;
       if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec);
+
+      // Keep the observed TTFT, then project the request's mean decode
+      // interval to a fixed output length. Do this per request before taking
+      // percentiles so long original outputs do not dominate the tail.
+      const observedDecodeIntervals = Math.max(t.osl - 1, 1);
+      const itlMs = (t.request_latency_ms - t.ttft_ms) / observedDecodeIntervals;
+      const normalizedMs = t.ttft_ms + (NORMALIZED_E2E_OUTPUT_TOKENS - 1) * itlMs;
+      if (
+        Number.isFinite(itlMs) &&
+        itlMs >= 0 &&
+        Number.isFinite(normalizedMs) &&
+        normalizedMs > 0
+      ) {
+        allNormalizedE2eS.push(normalizedMs / 1000);
+      }
     }
     if (load > 0) {
       sessionTimesS.push(timeMs / 1000);
@@ -182,6 +209,7 @@ export function computeDerivedFromBlob(jsonl: string): {
   return {
     normalized_session_time_s: normalized,
     p90_prefill_tps_per_user: prefill,
+    normalized_e2e_400: percentilesOf(allNormalizedE2eS),
   };
 }
 
@@ -210,6 +238,7 @@ export async function getDerivedAgenticMetrics(
       version?: number;
       normalizedSessionTimeS?: number | null;
       p90PrefillTpsPerUser?: number | null;
+      normalizedE2e400?: MetricPercentiles | null;
     } | null;
   }[];
 
@@ -221,6 +250,8 @@ export async function getDerivedAgenticMetrics(
         id,
         normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null,
         p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null,
+        p75_normalized_e2e_400_s: row.stats.normalizedE2e400?.p75 ?? null,
+        p90_normalized_e2e_400_s: row.stats.normalizedE2e400?.p90 ?? null,
       };
     } else {
       idsNeedingBlob.push(id);
@@ -250,11 +281,14 @@ export async function getDerivedAgenticMetrics(
   for (const row of rows) {
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');
-      const { normalized_session_time_s, p90_prefill_tps_per_user } = computeDerivedFromBlob(jsonl);
+      const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } =
+        computeDerivedFromBlob(jsonl);
       result[Number(row.benchmark_result_id)] = {
         id: Number(row.benchmark_result_id),
         normalized_session_time_s,
         p90_prefill_tps_per_user,
+        p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null,
+        p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null,
       };
     } catch {
       // Skip malformed blobs silently — frontend treats missing ids as "no data".
diff --git a/packages/db/src/queries/request-timeline.test.ts b/packages/db/src/queries/request-timeline.test.ts
new file mode 100644
index 00000000..62ba5385
--- /dev/null
+++ b/packages/db/src/queries/request-timeline.test.ts
@@ -0,0 +1,45 @@
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline';
+import type { DbClient } from '../connection.js';
+
+import { getRequestTimeline } from './request-timeline';
+
+function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } {
+  const responses = [...queue];
+  const calls: string[] = [];
+  const sql = ((strings: TemplateStringsArray) => {
+    calls.push(strings.join('?'));
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+const timeline: RequestTimeline = {
+  version: REQUEST_TIMELINE_VERSION,
+  startNs: 100,
+  endNs: 200,
+  durationS: 0.0000001,
+  requests: [],
+};
+
+describe('getRequestTimeline', () => {
+  it('returns the current precomputed timeline without selecting the raw profile blob', async () => {
+    const { sql, calls } = mockSql([
+      [{ trace_replay_id: 870, has_blob: true, request_timeline: timeline }],
+    ]);
+
+    await expect(getRequestTimeline(sql, 422991)).resolves.toEqual(timeline);
+    expect(calls).toHaveLength(1);
+    expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob');
+  });
+
+  it('does not fetch a blob when neither a current timeline nor a blob exists', async () => {
+    const { sql, calls } = mockSql([
+      [{ trace_replay_id: 870, has_blob: false, request_timeline: null }],
+    ]);
+
+    await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull();
+    expect(calls).toHaveLength(1);
+  });
+});
diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts
index 2bd3e251..2a6bb40c 100644
--- a/packages/db/src/queries/request-timeline.ts
+++ b/packages/db/src/queries/request-timeline.ts
@@ -18,23 +18,29 @@ import type { DbClient } from '../connection.js';
 
 export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline';
 
-interface RawRow {
-  blob: Buffer | null;
+interface RawMetaRow {
+  trace_replay_id: number;
+  has_blob: boolean;
   request_timeline: RequestTimeline | null;
 }
 
+interface RawBlobRow {
+  blob: Buffer | null;
+}
+
 export async function getRequestTimeline(
   sql: DbClient,
   benchmarkResultId: number,
 ): Promise<RequestTimeline | null> {
   const rows = (await sql`
     select
-      atr.profile_export_jsonl_gz as blob,
+      atr.id as trace_replay_id,
+      (atr.profile_export_jsonl_gz is not null) as has_blob,
       atr.request_timeline
     from benchmark_results br
     join agentic_trace_replay atr on atr.id = br.trace_replay_id
     where br.id = ${benchmarkResultId}
-  `) as unknown as RawRow[];
+  `) as unknown as RawMetaRow[];
   const row = rows[0];
   if (!row) return null;
 
@@ -43,6 +49,16 @@ export async function getRequestTimeline(
     return row.request_timeline;
   }
 
-  // Slow path: recompute from the blob (rare — only stale/missing rows).
-  return computeRequestTimeline(row.blob);
+  if (!row.has_blob) return null;
+
+  // Slow path only: fetch the large profile blob after establishing that the
+  // pre-computed timeline is stale or missing. Long trace runs can have blobs
+  // large enough to exceed Neon's 64 MiB encoded-response limit, so the fast
+  // path must never select the blob alongside request_timeline.
+  const blobRows = (await sql`
+    select profile_export_jsonl_gz as blob
+    from agentic_trace_replay
+    where id = ${row.trace_replay_id}
+  `) as unknown as RawBlobRow[];
+  return computeRequestTimeline(blobRows[0]?.blob ?? null);
 }
diff --git a/packages/db/src/queries/trace-histograms.test.ts b/packages/db/src/queries/trace-histograms.test.ts
new file mode 100644
index 00000000..c3c6ec8a
--- /dev/null
+++ b/packages/db/src/queries/trace-histograms.test.ts
@@ -0,0 +1,78 @@
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline';
+import type { DbClient } from '../connection.js';
+
+import { getTraceHistograms } from './trace-histograms';
+
+function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } {
+  const responses = [...queue];
+  const calls: string[] = [];
+  const sql = ((strings: TemplateStringsArray) => {
+    calls.push(strings.join('?'));
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+const timeline: RequestTimeline = {
+  version: REQUEST_TIMELINE_VERSION,
+  startNs: 0,
+  endNs: 10,
+  durationS: 0.00000001,
+  requests: [
+    {
+      cid: 'session-1',
+      ti: 0,
+      wid: '0',
+      ad: 0,
+      phase: 'profiling',
+      credit: 0,
+      start: 1,
+      ack: 2,
+      end: 3,
+      ttftMs: 1,
+      tpotMs: 2,
+      isl: 4096,
+      osl: 512,
+      cancelled: false,
+    },
+    {
+      cid: 'session-1',
+      ti: 1,
+      wid: '0',
+      ad: 0,
+      phase: 'profiling',
+      credit: 4,
+      start: 5,
+      ack: 6,
+      end: 7,
+      ttftMs: 1,
+      tpotMs: 2,
+      isl: null,
+      osl: 128,
+      cancelled: false,
+    },
+  ],
+};
+
+describe('getTraceHistograms', () => {
+  it('builds distributions from the precomputed timeline without selecting the raw blob', async () => {
+    const { sql, calls } = mockSql([
+      [
+        {
+          benchmark_result_id: 422991,
+          trace_replay_id: 870,
+          request_timeline: timeline,
+          has_blob: true,
+        },
+      ],
+    ]);
+
+    await expect(getTraceHistograms(sql, [422991])).resolves.toEqual({
+      422991: { id: 422991, isl: [4096], osl: [512, 128] },
+    });
+    expect(calls).toHaveLength(1);
+    expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob');
+  });
+});
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
index 20ebc0d5..24b96c35 100644
--- a/packages/db/src/queries/trace-histograms.ts
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -14,6 +14,8 @@
 
 import { gunzipSync } from 'node:zlib';
 
+import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline';
+
 import type { DbClient } from '../connection.js';
 
 export interface TraceHistogramPoint {
@@ -27,13 +29,28 @@ export interface TraceHistogramPoint {
 
 export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
 
-/**
- * Cap the number of blobs we pull in a single Neon HTTP query — the serverless
- * driver returns 507 ("response is too large, max 64 MB") if the combined gzip
- * payload exceeds that. Each profile_export.jsonl blob can be ~1-2 MB
- * compressed, so we stay well below the cap at 12.
- */
 const QUERY_CHUNK_SIZE = 12;
+// Bytea values expand in Neon's JSON-over-HTTP response. Keep raw fallback
+// reads comfortably below its 64 MiB response cap; current ingests should use
+// request_timeline instead and never need this path.
+const MAX_FALLBACK_BLOB_BYTES = 24 * 1024 * 1024;
+
+interface TimelineRow {
+  benchmark_result_id: number;
+  trace_replay_id: number;
+  request_timeline: RequestTimeline | null;
+  has_blob: boolean;
+}
+
+function histogramFromTimeline(id: number, timeline: RequestTimeline): TraceHistogramPoint {
+  const isl: number[] = [];
+  const osl: number[] = [];
+  for (const request of timeline.requests) {
+    if (typeof request.isl === 'number' && Number.isFinite(request.isl)) isl.push(request.isl);
+    if (typeof request.osl === 'number' && Number.isFinite(request.osl)) osl.push(request.osl);
+  }
+  return { id, isl, osl };
+}
 
 export async function getTraceHistograms(
   sql: DbClient,
@@ -41,25 +58,47 @@ export async function getTraceHistograms(
 ): Promise<TraceHistogramMap> {
   if (benchmarkResultIds.length === 0) return {};
 
-  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  const result: TraceHistogramMap = {};
+  const fallbackRows: TimelineRow[] = [];
   for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
     const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
     const chunkRows = (await sql`
       select
         br.id as benchmark_result_id,
-        atr.profile_export_jsonl_gz as blob
+        atr.id as trace_replay_id,
+        atr.request_timeline,
+        (atr.profile_export_jsonl_gz is not null) as has_blob
       from benchmark_results br
       join agentic_trace_replay atr on atr.id = br.trace_replay_id
       where br.id = any(${chunk}::bigint[])
-        and atr.profile_export_jsonl_gz is not null
-    `) as { benchmark_result_id: number; blob: Buffer }[];
-    rows.push(...chunkRows);
+    `) as unknown as TimelineRow[];
+    for (const row of chunkRows) {
+      const id = Number(row.benchmark_result_id);
+      if (
+        row.request_timeline &&
+        Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION
+      ) {
+        result[id] = histogramFromTimeline(id, row.request_timeline);
+      } else if (row.has_blob) {
+        fallbackRows.push(row);
+      }
+    }
   }
 
-  const result: TraceHistogramMap = {};
-  for (const row of rows) {
+  // Compatibility fallback for pre-timeline rows. Fetch one small blob at a
+  // time; oversized legacy rows are omitted instead of turning the whole API
+  // response into a 507.
+  for (const row of fallbackRows) {
+    const blobRows = (await sql`
+      select profile_export_jsonl_gz as blob
+      from agentic_trace_replay
+      where id = ${row.trace_replay_id}
+        and octet_length(profile_export_jsonl_gz) <= ${MAX_FALLBACK_BLOB_BYTES}
+    `) as unknown as { blob: Buffer }[];
+    const blob = blobRows[0]?.blob;
+    if (!blob) continue;
     try {
-      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const jsonl = gunzipSync(blob).toString('utf8');
       const isl: number[] = [];
       const osl: number[] = [];
       for (const line of jsonl.split('\n')) {
diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts
new file mode 100644
index 00000000..61d21d35
--- /dev/null
+++ b/packages/db/src/queries/trace-server-metrics.test.ts
@@ -0,0 +1,104 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, type ChartSeries } from '../etl/compute-chart-series';
+import type { DbClient } from '../connection.js';
+
+import { getTraceServerMetrics } from './trace-server-metrics';
+
+function currentSeries(): ChartSeries {
+  return {
+    version: CHART_SERIES_VERSION,
+    startNs: 0,
+    endNs: 1e9,
+    durationS: 1,
+    timeslicesCount: 1,
+    kvCacheUsage: [],
+    prefixCacheHitRate: [],
+    queueDepth: [],
+    promptTokensBySource: {},
+    prefillTps: [{ t: 0, value: 100 }],
+    decodeTps: [],
+    prefixCacheHitsTps: [],
+    hostKvCacheUsage: [],
+    kvCacheUsageByEngine: [],
+    metricSources: [],
+  };
+}
+
+function metaRow(overrides: Record<string, unknown> = {}) {
+  return {
+    id: 42,
+    trace_replay_id: 7,
+    has_blob: true,
+    chart_series: currentSeries(),
+    hardware: 'gb200',
+    framework: 'dynamo-vllm',
+    model: 'deepseek-r1-0528',
+    precision: 'fp8',
+    spec_method: 'none',
+    disagg: true,
+    conc: 128,
+    offload_mode: 'off',
+    isl: null,
+    osl: null,
+    benchmark_type: 'agentic_traces',
+    date: '2026-06-23',
+    run_url: null,
+    server_gpu_cache_hit_rate: null,
+    server_cpu_cache_hit_rate: null,
+    ...overrides,
+  };
+}
+
+function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } {
+  const responses = [...queue];
+  const calls: string[] = [];
+  const sql = ((strings: TemplateStringsArray) => {
+    calls.push(strings.join('?'));
+    return Promise.resolve(responses.shift() ?? []);
+  }) as unknown as DbClient;
+  return { sql, calls };
+}
+
+describe('getTraceServerMetrics', () => {
+  it('returns current precomputed series without selecting the raw blob', async () => {
+    const { sql, calls } = mockSql([[metaRow()]]);
+
+    const result = await getTraceServerMetrics(sql, 42);
+
+    expect(result?.prefillTps).toEqual([{ t: 0, value: 100 }]);
+    expect(calls).toHaveLength(1);
+    expect(calls[0]).not.toContain('server_metrics_json_gz as blob');
+  });
+
+  it('fetches and computes the raw blob only when chart_series is stale', async () => {
+    const raw = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metrics: {
+            'vllm:prompt_tokens': {
+              series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 321 }] }],
+            },
+          },
+        }),
+      ),
+    );
+    const stale = { ...currentSeries(), version: CHART_SERIES_VERSION - 1 };
+    const { sql, calls } = mockSql([[metaRow({ chart_series: stale })], [{ blob: raw }]]);
+
+    const result = await getTraceServerMetrics(sql, 42);
+
+    expect(result?.prefillTps).toEqual([{ t: 0, value: 321 }]);
+    expect(calls).toHaveLength(2);
+    expect(calls[1]).toContain('server_metrics_json_gz as blob');
+  });
+
+  it('returns null without a blob and does not issue a second query', async () => {
+    const { sql, calls } = mockSql([[metaRow({ has_blob: false, chart_series: null })]]);
+
+    await expect(getTraceServerMetrics(sql, 42)).resolves.toBeNull();
+    expect(calls).toHaveLength(1);
+  });
+});
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 5594d514..61cacaae 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -14,6 +14,7 @@ import {
   CHART_SERIES_VERSION,
   computeChartSeries,
   type ChartSeries,
+  type MetricSourceSeries,
   type QueueDepthPoint,
   type TimeSeriesPoint,
 } from '../etl/compute-chart-series';
@@ -80,13 +81,20 @@ export interface TraceServerMetrics {
    * the cluster-average `kvCacheUsage` line covers that case alone.
    */
   kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /** Orchestrator-normalized metrics grouped by endpoint/worker. */
+  metricSources: MetricSourceSeries[];
 }
 
 interface RawMetaRow extends PointMeta {
-  blob: Buffer | null;
+  trace_replay_id: number | null;
+  has_blob: boolean;
   chart_series: ChartSeries | null;
 }
 
+interface RawBlobRow {
+  blob: Buffer | null;
+}
+
 function buildMeta(row: RawMetaRow): PointMeta {
   return {
     id: Number(row.id),
@@ -128,6 +136,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     hostKvCacheUsage: series.hostKvCacheUsage ?? [],
     // v8+ field; older chart_series rows lack it → omit per-engine overlay.
     kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [],
+    // v9+ field; old rows are served without a source selector until backfilled.
+    metricSources: series.metricSources ?? [],
   };
 }
 
@@ -137,7 +147,8 @@ export async function getTraceServerMetrics(
 ): Promise<TraceServerMetrics | null> {
   const rows = (await sql`
     select
-      atr.server_metrics_json_gz as blob,
+      br.trace_replay_id,
+      (atr.server_metrics_json_gz is not null) as has_blob,
       atr.chart_series,
       br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
       br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
@@ -153,7 +164,7 @@ export async function getTraceServerMetrics(
   `) as unknown as RawMetaRow[];
   const row = rows[0];
   if (!row) return null;
-  if (!row.blob) return null;
+  if (!row.has_blob || row.trace_replay_id === null) return null;
   const meta = buildMeta(row);
 
   // Fast path: pre-computed chart_series at the current version.
@@ -161,10 +172,25 @@ export async function getTraceServerMetrics(
     return merge(meta, row.chart_series);
   }
 
-  // Slow path: compute from the blob. `computeChartSeries` handles
+  // Slow path only: fetch the large raw blob after establishing that the
+  // pre-computed series is missing or stale. Disaggregated blobs can be tens
+  // of MB compressed, so selecting this in the metadata query defeats the
+  // fast path even when chart_series is current.
+  const blobRows = (await sql`
+    select server_metrics_json_gz as blob
+    from agentic_trace_replay
+    where id = ${row.trace_replay_id}
+  `) as unknown as RawBlobRow[];
+  const blob = blobRows[0]?.blob;
+  if (!blob) return null;
+
+  // `computeChartSeries` handles
   // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP
   // rows succeed even before the backfill drains them.
-  const series = await computeChartSeries(row.blob);
+  const series = await computeChartSeries(blob, {
+    framework: row.framework,
+    disagg: row.disagg,
+  });
   if (!series) return null;
   return merge(meta, series);
 }

From 8b243e47e96465adcde20dbd00f551830af61bc2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 30 Jun 2026 18:38:14 -0500
Subject: [PATCH 102/111] feat(agentic): KV-cache pool ceiling +
 warmup/profiling phase split

Agentic point-detail enhancements (shared files across workstreams):

- KV-cache pool size: derive total pool tokens from the authoritative vLLM
  "GPU KV cache size: N tokens" server-log line (summed across DP engine
  cores; TP already aggregated), stored on benchmark_results.metrics. The
  vllm:cache_config_info metric is unreliable for MLA models, so the log is
  the source of truth. Drawn as a horizontal ceiling on the "unique input
  tokens in flight" chart via a reusable TimeSeriesChart refLines prop.
  - Fix ingest: link agentic server logs (bmk_agentic_<key> ->
    server_logs_<key> key mismatch meant agentic rows never got a server log).
  - New: server-log-metrics parser (line-based, robust to multi-MB log lines),
    db:backfill-agentic-server-logs and db:backfill-kv-pool scripts.

- Warmup/profiling phase split: chart_series now merges the warmup_metrics
  block; per-request phase tags drive timeline + per-point phase slicing
  (phase-slice).

- Request timeline: restore zoom/scroll/filter position on browser back via a
  one-shot sessionStorage snapshot; phase toggle is Profiling/Warmup.

Tests added for the parser, snapshot (de)serialization, and chart-series.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 959 ++++++++++--------
 .../agentic-point/phase-slice.test.ts         | 212 ++++
 .../inference/agentic-point/phase-slice.ts    | 188 ++++
 .../agentic-point/request-timeline.test.ts    |  75 +-
 .../agentic-point/request-timeline.tsx        | 181 +++-
 .../agentic-point/time-series-chart.test.ts   |  33 +-
 .../agentic-point/time-series-chart.tsx       |  71 +-
 .../src/hooks/api/use-trace-server-metrics.ts |   5 +
 packages/db/package.json                      |   2 +
 .../db/src/backfill-agentic-server-logs.ts    | 267 +++++
 packages/db/src/backfill-kv-pool.ts           | 137 +++
 packages/db/src/etl/benchmark-ingest.ts       |  12 +-
 .../db/src/etl/compute-chart-series.test.ts   |  43 +
 packages/db/src/etl/compute-chart-series.ts   |  58 +-
 .../db/src/etl/server-log-metrics.test.ts     |  43 +
 packages/db/src/etl/server-log-metrics.ts     |  65 ++
 packages/db/src/ingest-ci-run.ts              |   9 +-
 .../src/queries/trace-server-metrics.test.ts  |   1 +
 .../db/src/queries/trace-server-metrics.ts    |  23 +-
 19 files changed, 1924 insertions(+), 460 deletions(-)
 create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.test.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.ts
 create mode 100644 packages/db/src/backfill-agentic-server-logs.ts
 create mode 100644 packages/db/src/backfill-kv-pool.ts
 create mode 100644 packages/db/src/etl/server-log-metrics.test.ts
 create mode 100644 packages/db/src/etl/server-log-metrics.ts

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 77d87997..c6697442 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -2,12 +2,11 @@
 
 import Link from 'next/link';
 import { usePathname, useRouter, useSearchParams } from 'next/navigation';
-import { useCallback, useState } from 'react';
+import { useCallback, useMemo, useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
 import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
 import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-request-timeline';
-import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
   type MetricSource,
@@ -29,6 +28,14 @@ import { track } from '@/lib/analytics';
 import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
 import { ExpandableChart } from './expandable-chart';
+import {
+  phaseBoundarySec,
+  sliceServerSeriesByPhase,
+  sliceTimelineByPhase,
+  timelineHasWarmup,
+  type ServerSeriesLike,
+  type StagePhase,
+} from './phase-slice';
 import { RequestTimelineView } from './request-timeline';
 import { SiblingNav, chipLabel } from './sibling-nav';
 import {
@@ -57,6 +64,13 @@ interface Props {
 const fmtPct = (v: number | null | undefined): string =>
   v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`;
 
+/** Compact token count for chart labels: 306808 → "307K tok", 3.2e6 → "3.2M tok". */
+const fmtTokensCompact = (n: number): string => {
+  if (n >= 1e6) return `${(n / 1e6).toFixed(1)}M tok`;
+  if (n >= 1e3) return `${Math.round(n / 1e3)}K tok`;
+  return `${Math.round(n)} tok`;
+};
+
 function MetaLine({ label, value }: { label: string; value: React.ReactNode }) {
   return (
     <div className="flex flex-col gap-0.5">
@@ -155,6 +169,14 @@ const SEQUENCE_METRIC_OPTIONS: SegmentedToggleOption<SequenceMetricView>[] = [
   { value: 'inflight', label: 'In-flight avg' },
 ];
 
+// Warmup vs profiling stage selector. Drives the server-metric charts AND the
+// request-derived charts (ISL/OSL, latency-over-time, in-flight). Only shown
+// when the point actually has a warmup phase.
+const STAGE_PHASE_OPTIONS: SegmentedToggleOption<StagePhase>[] = [
+  { value: 'profiling', label: 'Profiling', testId: 'stage-phase-profiling' },
+  { value: 'warmup', label: 'Warmup', testId: 'stage-phase-warmup' },
+];
+
 const SOURCE_ROLE_LABEL: Record<MetricSource['role'], string> = {
   router: 'Router',
   prefill: 'Prefill',
@@ -285,21 +307,25 @@ function RequestMetricOverTime({
 
 function SequenceMetricCard({
   metric,
-  values,
   timeline,
-  histogramLoading,
   timelineLoading,
 }: {
   metric: 'isl' | 'osl';
-  values: readonly number[] | undefined;
+  /** Phase-scoped timeline — distribution values + in-flight are both derived from it. */
   timeline: RequestTimeline | null | undefined;
-  histogramLoading: boolean;
   timelineLoading: boolean;
 }) {
   const [view, setView] = useState<SequenceMetricView>('distribution');
   const acronym = metric.toUpperCase();
   const fullName = metric === 'isl' ? 'Input sequence length' : 'Output sequence length';
   const testPrefix = `${metric}-metric`;
+  // Per-request ISL/OSL for the selected phase (request_timeline carries both,
+  // so the distribution honours the warmup/profiling toggle for free).
+  const values = timeline
+    ? timeline.requests
+        .map((r) => r[metric])
+        .filter((v): v is number => typeof v === 'number' && Number.isFinite(v))
+    : undefined;
   return (
     <ExpandableChart
       title={view === 'distribution' ? `${fullName} distribution` : `Average ${acronym} in flight`}
@@ -323,8 +349,9 @@ function SequenceMetricCard({
       render={(expanded) => {
         const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
         if (view === 'distribution') {
-          if (values) return <Distribution values={values} unit="tokens" {...size} />;
-          return histogramLoading ? <Skeleton /> : <Empty />;
+          if (values && values.length > 0)
+            return <Distribution values={values} unit="tokens" {...size} />;
+          return timelineLoading ? <Skeleton /> : <Empty />;
         }
         if (!timeline) return timelineLoading ? <Skeleton /> : <Empty />;
         const raw = averageSequenceLengthInFlight(timeline.requests, metric);
@@ -376,11 +403,9 @@ export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
   const pathname = usePathname();
   const searchParams = useSearchParams();
-  const histQuery = useTraceHistograms([id], true);
   const metricsQuery = useTraceServerMetrics(id, true);
   const siblingsQuery = useBenchmarkSiblings(id);
 
-  const hist = histQuery.data?.[id];
   const metrics = metricsQuery.data;
   const siblingsData = siblingsQuery.data;
 
@@ -407,25 +432,73 @@ export function AgenticPointDetail({ id }: Props) {
   // shows how the metric varies across the SKU.
   const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
   const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
-  // Per-request timeline used by both the timeline view AND the per-point
-  // "Unique input tokens in flight" chart, so fetch whenever we're on
-  // either view.
+  // Per-request timeline used by the timeline view AND every per-point
+  // request-derived chart (ISL/OSL, latency-over-time, in-flight), so fetch
+  // whenever we're on either view.
   const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point');
+  const timeline = timelineQuery.data;
+
+  // Warmup vs profiling stage. Only meaningful when the point actually has a
+  // warmup phase (older runs are profiling-only) — when absent the toggle is
+  // hidden and everything falls back to the full (profiling) run.
+  const [phase, setPhase] = useState<StagePhase>('profiling');
+  const hasWarmup = useMemo(() => timelineHasWarmup(timeline), [timeline]);
+  const effectivePhase: StagePhase = hasWarmup ? phase : 'profiling';
+
+  // Server-metric boundary on the chart's own t-axis (rebased through absolute
+  // ns — see phase-slice header for the origin-gap invariant). Request charts
+  // get a phase-scoped timeline (filtered + rebased) so they share a 0-based
+  // axis with the server charts for the selected phase.
+  const boundarySec = useMemo(() => phaseBoundarySec(metrics, timeline), [metrics, timeline]);
+  const phaseTimeline = useMemo(
+    () => (timeline ? sliceTimelineByPhase(timeline, effectivePhase) : null),
+    [timeline, effectivePhase],
+  );
+
   const metricSources = metrics?.metricSources ?? [];
   const selectedMetricSource = metricSources.find(({ source }) => source.id === metricSourceId);
-  const serverSeries = selectedMetricSource
-    ? {
-        kvCacheUsage: selectedMetricSource.kvCacheUsage,
-        prefixCacheHitRate: selectedMetricSource.prefixCacheHitRate,
-        queueDepth: selectedMetricSource.queueDepth,
-        promptTokensBySource: selectedMetricSource.promptTokensBySource,
-        prefillTps: selectedMetricSource.promptTps,
-        decodeTps: selectedMetricSource.generationTps,
-        prefixCacheHitsTps: selectedMetricSource.prefixCacheHitsTps,
-        hostKvCacheUsage: selectedMetricSource.hostKvCacheUsage,
-        kvCacheUsageByEngine: selectedMetricSource.kvCacheUsageByEngine,
-      }
-    : metrics;
+  const baseServerSeries: ServerSeriesLike | undefined = useMemo(() => {
+    const src = metrics?.metricSources?.find((m) => m.source.id === metricSourceId);
+    if (src) {
+      return {
+        kvCacheUsage: src.kvCacheUsage,
+        prefixCacheHitRate: src.prefixCacheHitRate,
+        queueDepth: src.queueDepth,
+        promptTokensBySource: src.promptTokensBySource,
+        prefillTps: src.promptTps,
+        decodeTps: src.generationTps,
+        prefixCacheHitsTps: src.prefixCacheHitsTps,
+        hostKvCacheUsage: src.hostKvCacheUsage,
+        kvCacheUsageByEngine: src.kvCacheUsageByEngine,
+      };
+    }
+    return metrics ?? undefined;
+  }, [metrics, metricSourceId]);
+  // Phase-sliced server series (+ matching durationS) consumed by every server
+  // chart. Null only when there are no server metrics at all. Each chart reads
+  // `sliced.series` (locally aliased to `serverSeries`) and `sliced.durationS`.
+  const sliced = useMemo(
+    () =>
+      baseServerSeries
+        ? sliceServerSeriesByPhase(
+            baseServerSeries,
+            effectivePhase,
+            boundarySec,
+            metrics?.durationS ?? 0,
+          )
+        : null,
+    [baseServerSeries, effectivePhase, boundarySec, metrics?.durationS],
+  );
+  // Some runs only scrape server metrics during profiling — `chart_series`
+  // starts at the profiling boundary, so the warmup slice collapses to ~0–1
+  // points (just the t=0 origin) even though request-level warmup data exists.
+  // Require ≥2 points in some series to count as real warmup coverage; otherwise
+  // show an explanatory note instead of six silently-blank charts.
+  const slicedHasServerData =
+    (sliced?.series.kvCacheUsage.length ?? 0) > 1 ||
+    (sliced?.series.queueDepth.length ?? 0) > 1 ||
+    (sliced?.series.prefillTps.length ?? 0) > 1 ||
+    (sliced?.series.prefixCacheHitRate.length ?? 0) > 1;
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -489,45 +562,67 @@ export function AgenticPointDetail({ id }: Props) {
         )}
       </div>
 
-      {view === 'point' && metricSources.length > 1 && (
+      {view === 'point' && (metricSources.length > 1 || hasWarmup) && (
         <div
-          className="sticky top-16 z-40 flex items-center justify-end gap-2 rounded-lg border border-border/40 bg-background/90 px-3 py-2 shadow-sm backdrop-blur"
+          className="sticky top-16 z-40 flex items-center justify-between gap-2 rounded-lg border border-border/40 bg-background/90 px-3 py-2 shadow-sm backdrop-blur"
           data-testid="metric-source-toolbar"
         >
-          <span className="text-xs text-muted-foreground">Server metrics</span>
-          <Select
-            value={selectedMetricSource?.source.id ?? 'all'}
-            onValueChange={(value) => {
-              setMetricSourceId(value);
-              const source = metricSources.find((entry) => entry.source.id === value)?.source;
-              track('inference_agentic_metric_source_changed', {
-                source: value,
-                role: source?.role ?? 'all',
-                adapter: source?.adapter ?? metrics?.meta.framework ?? 'unknown',
-              });
-            }}
-          >
-            <SelectTrigger
-              size="sm"
-              className="max-w-72"
-              aria-label="Server metrics source"
-              data-testid="metric-source-select"
-            >
-              <SelectValue />
-            </SelectTrigger>
-            <SelectContent>
-              <SelectItem value="all">All endpoints</SelectItem>
-              {metricSources.map(({ source }) => (
-                <SelectItem
-                  key={source.id}
-                  value={source.id}
-                  title={source.endpointUrl ?? undefined}
+          {hasWarmup ? (
+            <div className="flex items-center gap-2">
+              <span className="text-xs text-muted-foreground">Stage</span>
+              <SegmentedToggle
+                value={phase}
+                options={STAGE_PHASE_OPTIONS}
+                onValueChange={(value) => {
+                  setPhase(value);
+                  track('inference_agentic_phase_changed', { phase: value });
+                }}
+                ariaLabel="Stage phase"
+                testId="stage-phase-toggle"
+                buttonClassName="px-2.5 py-1 text-xs"
+              />
+            </div>
+          ) : (
+            <span />
+          )}
+          {metricSources.length > 1 ? (
+            <div className="flex items-center gap-2">
+              <span className="text-xs text-muted-foreground">Server metrics</span>
+              <Select
+                value={selectedMetricSource?.source.id ?? 'all'}
+                onValueChange={(value) => {
+                  setMetricSourceId(value);
+                  const source = metricSources.find((entry) => entry.source.id === value)?.source;
+                  track('inference_agentic_metric_source_changed', {
+                    source: value,
+                    role: source?.role ?? 'all',
+                    adapter: source?.adapter ?? metrics?.meta.framework ?? 'unknown',
+                  });
+                }}
+              >
+                <SelectTrigger
+                  size="sm"
+                  className="max-w-72"
+                  aria-label="Server metrics source"
+                  data-testid="metric-source-select"
                 >
-                  {metricSourceLabel(source)}
-                </SelectItem>
-              ))}
-            </SelectContent>
-          </Select>
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent>
+                  <SelectItem value="all">All endpoints</SelectItem>
+                  {metricSources.map(({ source }) => (
+                    <SelectItem
+                      key={source.id}
+                      value={source.id}
+                      title={source.endpointUrl ?? undefined}
+                    >
+                      {metricSourceLabel(source)}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+          ) : null}
         </div>
       )}
 
@@ -546,6 +641,7 @@ export function AgenticPointDetail({ id }: Props) {
           <RequestTimelineView
             data={timelineQuery.data}
             datasetSlug={siblingsQuery.data?.sku.dataset_slug}
+            pointId={id}
           />
         ) : (
           <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
@@ -554,385 +650,410 @@ export function AgenticPointDetail({ id }: Props) {
           </div>
         )
       ) : (
-        <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-          <SequenceMetricCard
-            metric="isl"
-            values={hist?.isl}
-            timeline={timelineQuery.data}
-            histogramLoading={histQuery.isLoading}
-            timelineLoading={timelineQuery.isLoading}
-          />
-          <SequenceMetricCard
-            metric="osl"
-            values={hist?.osl}
-            timeline={timelineQuery.data}
-            histogramLoading={histQuery.isLoading}
-            timelineLoading={timelineQuery.isLoading}
-          />
+        <>
+          {effectivePhase === 'warmup' && (
+            <p
+              className="rounded-md border-l-2 border-amber-500/60 bg-amber-500/10 px-3 py-2 text-xs text-muted-foreground"
+              data-testid="warmup-phase-note"
+            >
+              Showing the <span className="font-medium text-foreground">warmup</span> phase — a
+              cache-warming pass whose outputs are capped at 1 token. Warmup OSL ≈ 1, and
+              interactivity/decode are blank (single-token outputs have no inter-token latency).
+              {!slicedHasServerData &&
+                ' Warmup server-side metrics aren’t available for this point, so the server charts below are empty — the request-level charts above still reflect warmup.'}
+            </p>
+          )}
+          <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+            <SequenceMetricCard
+              metric="isl"
+              timeline={phaseTimeline}
+              timelineLoading={timelineQuery.isLoading}
+            />
+            <SequenceMetricCard
+              metric="osl"
+              timeline={phaseTimeline}
+              timelineLoading={timelineQuery.isLoading}
+            />
 
-          <RequestMetricOverTime
-            title="Interactivity over time"
-            metric="interactivity"
-            timeline={timelineQuery.data}
-            isLoading={timelineQuery.isLoading}
-          />
+            <RequestMetricOverTime
+              title="Interactivity over time"
+              metric="interactivity"
+              timeline={phaseTimeline}
+              isLoading={timelineQuery.isLoading}
+            />
 
-          <RequestMetricOverTime
-            title="TTFT over time"
-            metric="ttft"
-            timeline={timelineQuery.data}
-            isLoading={timelineQuery.isLoading}
-            latencySelector
-          />
+            <RequestMetricOverTime
+              title="TTFT over time"
+              metric="ttft"
+              timeline={phaseTimeline}
+              isLoading={timelineQuery.isLoading}
+              latencySelector
+            />
 
-          <ExpandableChart
-            title="KV cache utilization over time"
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics || !serverSeries) return <Skeleton />;
-              // For SGLang hicache rows we have both GPU (HBM) util and
-              // host (CPU offload pool) util — overlay them as two lines.
-              const hasHost = serverSeries.hostKvCacheUsage.length > 0;
-              // DEP runs report one series per engine. When there's more
-              // than one, draw one line per rank in distinct colors so
-              // load skew is visible at a glance; cluster-average sits on
-              // top in white so it stands out.
-              const perEngine = serverSeries.kvCacheUsageByEngine ?? [];
-              const hasPerEngine = perEngine.length > 1;
-              // Render order matters: per-engine first → average drawn on top.
-              const series = [
-                ...(hasPerEngine
-                  ? perEngine.map((e, i) => ({
-                      name: `DP ${e.engineLabel}`,
-                      data: rollingAverage(e.points, 50),
-                      color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!,
-                      // Thin + translucent so the Avg line on top reads as
-                      // the headline number, not just one more series.
-                      strokeWidth: 1,
-                      strokeOpacity: 0.5,
-                    }))
-                  : []),
-                {
-                  name: hasHost
-                    ? 'GPU HBM (avg n=50)'
-                    : hasPerEngine
-                      ? 'Avg'
-                      : 'GPU KV cache (avg n=50)',
-                  data: rollingAverage(serverSeries.kvCacheUsage, 50),
-                  // Skip raw scatter when per-engine overlay is on — the
-                  // DP-rank lines already convey the spread, dots would be noise.
-                  rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage,
-                  // Bold red Avg sits on top of the translucent per-DP lines.
-                  // DP 1 in the palette is #ef4444 (lighter red); the darker
-                  // #dc2626 here plus the heavier stroke keeps it distinct.
-                  color: hasPerEngine ? '#dc2626' : '#3b82f6',
-                  strokeWidth: hasPerEngine ? 3.5 : 2,
-                },
-                ...(hasHost
-                  ? [
-                      {
-                        name: 'CPU offload pool (avg n=50)',
-                        data: rollingAverage(serverSeries.hostKvCacheUsage, 50),
-                        rawData: serverSeries.hostKvCacheUsage,
-                        color: '#f97316',
-                        strokeWidth: 2,
-                      },
-                    ]
-                  : []),
-              ];
-              return (
-                <TimeSeriesChart
-                  series={series}
-                  durationS={metrics.durationS}
-                  yMax={1}
-                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                  yAxisLabel="KV cache (%)"
-                  {...size}
-                />
-              );
-            }}
-          />
+            <ExpandableChart
+              title="KV cache utilization over time"
+              render={(expanded) => {
+                const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+                if (!metrics || !sliced) return <Skeleton />;
+                const serverSeries = sliced.series;
+                // For SGLang hicache rows we have both GPU (HBM) util and
+                // host (CPU offload pool) util — overlay them as two lines.
+                const hasHost = serverSeries.hostKvCacheUsage.length > 0;
+                // DEP runs report one series per engine. When there's more
+                // than one, draw one line per rank in distinct colors so
+                // load skew is visible at a glance; cluster-average sits on
+                // top in white so it stands out.
+                const perEngine = serverSeries.kvCacheUsageByEngine ?? [];
+                const hasPerEngine = perEngine.length > 1;
+                // Render order matters: per-engine first → average drawn on top.
+                const series = [
+                  ...(hasPerEngine
+                    ? perEngine.map((e, i) => ({
+                        name: `DP ${e.engineLabel}`,
+                        data: rollingAverage(e.points, 50),
+                        color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!,
+                        // Thin + translucent so the Avg line on top reads as
+                        // the headline number, not just one more series.
+                        strokeWidth: 1,
+                        strokeOpacity: 0.5,
+                      }))
+                    : []),
+                  {
+                    name: hasHost
+                      ? 'GPU HBM (avg n=50)'
+                      : hasPerEngine
+                        ? 'Avg'
+                        : 'GPU KV cache (avg n=50)',
+                    data: rollingAverage(serverSeries.kvCacheUsage, 50),
+                    // Skip raw scatter when per-engine overlay is on — the
+                    // DP-rank lines already convey the spread, dots would be noise.
+                    rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage,
+                    // Bold red Avg sits on top of the translucent per-DP lines.
+                    // DP 1 in the palette is #ef4444 (lighter red); the darker
+                    // #dc2626 here plus the heavier stroke keeps it distinct.
+                    color: hasPerEngine ? '#dc2626' : '#3b82f6',
+                    strokeWidth: hasPerEngine ? 3.5 : 2,
+                  },
+                  ...(hasHost
+                    ? [
+                        {
+                          name: 'CPU offload pool (avg n=50)',
+                          data: rollingAverage(serverSeries.hostKvCacheUsage, 50),
+                          rawData: serverSeries.hostKvCacheUsage,
+                          color: '#f97316',
+                          strokeWidth: 2,
+                        },
+                      ]
+                    : []),
+                ];
+                return (
+                  <TimeSeriesChart
+                    series={series}
+                    durationS={sliced.durationS}
+                    yMax={1}
+                    yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                    yAxisLabel="KV cache (%)"
+                    {...size}
+                  />
+                );
+              }}
+            />
 
-          <ExpandableChart
-            title={
-              requestActivityView === 'queue'
-                ? 'Request queue depth'
-                : 'Cumulative completed requests'
-            }
-            testId="request-activity-chart"
-            controls={
-              <SegmentedToggle
-                value={requestActivityView}
-                options={REQUEST_ACTIVITY_OPTIONS}
-                onValueChange={(value) => {
-                  setRequestActivityView(value);
-                  track('inference_agentic_request_activity_changed', { view: value });
-                }}
-                ariaLabel="Request activity metric"
-                testId="request-activity-toggle"
-                buttonClassName="px-2 py-1 text-xs"
-              />
-            }
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (requestActivityView === 'completed') {
-                if (!timelineQuery.data) {
-                  return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
+            <ExpandableChart
+              title={
+                requestActivityView === 'queue'
+                  ? 'Request queue depth'
+                  : 'Cumulative completed requests'
+              }
+              testId="request-activity-chart"
+              controls={
+                <SegmentedToggle
+                  value={requestActivityView}
+                  options={REQUEST_ACTIVITY_OPTIONS}
+                  onValueChange={(value) => {
+                    setRequestActivityView(value);
+                    track('inference_agentic_request_activity_changed', { view: value });
+                  }}
+                  ariaLabel="Request activity metric"
+                  testId="request-activity-toggle"
+                  buttonClassName="px-2 py-1 text-xs"
+                />
+              }
+              render={(expanded) => {
+                const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+                if (requestActivityView === 'completed') {
+                  if (!phaseTimeline) {
+                    return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
+                  }
+                  return (
+                    <TimeSeriesChart
+                      series={[
+                        {
+                          name: 'Completed requests',
+                          data: cumulativeCompletedRequests(phaseTimeline.requests),
+                          color: '#3b82f6',
+                          strokeWidth: 2.5,
+                        },
+                      ]}
+                      durationS={phaseTimeline.durationS}
+                      yAxisLabel="Requests"
+                      {...size}
+                    />
+                  );
                 }
+                if (!metrics || !sliced) return <Skeleton />;
+                const serverSeries = sliced.series;
                 return (
                   <TimeSeriesChart
                     series={[
                       {
-                        name: 'Completed requests',
-                        data: cumulativeCompletedRequests(timelineQuery.data.requests),
+                        name: 'Running (avg n=50)',
+                        data: rollingAverage(
+                          serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+                            t: p.t,
+                            value: p.running,
+                          })),
+                          50,
+                        ),
+                        color: '#22c55e',
+                        strokeWidth: 2,
+                      },
+                      {
+                        name: 'Waiting (avg n=50)',
+                        data: rollingAverage(
+                          serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+                            t: p.t,
+                            value: p.waiting,
+                          })),
+                          50,
+                        ),
+                        color: '#ef4444',
+                        strokeWidth: 2,
+                      },
+                      {
+                        name: 'Total (avg n=50)',
+                        data: rollingAverage(
+                          serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
+                            t: p.t,
+                            value: p.total,
+                          })),
+                          50,
+                        ),
                         color: '#3b82f6',
-                        strokeWidth: 2.5,
+                        strokeWidth: 2,
                       },
                     ]}
-                    durationS={timelineQuery.data.durationS}
+                    durationS={sliced.durationS}
                     yAxisLabel="Requests"
                     {...size}
                   />
                 );
-              }
-              if (!metrics || !serverSeries) return <Skeleton />;
-              return (
-                <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'Running (avg n=50)',
-                      data: rollingAverage(
-                        serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
-                          t: p.t,
-                          value: p.running,
-                        })),
-                        50,
-                      ),
-                      color: '#22c55e',
-                      strokeWidth: 2,
-                    },
-                    {
-                      name: 'Waiting (avg n=50)',
-                      data: rollingAverage(
-                        serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
-                          t: p.t,
-                          value: p.waiting,
-                        })),
-                        50,
-                      ),
-                      color: '#ef4444',
-                      strokeWidth: 2,
-                    },
-                    {
-                      name: 'Total (avg n=50)',
-                      data: rollingAverage(
-                        serverSeries.queueDepth.map((p: QueueDepthPoint) => ({
-                          t: p.t,
-                          value: p.total,
-                        })),
-                        50,
-                      ),
-                      color: '#3b82f6',
-                      strokeWidth: 2,
-                    },
-                  ]}
-                  durationS={metrics.durationS}
-                  yAxisLabel="Requests"
-                  {...size}
-                />
-              );
-            }}
-          />
+              }}
+            />
 
-          <ExpandableChart
-            title="Prefix cache hit rate per interval"
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics || !serverSeries) return <Skeleton />;
-              return (
-                <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'GPU (HBM, avg n=50)',
-                      data: rollingAverage(serverSeries.prefixCacheHitRate, 50),
-                      rawData: serverSeries.prefixCacheHitRate,
-                      color: '#a855f7',
-                      strokeWidth: 2,
-                    },
-                  ]}
-                  durationS={metrics.durationS}
-                  yMax={1}
-                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                  yAxisLabel="Hit rate (%)"
-                  {...size}
-                />
-              );
-            }}
-          />
+            <ExpandableChart
+              title="Prefix cache hit rate per interval"
+              render={(expanded) => {
+                const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+                if (!metrics || !sliced) return <Skeleton />;
+                const serverSeries = sliced.series;
+                return (
+                  <TimeSeriesChart
+                    series={[
+                      {
+                        name: 'GPU (HBM, avg n=50)',
+                        data: rollingAverage(serverSeries.prefixCacheHitRate, 50),
+                        rawData: serverSeries.prefixCacheHitRate,
+                        color: '#a855f7',
+                        strokeWidth: 2,
+                      },
+                    ]}
+                    durationS={sliced.durationS}
+                    yMax={1}
+                    yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                    yAxisLabel="Hit rate (%)"
+                    {...size}
+                  />
+                );
+              }}
+            />
 
-          <ExpandableChart
-            title={
-              selectedMetricSource
-                ? `Throughput · ${metricSourceLabel(selectedMetricSource.source)}`
-                : 'Throughput (input & decode)'
-            }
-            controls={
-              <div className="flex items-center gap-1" data-testid="throughput-series-toggle">
-                {(
-                  [
-                    ['input', 'Input'],
-                    ['decode', 'Decode'],
-                  ] as const
-                ).map(([key, label]) => {
-                  const active = throughputSeries.has(key);
-                  const isOnlyActive = active && throughputSeries.size === 1;
-                  return (
-                    <button
-                      key={key}
-                      type="button"
-                      aria-pressed={active}
-                      disabled={isOnlyActive}
-                      data-testid={`throughput-series-${key}`}
-                      className={`rounded px-2 py-1 text-xs font-medium transition-colors ${
-                        active
-                          ? key === 'input'
-                            ? 'bg-blue-500/20 text-blue-600 dark:text-blue-300'
-                            : 'bg-orange-500/20 text-orange-600 dark:text-orange-300'
-                          : 'bg-muted text-muted-foreground hover:text-foreground'
-                      } disabled:cursor-not-allowed disabled:opacity-60`}
-                      onClick={() => {
-                        const next = toggleThroughputSeries(throughputSeries, key);
-                        if (next === throughputSeries) return;
-                        setThroughputSeries(next);
-                        track('inference_agentic_throughput_series_toggled', {
-                          series: key,
-                          enabled: next.has(key),
-                        });
-                      }}
-                    >
-                      {label}
-                    </button>
-                  );
-                })}
-              </div>
-            }
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics || !serverSeries) return <Skeleton />;
-              return (
-                <TimeSeriesChart
-                  series={buildThroughputChartSeries(
-                    serverSeries.prefillTps,
-                    serverSeries.decodeTps,
-                    throughputSeries,
-                  )}
-                  durationS={metrics.durationS}
-                  yAxisLabel="Tokens / sec"
-                  {...size}
-                />
-              );
-            }}
-          />
+            <ExpandableChart
+              title={
+                selectedMetricSource
+                  ? `Throughput · ${metricSourceLabel(selectedMetricSource.source)}`
+                  : 'Throughput (input & decode)'
+              }
+              controls={
+                <div className="flex items-center gap-1" data-testid="throughput-series-toggle">
+                  {(
+                    [
+                      ['input', 'Input'],
+                      ['decode', 'Decode'],
+                    ] as const
+                  ).map(([key, label]) => {
+                    const active = throughputSeries.has(key);
+                    const isOnlyActive = active && throughputSeries.size === 1;
+                    return (
+                      <button
+                        key={key}
+                        type="button"
+                        aria-pressed={active}
+                        disabled={isOnlyActive}
+                        data-testid={`throughput-series-${key}`}
+                        className={`rounded px-2 py-1 text-xs font-medium transition-colors ${
+                          active
+                            ? key === 'input'
+                              ? 'bg-blue-500/20 text-blue-600 dark:text-blue-300'
+                              : 'bg-orange-500/20 text-orange-600 dark:text-orange-300'
+                            : 'bg-muted text-muted-foreground hover:text-foreground'
+                        } disabled:cursor-not-allowed disabled:opacity-60`}
+                        onClick={() => {
+                          const next = toggleThroughputSeries(throughputSeries, key);
+                          if (next === throughputSeries) return;
+                          setThroughputSeries(next);
+                          track('inference_agentic_throughput_series_toggled', {
+                            series: key,
+                            enabled: next.has(key),
+                          });
+                        }}
+                      >
+                        {label}
+                      </button>
+                    );
+                  })}
+                </div>
+              }
+              render={(expanded) => {
+                const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+                if (!metrics || !sliced) return <Skeleton />;
+                const serverSeries = sliced.series;
+                return (
+                  <TimeSeriesChart
+                    series={buildThroughputChartSeries(
+                      serverSeries.prefillTps,
+                      serverSeries.decodeTps,
+                      throughputSeries,
+                    )}
+                    durationS={sliced.durationS}
+                    yAxisLabel="Tokens / sec"
+                    {...size}
+                  />
+                );
+              }}
+            />
 
-          <ExpandableChart
-            title="Cumulative prompt token source breakdown"
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics || !serverSeries) return <Skeleton />;
-              return (
-                <StackedAreaChart
-                  sourceSeries={serverSeries.promptTokensBySource}
-                  durationS={metrics.durationS}
-                  {...size}
-                />
-              );
-            }}
-          />
+            <ExpandableChart
+              title="Cumulative prompt token source breakdown"
+              render={(expanded) => {
+                const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+                if (!metrics || !sliced) return <Skeleton />;
+                const serverSeries = sliced.series;
+                return (
+                  <StackedAreaChart
+                    sourceSeries={serverSeries.promptTokensBySource}
+                    durationS={sliced.durationS}
+                    {...size}
+                  />
+                );
+              }}
+            />
 
-          <ExpandableChart
-            title="Total unique input tokens over time"
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!metrics || !serverSeries) return <Skeleton />;
-              // Unique = total prompt tokens received minus tokens served from
-              // any cache tier — i.e. the freshly prefill-computed tokens. Prefer
-              // the promptTokensBySource breakdown (its buckets sum to the real
-              // prompt-token total, so subtracting cache tiers is exact). Fall
-              // back to cumsum(prefillTps - prefixCacheHitsTps) only for older
-              // data without the breakdown: vllm:prefix_cache_hits re-counts
-              // tokens across scheduler passes, so its cumulative can exceed the
-              // prompt tokens received, driving the diff negative and freezing
-              // the monotonic-clamped line after a few seconds.
-              const uniqueFromBreakdown = cumulativeUniqueInputTokens(
-                serverSeries.promptTokensBySource,
-              );
-              const uniqueData =
-                uniqueFromBreakdown.length > 0
-                  ? uniqueFromBreakdown
-                  : cumulativeDifferenceMonotonic(
-                      serverSeries.prefillTps,
-                      serverSeries.prefixCacheHitsTps,
-                    );
-              return (
-                <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'Cumulative unique input tokens',
-                      data: uniqueData,
-                      color: '#3b82f6',
-                      strokeWidth: 2,
-                    },
-                  ]}
-                  durationS={metrics.durationS}
-                  yAxisLabel="Tokens"
-                  {...size}
-                />
-              );
-            }}
-          />
+            <ExpandableChart
+              title="Total unique input tokens over time"
+              render={(expanded) => {
+                const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+                if (!metrics || !sliced) return <Skeleton />;
+                const serverSeries = sliced.series;
+                // Unique = total prompt tokens received minus tokens served from
+                // any cache tier — i.e. the freshly prefill-computed tokens. Prefer
+                // the promptTokensBySource breakdown (its buckets sum to the real
+                // prompt-token total, so subtracting cache tiers is exact). Fall
+                // back to cumsum(prefillTps - prefixCacheHitsTps) only for older
+                // data without the breakdown: vllm:prefix_cache_hits re-counts
+                // tokens across scheduler passes, so its cumulative can exceed the
+                // prompt tokens received, driving the diff negative and freezing
+                // the monotonic-clamped line after a few seconds.
+                const uniqueFromBreakdown = cumulativeUniqueInputTokens(
+                  serverSeries.promptTokensBySource,
+                );
+                const uniqueData =
+                  uniqueFromBreakdown.length > 0
+                    ? uniqueFromBreakdown
+                    : cumulativeDifferenceMonotonic(
+                        serverSeries.prefillTps,
+                        serverSeries.prefixCacheHitsTps,
+                      );
+                return (
+                  <TimeSeriesChart
+                    series={[
+                      {
+                        name: 'Cumulative unique input tokens',
+                        data: uniqueData,
+                        color: '#3b82f6',
+                        strokeWidth: 2,
+                      },
+                    ]}
+                    durationS={sliced.durationS}
+                    yAxisLabel="Tokens"
+                    {...size}
+                  />
+                );
+              }}
+            />
 
-          <ExpandableChart
-            title="Unique input tokens in flight"
-            testId="unique-input-inflight-chart"
-            render={(expanded) => {
-              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-              if (!timelineQuery.data) {
-                return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
-              }
-              // Step function: at each request start/end, sum the ISLs of
-              // currently-active requests across distinct cids. Within one
-              // cid turns are sequential so each cid contributes at most
-              // one in-flight ISL; across cids we treat content as
-              // independent (cross-conv prefix sharing adds <1pp in
-              // practice). Smooth with a 30s time-weighted rolling average
-              // so brief turn-handoff dips don't dominate the chart.
-              const raw = inflightUniqueTokens(timelineQuery.data.requests);
-              const smoothed = timeRollingAverage(raw, 30);
-              return (
-                <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'In flight (avg 30s)',
-                      data: smoothed,
-                      rawData: raw,
-                      color: '#a855f7',
-                      strokeWidth: 2,
-                    },
-                    {
-                      name: 'Cumulative average',
-                      data: cumulativeTimeAverage(raw),
-                      color: '#ef4444',
-                      strokeWidth: 3,
-                    },
-                  ]}
-                  durationS={timelineQuery.data.durationS}
-                  yAxisLabel="Tokens"
-                  {...size}
-                />
-              );
-            }}
-          />
-        </div>
+            <ExpandableChart
+              title="Unique input tokens in flight"
+              testId="unique-input-inflight-chart"
+              render={(expanded) => {
+                const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+                if (!phaseTimeline) {
+                  return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
+                }
+                // Step function: at each request start/end, sum the ISLs of
+                // currently-active requests across distinct cids. Within one
+                // cid turns are sequential so each cid contributes at most
+                // one in-flight ISL; across cids we treat content as
+                // independent (cross-conv prefix sharing adds <1pp in
+                // practice). Smooth with a 30s time-weighted rolling average
+                // so brief turn-handoff dips don't dominate the chart.
+                const raw = inflightUniqueTokens(phaseTimeline.requests);
+                const smoothed = timeRollingAverage(raw, 30);
+                // KV-cache pool size (vLLM only) drawn as a constant ceiling so
+                // you can see how close the working set gets to eviction
+                // pressure. Phase-independent — it's a static config value.
+                const pool = metrics?.kvCachePoolTokens ?? null;
+                return (
+                  <TimeSeriesChart
+                    series={[
+                      {
+                        name: 'In flight (avg 30s)',
+                        data: smoothed,
+                        rawData: raw,
+                        color: '#a855f7',
+                        strokeWidth: 2,
+                      },
+                      {
+                        name: 'Cumulative average',
+                        data: cumulativeTimeAverage(raw),
+                        color: '#ef4444',
+                        strokeWidth: 3,
+                      },
+                    ]}
+                    durationS={phaseTimeline.durationS}
+                    yAxisLabel="Tokens"
+                    refLines={
+                      pool && pool > 0
+                        ? [{ value: pool, label: `KV cache pool · ${fmtTokensCompact(pool)}` }]
+                        : undefined
+                    }
+                    {...size}
+                  />
+                );
+              }}
+            />
+          </div>
+        </>
       )}
     </div>
   );
diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.test.ts b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts
new file mode 100644
index 00000000..ef6cdaab
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts
@@ -0,0 +1,212 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline';
+import {
+  phaseBoundaryNs,
+  phaseBoundarySec,
+  requestsForPhase,
+  sliceServerSeriesByPhase,
+  sliceTimelineByPhase,
+  timelineHasWarmup,
+  type ServerSeriesLike,
+} from './phase-slice';
+
+function req(overrides: Partial<RequestRecord>): RequestRecord {
+  return {
+    cid: 'c',
+    ti: 0,
+    wid: 'w',
+    ad: 0,
+    phase: 'profiling',
+    credit: 0,
+    start: 0,
+    ack: null,
+    end: 1,
+    ttftMs: null,
+    tpotMs: null,
+    isl: null,
+    osl: null,
+    cancelled: false,
+    ...overrides,
+  };
+}
+
+function timeline(requests: RequestRecord[], startNs = 1_000): RequestTimeline {
+  return { version: 3, startNs, endNs: startNs + 1, durationS: 1, requests };
+}
+
+function makeSeries(ts: number[]): ServerSeriesLike {
+  const pts = ts.map((t) => ({ t, value: t * 10 }));
+  return {
+    kvCacheUsage: pts,
+    prefixCacheHitRate: pts,
+    queueDepth: ts.map((t) => ({ t, running: t, waiting: t + 1, total: 2 * t + 1 })),
+    promptTokensBySource: { src: pts },
+    prefillTps: pts,
+    decodeTps: pts,
+    prefixCacheHitsTps: pts,
+    hostKvCacheUsage: pts,
+    kvCacheUsageByEngine: [{ engineLabel: 'e0', points: pts }],
+  };
+}
+
+describe('phaseBoundaryNs', () => {
+  it('returns null when there are no profiling requests', () => {
+    expect(phaseBoundaryNs(timeline([req({ phase: 'warmup', start: 5 })]))).toBeNull();
+  });
+
+  it('returns null when there are no warmup requests', () => {
+    expect(phaseBoundaryNs(timeline([req({ phase: 'profiling', start: 5 })]))).toBeNull();
+  });
+
+  it('returns startNs + earliest profiling start when both phases present', () => {
+    const t = timeline(
+      [
+        req({ phase: 'warmup', start: 0 }),
+        req({ phase: 'profiling', start: 900 }),
+        req({ phase: 'profiling', start: 700 }),
+      ],
+      1_000,
+    );
+    expect(phaseBoundaryNs(t)).toBe(1_700);
+  });
+
+  it('returns null for nullish timeline', () => {
+    expect(phaseBoundaryNs(null)).toBeNull();
+    expect(phaseBoundaryNs(undefined)).toBeNull();
+  });
+});
+
+describe('phaseBoundarySec', () => {
+  it('rebases through absolute ns by subtracting serverMetrics.startNs (origin gap)', () => {
+    // timeline origin and server-metrics origin differ — the classic ~124s gap.
+    const tl = timeline(
+      [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 600 * 1e9 })],
+      200 * 1e9, // timeline.startNs
+    );
+    // boundaryNs = 200e9 + 600e9 = 800e9 ; serverMetrics origin = 124e9 earlier
+    const boundarySec = phaseBoundarySec({ startNs: 76 * 1e9 }, tl);
+    // (800e9 - 76e9)/1e9 = 724
+    expect(boundarySec).toBe(724);
+  });
+
+  it('clamps a negative mapping to 0', () => {
+    const tl = timeline(
+      [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 0 })],
+      0,
+    );
+    expect(phaseBoundarySec({ startNs: 5 * 1e9 }, tl)).toBe(0);
+  });
+
+  it('returns null when serverMetrics missing or no split', () => {
+    const tl = timeline(
+      [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 1e9 })],
+      0,
+    );
+    expect(phaseBoundarySec(null, tl)).toBeNull();
+    expect(phaseBoundarySec({ startNs: 0 }, timeline([req({ phase: 'profiling' })]))).toBeNull();
+  });
+});
+
+describe('timelineHasWarmup', () => {
+  it('detects warmup presence', () => {
+    expect(timelineHasWarmup(timeline([req({ phase: 'profiling' })]))).toBe(false);
+    expect(timelineHasWarmup(timeline([req({ phase: 'warmup' })]))).toBe(true);
+    expect(timelineHasWarmup(null)).toBe(false);
+  });
+});
+
+describe('sliceServerSeriesByPhase', () => {
+  it('is an identity passthrough (full duration) when boundary is null', () => {
+    const s = makeSeries([0, 1, 2]);
+    const out = sliceServerSeriesByPhase(s, 'profiling', null, 99);
+    expect(out.series).toBe(s);
+    expect(out.durationS).toBe(99);
+  });
+
+  it('warmup keeps t < boundary, no rebase, durationS = boundary', () => {
+    const s = makeSeries([0, 1, 2, 3, 4]);
+    const out = sliceServerSeriesByPhase(s, 'warmup', 2, 5);
+    expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1]); // excludes t===2
+    expect(out.durationS).toBe(2);
+  });
+
+  it('profiling keeps t >= boundary and rebases to start at 0', () => {
+    const s = makeSeries([0, 1, 2, 3, 4]);
+    const out = sliceServerSeriesByPhase(s, 'profiling', 2, 5);
+    expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1, 2]); // 2,3,4 -> 0,1,2
+    expect(out.series.kvCacheUsage.map((p) => p.value)).toEqual([20, 30, 40]); // values preserved
+    expect(out.durationS).toBe(3); // 5 - 2
+  });
+
+  it('slices queueDepth, promptTokensBySource, and kvCacheUsageByEngine; preserves queue fields', () => {
+    const s = makeSeries([0, 1, 2, 3]);
+    const out = sliceServerSeriesByPhase(s, 'profiling', 2, 4);
+    expect(out.series.queueDepth).toEqual([
+      { t: 0, running: 2, waiting: 3, total: 5 },
+      { t: 1, running: 3, waiting: 4, total: 7 },
+    ]);
+    expect(out.series.promptTokensBySource.src.map((p) => p.t)).toEqual([0, 1]);
+    expect(out.series.kvCacheUsageByEngine[0]!.points.map((p) => p.t)).toEqual([0, 1]);
+    expect(out.series.kvCacheUsageByEngine[0]!.engineLabel).toBe('e0');
+  });
+
+  it('does not mutate the input series', () => {
+    const s = makeSeries([0, 1, 2]);
+    const before = s.kvCacheUsage.map((p) => p.t);
+    sliceServerSeriesByPhase(s, 'profiling', 1, 3);
+    expect(s.kvCacheUsage.map((p) => p.t)).toEqual(before);
+  });
+});
+
+describe('requestsForPhase', () => {
+  const rs = [
+    req({ phase: 'warmup', isl: 1 }),
+    req({ phase: 'profiling', isl: 2 }),
+    req({ phase: 'unknown', isl: 3 }),
+  ];
+
+  it('profiling selects only profiling rows', () => {
+    expect(requestsForPhase(rs, 'profiling').map((r) => r.isl)).toEqual([2]);
+  });
+
+  it('warmup selects everything that is not profiling', () => {
+    expect(requestsForPhase(rs, 'warmup').map((r) => r.isl)).toEqual([1, 3]);
+  });
+});
+
+describe('sliceTimelineByPhase', () => {
+  // startNs origin = 1000; warmup request at offset 0..50, profiling at 100..300.
+  const tl = timeline(
+    [
+      req({ phase: 'warmup', credit: 0, start: 0, ack: 10, end: 50, isl: 1 }),
+      req({ phase: 'profiling', credit: 90, start: 100, ack: 120, end: 300, isl: 2 }),
+    ],
+    1_000,
+  );
+  // tl.durationS default = 1 from helper; override for window math.
+  const tlDur: RequestTimeline = { ...tl, durationS: 3 };
+
+  it('returns the input unchanged for a single-phase timeline', () => {
+    const single = timeline([req({ phase: 'profiling', start: 5 })]);
+    expect(sliceTimelineByPhase(single, 'profiling')).toBe(single);
+  });
+
+  it('warmup keeps pre-boundary requests, no rebase, startNs unchanged', () => {
+    const out = sliceTimelineByPhase(tlDur, 'warmup');
+    expect(out.requests.map((r) => r.isl)).toEqual([1]);
+    expect(out.requests[0]!.start).toBe(0); // not rebased
+    expect(out.startNs).toBe(1_000);
+  });
+
+  it('profiling keeps post-boundary requests and rebases offsets + startNs', () => {
+    const out = sliceTimelineByPhase(tlDur, 'profiling');
+    expect(out.requests.map((r) => r.isl)).toEqual([2]);
+    // boundary offset = 100 → rebased: start 100→0, end 300→200, ack 120→20, credit 90→-10
+    expect(out.requests[0]!.start).toBe(0);
+    expect(out.requests[0]!.end).toBe(200);
+    expect(out.requests[0]!.ack).toBe(20);
+    // startNs shifts forward by the boundary offset so absolute time is preserved
+    expect(out.startNs).toBe(1_100);
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.ts b/packages/app/src/components/inference/agentic-point/phase-slice.ts
new file mode 100644
index 00000000..e6e17719
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/phase-slice.ts
@@ -0,0 +1,188 @@
+/**
+ * Warmup vs profiling phase slicing for the agentic per-point detail page.
+ *
+ * Agentic trace-replay runs have two phases: a warmup (cache-warming) pass, then
+ * the measured profiling window. The server-metric time-series (`chart_series`)
+ * spans the whole run with no per-point phase label, but the per-request
+ * `request_timeline` IS phase-tagged. We derive the warmup→profiling boundary
+ * from the timeline and slice the server series at it.
+ *
+ * ⚠️ ORIGIN-GAP INVARIANT: the two payloads share the aiperf clock but have
+ * DIFFERENT zero origins — `serverMetrics.startNs` is the first server scrape,
+ * `timeline.startNs` is the first request's credit (observed ~124 s apart in
+ * real runs). The boundary must therefore be rebased through absolute ns by
+ * subtracting `serverMetrics.startNs`; a same-axis offset comparison would be
+ * off by the origin gap. This rebasing lives in `phaseBoundarySec` only.
+ */
+
+import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline';
+import type {
+  QueueDepthPoint,
+  TimeSeriesPoint,
+  TraceServerMetrics,
+} from '@/hooks/api/use-trace-server-metrics';
+
+export type StagePhase = 'warmup' | 'profiling';
+
+/**
+ * The subset of server-metric series the per-point charts render. Both the
+ * top-level `TraceServerMetrics` and a per-source object (after the detail page
+ * remaps `promptTps`→`prefillTps`, `generationTps`→`decodeTps`) are assignable.
+ */
+export interface ServerSeriesLike {
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+  prefixCacheHitsTps: TimeSeriesPoint[];
+  hostKvCacheUsage: TimeSeriesPoint[];
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+}
+
+/** True when the timeline contains at least one non-profiling (warmup) request. */
+export function timelineHasWarmup(timeline: RequestTimeline | null | undefined): boolean {
+  return Boolean(timeline?.requests.some((r) => r.phase !== 'profiling'));
+}
+
+/**
+ * Absolute-ns wall-clock instant where the profiling phase begins
+ * = `timeline.startNs + earliest profiling request's start offset`.
+ * Returns null unless BOTH a warmup and a profiling request exist (nothing to
+ * split otherwise).
+ */
+export function phaseBoundaryNs(timeline: RequestTimeline | null | undefined): number | null {
+  if (!timeline) return null;
+  let hasWarmup = false;
+  let minProfilingStart: number | null = null;
+  for (const r of timeline.requests) {
+    if (r.phase === 'profiling') {
+      if (minProfilingStart === null || r.start < minProfilingStart) minProfilingStart = r.start;
+    } else {
+      hasWarmup = true;
+    }
+  }
+  if (!hasWarmup || minProfilingStart === null) return null;
+  return timeline.startNs + minProfilingStart;
+}
+
+/**
+ * The profiling-start boundary expressed on the SERVER-METRIC chart's own t-axis
+ * (seconds from `serverMetrics.startNs`). See the origin-gap invariant at the top
+ * of the file — the `- serverMetrics.startNs` subtraction is mandatory.
+ *
+ * Returns null when there's no warmup/profiling split, or `serverMetrics` is
+ * absent (→ callers fall back to the full-run series).
+ */
+export function phaseBoundarySec(
+  serverMetrics: Pick<TraceServerMetrics, 'startNs'> | null | undefined,
+  timeline: RequestTimeline | null | undefined,
+): number | null {
+  if (!serverMetrics) return null;
+  const boundaryNs = phaseBoundaryNs(timeline);
+  if (boundaryNs === null) return null;
+  return Math.max(0, (boundaryNs - serverMetrics.startNs) / 1e9);
+}
+
+export interface PhaseSlicedSeries<S> {
+  series: S;
+  durationS: number;
+}
+
+/**
+ * Slice every server-metric series to one phase:
+ *  - warmup:    keep points with `t < boundary`, no rebase, `durationS = boundary`
+ *  - profiling: keep points with `t >= boundary`, rebased so `t` starts at 0,
+ *               `durationS = full - boundary`
+ *
+ * A point exactly at `t === boundary` belongs to profiling. Null boundary
+ * (single-phase point, or no server metrics) → identity passthrough with the
+ * full `durationS`. Pure — returns new objects, never mutates the input.
+ *
+ * NOTE: rebasing the profiling slice to start at 0 makes the cumulative charts
+ * (prompt-token source, unique-input-tokens) read as "since profiling start"
+ * rather than "since run start" — intended.
+ */
+export function sliceServerSeriesByPhase<S extends ServerSeriesLike>(
+  series: S,
+  phase: StagePhase,
+  boundarySec: number | null,
+  fullDurationS: number,
+): PhaseSlicedSeries<S> {
+  if (boundarySec === null) return { series, durationS: fullDurationS };
+  const b = boundarySec;
+  const keep = phase === 'warmup' ? (t: number) => t < b : (t: number) => t >= b;
+  const rebase = phase === 'profiling' ? (t: number) => t - b : (t: number) => t;
+
+  const sliceTs = (pts: TimeSeriesPoint[]): TimeSeriesPoint[] =>
+    pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) }));
+  const sliceQd = (pts: QueueDepthPoint[]): QueueDepthPoint[] =>
+    pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) }));
+  const sliceRecord = (
+    rec: Record<string, TimeSeriesPoint[]>,
+  ): Record<string, TimeSeriesPoint[]> => {
+    const out: Record<string, TimeSeriesPoint[]> = {};
+    for (const [k, v] of Object.entries(rec)) out[k] = sliceTs(v);
+    return out;
+  };
+
+  const slicedFields: ServerSeriesLike = {
+    kvCacheUsage: sliceTs(series.kvCacheUsage),
+    prefixCacheHitRate: sliceTs(series.prefixCacheHitRate),
+    queueDepth: sliceQd(series.queueDepth),
+    promptTokensBySource: sliceRecord(series.promptTokensBySource),
+    prefillTps: sliceTs(series.prefillTps),
+    decodeTps: sliceTs(series.decodeTps),
+    prefixCacheHitsTps: sliceTs(series.prefixCacheHitsTps),
+    hostKvCacheUsage: sliceTs(series.hostKvCacheUsage),
+    kvCacheUsageByEngine: series.kvCacheUsageByEngine.map((e) => ({
+      engineLabel: e.engineLabel,
+      points: sliceTs(e.points),
+    })),
+  };
+
+  const durationS = phase === 'warmup' ? b : Math.max(1, fullDurationS - b);
+  return { series: { ...series, ...slicedFields } as S, durationS };
+}
+
+/** Filter request-timeline records to one phase (warmup = anything not profiling). */
+export function requestsForPhase(requests: RequestRecord[], phase: StagePhase): RequestRecord[] {
+  return phase === 'warmup'
+    ? requests.filter((r) => r.phase !== 'profiling')
+    : requests.filter((r) => r.phase === 'profiling');
+}
+
+/**
+ * Scope a whole request timeline to one phase: keep only that phase's requests
+ * and, for profiling, rebase every ns offset (and `startNs`) so the phase starts
+ * at t=0 — mirroring `sliceServerSeriesByPhase` so the request-derived charts and
+ * the server charts share a 0-based axis for the same phase. `durationS` becomes
+ * the phase window. Returns the input unchanged when there's no warmup/profiling
+ * split (single-phase point). Pure — new object, original untouched.
+ *
+ * The boundary here is on the REQUEST clock (offset from `timeline.startNs`), so
+ * we use `phaseBoundaryNs` minus `timeline.startNs` rather than the server-axis
+ * `phaseBoundarySec` (different origin — see the file header).
+ */
+export function sliceTimelineByPhase(
+  timeline: RequestTimeline,
+  phase: StagePhase,
+): RequestTimeline {
+  const boundaryNs = phaseBoundaryNs(timeline);
+  if (boundaryNs === null) return timeline;
+  const boundaryOff = boundaryNs - timeline.startNs; // ns offset on the request clock
+  const inPhase = (r: RequestRecord) =>
+    phase === 'warmup' ? r.start < boundaryOff : r.start >= boundaryOff;
+  const shift = phase === 'profiling' ? boundaryOff : 0;
+  const requests = timeline.requests.filter(inPhase).map((r) => ({
+    ...r,
+    credit: r.credit - shift,
+    start: r.start - shift,
+    ack: r.ack === null ? null : r.ack - shift,
+    end: r.end - shift,
+  }));
+  const durationS =
+    phase === 'warmup' ? boundaryOff / 1e9 : Math.max(1, timeline.durationS - boundaryOff / 1e9);
+  return { ...timeline, startNs: timeline.startNs + shift, requests, durationS };
+}
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
index d15da878..fe3c1231 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -2,7 +2,13 @@ import { describe, expect, it } from 'vitest';
 
 import type { RequestRecord } from '@/hooks/api/use-request-timeline';
 
-import { buildRequestTimelineRows, requestIdleStats, splitTimelineCid } from './request-timeline';
+import {
+  buildRequestTimelineRows,
+  parseTimelineViewSnapshot,
+  requestIdleStats,
+  splitTimelineCid,
+  type TimelineViewSnapshot,
+} from './request-timeline';
 
 const request = (start: number, end: number): RequestRecord => ({
   cid: 'conversation',
@@ -99,3 +105,70 @@ describe('subagent timeline hierarchy', () => {
     expect(rows[2]!.requests).toHaveLength(1);
   });
 });
+
+describe('parseTimelineViewSnapshot', () => {
+  const full: TimelineViewSnapshot = {
+    viewStart: 1_000,
+    viewEnd: 5_000,
+    rowMode: 'worker',
+    phaseFilter: 'warmup',
+    expanded: ['conv::sa:subagent_001_abcd'],
+    scrollTop: 240,
+    scrollLeft: 80,
+  };
+
+  it('round-trips a full snapshot', () => {
+    expect(parseTimelineViewSnapshot(JSON.stringify(full))).toEqual(full);
+  });
+
+  it('round-trips the profiling phase and rejects the removed "all" value', () => {
+    expect(
+      parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'profiling' }))?.phaseFilter,
+    ).toBe('profiling');
+    // 'all' is no longer a valid phase — coerces back to the profiling default.
+    expect(
+      parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'all' }))?.phaseFilter,
+    ).toBe('profiling');
+  });
+
+  it('returns null for absent or unparseable input', () => {
+    expect(parseTimelineViewSnapshot(null)).toBeNull();
+    expect(parseTimelineViewSnapshot('')).toBeNull();
+    expect(parseTimelineViewSnapshot('{not json')).toBeNull();
+    expect(parseTimelineViewSnapshot('42')).toBeNull();
+  });
+
+  it('preserves a null viewEnd (not zoomed) and rejects non-finite viewEnd', () => {
+    const restored = parseTimelineViewSnapshot(JSON.stringify({ ...full, viewEnd: null }));
+    expect(restored?.viewEnd).toBeNull();
+    // NaN / Infinity don't survive JSON, but a malformed string value must coerce to null.
+    expect(parseTimelineViewSnapshot('{"viewEnd":"oops"}')?.viewEnd).toBeNull();
+  });
+
+  it('falls back to defaults for invalid enums and missing numbers', () => {
+    expect(parseTimelineViewSnapshot('{}')).toEqual({
+      viewStart: 0,
+      viewEnd: null,
+      rowMode: 'conversation',
+      phaseFilter: 'profiling',
+      expanded: [],
+      scrollTop: 0,
+      scrollLeft: 0,
+    });
+    const bogus = parseTimelineViewSnapshot(
+      JSON.stringify({ rowMode: 'nope', phaseFilter: 'nope', viewStart: 'x', scrollTop: null }),
+    )!;
+    expect(bogus.rowMode).toBe('conversation');
+    expect(bogus.phaseFilter).toBe('profiling');
+    expect(bogus.viewStart).toBe(0);
+    expect(bogus.scrollTop).toBe(0);
+  });
+
+  it('drops non-string entries from the expanded list', () => {
+    expect(parseTimelineViewSnapshot('{"expanded":["a",1,null,"b"]}')!.expanded).toEqual([
+      'a',
+      'b',
+    ]);
+    expect(parseTimelineViewSnapshot('{"expanded":"nope"}')!.expanded).toEqual([]);
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index bdf0a9b9..f3870bb1 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -1,12 +1,14 @@
 'use client';
 
-import { useCallback, useMemo, useRef, useState } from 'react';
+import { useCallback, useLayoutEffect, useMemo, useRef, useState } from 'react';
 import { useRouter } from 'next/navigation';
 
 import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
 import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
 import { track } from '@/lib/analytics';
 
+import { requestsForPhase, type StagePhase } from './phase-slice';
+
 /**
  * The dataset conversation id for a request: the cid with any subagent/forked
  * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in
@@ -79,13 +81,110 @@ const ROW_MODE_OPTIONS: SegmentedToggleOption<RowMode>[] = [
   { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' },
 ];
 
-type PhaseFilter = 'all' | 'profiling';
+// Two phases shown separately (no combined view) — matches the per-point detail
+// stage toggle. Reuses StagePhase so the filter predicate is shared.
+type PhaseFilter = StagePhase;
 
 const PHASE_OPTIONS: SegmentedToggleOption<PhaseFilter>[] = [
   { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' },
-  { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' },
+  { value: 'warmup', label: 'Warmup', testId: 'timeline-phase-warmup' },
 ];
 
+/**
+ * Persisted snapshot of the timeline's view state, used to restore the user's
+ * zoom / scroll / filter position when they return to the page (e.g. clicking a
+ * request to open the dataset flamegraph, then hitting the browser back button).
+ * Stored in sessionStorage keyed by point id; written on click-through and
+ * consumed once on the next mount.
+ */
+export interface TimelineViewSnapshot {
+  /** Zoom-pan window start (ns offset from dataStart). */
+  viewStart: number;
+  /** Zoom-pan window end, or null when not zoomed (full extent). */
+  viewEnd: number | null;
+  rowMode: RowMode;
+  phaseFilter: PhaseFilter;
+  /** Keys of expanded multi-stream subagent rows. */
+  expanded: string[];
+  /** Scroll container offsets (vertical row scroll + horizontal). */
+  scrollTop: number;
+  scrollLeft: number;
+}
+
+const TIMELINE_VIEW_SNAPSHOT_PREFIX = 'agentic-timeline-view:';
+const ROW_MODE_VALUES: readonly RowMode[] = ['conversation', 'worker'];
+const PHASE_FILTER_VALUES: readonly PhaseFilter[] = ['warmup', 'profiling'];
+
+const finiteOr = (value: unknown, fallback: number): number =>
+  typeof value === 'number' && Number.isFinite(value) ? value : fallback;
+
+/**
+ * Parse a persisted snapshot, coercing/validating each field and falling back
+ * to defaults so a malformed or stale blob can never break restore. Returns
+ * null only when the input is absent or not parseable JSON.
+ */
+export function parseTimelineViewSnapshot(raw: string | null): TimelineViewSnapshot | null {
+  if (!raw) return null;
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(raw);
+  } catch {
+    return null;
+  }
+  if (!parsed || typeof parsed !== 'object') return null;
+  const record = parsed as Record<string, unknown>;
+  const rowMode = ROW_MODE_VALUES.includes(record.rowMode as RowMode)
+    ? (record.rowMode as RowMode)
+    : 'conversation';
+  const phaseFilter = PHASE_FILTER_VALUES.includes(record.phaseFilter as PhaseFilter)
+    ? (record.phaseFilter as PhaseFilter)
+    : 'profiling';
+  const viewEnd =
+    typeof record.viewEnd === 'number' && Number.isFinite(record.viewEnd) ? record.viewEnd : null;
+  const expanded = Array.isArray(record.expanded)
+    ? record.expanded.filter((entry): entry is string => typeof entry === 'string')
+    : [];
+  return {
+    viewStart: finiteOr(record.viewStart, 0),
+    viewEnd,
+    rowMode,
+    phaseFilter,
+    expanded,
+    scrollTop: finiteOr(record.scrollTop, 0),
+    scrollLeft: finiteOr(record.scrollLeft, 0),
+  };
+}
+
+function timelineSnapshotKey(pointId: number): string {
+  return `${TIMELINE_VIEW_SNAPSHOT_PREFIX}${pointId}`;
+}
+
+function saveTimelineViewSnapshot(pointId: number, snapshot: TimelineViewSnapshot): void {
+  if (typeof window === 'undefined') return;
+  try {
+    window.sessionStorage.setItem(timelineSnapshotKey(pointId), JSON.stringify(snapshot));
+  } catch {
+    // sessionStorage can throw (private mode / quota exceeded) — restore is
+    // best-effort, so a failed write just means no restore next time.
+  }
+}
+
+/**
+ * Read AND remove the snapshot (one-shot): we only want to restore once per
+ * click-through, so a later reload of the same point starts from defaults.
+ */
+function consumeTimelineViewSnapshot(pointId: number): TimelineViewSnapshot | null {
+  if (typeof window === 'undefined') return null;
+  try {
+    const key = timelineSnapshotKey(pointId);
+    const raw = window.sessionStorage.getItem(key);
+    window.sessionStorage.removeItem(key);
+    return parseTimelineViewSnapshot(raw);
+  } catch {
+    return null;
+  }
+}
+
 // The timeline body is capped at this height and scrolls internally, so a run
 // with many conversations/workers doesn't make the card grow unbounded and push
 // the rest of the detail page down. Sized to show ~16 rows + the header.
@@ -497,19 +596,54 @@ function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean })
 export function RequestTimelineView({
   data,
   datasetSlug,
+  pointId,
 }: {
   data: RequestTimeline;
   /** Source dataset slug for this run; enables click-to-conversation deep links. */
   datasetSlug?: string | null;
+  /** benchmark_results.id — keys the per-point view-state snapshot for restore. */
+  pointId: number;
 }) {
   const router = useRouter();
   const [rowMode, setRowMode] = useState<RowMode>('conversation');
   const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
 
+  // The scroll container (vertical row scroll + horizontal chart scroll) and a
+  // ref mirror of the live view state, so click-through can snapshot the exact
+  // position without rebuilding openConversation on every zoom/pan tick.
+  const scrollRef = useRef<HTMLDivElement>(null);
+  const liveStateRef = useRef<{
+    viewStart: number;
+    viewEnd: number | null;
+    rowMode: RowMode;
+    phaseFilter: PhaseFilter;
+    expandedSubagents: ReadonlySet<string>;
+  }>({
+    viewStart: 0,
+    viewEnd: null,
+    rowMode: 'conversation',
+    phaseFilter: 'profiling',
+    expandedSubagents: new Set(),
+  });
+
   const openConversation = useCallback(
     (req: RequestRecord) => {
       if (!datasetSlug) return;
+      // Snapshot the current zoom/scroll/filter position so the browser back
+      // button restores it (see the restore effect below).
+      if (scrollRef.current) {
+        const live = liveStateRef.current;
+        saveTimelineViewSnapshot(pointId, {
+          viewStart: live.viewStart,
+          viewEnd: live.viewEnd,
+          rowMode: live.rowMode,
+          phaseFilter: live.phaseFilter,
+          expanded: [...live.expandedSubagents],
+          scrollTop: scrollRef.current.scrollTop,
+          scrollLeft: scrollRef.current.scrollLeft,
+        });
+      }
       const convId = datasetConvId(req.cid);
       // Carry the turn (and, for subagent requests, the subagent id) so the
       // flamegraph can scroll to / highlight the exact node this bar maps to.
@@ -521,7 +655,7 @@ export function RequestTimelineView({
         `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`,
       );
     },
-    [datasetSlug, router],
+    [datasetSlug, router, pointId],
   );
   // Which multi-stream subagents currently have their per-stream rows
   // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
@@ -549,10 +683,7 @@ export function RequestTimelineView({
   // Apply phase filter, then group into rows. With no warmup data the filter
   // collapses to "profiling" regardless of the (hidden) toggle state.
   const filtered = useMemo(
-    () =>
-      phaseFilter === 'all' && hasWarmup
-        ? data.requests
-        : data.requests.filter((r) => r.phase === 'profiling'),
+    () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'),
     [data.requests, phaseFilter, hasWarmup],
   );
   const rows = useMemo(
@@ -595,6 +726,34 @@ export function RequestTimelineView({
   const visibleDur = Math.max(vEnd - vStart, 1);
   const isZoomed = viewEnd !== null;
 
+  // Mirror the live view state into a ref so the click-through snapshot reads
+  // the latest values without rebuilding openConversation on every zoom tick.
+  liveStateRef.current = { viewStart, viewEnd, rowMode, phaseFilter, expandedSubagents };
+
+  // Restore the snapshot written on click-through (e.g. open a request in the
+  // dataset flamegraph, then hit the browser back button). Runs once per mount,
+  // keyed by point id; the snapshot is consumed so a later reload starts fresh.
+  // Scroll is applied after the restored filters/expansions re-render the rows
+  // (rAF fires after that synchronous commit, before paint — no visible jump).
+  useLayoutEffect(() => {
+    const snapshot = consumeTimelineViewSnapshot(pointId);
+    if (!snapshot) return;
+    setRowMode(snapshot.rowMode);
+    setPhaseFilter(snapshot.phaseFilter);
+    setExpandedSubagents(new Set(snapshot.expanded));
+    setViewStart(snapshot.viewStart);
+    setViewEnd(snapshot.viewEnd);
+    const target = { top: snapshot.scrollTop, left: snapshot.scrollLeft };
+    requestAnimationFrame(() => {
+      const el = scrollRef.current;
+      if (!el) return;
+      el.scrollTop = target.top;
+      el.scrollLeft = target.left;
+    });
+    // setState setters are stable; only re-run if the point itself changes.
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [pointId]);
+
   // Layout
   // Wide enough for a full 36-char conversation id at 10px font, plus the
   // indent + color stripe + count badge. Subagent rows inherit the same
@@ -778,7 +937,11 @@ export function RequestTimelineView({
             horizontally inside it, so the card doesn't grow to fit every
             conversation/worker AND the horizontal scrollbar stays pinned to the
             window's bottom edge (rather than the bottom of the tall content). */}
-        <div className="overflow-auto" style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}>
+        <div
+          ref={scrollRef}
+          className="overflow-auto"
+          style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}
+        >
           <div className="flex w-max">
             {/* Label column — pinned left (sticky) so it stays put during
                 horizontal scroll, while scrolling vertically with the rows. */}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
index a9ece859..9f6adc6a 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
@@ -79,11 +79,11 @@ describe('rollingRequestMetric', () => {
     expect(result.cumulative.at(-1)?.value).toBeCloseTo(2.85, 8);
   });
 
-  it('drops warmup, cancelled, missing, and non-positive samples', () => {
+  it('drops cancelled, missing, and non-positive samples (phase is the caller’s concern)', () => {
     const result = rollingRequestMetric(
       [
         request(1, 100, 10),
-        request(2, 200, 20, { phase: 'warmup' }),
+        request(2, 200, 20, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline
         request(3, 300, 30, { cancelled: true }),
         request(4, null, null),
         request(5, 0, 0),
@@ -92,9 +92,10 @@ describe('rollingRequestMetric', () => {
       'p90',
     );
 
-    expect(result.raw).toEqual([{ t: 1, value: 0.1 }]);
-    expect(result.trend).toEqual([{ t: 1, value: 0.1 }]);
-    expect(result.cumulative).toEqual([{ t: 1, value: 0.1 }]);
+    expect(result.raw).toEqual([
+      { t: 1, value: 0.1 },
+      { t: 2, value: 0.2 },
+    ]);
   });
 });
 
@@ -161,22 +162,23 @@ describe('cumulativeTimeAverage', () => {
 });
 
 describe('cumulativeCompletedRequests', () => {
-  it('sorts profiling completions and excludes warmup and cancelled requests', () => {
+  it('sorts completions and excludes cancelled requests (phase is the caller’s concern)', () => {
     expect(
       cumulativeCompletedRequests([
         request(4, 100, 10),
         request(2, 100, 10),
-        request(1, 100, 10, { phase: 'warmup' }),
+        request(1, 100, 10, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline
         request(3, 100, 10, { cancelled: true }),
       ]),
     ).toEqual([
       { t: 0, value: 0 },
-      { t: 2, value: 1 },
-      { t: 4, value: 2 },
+      { t: 1, value: 1 },
+      { t: 2, value: 2 },
+      { t: 4, value: 3 },
     ]);
   });
 
-  it('returns no series when there are no successful profiling completions', () => {
+  it('returns no series when there are no successful completions', () => {
     expect(cumulativeCompletedRequests([request(1, 100, 10, { cancelled: true })])).toEqual([]);
   });
 });
@@ -199,17 +201,22 @@ describe('averageSequenceLengthInFlight', () => {
     ]);
   });
 
-  it('excludes cancelled, warmup, and missing sequence lengths', () => {
+  it('excludes cancelled and missing sequence lengths (phase is the caller’s concern)', () => {
+    // Only the null-osl and cancelled rows are dropped; the warmup row is kept
+    // (the caller passes a phase-scoped timeline), so it produces a step series.
     expect(
       averageSequenceLengthInFlight(
         [
           request(1, 100, 10, { osl: null }),
           request(2, 100, 10, { osl: 20, cancelled: true }),
-          request(3, 100, 10, { osl: 30, phase: 'warmup' }),
+          request(3, 100, 10, { osl: 30, phase: 'warmup', start: 0, end: 3_000_000_000 }),
         ],
         'osl',
       ),
-    ).toEqual([]);
+    ).toEqual([
+      { t: 0, value: 30 },
+      { t: 3, value: 0 },
+    ]);
   });
 });
 
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index ab744286..088a5e3b 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -23,6 +23,14 @@ interface Series {
   hideFromHover?: boolean;
 }
 
+/** A constant horizontal reference line (e.g. a capacity ceiling). */
+export interface ReferenceLine {
+  value: number;
+  label: string;
+  /** Line + label color. Defaults to a muted emerald. */
+  color?: string;
+}
+
 interface TimeSeriesChartProps {
   series: Series[];
   durationS: number;
@@ -31,8 +39,16 @@ interface TimeSeriesChartProps {
   yAxisLabel?: string;
   width?: number;
   height?: number;
+  /**
+   * Horizontal reference lines drawn across the plot. Their values are folded
+   * into the auto y-max so the line stays on-chart even when it exceeds the
+   * data (e.g. a KV-cache pool ceiling well above the working set).
+   */
+  refLines?: readonly ReferenceLine[];
 }
 
+const NO_REF_LINES: readonly ReferenceLine[] = [];
+
 export type RequestMetric = 'interactivity' | 'ttft' | 'e2e';
 export type RequestPercentile = 'p75' | 'p90';
 export type ThroughputSeriesKey = 'input' | 'decode';
@@ -74,8 +90,11 @@ export function rollingRequestMetric(
   windowSize = 50,
 ): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } {
   const q = percentile === 'p75' ? 0.75 : 0.9;
+  // Phase is the caller's concern — the agentic detail page passes a
+  // phase-scoped (warmup or profiling) timeline. Here we only drop cancelled
+  // requests and samples without a usable latency value.
   const samples = requests
-    .filter((request) => request.phase === 'profiling' && !request.cancelled)
+    .filter((request) => !request.cancelled)
     .flatMap((request) => {
       const latencyMs =
         metric === 'ttft'
@@ -244,10 +263,13 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
-/** Cumulative count of successfully completed profiling requests by end time. */
+/**
+ * Cumulative count of successfully completed (non-cancelled) requests by end
+ * time. Phase is the caller's concern — pass a phase-scoped timeline.
+ */
 export function cumulativeCompletedRequests(requests: readonly RequestRecord[]): TimeSeriesPoint[] {
   const completionTimes = requests
-    .filter((request) => request.phase === 'profiling' && !request.cancelled)
+    .filter((request) => !request.cancelled)
     .map((request) => request.end / 1e9)
     .filter(Number.isFinite)
     .toSorted((a, b) => a - b);
@@ -271,10 +293,10 @@ export function averageSequenceLengthInFlight(
     events.set(t, current);
   };
 
+  // Phase is the caller's concern — pass a phase-scoped timeline.
   for (const request of requests) {
     const tokens = request[metric];
     if (
-      request.phase !== 'profiling' ||
       request.cancelled ||
       tokens === null ||
       !Number.isFinite(tokens) ||
@@ -527,6 +549,7 @@ export function TimeSeriesChart({
   yAxisLabel,
   width = 720,
   height = 260,
+  refLines = NO_REF_LINES,
 }: TimeSeriesChartProps) {
   const W = width;
   const H = height;
@@ -536,11 +559,15 @@ export function TimeSeriesChart({
     const innerW = W - PAD.left - PAD.right;
     const innerH = H - PAD.top - PAD.bottom;
     const xMax = Math.max(durationS, 1);
-    const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value)));
+    // Fold reference-line values into the auto max so a ceiling above the data
+    // (e.g. KV-cache pool >> working set) still renders inside the plot.
+    const refMax = refLines.length > 0 ? Math.max(...refLines.map((r) => r.value)) : 0;
+    const yMax =
+      yMaxOpt ?? Math.max(1e-9, refMax, ...series.flatMap((s) => s.data.map((d) => d.value)));
     const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
     const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
     return { innerW, innerH, xMax, yMax, xScale, yScale };
-  }, [series, durationS, yMaxOpt, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
+  }, [series, durationS, yMaxOpt, refLines, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
 
   const { innerW, innerH, xMax, yMax, xScale, yScale } = layout;
 
@@ -640,6 +667,38 @@ export function TimeSeriesChart({
         );
       })}
 
+      {/* Horizontal reference lines (e.g. KV-cache pool ceiling). Drawn on top
+          of the data lines, with a label pinned to the right edge. */}
+      {refLines.map((ref, i) => {
+        if (!Number.isFinite(ref.value) || ref.value < 0 || ref.value > yMax) return null;
+        const y = yScale(ref.value);
+        const color = ref.color ?? '#16a34a';
+        return (
+          <g key={`ref${i}`}>
+            <line
+              x1={PAD.left}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke={color}
+              strokeWidth={1.5}
+              strokeDasharray="5 4"
+              opacity={0.85}
+            />
+            <text
+              x={PAD.left + innerW - 4}
+              y={y - 4}
+              fontSize={10}
+              fill={color}
+              opacity={0.95}
+              textAnchor="end"
+            >
+              {ref.label}
+            </text>
+          </g>
+        );
+      })}
+
       {/* X-axis */}
       <line
         x1={PAD.left}
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index a16be558..00853e8a 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -77,6 +77,11 @@ export interface TraceServerMetrics {
    * the cluster-average `kvCacheUsage` line covers that case alone.
    */
   kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /**
+   * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed
+   * across engines). vLLM only — null for SGLang/TRT or older rows.
+   */
+  kvCachePoolTokens: number | null;
   /** Orchestrator-normalized metrics grouped by endpoint/worker. */
   metricSources: MetricSourceSeries[];
 }
diff --git a/packages/db/package.json b/packages/db/package.json
index 17d6f627..df28a208 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -22,7 +22,9 @@
     "db:backfill-agentic-intvty": "dotenv -e ../../.env -- tsx src/backfill-agentic-intvty.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+    "db:backfill-agentic-server-logs": "dotenv -e ../../.env -- tsx src/backfill-agentic-server-logs.ts",
     "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts",
+    "db:backfill-kv-pool": "dotenv -e ../../.env -- tsx src/backfill-kv-pool.ts",
     "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
diff --git a/packages/db/src/backfill-agentic-server-logs.ts b/packages/db/src/backfill-agentic-server-logs.ts
new file mode 100644
index 00000000..9f826caf
--- /dev/null
+++ b/packages/db/src/backfill-agentic-server-logs.ts
@@ -0,0 +1,267 @@
+/**
+ * Backfill server logs (and the derived KV-cache pool size) for AGENTIC
+ * benchmark points.
+ *
+ * Agentic runs upload their vLLM server log as a `server_logs_<key>` artifact,
+ * but the ingest path historically failed to link it to agentic rows (the
+ * `bmk_agentic_<key>` → `server_logs_<key>` key mismatch, now fixed in
+ * ingest-ci-run). As a result the agentic server log text was never stored, so
+ * `kv_cache_pool_tokens` cannot be derived from the DB — we must re-fetch the
+ * artifacts from GitHub.
+ *
+ * For each agentic workflow run this:
+ *   1. lists the run's artifacts and keeps only `server_logs_*` + `bmk_agentic_*`
+ *      (dedup by logical name, mirroring ingest's runner-suffix collapse),
+ *   2. downloads + unzips just those (small — skips the multi-MB trace dirs),
+ *   3. maps each `bmk_agentic_<key>` JSON → config → benchmark_results rows via
+ *      the same mapBenchmarkRow/config-cache logic ingest uses,
+ *   4. calls insertServerLog(), which stores+links the log AND derives
+ *      `kv_cache_pool_tokens` into benchmark_results.metrics.
+ *
+ * Idempotent: insertServerLog only links rows whose server_log_id is null.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-server-logs
+ *     [--limit N]   only process the first N workflow runs
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { insertServerLog } from './etl/benchmark-ingest.js';
+import { mapBenchmarkRow } from './etl/benchmark-mapper.js';
+import { createConfigCache } from './etl/config-cache.js';
+import { createAdminSql } from './etl/db-utils.js';
+import { createSkipTracker } from './etl/skip-tracker.js';
+
+const REPO = 'SemiAnalysisAI/InferenceX';
+// Strip the trailing `_<runner-pool>_<idx>` token so `server_logs_<key>` and
+// `bmk_agentic_<key>` collapse to the same logical key (matches ingest).
+const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/u;
+
+function parseFlags(): { limit: number | null } {
+  let limit: number | null = null;
+  for (let i = 2; i < process.argv.length; i++) {
+    if (process.argv[i] === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit };
+}
+
+const flags = parseFlags();
+const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} });
+
+interface ArtifactMeta {
+  name: string;
+  archive_download_url: string;
+  created_at: string;
+}
+
+/** List the run's artifacts, dedup by logical name keeping the most recent. */
+function listArtifacts(githubRunId: string): Map<string, ArtifactMeta> {
+  const json = execSync(
+    `gh api "repos/${REPO}/actions/runs/${githubRunId}/artifacts" --paginate --jq '.artifacts[]'`,
+    { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024 },
+  );
+  const byLogical = new Map<string, ArtifactMeta>();
+  for (const line of json.trim().split('\n')) {
+    if (!line) continue;
+    let a: ArtifactMeta;
+    try {
+      a = JSON.parse(line) as ArtifactMeta;
+    } catch {
+      continue;
+    }
+    if (!a.name.startsWith('server_logs_') && !a.name.startsWith('bmk_agentic_')) continue;
+    const key = a.name.replace(RUNNER_SUFFIX_RE, '');
+    const existing = byLogical.get(key);
+    if (!existing || a.created_at > existing.created_at) byLogical.set(key, a);
+  }
+  return byLogical;
+}
+
+function download(artifact: ArtifactMeta, destRoot: string): string {
+  const zipPath = path.join(destRoot, 'a.zip');
+  execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
+    stdio: ['pipe', 'pipe', 'inherit'],
+  });
+  const destDir = path.join(destRoot, artifact.name);
+  fs.mkdirSync(destDir, { recursive: true });
+  execSync(`unzip -oq "${zipPath}" -d "${destDir}"`, { stdio: 'inherit' });
+  fs.unlinkSync(zipPath);
+  return destDir;
+}
+
+/** Logical key shared by a server_logs_/bmk_agentic_ artifact pair. */
+function logicalKey(name: string): string {
+  return name
+    .replace(/^server_logs_/u, '')
+    .replace(/^bmk_agentic_/u, '')
+    .replace(RUNNER_SUFFIX_RE, '');
+}
+
+/**
+ * Read up to `maxBytes` of a (possibly huge) server log as UTF-8, stripping NUL
+ * bytes. vLLM's "GPU KV cache size" startup lines are near the top, so a head
+ * read is enough to derive the KV pool — and it caps storage for the rare
+ * multi-hundred-MB logs that exceed V8's ~512 MB string limit.
+ */
+const stripNul = (s: string): string => s.replaceAll(String.fromCodePoint(0), '');
+
+function readServerLogCapped(p: string, maxBytes = 64 * 1024 * 1024): string {
+  if (fs.statSync(p).size <= maxBytes) return stripNul(fs.readFileSync(p, 'utf8'));
+  const fd = fs.openSync(p, 'r');
+  try {
+    const buf = Buffer.allocUnsafe(maxBytes);
+    const n = fs.readSync(fd, buf, 0, maxBytes, 0);
+    return stripNul(buf.subarray(0, n).toString('utf8'));
+  } finally {
+    fs.closeSync(fd);
+  }
+}
+
+function findJsonFiles(dir: string): string[] {
+  const out: string[] = [];
+  const walk = (d: string) => {
+    for (const e of fs.readdirSync(d, { withFileTypes: true })) {
+      const p = path.join(d, e.name);
+      if (e.isDirectory()) walk(p);
+      else if (e.name.endsWith('.json')) out.push(p);
+    }
+  };
+  walk(dir);
+  return out;
+}
+
+async function main(): Promise<void> {
+  console.log('=== backfill-agentic-server-logs ===');
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Agentic workflow runs that still have unlinked server logs.
+  const runs = await sql<{ github_run_id: string; workflow_run_id: number }[]>`
+    select distinct wr.github_run_id::text as github_run_id, wr.id as workflow_run_id
+    from benchmark_results br
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    where br.benchmark_type = 'agentic_traces'
+      and br.server_log_id is null
+    order by wr.id
+    ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+  `;
+
+  if (runs.length === 0) {
+    console.log('\n  Nothing to do — all agentic rows already have a server log.');
+    return;
+  }
+  console.log(`\n  ${runs.length} agentic workflow run(s) to process.`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  const cache = createConfigCache(sql);
+  await cache.preloadConfigs();
+  const tracker = createSkipTracker();
+
+  let linkedRows = 0;
+  let runsOk = 0;
+  let runsFailed = 0;
+  const t0 = Date.now();
+
+  for (const { github_run_id: githubRunId, workflow_run_id: wrId } of runs) {
+    const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `kvpool-${githubRunId}-`));
+    try {
+      const artifacts = listArtifacts(githubRunId);
+      // server log path by logical key
+      const serverLogByKey = new Map<string, string>();
+      const bmkDirs: string[] = [];
+      for (const art of artifacts.values()) {
+        const dir = download(art, tmp);
+        if (art.name.startsWith('server_logs_')) {
+          const logPath = path.join(dir, 'server.log');
+          if (fs.existsSync(logPath)) serverLogByKey.set(logicalKey(art.name), logPath);
+        } else {
+          bmkDirs.push(dir);
+        }
+      }
+
+      let runLinked = 0;
+      for (const bmkDir of bmkDirs) {
+        const key = logicalKey(path.basename(bmkDir));
+        const logPath = serverLogByKey.get(key);
+        if (!logPath) continue;
+        for (const file of findJsonFiles(bmkDir)) {
+          let raw: unknown;
+          try {
+            raw = JSON.parse(fs.readFileSync(file, 'utf8'));
+          } catch {
+            continue;
+          }
+          const rows = Array.isArray(raw) ? raw : [raw];
+          for (const row of rows) {
+            if (!row || typeof row !== 'object') continue;
+            const mapped = mapBenchmarkRow(row as Record<string, unknown>, tracker);
+            if (!mapped || mapped.benchmarkType !== 'agentic_traces') continue;
+            const configId = await cache.getOrCreateConfig(mapped.config);
+            const ids = await sql<{ id: number }[]>`
+              select id from benchmark_results
+              where workflow_run_id = ${wrId}
+                and config_id = ${configId}
+                and conc = ${mapped.conc}
+                and benchmark_type = 'agentic_traces'
+                and server_log_id is null
+            `;
+            if (ids.length === 0) continue;
+            const serverLog = readServerLogCapped(logPath);
+            await insertServerLog(
+              sql,
+              ids.map((r) => r.id),
+              serverLog,
+            );
+            runLinked += ids.length;
+          }
+        }
+      }
+      linkedRows += runLinked;
+      runsOk++;
+      const elapsed = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ run ${githubRunId}: ${serverLogByKey.size} log(s), linked ${runLinked} row(s) ` +
+          `(${runsOk}/${runs.length}, ${elapsed}s total)`,
+      );
+    } catch (error) {
+      runsFailed++;
+      console.error(
+        `  ✗ run ${githubRunId}: ${error instanceof Error ? (error.stack ?? error.message) : String(error)}`,
+      );
+    } finally {
+      fs.rmSync(tmp, { recursive: true, force: true });
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(
+    `\n=== complete: ${linkedRows} row(s) linked across ${runsOk} run(s) ` +
+      `(${runsFailed} failed) in ${totalSec}s ===`,
+  );
+  if (runsFailed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-agentic-server-logs failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/backfill-kv-pool.ts b/packages/db/src/backfill-kv-pool.ts
new file mode 100644
index 00000000..6cf40a33
--- /dev/null
+++ b/packages/db/src/backfill-kv-pool.ts
@@ -0,0 +1,137 @@
+/**
+ * Backfill `benchmark_results.metrics->kv_cache_pool_tokens` from the captured
+ * server logs. The value is parsed from vLLM's authoritative
+ * "GPU KV cache size: N tokens" startup line(s), summed across data-parallel
+ * engine cores (see {@link kvCachePoolTokensFromServerLog}).
+ *
+ * The ingest path now derives this inline in `insertServerLog`, but existing
+ * rows need this one-time pass. Idempotent: re-running only touches rows that
+ * still lack the value (unless --force).
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-kv-pool
+ *     [--limit N]   only process the first N candidate server logs
+ *     [--force]     recompute even when the value is already set
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { createAdminSql } from './etl/db-utils.js';
+import { kvCachePoolTokensFromServerLog } from './etl/server-log-metrics.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-kv-pool ===');
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // One server log can be linked to several benchmark_results (multiple
+  // concurrency points share a server). Group by log id so we parse each log
+  // once and fan the value out to all its rows.
+  const candidates = flags.force
+    ? await sql<{ server_log_id: number }[]>`
+        select distinct server_log_id
+        from benchmark_results
+        where server_log_id is not null
+        order by server_log_id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ server_log_id: number }[]>`
+        select distinct server_log_id
+        from benchmark_results
+        where server_log_id is not null
+          and metrics->>'kv_cache_pool_tokens' is null
+        order by server_log_id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate server log(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let updated = 0;
+  let logsWithValue = 0;
+  let logsNoValue = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { server_log_id: logId } of candidates) {
+    try {
+      const [row] = await sql<{ server_log: string | null }[]>`
+        select server_log from server_logs where id = ${logId}
+      `;
+      const tokens = kvCachePoolTokensFromServerLog(row?.server_log ?? null);
+      if (tokens === null) {
+        logsNoValue++;
+        continue; // non-vLLM or no startup line — leave unset
+      }
+      logsWithValue++;
+      const targets = flags.force
+        ? sql`server_log_id = ${logId}`
+        : sql`server_log_id = ${logId} and metrics->>'kv_cache_pool_tokens' is null`;
+      const result = await sql`
+        update benchmark_results
+        set metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${tokens}::bigint))
+        where ${targets}
+      `;
+      updated += result.count;
+      console.log(`  ✓ log=${logId}: ${tokens.toLocaleString()} tok → ${result.count} row(s)`);
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ log=${logId}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(
+    `\n=== backfill complete: ${updated} row(s) updated from ${logsWithValue} log(s) ` +
+      `(${logsNoValue} log(s) had no KV-pool line, ${failed} failed) in ${totalSec}s ===`,
+  );
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-kv-pool failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index 343d7fb7..a405789d 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -4,6 +4,7 @@
 
 import type postgres from 'postgres';
 import type { BenchmarkParams } from './benchmark-mapper';
+import { kvCachePoolTokensFromServerLog } from './server-log-metrics';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -106,9 +107,18 @@ export async function insertServerLog(
     insert into server_logs (server_log) values (${serverLog})
     returning id
   `;
+  // Derive the KV-cache pool size (tokens) from the log's authoritative
+  // "GPU KV cache size: N tokens" line(s) and stash it on the result's metrics
+  // JSON, mirroring how trace-replay-ingest derives cache-hit rates. The
+  // scraped vllm:cache_config_info metric can't reconstruct this for MLA models.
+  const kvCachePoolTokens = kvCachePoolTokensFromServerLog(serverLog);
   await sql`
     update benchmark_results
-    set server_log_id = ${logId}
+    set server_log_id = ${logId}${
+      kvCachePoolTokens === null
+        ? sql``
+        : sql`, metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${kvCachePoolTokens}::bigint))`
+    }
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
 }
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
index 7d292207..3f088cd6 100644
--- a/packages/db/src/etl/compute-chart-series.test.ts
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -138,6 +138,49 @@ describe('computeChartSeries', () => {
     ]);
   });
 
+  it('merges warmup_metrics before profiling into one continuous series (v11)', async () => {
+    // warmup scrapes at t=0,1s; profiling scrapes at t=10,11s (own start_ns).
+    const blob = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          warmup_metrics: {
+            'vllm:kv_cache_usage_perc': {
+              series: [
+                {
+                  timeslices: [
+                    { start_ns: 0, end_ns: 1e9, avg: 0.2 },
+                    { start_ns: 1e9, end_ns: 2e9, avg: 0.3 },
+                  ],
+                },
+              ],
+            },
+          },
+          metrics: {
+            'vllm:kv_cache_usage_perc': {
+              series: [
+                {
+                  timeslices: [
+                    { start_ns: 10e9, end_ns: 11e9, avg: 0.8 },
+                    { start_ns: 11e9, end_ns: 12e9, avg: 0.9 },
+                  ],
+                },
+              ],
+            },
+          },
+        }),
+      ),
+    );
+    const series = await computeChartSeries(blob);
+    // Origin is the earliest (warmup) start_ns, so warmup sits at low t and
+    // profiling follows on the same axis — the frontend slices at the boundary.
+    expect(series?.kvCacheUsage).toEqual([
+      { t: 0, value: 0.2 },
+      { t: 1, value: 0.3 },
+      { t: 10, value: 0.8 },
+      { t: 11, value: 0.9 },
+    ]);
+  });
+
   it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => {
     const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 }));
     expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]);
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 394a5826..c87df26b 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -63,8 +63,18 @@ import {
  * v10: only emit per-source series for disaggregated configs with a recognized
  * orchestrator adapter. Non-disaggregated and unsupported configs retain the
  * existing aggregate-only behavior.
+ *
+ * v12: also consume the `warmup_metrics` block from the server-metrics blob and
+ * merge its scrapes into the same series as the profiling `metrics` block.
+ * Warmup and profiling timeslices carry their own absolute `start_ns` and never
+ * overlap in time, so the merged series is continuous (warmup at lower t,
+ * profiling after). This lets the agentic detail page slice `chart_series` into
+ * warmup vs profiling at the request-derived boundary; older blobs without a
+ * warmup block are unaffected. (v11 was a short-lived, since-reverted attempt to
+ * carry kvCachePoolTokens in chart_series; that value now lives in
+ * benchmark_results.metrics, derived from the server log — unrelated to this.)
  */
-export const CHART_SERIES_VERSION = 10;
+export const CHART_SERIES_VERSION = 12;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -193,14 +203,37 @@ const CHART_METRIC_KEYS = new Set([
  * subtrees the chart needs. Avoids Node's 512 MB max-string-length cap that
  * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows.
  */
-async function streamCollectMetrics(buffer: Buffer): Promise<MetricsMap> {
+/**
+ * Merge a warmup phase metric map into the profiling one by concatenating each
+ * metric's `series`. The two phases' timeslices carry their own absolute
+ * `start_ns` and never overlap in time, so `buildSeriesFromMetrics` (which keys
+ * by `start_ns`) yields one continuous series — warmup scrapes at lower t,
+ * profiling after. No-ops when either side is empty (older blobs have no warmup).
+ */
+function mergePhaseMetrics(profiling: MetricsMap, warmup: MetricsMap): MetricsMap {
+  if (Object.keys(warmup).length === 0) return profiling;
+  if (Object.keys(profiling).length === 0) return warmup;
+  const out: MetricsMap = {};
+  for (const name of new Set([...Object.keys(profiling), ...Object.keys(warmup)])) {
+    out[name] = {
+      series: [...(profiling[name]?.series ?? []), ...(warmup[name]?.series ?? [])],
+    };
+  }
+  return out;
+}
+
+/** Stream-collect one top-level phase block (`metrics` or `warmup_metrics`). */
+async function streamCollectPhase(
+  buffer: Buffer,
+  filter: 'metrics' | 'warmup_metrics',
+): Promise<MetricsMap> {
   /* eslint-disable @typescript-eslint/no-explicit-any */
   const collected: MetricsMap = {};
   const pipeline = chain([
     Readable.from(buffer),
     createGunzip(),
     parser(),
-    pick({ filter: 'metrics' }),
+    pick({ filter }),
     streamObject(),
   ]);
   await new Promise<void>((resolve, reject) => {
@@ -215,15 +248,28 @@ async function streamCollectMetrics(buffer: Buffer): Promise<MetricsMap> {
   return collected;
 }
 
+/** Stream-parse fallback: collect both phase blocks and merge (see v11). */
+async function streamCollectMetrics(buffer: Buffer): Promise<MetricsMap> {
+  const [profiling, warmup] = await Promise.all([
+    streamCollectPhase(buffer, 'metrics'),
+    streamCollectPhase(buffer, 'warmup_metrics'),
+  ]);
+  return mergePhaseMetrics(profiling, warmup);
+}
+
 /**
  * Parse the gzipped server_metrics blob into the metric map. Tries the
  * synchronous fast path first; falls back to stream-parse on
- * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed.
+ * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed. Merges the warmup block
+ * into the profiling one (v11) so the series span both phases.
  */
 async function parseMetrics(buffer: Buffer): Promise<MetricsMap> {
   try {
-    const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { metrics?: MetricsMap };
-    return obj.metrics ?? {};
+    const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as {
+      metrics?: MetricsMap;
+      warmup_metrics?: MetricsMap;
+    };
+    return mergePhaseMetrics(obj.metrics ?? {}, obj.warmup_metrics ?? {});
   } catch (error) {
     const code = error && (error as NodeJS.ErrnoException).code;
     const msg = error instanceof Error ? error.message : String(error);
diff --git a/packages/db/src/etl/server-log-metrics.test.ts b/packages/db/src/etl/server-log-metrics.test.ts
new file mode 100644
index 00000000..9e0fa852
--- /dev/null
+++ b/packages/db/src/etl/server-log-metrics.test.ts
@@ -0,0 +1,43 @@
+import { describe, expect, it } from 'vitest';
+
+import { kvCachePoolTokensFromServerLog } from './server-log-metrics';
+
+describe('kvCachePoolTokensFromServerLog', () => {
+  it('returns null for empty / missing logs', () => {
+    expect(kvCachePoolTokensFromServerLog(null)).toBeNull();
+    expect(kvCachePoolTokensFromServerLog('')).toBeNull();
+    expect(kvCachePoolTokensFromServerLog('no kv cache line here')).toBeNull();
+  });
+
+  it('reads a single-engine (ep1) pool size', () => {
+    const log = `
+(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1744] GPU KV cache size: 11,294,463 tokens
+(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1745] Maximum concurrency for 1,048,576 tokens per request: 10.77x
+`;
+    expect(kvCachePoolTokensFromServerLog(log)).toBe(11_294_463);
+  });
+
+  it('sums across data-parallel engine cores (ep8)', () => {
+    const lines = Array.from(
+      { length: 8 },
+      (_, i) =>
+        `(EngineCore_DP${i} pid=${2337827 + i}) INFO [kv_cache_utils.py:1744] GPU KV cache size: 11,577,333 tokens`,
+    ).join('\n');
+    expect(kvCachePoolTokensFromServerLog(lines)).toBe(11_577_333 * 8);
+  });
+
+  it('dedups reprinted lines for the same engine core', () => {
+    const log = `
+(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens
+(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens
+(EngineCore_DP1 pid=2) GPU KV cache size: 5,000,000 tokens
+`;
+    // DP0 counted once + DP1 once = 10M, not 15M.
+    expect(kvCachePoolTokensFromServerLog(log)).toBe(10_000_000);
+  });
+
+  it('falls back to bare lines when no engine-core prefix is present', () => {
+    const log = `INFO GPU KV cache size: 1,234,567 tokens`;
+    expect(kvCachePoolTokensFromServerLog(log)).toBe(1_234_567);
+  });
+});
diff --git a/packages/db/src/etl/server-log-metrics.ts b/packages/db/src/etl/server-log-metrics.ts
new file mode 100644
index 00000000..b8b26dd1
--- /dev/null
+++ b/packages/db/src/etl/server-log-metrics.ts
@@ -0,0 +1,65 @@
+/**
+ * Derive server-side scalars from the captured vLLM server log
+ * (`server_logs.server_log`). These come from startup log lines rather than the
+ * scraped Prometheus `/metrics`, because for MLA / sparse-attention models the
+ * `vllm:cache_config_info` labels (num_gpu_blocks × block_size) do NOT
+ * reconstruct the real KV-cache token capacity — they undercount by a
+ * non-constant factor. vLLM's own `GPU KV cache size: N tokens` line is the
+ * authoritative number.
+ */
+
+/**
+ * Total KV-cache pool size in tokens.
+ *
+ * vLLM prints one `GPU KV cache size: N tokens` line per engine core (one per
+ * data-parallel rank; tensor-parallel is already aggregated into that single
+ * per-engine number). We sum across distinct engine cores so the result is the
+ * deployment-wide total:
+ *
+ *   (EngineCore pid=…)      GPU KV cache size: 11,294,463 tokens   → ep1 total
+ *   (EngineCore_DP0 pid=…)  GPU KV cache size: 11,577,333 tokens   ┐
+ *   (EngineCore_DP1 pid=…)  GPU KV cache size: 11,577,333 tokens   ┘ → ×8 = total
+ *
+ * Returns null when the log has no such line (non-vLLM frameworks, or a log
+ * that didn't capture engine startup).
+ */
+export function kvCachePoolTokensFromServerLog(serverLog: string | null): number | null {
+  if (!serverLog) return null;
+
+  // Scan line-by-line. We deliberately avoid a global regex over the whole blob
+  // with a lazy `[^\n]*?` bridge between the engine tag and the size: some logs
+  // contain multi-megabyte single lines (progress bars, tracebacks) that make
+  // such a regex recurse and blow the stack. A per-line substring pre-filter
+  // means the (cheap) regexes only ever run on the short KV-size lines.
+  //
+  // Each engine core prints one line; the tag (e.g. `EngineCore_DP3`) is stable
+  // across a run while the pid is not, so key on the tag to dedup reprints and
+  // sum across data-parallel ranks.
+  const tagRe = /\((?<tag>EngineCore(?:_DP\d+)?)\s+pid=\d+\)/u;
+  const sizeRe = /GPU KV cache size:\s*(?<tokens>[\d,]+)\s*tokens/u;
+  const perEngine = new Map<string, number>();
+  let bareTotal = 0;
+  let bareFound = false;
+  for (const line of serverLog.split('\n')) {
+    if (!line.includes('GPU KV cache size')) continue;
+    const sizeMatch = sizeRe.exec(line);
+    if (!sizeMatch) continue;
+    const tokens = Number(sizeMatch.groups!.tokens!.replaceAll(',', ''));
+    if (!Number.isFinite(tokens) || tokens <= 0) continue;
+    const tagMatch = tagRe.exec(line);
+    if (tagMatch) {
+      perEngine.set(tagMatch.groups!.tag!, tokens);
+    } else {
+      // Fallback for logs without the engine-core prefix: count each occurrence
+      // (one per engine when there are no reprints). Best-effort only.
+      bareTotal += tokens;
+      bareFound = true;
+    }
+  }
+  if (perEngine.size > 0) {
+    let total = 0;
+    for (const v of perEngine.values()) total += v;
+    return total;
+  }
+  return bareFound ? bareTotal : null;
+}
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 2a5f15f0..8ec1fb9e 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -467,8 +467,15 @@ async function main(): Promise<void> {
 
           const parentDir = path.basename(path.dirname(file));
           if (parentDir.startsWith('bmk_') && insertedIds.length > 0) {
+            // Single-turn artifacts are `bmk_<key>` paired with
+            // `server_logs_<key>`. Agentic artifacts are `bmk_agentic_<key>`
+            // but the server log is still `server_logs_<key>` (no `agentic_`
+            // prefix), so fall back to the fully-stripped suffix — otherwise
+            // agentic rows never get their server log (and KV-pool size) linked.
             const configKey = parentDir.replace(/^bmk_/u, '');
-            const logPath = serverLogPaths.get(configKey);
+            const logPath =
+              serverLogPaths.get(configKey) ??
+              serverLogPaths.get(stripBmkAndAgenticPrefix(parentDir));
             if (logPath) {
               try {
                 const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', '');
diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts
index 61d21d35..f045dfda 100644
--- a/packages/db/src/queries/trace-server-metrics.test.ts
+++ b/packages/db/src/queries/trace-server-metrics.test.ts
@@ -48,6 +48,7 @@ function metaRow(overrides: Record<string, unknown> = {}) {
     run_url: null,
     server_gpu_cache_hit_rate: null,
     server_cpu_cache_hit_rate: null,
+    kv_cache_pool_tokens: null,
     ...overrides,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 61cacaae..d24d0879 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -81,6 +81,11 @@ export interface TraceServerMetrics {
    * the cluster-average `kvCacheUsage` line covers that case alone.
    */
   kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
+  /**
+   * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed
+   * across engines). vLLM only — null for SGLang/TRT or older rows.
+   */
+  kvCachePoolTokens: number | null;
   /** Orchestrator-normalized metrics grouped by endpoint/worker. */
   metricSources: MetricSourceSeries[];
 }
@@ -89,6 +94,8 @@ interface RawMetaRow extends PointMeta {
   trace_replay_id: number | null;
   has_blob: boolean;
   chart_series: ChartSeries | null;
+  /** Derived at server-log ingest from "GPU KV cache size: N tokens" lines. */
+  kv_cache_pool_tokens: string | null;
 }
 
 interface RawBlobRow {
@@ -118,9 +125,14 @@ function buildMeta(row: RawMetaRow): PointMeta {
   };
 }
 
-function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
+function merge(
+  meta: PointMeta,
+  series: ChartSeries,
+  kvCachePoolTokens: number | null,
+): TraceServerMetrics {
   return {
     meta,
+    kvCachePoolTokens,
     startNs: series.startNs,
     endNs: series.endNs,
     durationS: series.durationS,
@@ -155,7 +167,8 @@ export async function getTraceServerMetrics(
       br.date::text,
       case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
       (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
-      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate,
+      (br.metrics ->> 'kv_cache_pool_tokens')::numeric as kv_cache_pool_tokens
     from benchmark_results br
     join configs c on c.id = br.config_id
     join workflow_runs wr on wr.id = br.workflow_run_id
@@ -166,10 +179,12 @@ export async function getTraceServerMetrics(
   if (!row) return null;
   if (!row.has_blob || row.trace_replay_id === null) return null;
   const meta = buildMeta(row);
+  const kvCachePoolTokens =
+    row.kv_cache_pool_tokens === null ? null : Number(row.kv_cache_pool_tokens);
 
   // Fast path: pre-computed chart_series at the current version.
   if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) {
-    return merge(meta, row.chart_series);
+    return merge(meta, row.chart_series, kvCachePoolTokens);
   }
 
   // Slow path only: fetch the large raw blob after establishing that the
@@ -192,5 +207,5 @@ export async function getTraceServerMetrics(
     disagg: row.disagg,
   });
   if (!series) return null;
-  return merge(meta, series);
+  return merge(meta, series, kvCachePoolTokens);
 }

From af6bc11987e41c7cf5ca9298231fe02e90b5d9ce Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Jul 2026 16:16:05 -0500
Subject: [PATCH 103/111] fix(agentic): stable conversation row order + color
 across timeline phase toggle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switching the request timeline between Warmup and Profiling re-derived both the
row order (sorted by first-start) and the color palette (assigned in iteration
order) from the phase-filtered subset, so a conversation jumped rows and swapped
color on every toggle.

Compute a stable per-group index (conversation cid or worker id) from the full,
unfiltered request set — keyed by earliest start across all phases — and drive
both ordering and color from it. buildRequestTimelineRows takes it as an optional
4th arg (falls back to the legacy self-contained behavior for unit tests).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.test.ts    | 61 +++++++++++++++++
 .../agentic-point/request-timeline.tsx        | 68 ++++++++++++++++---
 2 files changed, 119 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
index fe3c1231..6fcf1c57 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -4,6 +4,7 @@ import type { RequestRecord } from '@/hooks/api/use-request-timeline';
 
 import {
   buildRequestTimelineRows,
+  computeStableRowIndex,
   parseTimelineViewSnapshot,
   requestIdleStats,
   splitTimelineCid,
@@ -106,6 +107,66 @@ describe('subagent timeline hierarchy', () => {
   });
 });
 
+describe('stable row order + color across phase filters', () => {
+  // Same conversations appear in both warmup and profiling. Their global
+  // first-start order is A (0) < B (10) < C (only profiling, 50). The bug:
+  // filtering to a phase re-sorted + re-colored by the visible subset, so a
+  // conversation jumped rows and swapped color when toggling phases.
+  const rec = (
+    cid: string,
+    phase: RequestRecord['phase'],
+    start: number,
+    end: number,
+  ): RequestRecord => ({ ...request(start, end), cid, phase });
+  const full: RequestRecord[] = [
+    rec('A', 'warmup', 0, 5),
+    rec('A', 'profiling', 100, 110),
+    rec('B', 'warmup', 10, 15),
+    rec('B', 'profiling', 120, 130),
+    rec('C', 'profiling', 50, 60), // profiling-only; earliest profiling start
+  ];
+
+  it('keeps each conversation in the same position and color when the phase changes', () => {
+    const index = computeStableRowIndex(full, 'conversation');
+    const warmupRows = buildRequestTimelineRows(
+      full.filter((r) => r.phase === 'warmup'),
+      'conversation',
+      new Set(),
+      index,
+    ).filter((r) => r.kind === 'parent');
+    const profilingRows = buildRequestTimelineRows(
+      full.filter((r) => r.phase === 'profiling'),
+      'conversation',
+      new Set(),
+      index,
+    ).filter((r) => r.kind === 'parent');
+
+    // Position: A before B in both phases (C only shows in profiling, and sorts
+    // after A/B by its global index — NOT first by its earlier profiling start).
+    expect(warmupRows.map((r) => r.label)).toEqual(['A', 'B']);
+    expect(profilingRows.map((r) => r.label)).toEqual(['A', 'B', 'C']);
+
+    // Color: identical per conversation across phases, distinct between them.
+    const warmupColors = Object.fromEntries(warmupRows.map((r) => [r.label, r.color]));
+    const profilingColors = Object.fromEntries(profilingRows.map((r) => [r.label, r.color]));
+    expect(warmupColors.A).toBe(profilingColors.A);
+    expect(warmupColors.B).toBe(profilingColors.B);
+    expect(warmupColors.A).not.toBe(warmupColors.B);
+  });
+
+  it('without a shared index, the same subset re-sorts by its own start times (regression guard)', () => {
+    // Sanity: the legacy self-contained path (no index arg) orders by the
+    // subset's own first-start, which is exactly why the shared index is needed.
+    const profilingOnly = buildRequestTimelineRows(
+      full.filter((r) => r.phase === 'profiling'),
+      'conversation',
+      new Set(),
+    ).filter((r) => r.kind === 'parent');
+    // C (start 50) sorts first here, ahead of A (100) and B (120).
+    expect(profilingOnly.map((r) => r.label)).toEqual(['C', 'A', 'B']);
+  });
+});
+
 describe('parseTimelineViewSnapshot', () => {
   const full: TimelineViewSnapshot = {
     viewStart: 1_000,
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index f3870bb1..db1ac93f 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -275,6 +275,36 @@ export function splitTimelineCid(cid: string): {
   return { parent, subagentBase: raw, stream: null, aux: null };
 }
 
+/**
+ * Stable order/color index for the top-level row groups (conversations in
+ * conversation mode, workers in worker mode), keyed by group id and computed
+ * over the FULL (unfiltered) request set. Both the row ordering and the color
+ * palette are driven by this index, so a conversation/worker keeps the same
+ * position and color when the phase filter changes the visible subset — without
+ * it, filtering to warmup vs profiling re-sorts and re-colors by whatever subset
+ * is showing, making rows jump and swap colors.
+ *
+ * Order key is the group's earliest request start across all phases; ties break
+ * on the group id for determinism.
+ */
+export function computeStableRowIndex(
+  requests: readonly RequestRecord[],
+  mode: RowMode,
+): Map<string, number> {
+  const firstStart = new Map<string, number>();
+  for (const r of requests) {
+    const key = mode === 'conversation' ? splitTimelineCid(r.cid).parent : r.wid;
+    const cur = firstStart.get(key);
+    if (cur === undefined || r.start < cur) firstStart.set(key, r.start);
+  }
+  const keys = [...firstStart.keys()].toSorted(
+    (a, b) => firstStart.get(a)! - firstStart.get(b)! || (a < b ? -1 : a > b ? 1 : 0),
+  );
+  const index = new Map<string, number>();
+  keys.forEach((key, i) => index.set(key, i));
+  return index;
+}
+
 /**
  * Group requests into rows. In conversation mode, output order is:
  *   parent_conv
@@ -289,12 +319,23 @@ export function splitTimelineCid(cid: string): {
  * stream children. Bars on a collapsed subagent are the UNION of all its
  * streams' requests — overlapping bars visually communicate the
  * stream-level parallelism without expanding.
+ *
+ * `stableRowIndex` (optional) pins the top-level order + color per group so they
+ * survive phase-filter changes; when omitted it's derived from `requests` (the
+ * legacy self-contained behavior, used by unit tests).
  */
 export function buildRequestTimelineRows(
   requests: RequestRecord[],
   mode: RowMode,
   expandedSubagents: ReadonlySet<string>,
+  stableRowIndex?: ReadonlyMap<string, number>,
 ): RequestTimelineRow[] {
+  const index = stableRowIndex ?? computeStableRowIndex(requests, mode);
+  const colorFor = (key: string) =>
+    ROW_COLORS[
+      (((index.get(key) ?? 0) % ROW_COLORS.length) + ROW_COLORS.length) % ROW_COLORS.length
+    ]!;
+  const orderOf = (key: string) => index.get(key) ?? Number.POSITIVE_INFINITY;
   if (mode !== 'conversation') {
     // Worker mode: flat rows, sorted by first activity.
     const groups = new Map<string, RequestRecord[]>();
@@ -307,20 +348,20 @@ export function buildRequestTimelineRows(
       list.push(r);
     }
     const rows: RequestTimelineRow[] = [];
-    let i = 0;
     for (const [key, list] of groups) {
       list.sort((a, b) => a.start - b.start);
       rows.push({
         key,
         label: shortenWid(key),
-        color: ROW_COLORS[i % ROW_COLORS.length]!,
+        color: colorFor(key),
         requests: list,
         depth: 0,
         kind: 'worker',
       });
-      i++;
     }
-    rows.sort((a, b) => a.requests[0]!.start - b.requests[0]!.start);
+    rows.sort(
+      (a, b) => orderOf(a.key) - orderOf(b.key) || a.requests[0]!.start - b.requests[0]!.start,
+    );
     return rows;
   }
 
@@ -370,12 +411,12 @@ export function buildRequestTimelineRows(
     if (r.start < tree.firstStart) tree.firstStart = r.start;
   }
 
-  const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
+  const sortedTrees = [...trees.values()].toSorted(
+    (a, b) => orderOf(a.parentCid) - orderOf(b.parentCid) || a.firstStart - b.firstStart,
+  );
   const rows: RequestTimelineRow[] = [];
-  let colorIdx = 0;
   for (const tree of sortedTrees) {
-    const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
-    colorIdx++;
+    const color = colorFor(tree.parentCid);
     // Parent row (use a placeholder key if the parent itself wasn't replayed).
     tree.parentReqs.sort((a, b) => a.start - b.start);
     rows.push({
@@ -686,9 +727,16 @@ export function RequestTimelineView({
     () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'),
     [data.requests, phaseFilter, hasWarmup],
   );
+  // Stable order/color per conversation (or worker), computed over the FULL
+  // request set — NOT the phase-filtered subset — so a row keeps its position
+  // and color when the user toggles between warmup and profiling.
+  const stableRowIndex = useMemo(
+    () => computeStableRowIndex(data.requests, rowMode),
+    [data.requests, rowMode],
+  );
   const rows = useMemo(
-    () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents),
-    [filtered, rowMode, expandedSubagents],
+    () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents, stableRowIndex),
+    [filtered, rowMode, expandedSubagents, stableRowIndex],
   );
   const idleStats = useMemo(() => requestIdleStats(filtered), [filtered]);
 

From 25ac7f910bc5933b16798108ff016de38d844bcf Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Jul 2026 22:51:46 -0500
Subject: [PATCH 104/111] feat(agentic): open-in-new-tab for request timeline
 bars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Render each linkable request bar as a real SVG <a href> (the dataset
conversation deep link) instead of a <g> with an onClick, so the browser's
native "open in new tab" works — right-click → Open Link in New Tab, plus
⌘/Ctrl-click and middle-click. Plain left-click still does the in-app SPA
navigation + view-state snapshot; modified/non-primary clicks fall through to
the browser, and native link-drag is suppressed so it doesn't fight the pan
gesture.

Extract the URL construction into an exported conversationHref() helper shared
by the click handler and the href. Non-linkable points (no source dataset) keep
the plain <g>.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.test.ts    | 19 +++++
 .../agentic-point/request-timeline.tsx        | 74 ++++++++++++++-----
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
index 6fcf1c57..3a3ebcc5 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -5,6 +5,7 @@ import type { RequestRecord } from '@/hooks/api/use-request-timeline';
 import {
   buildRequestTimelineRows,
   computeStableRowIndex,
+  conversationHref,
   parseTimelineViewSnapshot,
   requestIdleStats,
   splitTimelineCid,
@@ -107,6 +108,24 @@ describe('subagent timeline hierarchy', () => {
   });
 });
 
+describe('conversationHref', () => {
+  it('builds a turn-carrying dataset link for a main-conversation request', () => {
+    expect(
+      conversationHref('cc-traces-weka-062126', { ...request(0, 10), cid: 'abc123', ti: 4 }),
+    ).toBe('/datasets/cc-traces-weka-062126/conversations/abc123?turn=4');
+  });
+
+  it('carries the subagent id and strips the ::sa suffix from the conv id', () => {
+    expect(
+      conversationHref('slug', {
+        ...request(0, 10),
+        cid: 'abc123::sa:subagent_001_bf1c5c16:s2',
+        ti: 7,
+      }),
+    ).toBe('/datasets/slug/conversations/abc123?turn=7&sa=subagent_001_bf1c5c16');
+  });
+});
+
 describe('stable row order + color across phase filters', () => {
   // Same conversations appear in both warmup and profiling. Their global
   // first-start order is A (0) < B (10) < C (only profiling, 50). The bug:
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index db1ac93f..3e0edd9e 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -34,6 +34,21 @@ export function subagentIdOf(cid: string): string | null {
   return colon === -1 ? raw : raw.slice(0, colon);
 }
 
+/**
+ * Deep-link URL for the dataset conversation a request maps to. Carries the turn
+ * (and, for subagent requests, the subagent id) so the flamegraph can scroll to
+ * / highlight the exact node. Used both for SPA navigation on click and as the
+ * real `href` on the request bar so the browser's native "open in new tab"
+ * (right-click, ⌘/Ctrl-click, middle-click) works.
+ */
+export function conversationHref(datasetSlug: string, req: RequestRecord): string {
+  const convId = datasetConvId(req.cid);
+  const params = new URLSearchParams({ turn: String(req.ti) });
+  const sa = subagentIdOf(req.cid);
+  if (sa) params.set('sa', sa);
+  return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`;
+}
+
 export interface RequestIdleStats {
   /** Total time between the first start and last end with no request running. */
   idleNs: number;
@@ -685,16 +700,8 @@ export function RequestTimelineView({
           scrollLeft: scrollRef.current.scrollLeft,
         });
       }
-      const convId = datasetConvId(req.cid);
-      // Carry the turn (and, for subagent requests, the subagent id) so the
-      // flamegraph can scroll to / highlight the exact node this bar maps to.
-      const params = new URLSearchParams({ turn: String(req.ti) });
-      const sa = subagentIdOf(req.cid);
-      if (sa) params.set('sa', sa);
       track('agentic_timeline_to_dataset', { slug: datasetSlug });
-      router.push(
-        `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`,
-      );
+      router.push(conversationHref(datasetSlug, req));
     },
     [datasetSlug, router, pointId],
   );
@@ -1156,14 +1163,12 @@ export function RequestTimelineView({
                     const runW = Math.max(xEnd - xStart, 1);
                     const queueW = Math.max(xStart - xCredit, 0);
                     const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
-                    return (
-                      <g
-                        key={`${req.cid}-${req.ti}-${req.start}`}
-                        onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
-                        onMouseLeave={() => setTooltip(null)}
-                        onClick={datasetSlug ? () => openConversation(req) : undefined}
-                        style={datasetSlug ? { cursor: 'pointer' } : undefined}
-                      >
+                    const barKey = `${req.cid}-${req.ti}-${req.start}`;
+                    const showTooltip = (e: React.MouseEvent) =>
+                      setTooltip({ x: e.clientX, y: e.clientY, row, req });
+                    const hideTooltip = () => setTooltip(null);
+                    const barChildren = (
+                      <>
                         {/* Queue lead-in (faint) — only drawn when noticeable. */}
                         {queueW >= 1 && (
                           <rect
@@ -1216,7 +1221,40 @@ export function RequestTimelineView({
                             opacity={0.6}
                           />
                         )}
-                      </g>
+                      </>
+                    );
+                    // No source dataset → not linkable; plain group.
+                    if (!datasetSlug) {
+                      return (
+                        <g key={barKey} onMouseMove={showTooltip} onMouseLeave={hideTooltip}>
+                          {barChildren}
+                        </g>
+                      );
+                    }
+                    // Linkable: render a real SVG anchor with the conversation
+                    // href so the browser's native "open in new tab" works
+                    // (right-click menu, ⌘/Ctrl-click, middle-click). Plain
+                    // left-click stays an in-app navigation; modified or
+                    // non-primary clicks fall through to the browser. Suppress
+                    // the native link drag so it doesn't fight the pan gesture.
+                    return (
+                      <a
+                        key={barKey}
+                        href={conversationHref(datasetSlug, req)}
+                        onMouseMove={showTooltip}
+                        onMouseLeave={hideTooltip}
+                        onClick={(e) => {
+                          if (e.metaKey || e.ctrlKey || e.shiftKey || e.altKey || e.button !== 0) {
+                            return;
+                          }
+                          e.preventDefault();
+                          openConversation(req);
+                        }}
+                        onDragStart={(e) => e.preventDefault()}
+                        style={{ cursor: 'pointer' }}
+                      >
+                        {barChildren}
+                      </a>
                     );
                   });
                 })}

From 7558faa58930a386af8dfb83d27afb460ba6e564 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Jul 2026 22:56:28 -0500
Subject: [PATCH 105/111] fix(agentic): show 1-based turn number in request
 timeline tooltip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The timeline tooltip showed the raw 0-based turn_index while the dataset
flamegraph labels turns 1-based ("Turn 1" for the first turn), so the same
request read as a different turn in the two views. Display req.ti + 1 to align
them. The deep-link `turn` param stays 0-based — the flamegraph matches it
against a 0-based turn ordinal for scroll/highlight, so click-through targeting
is unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../components/inference/agentic-point/request-timeline.tsx  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 3e0edd9e..834b7a83 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -592,7 +592,10 @@ function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean })
       <div className="flex items-center gap-2 font-medium text-foreground">
         <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: row.color }} />
         <span className="truncate">{row.label}</span>
-        <span className="text-muted-foreground">· turn {req.ti}</span>
+        {/* Display 1-based to match the dataset flamegraph's "Turn N" labels.
+            The deep-link `turn` param stays 0-based (req.ti) — the flamegraph
+            matches it against a 0-based turn ordinal for highlighting. */}
+        <span className="text-muted-foreground">· turn {req.ti + 1}</span>
         {req.cancelled && <span className="text-destructive">· cancelled</span>}
       </div>
       <div className="mt-1.5 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">

From ad85bed7236f44c4a48d71b3fffb3147734efaf3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Jul 2026 23:28:47 -0500
Subject: [PATCH 106/111] feat(agentic): link timeline requests by raw source

Signed-off-by: Cam Quilici <cjquilici@gmail.com>
---
 .../components/datasets/conversation-view.tsx |  5 +++-
 .../components/datasets/trace-flamegraph.tsx  | 17 +++++++++--
 .../agentic-point/request-timeline.test.ts    | 13 ++++++++
 .../agentic-point/request-timeline.tsx        | 26 +++++++++++++---
 .../app/src/hooks/api/use-request-timeline.ts |  6 ++++
 .../src/etl/compute-request-timeline.test.ts  | 30 +++++++++++++++++++
 .../db/src/etl/compute-request-timeline.ts    | 14 ++++++++-
 packages/db/src/etl/weka-structure.test.ts    | 14 +++++++++
 packages/db/src/etl/weka-structure.ts         |  5 +++-
 9 files changed, 120 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index ce10241a..0be8e58a 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -11,13 +11,15 @@ import { compact } from './format';
 export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
   const { data, isLoading, isError } = useDatasetConversation(slug, convId);
 
-  // Deep-link target from a request-timeline click: ?turn=<ti>[&sa=<agentId>].
+  // Deep-link target from a request-timeline click: ?raw=<outerIdx> or ?turn=<ti>[&sa=<agentId>].
   // useSearchParams (not a one-shot window.location read) so the params are
   // present on the very first client-side navigation, not just after a reload.
   const params = useSearchParams();
   const turnRaw = params.get('turn');
+  const sourceRaw = params.get('raw');
   const highlight = {
     turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null,
+    raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null,
     agent: params.get('sa'),
   };
 
@@ -95,6 +97,7 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
         <TraceFlamegraph
           structure={data.structure}
           highlightTurn={highlight.turn}
+          highlightRawIndex={highlight.raw}
           highlightAgentId={highlight.agent}
         />
       </Card>
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 158c03c3..f82f0b5f 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -169,20 +169,31 @@ interface TooltipState {
 export function TraceFlamegraph({
   structure,
   highlightTurn,
+  highlightRawIndex,
   highlightAgentId,
 }: {
   structure: ConversationStructure;
   /** Turn index to scroll to / highlight (from a request-timeline deep link). */
   highlightTurn?: number | null;
+  /** Raw Weka top-level request index to scroll to / highlight. */
+  highlightRawIndex?: number | null;
   /** Subagent id when the highlighted turn is inside a subagent group. */
   highlightAgentId?: string | null;
 }) {
   const nodes = structure.nodes;
 
   // Resolve the deep-link target to a row key (+ the group that must be open to
-  // show it). Main turns match by their main-turn ordinal; subagent turns match
-  // the group by agentId, then the ti-th child.
+  // show it). Raw Weka indexes are exact source coordinates and take precedence;
+  // otherwise main turns match by main-turn ordinal and subagent turns match the
+  // group by agentId, then the ti-th child.
   const target = useMemo(() => {
+    if (typeof highlightRawIndex === 'number' && highlightRawIndex >= 0) {
+      const i = nodes.findIndex(
+        (node) => node.kind === 'turn' && node.rawIndex === highlightRawIndex,
+      );
+      if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null };
+      return null;
+    }
     if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null;
     if (highlightAgentId) {
       const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId);
@@ -199,7 +210,7 @@ export function TraceFlamegraph({
       }
     }
     return null;
-  }, [nodes, highlightTurn, highlightAgentId]);
+  }, [nodes, highlightTurn, highlightRawIndex, highlightAgentId]);
 
   // Subagent groups collapsed by default — except the deep-link target's group.
   const [expanded, setExpanded] = useState<Set<number>>(() =>
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
index 3a3ebcc5..bebb63a9 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -124,6 +124,19 @@ describe('conversationHref', () => {
       }),
     ).toBe('/datasets/slug/conversations/abc123?turn=7&sa=subagent_001_bf1c5c16');
   });
+
+  it('uses raw source provenance for flattened-agent dataset links', () => {
+    expect(
+      conversationHref('slug', {
+        ...request(0, 10),
+        cid: '02bc0afb13f7a2d9efa86c28511261d85c0e::fa:003',
+        ti: 3,
+        srcTrace: '02bc0afb13f7a2d9efa86c28511261d85c0e',
+        srcOuter: 204,
+        srcKind: 'weka_flat',
+      }),
+    ).toBe('/datasets/slug/conversations/02bc0afb13f7a2d9efa86c28511261d85c0e?turn=3&raw=204');
+  });
 });
 
 describe('stable row order + color across phase filters', () => {
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 834b7a83..592f5c37 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -42,8 +42,11 @@ export function subagentIdOf(cid: string): string | null {
  * (right-click, ⌘/Ctrl-click, middle-click) works.
  */
 export function conversationHref(datasetSlug: string, req: RequestRecord): string {
-  const convId = datasetConvId(req.cid);
+  const convId = req.srcTrace ?? datasetConvId(req.cid);
   const params = new URLSearchParams({ turn: String(req.ti) });
+  if (typeof req.srcOuter === 'number' && Number.isInteger(req.srcOuter) && req.srcOuter >= 0) {
+    params.set('raw', String(req.srcOuter));
+  }
   const sa = subagentIdOf(req.cid);
   if (sa) params.set('sa', sa);
   return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`;
@@ -299,21 +302,36 @@ export function splitTimelineCid(cid: string): {
  * it, filtering to warmup vs profiling re-sorts and re-colors by whatever subset
  * is showing, making rows jump and swap colors.
  *
- * Order key is the group's earliest request start across all phases; ties break
- * on the group id for determinism.
+ * Groups that span BOTH phases sort first. The shared set is by definition
+ * present in either phase's view, so this leading block renders identically in
+ * both — a conversation that carries over from warmup into profiling stays on
+ * the exact same row when the toggle flips. Phase-exclusive groups follow, and
+ * only they reflow between views. Within each block the order key is the
+ * group's earliest request start across all phases; ties break on the group id
+ * for determinism.
  */
 export function computeStableRowIndex(
   requests: readonly RequestRecord[],
   mode: RowMode,
 ): Map<string, number> {
   const firstStart = new Map<string, number>();
+  // Which phases each group appears in. Mirrors requestsForPhase's split:
+  // 'profiling' is exact, anything else counts as warmup.
+  const inProfiling = new Set<string>();
+  const inWarmup = new Set<string>();
   for (const r of requests) {
     const key = mode === 'conversation' ? splitTimelineCid(r.cid).parent : r.wid;
     const cur = firstStart.get(key);
     if (cur === undefined || r.start < cur) firstStart.set(key, r.start);
+    if (r.phase === 'profiling') inProfiling.add(key);
+    else inWarmup.add(key);
   }
+  const spansBoth = (key: string) => inProfiling.has(key) && inWarmup.has(key);
   const keys = [...firstStart.keys()].toSorted(
-    (a, b) => firstStart.get(a)! - firstStart.get(b)! || (a < b ? -1 : a > b ? 1 : 0),
+    (a, b) =>
+      Number(spansBoth(b)) - Number(spansBoth(a)) ||
+      firstStart.get(a)! - firstStart.get(b)! ||
+      (a < b ? -1 : a > b ? 1 : 0),
   );
   const index = new Map<string, number>();
   keys.forEach((key, i) => index.set(key, i));
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
index 094d2230..306d1416 100644
--- a/packages/app/src/hooks/api/use-request-timeline.ts
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -5,6 +5,12 @@ export interface RequestRecord {
   cid: string;
   /** Zero-based turn index within the conversation. */
   ti: number;
+  /** Source trace id from the original raw dataset, when provided by AIPerf. */
+  srcTrace?: string;
+  /** Original raw top-level request index within srcTrace. */
+  srcOuter?: number;
+  /** Loader-specific source kind, e.g. weka_main or weka_flat. */
+  srcKind?: string;
   /** Worker id (concurrency slot that handled this request). */
   wid: string;
   /** Sub-agent depth (0 = top-level). */
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
index 61e69fe8..409dc091 100644
--- a/packages/db/src/etl/compute-request-timeline.test.ts
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -7,6 +7,9 @@ import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-requ
 interface SyntheticRequest {
   cid: string;
   ti: number;
+  srcTrace?: string;
+  srcOuter?: number;
+  srcKind?: string;
   wid?: string;
   ad?: number;
   phase?: string;
@@ -28,6 +31,9 @@ function makeBlob(requests: SyntheticRequest[]) {
       metadata: {
         conversation_id: r.cid,
         turn_index: r.ti,
+        ...(r.srcTrace === undefined ? {} : { source_trace_id: r.srcTrace }),
+        ...(r.srcOuter === undefined ? {} : { source_outer_idx: r.srcOuter }),
+        ...(r.srcKind === undefined ? {} : { source_kind: r.srcKind }),
         worker_id: r.wid ?? 'worker_default',
         agent_depth: r.ad ?? 0,
         benchmark_phase: r.phase ?? 'profiling',
@@ -119,6 +125,30 @@ describe('computeRequestTimeline', () => {
     expect(r.phase).toBe('profiling');
   });
 
+  it('preserves raw source provenance fields when present', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'trace::fa:003',
+          ti: 3,
+          srcTrace: 'trace',
+          srcOuter: 204,
+          srcKind: 'weka_flat',
+          credit: 0,
+          start: 10,
+          end: 100,
+        },
+      ]),
+    );
+    expect(tl?.requests[0]).toMatchObject({
+      cid: 'trace::fa:003',
+      ti: 3,
+      srcTrace: 'trace',
+      srcOuter: 204,
+      srcKind: 'weka_flat',
+    });
+  });
+
   it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => {
     const tl = computeRequestTimeline(
       makeBlob([
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
index 707e8c54..85f782fc 100644
--- a/packages/db/src/etl/compute-request-timeline.ts
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -14,13 +14,19 @@
 import { gunzipSync } from 'node:zlib';
 
 /** Bump when the extraction algorithm changes — backfill recomputes anything older. */
-export const REQUEST_TIMELINE_VERSION = 3;
+export const REQUEST_TIMELINE_VERSION = 4;
 
 export interface RequestRecord {
   /** Conversation id (groups turns of one agent session). */
   cid: string;
   /** Zero-based turn index within the conversation. */
   ti: number;
+  /** Source trace id from the original raw dataset, when distinct from replay cid. */
+  srcTrace?: string;
+  /** Original raw top-level request index within srcTrace. */
+  srcOuter?: number;
+  /** Loader-specific source kind, e.g. weka_main or weka_flat. */
+  srcKind?: string;
   /** Worker id (concurrency slot that handled this request). */
   wid: string;
   /** Sub-agent depth (0 = top-level). */
@@ -60,6 +66,9 @@ export interface RequestTimeline {
 interface RawMetadata {
   conversation_id?: string;
   turn_index?: number;
+  source_trace_id?: string;
+  source_outer_idx?: number;
+  source_kind?: string;
   worker_id?: string;
   agent_depth?: number;
   benchmark_phase?: string;
@@ -164,6 +173,9 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
     requests.push({
       cid: m.conversation_id ?? 'unknown',
       ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
+      srcTrace: typeof m.source_trace_id === 'string' ? m.source_trace_id : undefined,
+      srcOuter: typeof m.source_outer_idx === 'number' ? m.source_outer_idx : undefined,
+      srcKind: typeof m.source_kind === 'string' ? m.source_kind : undefined,
       wid: m.worker_id ?? 'unknown',
       ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
       phase: m.benchmark_phase ?? 'unknown',
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
index 97e8759d..dec2254c 100644
--- a/packages/db/src/etl/weka-structure.test.ts
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -61,6 +61,20 @@ describe('buildConversationStructure', () => {
     });
   });
 
+  it('stamps top-level turns with their raw Weka request index', () => {
+    const structure = buildConversationStructure({
+      id: 'raw-index',
+      requests: [
+        { type: 'n', in: 1, out: 1 },
+        { type: 'subagent', requests: [{ type: 'n', in: 1, out: 1 }] },
+        { type: 'n', in: 1, out: 1 },
+      ],
+    });
+
+    expect((structure.nodes[0] as TurnNode).rawIndex).toBe(0);
+    expect((structure.nodes[2] as TurnNode).rawIndex).toBe(2);
+  });
+
   it('clamps cached to the effective input on a partial last block', () => {
     const conv: RawWekaConversation = {
       id: 'c2',
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index f6cea1c1..bbdb8791 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -48,6 +48,8 @@ export interface RawWekaConversation {
 export interface TurnNode {
   kind: 'turn';
   turnIndex: number;
+  /** Zero-based index in the raw Weka requests array, when this row maps to one. */
+  rawIndex?: number;
   /** Seconds from the start of the conversation. */
   startS?: number;
   /** End of the original request interval (`startS + api_time`). */
@@ -218,7 +220,7 @@ export function buildConversationStructure(
   let numSubagentGroups = 0;
   let turnIndex = 0;
 
-  for (const entry of conv.requests ?? []) {
+  for (const [idx, entry] of (conv.requests ?? []).entries()) {
     if (isSubagent(entry)) {
       const { startS, endS } = subagentTimeRange(entry);
       const childSeen = new Set(seen); // snapshot at spawn; not merged back
@@ -272,6 +274,7 @@ export function buildConversationStructure(
       nodes.push({
         kind: 'turn',
         turnIndex: turnIndex++,
+        rawIndex: idx,
         startS,
         endS: requestEndS(startS, entry.api_time),
         model: entry.model,

From c3c3d4044c3b590454ffccdce4583119ccb88690 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Jul 2026 23:33:36 -0500
Subject: [PATCH 107/111] test(agentic): absolute row alignment for
 phase-spanning timeline conversations

Covers the shared-first ordering in computeStableRowIndex (landed in ad85bed):
conversations present in both warmup and profiling must occupy the same
absolute row in both phase views, with phase-exclusive conversations filling in
below. Guards the case where earlier-starting warmup-only conversations would
otherwise push the shared block down in one view only.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.test.ts    | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
index bebb63a9..779b79f3 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -186,6 +186,37 @@ describe('stable row order + color across phase filters', () => {
     expect(warmupColors.A).not.toBe(warmupColors.B);
   });
 
+  it('phase-spanning conversations occupy the same ABSOLUTE row in both phase views', () => {
+    // Warmup-only conversations start earliest — under a plain global-start
+    // ordering they'd sit above the shared ones in the warmup view but be
+    // absent from the profiling view, sliding every shared row up when the
+    // toggle flips. Spanning conversations must sort first so the leading block
+    // is identical in both views and a carried-over conversation never moves.
+    const data: RequestRecord[] = [
+      rec('W1', 'warmup', 0, 2),
+      rec('W2', 'warmup', 3, 4),
+      rec('A', 'warmup', 5, 8),
+      rec('A', 'profiling', 100, 110),
+      rec('B', 'warmup', 10, 15),
+      rec('B', 'profiling', 120, 130),
+      rec('P', 'profiling', 50, 60),
+    ];
+    const index = computeStableRowIndex(data, 'conversation');
+    const parentLabels = (phase: RequestRecord['phase']) =>
+      buildRequestTimelineRows(
+        data.filter((r) => r.phase === phase),
+        'conversation',
+        new Set(),
+        index,
+      )
+        .filter((r) => r.kind === 'parent')
+        .map((r) => r.label);
+    // Shared block [A, B] leads both views at rows 0 and 1; phase-unique
+    // conversations fill in below.
+    expect(parentLabels('warmup')).toEqual(['A', 'B', 'W1', 'W2']);
+    expect(parentLabels('profiling')).toEqual(['A', 'B', 'P']);
+  });
+
   it('without a shared index, the same subset re-sorts by its own start times (regression guard)', () => {
     // Sanity: the legacy self-contained path (no index arg) orders by the
     // subset's own first-start, which is exactly why the shared index is needed.

From 9a31af3880d560177879be6e527baa71091d1948 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Jul 2026 23:50:05 -0500
Subject: [PATCH 108/111] feat(agentic): link subagent requests by source

Signed-off-by: Cam Quilici <cjquilici@gmail.com>
---
 .../components/datasets/conversation-view.tsx |  3 +++
 .../components/datasets/trace-flamegraph.tsx  | 23 +++++++++++++++----
 .../agentic-point/request-timeline.test.ts    | 16 +++++++++++++
 .../agentic-point/request-timeline.tsx        | 18 +++++++++++----
 .../app/src/hooks/api/use-request-timeline.ts |  2 ++
 .../src/etl/compute-request-timeline.test.ts  |  4 ++++
 .../db/src/etl/compute-request-timeline.ts    |  6 ++++-
 packages/db/src/etl/weka-structure.test.ts    |  5 ++++
 packages/db/src/etl/weka-structure.ts         |  9 +++++++-
 9 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index 0be8e58a..359ca381 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -17,9 +17,11 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
   const params = useSearchParams();
   const turnRaw = params.get('turn');
   const sourceRaw = params.get('raw');
+  const sourceInner = params.get('inner');
   const highlight = {
     turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null,
     raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null,
+    inner: sourceInner !== null && /^\d+$/u.test(sourceInner) ? Number(sourceInner) : null,
     agent: params.get('sa'),
   };
 
@@ -98,6 +100,7 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
           structure={data.structure}
           highlightTurn={highlight.turn}
           highlightRawIndex={highlight.raw}
+          highlightInnerIndex={highlight.inner}
           highlightAgentId={highlight.agent}
         />
       </Card>
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index f82f0b5f..a3366342 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -170,6 +170,7 @@ export function TraceFlamegraph({
   structure,
   highlightTurn,
   highlightRawIndex,
+  highlightInnerIndex,
   highlightAgentId,
 }: {
   structure: ConversationStructure;
@@ -177,17 +178,31 @@ export function TraceFlamegraph({
   highlightTurn?: number | null;
   /** Raw Weka top-level request index to scroll to / highlight. */
   highlightRawIndex?: number | null;
+  /** Raw Weka nested request index under highlightRawIndex, for subagent children. */
+  highlightInnerIndex?: number | null;
   /** Subagent id when the highlighted turn is inside a subagent group. */
   highlightAgentId?: string | null;
 }) {
   const nodes = structure.nodes;
 
   // Resolve the deep-link target to a row key (+ the group that must be open to
-  // show it). Raw Weka indexes are exact source coordinates and take precedence;
-  // otherwise main turns match by main-turn ordinal and subagent turns match the
-  // group by agentId, then the ti-th child.
+  // show it). Raw Weka source coordinates are exact and take precedence:
+  //   raw=<outer>             -> top-level Weka request
+  //   raw=<outer>&inner=<idx> -> subagent child inside that top-level marker
+  // Otherwise main turns match by main-turn ordinal and subagent turns match
+  // the group by agentId, then the ti-th child.
   const target = useMemo(() => {
     if (typeof highlightRawIndex === 'number' && highlightRawIndex >= 0) {
+      if (typeof highlightInnerIndex === 'number' && highlightInnerIndex >= 0) {
+        const gi = nodes.findIndex(
+          (node) => node.kind === 'subagent' && node.rawIndex === highlightRawIndex,
+        );
+        if (gi === -1) return null;
+        const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
+        const ci = group.children.findIndex((child) => child.innerIndex === highlightInnerIndex);
+        if (ci === -1) return null;
+        return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi };
+      }
       const i = nodes.findIndex(
         (node) => node.kind === 'turn' && node.rawIndex === highlightRawIndex,
       );
@@ -210,7 +225,7 @@ export function TraceFlamegraph({
       }
     }
     return null;
-  }, [nodes, highlightTurn, highlightRawIndex, highlightAgentId]);
+  }, [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId]);
 
   // Subagent groups collapsed by default — except the deep-link target's group.
   const [expanded, setExpanded] = useState<Set<number>>(() =>
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
index 779b79f3..17d6d1bc 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts
@@ -137,6 +137,22 @@ describe('conversationHref', () => {
       }),
     ).toBe('/datasets/slug/conversations/02bc0afb13f7a2d9efa86c28511261d85c0e?turn=3&raw=204');
   });
+
+  it('uses raw nested source provenance for subagent child links', () => {
+    expect(
+      conversationHref('slug', {
+        ...request(0, 10),
+        cid: '117ebe75819d050f308a0a81647893abd02d::sa:subagent_010_32ee2daa',
+        ti: 16,
+        srcTrace: '117ebe75819d050f308a0a81647893abd02d',
+        srcOuter: 39,
+        srcInner: 16,
+        srcKind: 'weka_subagent',
+      }),
+    ).toBe(
+      '/datasets/slug/conversations/117ebe75819d050f308a0a81647893abd02d?turn=16&raw=39&inner=16',
+    );
+  });
 });
 
 describe('stable row order + color across phase filters', () => {
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 592f5c37..9afad5e6 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -46,12 +46,23 @@ export function conversationHref(datasetSlug: string, req: RequestRecord): strin
   const params = new URLSearchParams({ turn: String(req.ti) });
   if (typeof req.srcOuter === 'number' && Number.isInteger(req.srcOuter) && req.srcOuter >= 0) {
     params.set('raw', String(req.srcOuter));
+    if (typeof req.srcInner === 'number' && Number.isInteger(req.srcInner) && req.srcInner >= 0) {
+      params.set('inner', String(req.srcInner));
+    }
   }
   const sa = subagentIdOf(req.cid);
-  if (sa) params.set('sa', sa);
+  if (sa && !params.has('inner')) params.set('sa', sa);
   return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`;
 }
 
+function requestSourceLabel(req: RequestRecord): string {
+  if (typeof req.srcOuter === 'number') {
+    if (typeof req.srcInner === 'number') return `raw ${req.srcOuter} / child ${req.srcInner}`;
+    return `raw ${req.srcOuter}`;
+  }
+  return `replay turn ${req.ti + 1}`;
+}
+
 export interface RequestIdleStats {
   /** Total time between the first start and last end with no request running. */
   idleNs: number;
@@ -610,10 +621,7 @@ function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean })
       <div className="flex items-center gap-2 font-medium text-foreground">
         <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: row.color }} />
         <span className="truncate">{row.label}</span>
-        {/* Display 1-based to match the dataset flamegraph's "Turn N" labels.
-            The deep-link `turn` param stays 0-based (req.ti) — the flamegraph
-            matches it against a 0-based turn ordinal for highlighting. */}
-        <span className="text-muted-foreground">· turn {req.ti + 1}</span>
+        <span className="text-muted-foreground">· {requestSourceLabel(req)}</span>
         {req.cancelled && <span className="text-destructive">· cancelled</span>}
       </div>
       <div className="mt-1.5 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
index 306d1416..d2143c2b 100644
--- a/packages/app/src/hooks/api/use-request-timeline.ts
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -9,6 +9,8 @@ export interface RequestRecord {
   srcTrace?: string;
   /** Original raw top-level request index within srcTrace. */
   srcOuter?: number;
+  /** Original nested request index within srcOuter, for subagent children. */
+  srcInner?: number;
   /** Loader-specific source kind, e.g. weka_main or weka_flat. */
   srcKind?: string;
   /** Worker id (concurrency slot that handled this request). */
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
index 409dc091..1ad9e63b 100644
--- a/packages/db/src/etl/compute-request-timeline.test.ts
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -9,6 +9,7 @@ interface SyntheticRequest {
   ti: number;
   srcTrace?: string;
   srcOuter?: number;
+  srcInner?: number;
   srcKind?: string;
   wid?: string;
   ad?: number;
@@ -33,6 +34,7 @@ function makeBlob(requests: SyntheticRequest[]) {
         turn_index: r.ti,
         ...(r.srcTrace === undefined ? {} : { source_trace_id: r.srcTrace }),
         ...(r.srcOuter === undefined ? {} : { source_outer_idx: r.srcOuter }),
+        ...(r.srcInner === undefined ? {} : { source_inner_idx: r.srcInner }),
         ...(r.srcKind === undefined ? {} : { source_kind: r.srcKind }),
         worker_id: r.wid ?? 'worker_default',
         agent_depth: r.ad ?? 0,
@@ -133,6 +135,7 @@ describe('computeRequestTimeline', () => {
           ti: 3,
           srcTrace: 'trace',
           srcOuter: 204,
+          srcInner: 16,
           srcKind: 'weka_flat',
           credit: 0,
           start: 10,
@@ -145,6 +148,7 @@ describe('computeRequestTimeline', () => {
       ti: 3,
       srcTrace: 'trace',
       srcOuter: 204,
+      srcInner: 16,
       srcKind: 'weka_flat',
     });
   });
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
index 85f782fc..2cbe5174 100644
--- a/packages/db/src/etl/compute-request-timeline.ts
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -14,7 +14,7 @@
 import { gunzipSync } from 'node:zlib';
 
 /** Bump when the extraction algorithm changes — backfill recomputes anything older. */
-export const REQUEST_TIMELINE_VERSION = 4;
+export const REQUEST_TIMELINE_VERSION = 5;
 
 export interface RequestRecord {
   /** Conversation id (groups turns of one agent session). */
@@ -25,6 +25,8 @@ export interface RequestRecord {
   srcTrace?: string;
   /** Original raw top-level request index within srcTrace. */
   srcOuter?: number;
+  /** Original nested request index within srcOuter, for subagent children. */
+  srcInner?: number;
   /** Loader-specific source kind, e.g. weka_main or weka_flat. */
   srcKind?: string;
   /** Worker id (concurrency slot that handled this request). */
@@ -68,6 +70,7 @@ interface RawMetadata {
   turn_index?: number;
   source_trace_id?: string;
   source_outer_idx?: number;
+  source_inner_idx?: number;
   source_kind?: string;
   worker_id?: string;
   agent_depth?: number;
@@ -175,6 +178,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
       ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
       srcTrace: typeof m.source_trace_id === 'string' ? m.source_trace_id : undefined,
       srcOuter: typeof m.source_outer_idx === 'number' ? m.source_outer_idx : undefined,
+      srcInner: typeof m.source_inner_idx === 'number' ? m.source_inner_idx : undefined,
       srcKind: typeof m.source_kind === 'string' ? m.source_kind : undefined,
       wid: m.worker_id ?? 'unknown',
       ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
index dec2254c..5900d151 100644
--- a/packages/db/src/etl/weka-structure.test.ts
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -131,6 +131,7 @@ describe('buildConversationStructure', () => {
     expect(sub.kind).toBe('subagent');
     expect(sub.label).toBe('Explore');
     expect(sub.agentId).toBe('a1');
+    expect(sub.rawIndex).toBe(1);
     expect(sub.durationMs).toBe(1234);
     expect(sub.startS).toBe(12.5);
     expect(sub.endS).toBeCloseTo(13.734, 6);
@@ -141,6 +142,10 @@ describe('buildConversationStructure', () => {
       [12.5, 12.5],
       [13.1, 13.1],
     ]);
+    expect(sub.children.map((child) => [child.rawIndex, child.innerIndex])).toEqual([
+      [1, 0],
+      [1, 1],
+    ]);
     expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
     expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
     expect(sub.in).toBe(256);
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index bbdb8791..edc192ea 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -50,6 +50,8 @@ export interface TurnNode {
   turnIndex: number;
   /** Zero-based index in the raw Weka requests array, when this row maps to one. */
   rawIndex?: number;
+  /** Zero-based index within a raw nested request array, when this row maps to one. */
+  innerIndex?: number;
   /** Seconds from the start of the conversation. */
   startS?: number;
   /** End of the original request interval (`startS + api_time`). */
@@ -67,6 +69,8 @@ export interface SubagentNode {
   kind: 'subagent';
   label: string;
   agentId?: string;
+  /** Zero-based index of the raw top-level subagent marker. */
+  rawIndex?: number;
   /** Seconds from the start of the conversation. */
   startS?: number;
   /** Seconds from the start of the conversation. */
@@ -229,13 +233,15 @@ export function buildConversationStructure(
       let gout = 0;
       let gcached = 0;
       let guncached = 0;
-      for (const inner of entry.requests ?? []) {
+      for (const [innerIdx, inner] of (entry.requests ?? []).entries()) {
         const split = splitInput(inner, childSeen, blockSize);
         const out = Math.max(0, Math.round(inner.out ?? 0));
         const childStartS = subagentRequestStartS(entry, inner);
         children.push({
           kind: 'turn',
           turnIndex: turnIndex++,
+          rawIndex: idx,
+          innerIndex: innerIdx,
           startS: childStartS,
           endS: requestEndS(childStartS, inner.api_time),
           model: inner.model,
@@ -253,6 +259,7 @@ export function buildConversationStructure(
         kind: 'subagent',
         label: subagentLabel(entry),
         agentId: entry.agent_id,
+        rawIndex: idx,
         startS,
         endS,
         durationMs: entry.duration_ms,

From 173836e6ad25d7256cb1b2143798331a80687993 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 00:13:31 -0500
Subject: [PATCH 109/111] feat(ingest): support v3 agentic agg schema (nested
 request/server metrics)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agentic bmk rows (2026-07-02+) restructure the flat metric keys into nested
request_metrics / server_metrics containers, add p50 percentiles, ship intvty
already slow-tail inverted, scope the hw id (cluster:b300-nv), and replace
offload_mode with kv_offloading ('none'|'dram'|…) + kv_offload_backend.

- flattenAgenticAggRow(): map the nested containers back onto the canonical
  flat metric schema before the rest of the mapper runs, so v1/v2/v3 rows all
  produce one consistent metrics shape. p50 stats are stored as median_* to
  match the existing naming; the derive-from-itl intvty invariant is kept (it
  now agrees with the artifact's pre-inverted values).
- hwToGpuKey(): strip the v3 scope prefix (cluster:) — without this every v3
  row would be skipped as unmapped hardware.
- offload: kv_offloading descriptor reduces to the binary on/off used for row
  identity; the tier + backend strings are preserved as metrics for the UI.
- METRIC_KEYS: register gpu_kv_cache_usage_pct and
  server_external_cache_hit_rate (both previously warned as auto-captured).
- Deliberately not mapped yet: prefix hit/query counts, cpu KV detail,
  prompt_by_source split, sources[] — noted inline for when a view needs them.

No DB schema changes — everything lands in the existing metrics JSONB and
offload_mode column. Verified against the real artifacts from GH run
28553943579 (71 flat metrics on the conc16 row; kvdram-mooncake row maps to
offload on with backend preserved).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 packages/constants/src/metric-keys.ts        |   5 +
 packages/db/src/etl/benchmark-mapper.test.ts | 247 +++++++++++++++++++
 packages/db/src/etl/benchmark-mapper.ts      | 164 +++++++++++-
 packages/db/src/etl/normalizers.test.ts      |   5 +
 packages/db/src/etl/normalizers.ts           |   8 +-
 5 files changed, 419 insertions(+), 10 deletions(-)

diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts
index 0acf3fbf..914eed4b 100644
--- a/packages/constants/src/metric-keys.ts
+++ b/packages/constants/src/metric-keys.ts
@@ -106,7 +106,12 @@ export const METRIC_KEYS = new Set([
   // server prefix-cache observability (agentic aiperf)
   'server_gpu_cache_hit_rate',
   'server_cpu_cache_hit_rate',
+  'server_external_cache_hit_rate',
   'theoretical_cache_hit_rate',
+  // server KV-cache occupancy — mean GPU KV-cache usage fraction (0-1) over the
+  // profiling window (agentic aiperf; flat in v2 artifacts, mapped from
+  // server_metrics.kv_cache.gpu_usage_pct in v3)
+  'gpu_kv_cache_usage_pct',
   // measured power / energy (emitted by runner's aggregate_power.py)
   // avg_power_w:             mean per-GPU draw (W) during the load window
   // joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE on
diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts
index 5fe9ffde..cde2f74b 100644
--- a/packages/db/src/etl/benchmark-mapper.test.ts
+++ b/packages/db/src/etl/benchmark-mapper.test.ts
@@ -614,3 +614,250 @@ describe('mapBenchmarkRow — agentic interactivity normalization', () => {
     expect(result!.metrics.p90_intvty).toBe(999);
   });
 });
+
+/**
+ * Minimal v3 agentic row (2026-07-02+): nested request_metrics/server_metrics,
+ * p50 percentiles, pre-inverted intvty, kv_offloading descriptors. Mirrors the
+ * real artifact from GH run 28553943579 (trimmed).
+ */
+function makeV3AgenticRow(overrides: Record<string, any> = {}): Record<string, any> {
+  return {
+    infmax_model_prefix: 'dsv4',
+    hw: 'cluster:b300-nv',
+    framework: 'vllm',
+    precision: 'fp4',
+    spec_decoding: 'none',
+    disagg: false,
+    scenario_type: 'agentic-coding',
+    is_multinode: false,
+    tp: 4,
+    ep: 1,
+    dp_attention: 'false',
+    conc: 16,
+    image: 'vllm/vllm-openai:v0.23.0',
+    kv_offloading: 'none',
+    kv_offload_backend: '',
+    num_requests_total: 1648,
+    num_requests_successful: 1648,
+    dataset: {
+      source_type: 'public_dataset',
+      hf_dataset_name: 'semianalysisai/cc-traces-weka-062126',
+    },
+    request_metrics: {
+      qps: {
+        window_seconds: 1,
+        samples: 7209,
+        mean: 0.22846,
+        p50: 0,
+        p75: 0,
+        p90: 1,
+        p95: 1,
+        std: 0.60707,
+      },
+      latency: {
+        ttft: {
+          mean: 12.90033,
+          p50: 1.49712,
+          p75: 12.09501,
+          p90: 56.22194,
+          p95: 68.03156,
+          std: 22.68353,
+        },
+        e2el: {
+          mean: 81.05644,
+          p50: 26.18817,
+          p75: 84.93601,
+          p90: 199.85996,
+          p95: 360.31579,
+          std: 149.59205,
+        },
+        itl: {
+          mean: 0.07548,
+          p50: 0.03677,
+          p75: 0.10253,
+          p90: 0.16652,
+          p95: 0.22255,
+          std: 0.08327,
+        },
+        tpot: {
+          mean: 0.07548,
+          p50: 0.03677,
+          p75: 0.10253,
+          p90: 0.16652,
+          p95: 0.22255,
+          std: 0.08327,
+        },
+        // already slow-tail inverted upstream (pXX_intvty = 1/pXX_itl)
+        intvty: {
+          mean: 13.2482,
+          p50: 27.19411,
+          p75: 9.75304,
+          p90: 6.00526,
+          p95: 4.49335,
+          std: 24.77636,
+        },
+      },
+      tokens: {
+        input: {
+          mean: 157676.054,
+          p50: 96047,
+          p75: 197684.25,
+          p90: 404935.9,
+          p95: 547502.85,
+          std: 152480.17653,
+        },
+        output_actual: {
+          mean: 849.06735,
+          p50: 290.5,
+          p75: 783.5,
+          p90: 2231.8,
+          p95: 3915.45,
+          std: 1568.90823,
+        },
+        output_expected: {
+          mean: 1432.32728,
+          p50: 571.5,
+          p75: 1820,
+          p90: 3927,
+          p95: 5312.9,
+          std: 2067.19215,
+        },
+      },
+      throughput: {
+        input: { tokens_per_second: 35980.14001 },
+        output: { tokens_per_second: 193.7489 },
+        total: { tokens_per_second: 36173.88892 },
+        duration_seconds: 7222.04352,
+        per_gpu: {
+          total_tput_tps: 9043.47223,
+          output_tput_tps: 48.43723,
+          input_tput_tps: 8995.035,
+        },
+      },
+      cache: { theoretical_cache_hit_rate: 0.97509 },
+    },
+    server_metrics: {
+      present: true,
+      adapter: 'vllm',
+      metric_count: 49,
+      cache: {
+        gpu_cache_hit_rate: 0.78539,
+        cpu_cache_hit_rate: 0,
+        external_cache_hit_rate: 0,
+        overall_cache_hit_rate: 0.78539,
+        prefix_cache_hits: 205576960,
+        prefix_cache_queries: 261750519,
+        frontend_cache_hit_rate: null,
+      },
+      kv_cache: { gpu_usage_pct: 0.82134, cpu_usage_pct: null, cpu_used_tokens: null },
+      tokens: {
+        prompt_total: 261750519,
+        generation_total: 1422696,
+        requests_completed: 1648,
+        prompt_by_source: {
+          gpu_cache_hit: 205576960,
+          cpu_or_external_cache_hit: 0,
+          computed: 56173559,
+        },
+      },
+      sources: [{ id: 'combined|http://localhost:8888/metrics|engine=0', role: 'combined' }],
+    },
+    ...overrides,
+  };
+}
+
+describe('mapBenchmarkRow — v3 agentic nested agg schema', () => {
+  it('maps identity/routing and flattens the nested containers', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeV3AgenticRow(), tracker);
+
+    expect(result).not.toBeNull();
+    expect(result!.benchmarkType).toBe('agentic_traces');
+    expect(result!.config.hardware).toBe('b300');
+    expect(result!.conc).toBe(16);
+    expect(result!.isl).toBeNull();
+    expect(result!.osl).toBeNull();
+
+    const m = result!.metrics;
+    // latency distributions, p50 stored under the canonical median_* name
+    expect(m.median_ttft).toBeCloseTo(1.49712, 6);
+    expect(m.p90_ttft).toBeCloseTo(56.22194, 6);
+    expect(m.std_e2el).toBeCloseTo(149.59205, 6);
+    expect(m.p95_itl).toBeCloseTo(0.22255, 6);
+    expect(m.mean_tpot).toBeCloseTo(0.07548, 6);
+    // qps + token distributions
+    expect(m.median_qps).toBe(0);
+    expect(m.p90_input_tokens).toBeCloseTo(404935.9, 3);
+    expect(m.median_output_tokens_actual).toBeCloseTo(290.5, 3);
+    expect(m.p95_output_tokens_expected).toBeCloseTo(5312.9, 3);
+    // throughput scalars under the v2 flat names
+    expect(m.tput_per_gpu).toBeCloseTo(9043.47223, 3);
+    expect(m.output_tput_per_gpu).toBeCloseTo(48.43723, 3);
+    expect(m.input_tput_per_gpu).toBeCloseTo(8995.035, 3);
+    expect(m.total_tput_tps).toBeCloseTo(36173.88892, 3);
+    expect(m.duration_seconds).toBeCloseTo(7222.04352, 3);
+    // cache / kv / totals
+    expect(m.theoretical_cache_hit_rate).toBeCloseTo(0.97509, 6);
+    expect(m.server_gpu_cache_hit_rate).toBeCloseTo(0.78539, 6);
+    expect(m.server_external_cache_hit_rate).toBe(0);
+    expect(m.gpu_kv_cache_usage_pct).toBeCloseTo(0.82134, 6);
+    expect(m.total_prompt_tokens).toBe(261750519);
+    expect(m.total_generation_tokens).toBe(1422696);
+    expect(m.total_requests_completed).toBe(1648);
+    // nested containers must not leak into metrics
+    expect(m).not.toHaveProperty('request_metrics');
+    expect(m).not.toHaveProperty('server_metrics');
+  });
+
+  it('re-derives *_intvty from *_itl (matching the pre-inverted artifact values)', () => {
+    const tracker = createSkipTracker();
+    const m = mapBenchmarkRow(makeV3AgenticRow(), tracker)!.metrics;
+    // The artifact already ships slow-tail intvty; the derive invariant keeps
+    // one definition and must agree with it (up to the artifact's rounding).
+    expect(m.median_intvty).toBeCloseTo(1 / 0.03677, 6);
+    expect(m.p90_intvty).toBeCloseTo(1 / 0.16652, 6);
+    expect(m.median_intvty).toBeCloseTo(27.19411, 2);
+    expect(m.p90_intvty).toBeCloseTo(6.00526, 2);
+    // std is never inverted — passes through from the artifact
+    expect(m.std_intvty).toBeCloseTo(24.77636, 6);
+  });
+
+  it("maps kv_offloading 'none' to offload off and skips the empty backend", () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(makeV3AgenticRow(), tracker);
+    expect(result!.offloadMode).toBe('off');
+    expect(result!.metrics).not.toHaveProperty('kv_offload_backend');
+  });
+
+  it("maps kv_offloading 'dram' + backend to offload on with the backend preserved", () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeV3AgenticRow({ kv_offloading: 'dram', kv_offload_backend: 'mooncake', conc: 32 }),
+      tracker,
+    );
+    expect(result!.offloadMode).toBe('on');
+    expect((result!.metrics as Record<string, unknown>).kv_offloading).toBe('dram');
+    expect((result!.metrics as Record<string, unknown>).kv_offload_backend).toBe('mooncake');
+  });
+
+  it('still applies the failed-run guard to v3 rows', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 100 }),
+      tracker,
+    );
+    expect(result).toBeNull();
+    expect(tracker.skips.failedRun).toBe(1);
+  });
+
+  it('leaves v2 flat agentic rows byte-identical (no flattening applied)', () => {
+    const tracker = createSkipTracker();
+    const result = mapBenchmarkRow(
+      makeAgenticRow({ p90_itl: 0.1, mean_ttft: 1.5, offload_mode: 'on' }),
+      tracker,
+    );
+    expect(result!.metrics.mean_ttft).toBe(1.5);
+    expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6);
+    expect(result!.offloadMode).toBe('on');
+  });
+});
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 5ec3343c..59309945 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -63,6 +63,14 @@ const NON_METRIC_KEYS = new Set([
   'offload_mode',
   'num_requests_total',
   'num_requests_successful',
+  // v3 agentic KV-offload descriptors ('none'|'dram'|… + backend name). Mapped
+  // to offloadMode / stringified metrics explicitly in mapBenchmarkRow.
+  'kv_offloading',
+  'kv_offload_backend',
+  // v3 agentic nested containers — flattened by flattenAgenticAggRow before
+  // the auto-capture loop runs; the raw objects themselves are not metrics.
+  'request_metrics',
+  'server_metrics',
   // Public-dataset provenance emitted by aiperf. The ingest runner uses this
   // object to populate run_datasets; it is not a benchmark metric.
   'dataset',
@@ -79,6 +87,136 @@ const NON_METRIC_KEYS = new Set([
  */
 export type BenchmarkType = 'single_turn' | 'agentic_traces';
 
+// ---------------------------------------------------------------------------
+// v3 agentic agg schema (2026-07-02+): nested containers → canonical flat keys
+// ---------------------------------------------------------------------------
+
+/**
+ * Distribution stat names accepted from v3 nested stat blocks, with the rename
+ * applied when flattening. `p50` is stored as `median_*` to match the
+ * established METRIC_KEYS naming (fixed-seq runs and the frontend both use
+ * `median_*`; no `p50_*` key exists anywhere downstream).
+ */
+const V3_STAT_KEYS: Record<string, string> = {
+  mean: 'mean',
+  p50: 'median',
+  median: 'median',
+  p75: 'p75',
+  p90: 'p90',
+  p95: 'p95',
+  p99: 'p99',
+  'p99.9': 'p99.9',
+  std: 'std',
+};
+
+/** v3 `request_metrics.latency` sub-blocks → flat metric suffix (same name). */
+const V3_LATENCY_METRICS = ['ttft', 'e2el', 'itl', 'tpot', 'intvty'] as const;
+
+/** v3 `request_metrics.tokens` sub-blocks → flat metric suffix. */
+const V3_TOKEN_METRICS: Record<string, string> = {
+  input: 'input_tokens',
+  output_actual: 'output_tokens_actual',
+  output_expected: 'output_tokens_expected',
+};
+
+/**
+ * Scalar paths in the v3 nested containers → canonical flat metric key. Keys
+ * reuse the flat v2-agentic names wherever one existed so already-ingested runs
+ * and the frontend see one consistent schema; genuinely new information gets a
+ * new key (registered in METRIC_KEYS).
+ */
+const V3_SCALAR_PATHS: [string[], string][] = [
+  // client-side throughput
+  [['request_metrics', 'throughput', 'input', 'tokens_per_second'], 'input_tput_tps'],
+  [['request_metrics', 'throughput', 'output', 'tokens_per_second'], 'output_tput_tps'],
+  [['request_metrics', 'throughput', 'total', 'tokens_per_second'], 'total_tput_tps'],
+  [['request_metrics', 'throughput', 'duration_seconds'], 'duration_seconds'],
+  [['request_metrics', 'throughput', 'per_gpu', 'total_tput_tps'], 'tput_per_gpu'],
+  [['request_metrics', 'throughput', 'per_gpu', 'output_tput_tps'], 'output_tput_per_gpu'],
+  [['request_metrics', 'throughput', 'per_gpu', 'input_tput_tps'], 'input_tput_per_gpu'],
+  [['request_metrics', 'cache', 'theoretical_cache_hit_rate'], 'theoretical_cache_hit_rate'],
+  // server-side prefix-cache observability (same fields v2 emitted flat)
+  [['server_metrics', 'cache', 'gpu_cache_hit_rate'], 'server_gpu_cache_hit_rate'],
+  [['server_metrics', 'cache', 'cpu_cache_hit_rate'], 'server_cpu_cache_hit_rate'],
+  [['server_metrics', 'cache', 'external_cache_hit_rate'], 'server_external_cache_hit_rate'],
+  // KV-cache occupancy (gpu key predates v3 as a flat auto-captured field)
+  [['server_metrics', 'kv_cache', 'gpu_usage_pct'], 'gpu_kv_cache_usage_pct'],
+  // server token totals
+  [['server_metrics', 'tokens', 'prompt_total'], 'total_prompt_tokens'],
+  [['server_metrics', 'tokens', 'generation_total'], 'total_generation_tokens'],
+  [['server_metrics', 'tokens', 'requests_completed'], 'total_requests_completed'],
+  // Deliberately NOT mapped (yet): cache.overall/prefix_cache_hits/queries,
+  // kv_cache.cpu_*, tokens.prompt_by_source, sources[] — new v3 detail we don't
+  // consume anywhere; add here + METRIC_KEYS when a view needs them.
+];
+
+/** Reduce an offload descriptor ('none'|'dram'|…) to the binary on/off. */
+function descriptorToOnOff(v: unknown): string | null {
+  return typeof v === 'string' && v.length > 0 ? (v === 'none' ? 'off' : 'on') : null;
+}
+
+/** Walk a nested object path; returns undefined on any non-object hop. */
+function atPath(obj: Record<string, any>, path: string[]): unknown {
+  let cur: unknown = obj;
+  for (const seg of path) {
+    if (!cur || typeof cur !== 'object' || Array.isArray(cur)) return undefined;
+    cur = (cur as Record<string, unknown>)[seg];
+  }
+  return cur;
+}
+
+/** Flatten one v3 stat block ({mean, p50, …}) into `out` as `{stat}_{suffix}`. */
+function flattenStatBlock(block: unknown, suffix: string, out: Record<string, number>): void {
+  if (!block || typeof block !== 'object' || Array.isArray(block)) return;
+  for (const [stat, canonical] of Object.entries(V3_STAT_KEYS)) {
+    const n = parseNum((block as Record<string, unknown>)[stat]);
+    if (n !== undefined) out[`${canonical}_${suffix}`] = n;
+  }
+}
+
+/**
+ * Flatten a v3 agentic agg row (nested `request_metrics` / `server_metrics`
+ * containers, 2026-07-02+) into the canonical flat metric schema that v1/v2
+ * artifacts emitted directly and that the DB / API / frontend consume.
+ *
+ * Returns the row unchanged when `request_metrics` is absent (v1/v2 rows pass
+ * through untouched). Otherwise returns a copy with the flattened metrics
+ * merged in; the nested containers stay on the row (they're in NON_METRIC_KEYS
+ * so the auto-capture loop ignores them).
+ *
+ * Notes on the v3 source data:
+ * - `p50` percentiles are new (v2 had no median for agentic); stored as
+ *   `median_*` to match the frontend's naming.
+ * - `latency.intvty` arrives already slow-tail inverted (pXX_intvty =
+ *   1/pXX_itl). It's flattened here for completeness, but mapBenchmarkRow's
+ *   derive-from-itl invariant still overwrites it, keeping one definition
+ *   across all harness versions.
+ */
+export function flattenAgenticAggRow(row: Record<string, any>): Record<string, any> {
+  const rm = row.request_metrics;
+  if (!rm || typeof rm !== 'object' || Array.isArray(rm)) return row;
+
+  const flat: Record<string, number> = {};
+
+  // latency distributions
+  for (const metric of V3_LATENCY_METRICS) {
+    flattenStatBlock(atPath(row, ['request_metrics', 'latency', metric]), metric, flat);
+  }
+  // qps distribution (window_seconds / samples are intentionally not stats)
+  flattenStatBlock(atPath(row, ['request_metrics', 'qps']), 'qps', flat);
+  // per-request token-count distributions
+  for (const [src, suffix] of Object.entries(V3_TOKEN_METRICS)) {
+    flattenStatBlock(atPath(row, ['request_metrics', 'tokens', src]), suffix, flat);
+  }
+  // scalars
+  for (const [path, key] of V3_SCALAR_PATHS) {
+    const n = parseNum(atPath(row, path));
+    if (n !== undefined) flat[key] = n;
+  }
+
+  return { ...row, ...flat };
+}
+
 /**
  * METRIC_KEYS from constants is the canonical set of known metric keys.
  * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured
@@ -148,6 +286,11 @@ export function mapBenchmarkRow(
   tracker: SkipTracker,
   islOslFallback?: { isl: number; osl: number } | null,
 ): BenchmarkParams | null {
+  // v3 agentic rows nest their metrics; flatten to the canonical flat schema
+  // first so the rest of the mapper (auto-capture, intvty invariant, guards)
+  // is version-agnostic. No-op for v1/v2 rows.
+  row = flattenAgenticAggRow(row);
+
   const modelKey = resolveModelKey(row);
   if (!modelKey) {
     tracker.skips.unmappedModel++;
@@ -192,16 +335,15 @@ export function mapBenchmarkRow(
     return null;
   }
 
-  // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
-  // ('none' → 'off'; any other non-empty value → 'on').
+  // Agentic offload signal: prefer `offload_mode` ('on'|'off'), then the v3
+  // `kv_offloading` descriptor ('none'|'dram'|…), then legacy `offloading`.
+  // Descriptors reduce to the binary on/off used for row identity ('none' →
+  // 'off', anything else → 'on') so v3 offload points keep colliding-key parity
+  // with their v2 predecessors instead of forking a third offload_mode value.
   const offloadModeRaw =
     typeof row.offload_mode === 'string' && row.offload_mode.length > 0
       ? row.offload_mode
-      : typeof row.offloading === 'string' && row.offloading.length > 0
-        ? row.offloading === 'none'
-          ? 'off'
-          : 'on'
-        : 'off';
+      : (descriptorToOnOff(row.kv_offloading) ?? descriptorToOnOff(row.offloading) ?? 'off');
 
   const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg);
   const isMultinode = parseBool(row.is_multinode);
@@ -265,8 +407,16 @@ export function mapBenchmarkRow(
 
   // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`)
   // — preserve as a stringified metric so the frontend can expose it in tooltips.
+  // v3 rows additionally carry the offload tier + backend ('dram'/'mooncake');
+  // keep them so the UI can say *what kind* of offload, not just on/off.
   if (isAgentic) {
     (metrics as Record<string, unknown>).offload_mode = offloadModeRaw;
+    if (typeof row.kv_offloading === 'string' && row.kv_offloading.length > 0) {
+      (metrics as Record<string, unknown>).kv_offloading = row.kv_offloading;
+    }
+    if (typeof row.kv_offload_backend === 'string' && row.kv_offload_backend.length > 0) {
+      (metrics as Record<string, unknown>).kv_offload_backend = row.kv_offload_backend;
+    }
   }
 
   // Slow-tail interactivity invariant. Agentic artifacts ship `*_intvty`, but the
diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts
index e569143a..82aaf67c 100644
--- a/packages/db/src/etl/normalizers.test.ts
+++ b/packages/db/src/etl/normalizers.test.ts
@@ -25,6 +25,11 @@ describe('hwToGpuKey', () => {
     expect(hwToGpuKey('mi300x-amd')).toBe('mi300x');
   });
 
+  it('strips a v3 scope prefix (cluster:…)', () => {
+    expect(hwToGpuKey('cluster:b300-nv')).toBe('b300');
+    expect(hwToGpuKey('cluster:h200')).toBe('h200');
+  });
+
   it('strips -amds suffix', () => {
     expect(hwToGpuKey('mi355x-amds')).toBe('mi355x');
   });
diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts
index c5ff69dc..844e1751 100644
--- a/packages/db/src/etl/normalizers.ts
+++ b/packages/db/src/etl/normalizers.ts
@@ -22,9 +22,11 @@ export { GPU_KEYS };
  *   stripped base is not in `GPU_KEYS`.
  */
 export function hwToGpuKey(hw: string): string | null {
-  // Take the first segment before `-` as the canonical key. Subsumes all the
-  // prior explicit suffix strips (-nv, -amds, -dgxc-slurm, -p1, -cw, …).
-  const base = hw.toLowerCase().split('-')[0];
+  // v3 agentic artifacts scope the hw id (`cluster:b300-nv`) — drop everything
+  // up to the last `:` first. Then take the first segment before `-` as the
+  // canonical key; that subsumes all the prior explicit suffix strips
+  // (-nv, -amds, -dgxc-slurm, -p1, -cw, …).
+  const base = hw.toLowerCase().split(':').pop()!.split('-')[0];
   return GPU_KEYS.has(base) ? base : null;
 }
 

From 3a06f891158eee16851d9fe86484d6fb8161f681 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 00:43:29 -0500
Subject: [PATCH 110/111] fix(ingest): find server.log in the v3 harness's
 nested results/ layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

feat-agentx-v1.0 moved the log inside server_logs_* artifacts from the root
(server.log) to results/server.log; the discovery loop only checked the root,
so run 28553943579's first ingest pass attached 0/20 server logs. Check both
locations.

Also harden the ingest agent doc: changelog entries are mandatory for every
ingest (derive from the run name when no text is given, never block asking),
and fix a doc typo that pointed the cache purge at port 3000 — the port the
same doc forbids touching.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .claude/agents/ingest.md         | 15 ++++++++++-----
 packages/db/src/ingest-ci-run.ts |  9 +++++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
index 59045378..10e37d6c 100644
--- a/.claude/agents/ingest.md
+++ b/.claude/agents/ingest.md
@@ -130,7 +130,12 @@ The `spec_method` column has a lowercase check constraint — always lowercase.
 
 Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`.
 
-## Adding a perf changelog entry
+## Adding a perf changelog entry — MANDATORY for every ingest
+
+**You ALWAYS MUST add a changelog entry for every run you ingest. This is not optional.** Every standard ingest, delete+reingest, and partial ingest gets exactly one changelog entry. Never finish an ingest without one.
+
+- If the user gave changelog text, use it verbatim (substitute `<SKU>` with the run's hardware SKU when the text contains that placeholder).
+- If the user did NOT specify text, DO NOT skip the changelog — derive a sensible description from the run name (see convention below) and add it anyway, then tell the user what you used so they can adjust.
 
 Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `<model>-<precision>-<hw>-<framework>` (matches what the user actually sees in the filter chain).
 
@@ -147,7 +152,7 @@ Description convention from prior entries: `<HW upper> <Model> Ingest #<N> (<not
 - `MI355X Kimi Ingest #2`
 - `H200 Kimi Ingest #1 (mmap cache)`
 
-If user doesn't specify a description, ask for one OR derive from the run name.
+If the user doesn't specify a description, DO NOT skip the entry and DO NOT block on asking — derive a description from the run name, add the entry, and report what you used so the user can adjust.
 
 ## Common gotchas
 
@@ -165,9 +170,9 @@ If user doesn't specify a description, ask for one OR derive from the run name.
 2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding.
 3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line.
 4. **Refresh materialized view**.
-5. **Add changelog entry** if the user asked or if the run is a "marker" worth surfacing.
-6. **Purge both caches** (localhost 3002 + preview).
-7. **Report** the row count, date, hardware, run id, and changelog id (if added).
+5. **Add changelog entry — ALWAYS, MANDATORY.** Every ingest gets exactly one changelog entry (see "Adding a perf changelog entry — MANDATORY"). Use the user's text if given (substituting `<SKU>`); otherwise derive one from the run name and add it anyway. Never skip this step.
+6. **Purge both caches** (localhost 3002 + preview — never port 3000).
+7. **Report** the row count, date, hardware, run id, and the changelog id (always present).
 
 ## Related: ingesting agentic _datasets_ (not benchmark runs)
 
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 8ec1fb9e..8bea3378 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -384,8 +384,13 @@ async function main(): Promise<void> {
     if (fs.existsSync(artifactsDir)) {
       for (const d of fs.readdirSync(artifactsDir)) {
         if (!d.startsWith('server_logs_')) continue;
-        const logPath = path.join(artifactsDir, d, 'server.log');
-        if (!fs.existsSync(logPath)) continue;
+        // feat-agentx-v1.0 harness nests the log under `results/server.log`;
+        // older runs keep it at the artifact root. Check both.
+        const logPath = [
+          path.join(artifactsDir, d, 'server.log'),
+          path.join(artifactsDir, d, 'results', 'server.log'),
+        ].find((p) => fs.existsSync(p));
+        if (!logPath) continue;
         const configKey = d.replace(/^server_logs_/u, '');
         serverLogPaths.set(configKey, logPath);
       }

From f67b349d96aa0e3c53cbe6577954bdd32053f251 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Jul 2026 09:44:53 -0500
Subject: [PATCH 111/111] fix(datasets): resolve raw= deep links for
 pre-rawIndex conversation structures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Timeline request bars link to the dataset flamegraph with raw weka
coordinates (raw=/inner=), but stored conversation structures ingested
before rawIndex/innerIndex existed have no such fields, so the lookup
never matched and the deep link silently did nothing. Since
buildConversationStructure emits exactly one node per raw entry, array
position is definitionally the raw index — resolve via
(node.rawIndex ?? arrayIndex) in an extracted, unit-tested
resolveDeepLinkTarget. Out-of-range coords still return null rather
than guessing; positional turn=/sa= links are unchanged.

Verified with Playwright against real data: point 425111 (new mapping)
turn-1, later-turn, and subagent-child links highlight the exact row;
old positional-link runs (424976) unregressed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../datasets/trace-flamegraph.test.ts         | 102 ++++++++++++++-
 .../components/datasets/trace-flamegraph.tsx  | 118 ++++++++++++------
 2 files changed, 179 insertions(+), 41 deletions(-)

diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts
index 2ead726b..0cbf92f3 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.test.ts
+++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts
@@ -1,6 +1,16 @@
 import { describe, expect, it } from 'vitest';
 
-import { findRequestOverlapGroups, formatElapsedTime } from './trace-flamegraph';
+import type {
+  StructureNode,
+  SubagentNode,
+  TurnNode,
+} from '@semianalysisai/inferencex-db/etl/weka-structure';
+
+import {
+  findRequestOverlapGroups,
+  formatElapsedTime,
+  resolveDeepLinkTarget,
+} from './trace-flamegraph';
 
 describe('formatElapsedTime', () => {
   it('formats elapsed seconds below and above one hour', () => {
@@ -53,3 +63,93 @@ describe('findRequestOverlapGroups', () => {
     expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]);
   });
 });
+
+const turn = (turnIndex: number, extra: Partial<TurnNode> = {}): TurnNode => ({
+  kind: 'turn',
+  turnIndex,
+  in: 100,
+  out: 10,
+  cached: 0,
+  uncached: 100,
+  ...extra,
+});
+const subagent = (children: TurnNode[], extra: Partial<SubagentNode> = {}): SubagentNode => ({
+  kind: 'subagent',
+  label: 'Subagent',
+  in: 100,
+  out: 10,
+  cached: 0,
+  uncached: 100,
+  children,
+  ...extra,
+});
+
+describe('resolveDeepLinkTarget', () => {
+  // Node layout mirroring a real Weka conversation: raw entries
+  //   0: turn, 1: subagent (2 children), 2: turn
+  const withRawIndexes: StructureNode[] = [
+    turn(0, { rawIndex: 0 }),
+    subagent([turn(1, { rawIndex: 1, innerIndex: 0 }), turn(2, { rawIndex: 1, innerIndex: 1 })], {
+      agentId: 'subagent_001_abcd1234',
+      rawIndex: 1,
+    }),
+    turn(3, { rawIndex: 2 }),
+  ];
+  // The same conversation as stored by the pre-rawIndex ingest (fields absent).
+  const legacy: StructureNode[] = [
+    turn(0),
+    subagent([turn(1), turn(2)], { agentId: 'subagent_001_abcd1234' }),
+    turn(3),
+  ];
+
+  it('resolves raw source coordinates against explicit rawIndex fields', () => {
+    expect(resolveDeepLinkTarget(withRawIndexes, { raw: 2 })).toEqual({
+      rowKey: 't-2',
+      expandGroup: null,
+    });
+    expect(resolveDeepLinkTarget(withRawIndexes, { raw: 1, inner: 1 })).toEqual({
+      rowKey: 'g-1-c-1',
+      expandGroup: 1,
+    });
+  });
+
+  it('falls back to node array position for structures ingested before rawIndex existed', () => {
+    // One node per raw entry means position === raw index, so the deep link
+    // must still resolve exactly (regression: it previously returned null and
+    // the flamegraph neither scrolled nor highlighted anything).
+    expect(resolveDeepLinkTarget(legacy, { raw: 2, turn: 1 })).toEqual({
+      rowKey: 't-2',
+      expandGroup: null,
+    });
+    expect(resolveDeepLinkTarget(legacy, { raw: 0, turn: 0 })).toEqual({
+      rowKey: 't-0',
+      expandGroup: null,
+    });
+  });
+
+  it('resolves subagent children positionally when innerIndex is absent', () => {
+    expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 1, turn: 1 })).toEqual({
+      rowKey: 'g-1-c-1',
+      expandGroup: 1,
+    });
+  });
+
+  it('returns null for out-of-range raw coordinates instead of guessing', () => {
+    expect(resolveDeepLinkTarget(legacy, { raw: 9 })).toBeNull();
+    expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 5 })).toBeNull();
+    // raw pointing at a subagent marker without inner does not match a turn.
+    expect(resolveDeepLinkTarget(legacy, { raw: 1 })).toBeNull();
+  });
+
+  it('keeps the positional turn/agent fallback for links without raw coordinates', () => {
+    expect(resolveDeepLinkTarget(legacy, { turn: 1 })).toEqual({
+      rowKey: 't-2',
+      expandGroup: null,
+    });
+    expect(resolveDeepLinkTarget(legacy, { turn: 1, agent: 'subagent_001_abcd1234' })).toEqual({
+      rowKey: 'g-1-c-1',
+      expandGroup: 1,
+    });
+    expect(resolveDeepLinkTarget(legacy, {})).toBeNull();
+  });
+});
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index a3366342..d57567e5 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -160,6 +160,73 @@ interface TooltipState {
   row: VisibleRow;
 }
 
+export interface DeepLinkHighlight {
+  turn?: number | null;
+  raw?: number | null;
+  inner?: number | null;
+  agent?: string | null;
+}
+
+export interface DeepLinkTarget {
+  rowKey: string;
+  expandGroup: number | null;
+}
+
+/**
+ * Resolve a request-timeline deep link to a flamegraph row key (+ the subagent
+ * group that must be expanded to show it). Raw Weka source coordinates are
+ * exact and take precedence:
+ *   raw=<outer>             -> top-level Weka request
+ *   raw=<outer>&inner=<idx> -> subagent child inside that top-level marker
+ * Otherwise main turns match by main-turn ordinal and subagent turns match the
+ * group by agentId, then the ti-th child.
+ *
+ * `buildConversationStructure` emits exactly one node per raw Weka entry (and
+ * one child per nested entry), so a node's array position IS its raw index.
+ * Structures ingested before rawIndex/innerIndex were stored omit the explicit
+ * fields — fall back to the array position so deep links keep resolving against
+ * those older rows instead of silently doing nothing.
+ */
+export function resolveDeepLinkTarget(
+  nodes: readonly StructureNode[],
+  highlight: DeepLinkHighlight,
+): DeepLinkTarget | null {
+  const { turn, raw, inner, agent } = highlight;
+  if (typeof raw === 'number' && raw >= 0) {
+    if (typeof inner === 'number' && inner >= 0) {
+      const gi = nodes.findIndex(
+        (node, i) => node.kind === 'subagent' && (node.rawIndex ?? i) === raw,
+      );
+      if (gi === -1) return null;
+      const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
+      const ci = group.children.findIndex((child, i) => (child.innerIndex ?? i) === inner);
+      if (ci === -1) return null;
+      return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi };
+    }
+    const i = nodes.findIndex(
+      (node, idx) => node.kind === 'turn' && (node.rawIndex ?? idx) === raw,
+    );
+    if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null };
+    return null;
+  }
+  if (typeof turn !== 'number' || turn < 0) return null;
+  if (agent) {
+    const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === agent);
+    if (gi === -1) return null;
+    const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
+    if (turn >= group.children.length) return null;
+    return { rowKey: `g-${gi}-c-${turn}`, expandGroup: gi };
+  }
+  let ordinal = 0;
+  for (let i = 0; i < nodes.length; i++) {
+    if (nodes[i].kind === 'turn') {
+      if (ordinal === turn) return { rowKey: `t-${i}`, expandGroup: null };
+      ordinal += 1;
+    }
+  }
+  return null;
+}
+
 /**
  * Per-conversation flamegraph driven by the precomputed `structure` JSONB.
  * One row per turn; subagent groups render a collapsible header with indented
@@ -186,46 +253,17 @@ export function TraceFlamegraph({
   const nodes = structure.nodes;
 
   // Resolve the deep-link target to a row key (+ the group that must be open to
-  // show it). Raw Weka source coordinates are exact and take precedence:
-  //   raw=<outer>             -> top-level Weka request
-  //   raw=<outer>&inner=<idx> -> subagent child inside that top-level marker
-  // Otherwise main turns match by main-turn ordinal and subagent turns match
-  // the group by agentId, then the ti-th child.
-  const target = useMemo(() => {
-    if (typeof highlightRawIndex === 'number' && highlightRawIndex >= 0) {
-      if (typeof highlightInnerIndex === 'number' && highlightInnerIndex >= 0) {
-        const gi = nodes.findIndex(
-          (node) => node.kind === 'subagent' && node.rawIndex === highlightRawIndex,
-        );
-        if (gi === -1) return null;
-        const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
-        const ci = group.children.findIndex((child) => child.innerIndex === highlightInnerIndex);
-        if (ci === -1) return null;
-        return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi };
-      }
-      const i = nodes.findIndex(
-        (node) => node.kind === 'turn' && node.rawIndex === highlightRawIndex,
-      );
-      if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null };
-      return null;
-    }
-    if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null;
-    if (highlightAgentId) {
-      const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId);
-      if (gi === -1) return null;
-      const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
-      if (highlightTurn >= group.children.length) return null;
-      return { rowKey: `g-${gi}-c-${highlightTurn}`, expandGroup: gi };
-    }
-    let ordinal = 0;
-    for (let i = 0; i < nodes.length; i++) {
-      if (nodes[i].kind === 'turn') {
-        if (ordinal === highlightTurn) return { rowKey: `t-${i}`, expandGroup: null };
-        ordinal += 1;
-      }
-    }
-    return null;
-  }, [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId]);
+  // show it). See resolveDeepLinkTarget for the matching rules.
+  const target = useMemo(
+    () =>
+      resolveDeepLinkTarget(nodes, {
+        turn: highlightTurn,
+        raw: highlightRawIndex,
+        inner: highlightInnerIndex,
+        agent: highlightAgentId,
+      }),
+    [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId],
+  );
 
   // Subagent groups collapsed by default — except the deep-link target's group.
   const [expanded, setExpanded] = useState<Set<number>>(() =>