From 0e35e5f0b10c2c9db10094031a2ac92e59fff9f3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 23 Apr 2026 13:40:27 -0500 Subject: [PATCH 001/111] feat: agentic benchmark ingest + UI with offload-mode halo Adds agentic_traces scenario end-to-end: - Schema migrations for agentic scenario, availability, and KV offload mode - DB ingest/ETL + query updates to carry scenario, offload_mode, and server/theoretical cache-hit rates through to the API layer - Frontend types, filters (GlobalFilterContext / InferenceContext / ChartControls), URL state, and tooltip rows for agentic-only fields - ScatterGraph: subtle dashed halo on Pareto-frontier points that used KV offload so the tradeoff is visible at a glance --- packages/app/cypress/support/mock-data.ts | 2 + .../app/src/app/api/unofficial-run/route.ts | 2 + .../src/components/GlobalFilterContext.tsx | 12 +- .../components/inference/InferenceContext.tsx | 15 ++- .../inference/hooks/useChartData.ts | 34 +++-- .../app/src/components/inference/types.ts | 26 ++++ .../components/inference/ui/ChartControls.tsx | 27 +++- .../components/inference/ui/ScatterGraph.tsx | 21 +++ .../inference/utils/tooltipUtils.ts | 54 +++++++- .../app/src/components/ui/chart-selectors.tsx | 124 ++++++++++++++++++ .../unofficial-run-provider.test.ts | 2 + .../components/unofficial-run-provider.tsx | 4 +- packages/app/src/lib/api.ts | 14 +- .../app/src/lib/benchmark-transform.test.ts | 2 + packages/app/src/lib/benchmark-transform.ts | 65 ++++++++- packages/app/src/lib/data-mappings.ts | 72 +++++++++- packages/app/src/lib/url-state.ts | 2 + packages/constants/src/models.ts | 17 +++ .../db/migrations/002_agentic_scenario.sql | 30 +++++ .../migrations/003_agentic_availability.sql | 21 +++ packages/db/migrations/004_offload_mode.sql | 42 ++++++ packages/db/src/etl/benchmark-ingest.ts | 28 ++-- packages/db/src/etl/benchmark-mapper.ts | 45 ++++++- packages/db/src/ingest-ci-run.ts | 6 +- packages/db/src/ingest-gcs-backup.ts | 6 +- packages/db/src/ingest-supplemental.ts | 14 +- packages/db/src/json-provider.ts | 8 +- packages/db/src/queries/benchmarks.ts | 13 +- packages/db/src/queries/workflow-info.ts | 15 ++- 29 files changed, 645 insertions(+), 78 deletions(-) create mode 100644 packages/db/migrations/002_agentic_scenario.sql create mode 100644 packages/db/migrations/003_agentic_availability.sql create mode 100644 packages/db/migrations/004_offload_mode.sql diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index e6720c0b..7a4f59a9 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,6 +189,8 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), + selectedPercentile: 'median', + setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), selectedE2eXAxisMetric: null, diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index 79ac0665..dbfb9c33 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -49,6 +49,8 @@ export function normalizeArtifactRows( decode_num_workers: config.decodeNumWorkers, num_prefill_gpu: config.numPrefillGpu, num_decode_gpu: config.numDecodeGpu, + benchmark_type: params.benchmarkType, + offload_mode: params.offloadMode, isl: params.isl, osl: params.osl, conc: params.conc, diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 65f510cd..f603081a 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -11,7 +11,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; @@ -172,11 +172,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { const availableSequences = useMemo(() => { if (!availabilityRows) return SEQUENCE_OPTIONS; const seqs = [ - ...new Set( - modelRows - .map((r) => islOslToSequence(r.isl, r.osl)) - .filter((s): s is Sequence => s !== null), - ), + ...new Set(modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null)), ]; return seqs.length > 0 ? seqs : SEQUENCE_OPTIONS; }, [availabilityRows, modelRows]); @@ -190,7 +186,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { // Precisions available for the selected model + sequence const availablePrecisions = useMemo(() => { if (!availabilityRows) return ['fp4']; - const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const precs = [...new Set(rows.map((r) => r.precision))].toSorted(); return precs.length > 0 ? precs : ['fp4']; }, [availabilityRows, modelRows, effectiveSequence]); @@ -205,7 +201,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { // Dates available for selected model + sequence + precisions const availableDates = useMemo(() => { if (!availabilityRows) return []; - const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision)); if (rows.length === 0) { return [...new Set(seqRows.map((r) => r.date))].toSorted(); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 7fa416fd..6f45d8d7 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -11,7 +11,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import { FAVORITE_PRESETS, type FavoritePreset } from '@/components/favorites/favorite-presets'; @@ -110,6 +110,11 @@ export function InferenceProvider({ const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState( () => getUrlParam('i_e2e_xmetric') || null, ); + // Latency percentile applied to the chart x-axis for agentic scenarios. + // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. + const [selectedPercentile, setSelectedPercentile] = useState( + () => getUrlParam('i_pctl') || 'median', + ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', ); @@ -163,6 +168,7 @@ export function InferenceProvider({ effectiveRunDate, isActive, latestDate, + selectedPercentile, ); // For GPU comparison date picker — use shared availability data from global filters @@ -176,7 +182,7 @@ export function InferenceProvider({ if (!availabilityRows) return availableDates; const rows = availabilityRows.filter((r) => { if (!dbModelKeys.includes(r.model)) return false; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false; + if (rowToSequence(r) !== effectiveSequence) return false; if (!effectivePrecisions.includes(r.precision)) return false; if (!r.hardware) return false; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -201,7 +207,7 @@ export function InferenceProvider({ const hwKeys = new Set(); for (const r of availabilityRows) { if (!dbModelKeys.includes(r.model)) continue; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue; + if (rowToSequence(r) !== effectiveSequence) continue; if (!effectivePrecisions.includes(r.precision)) continue; if (!r.hardware) continue; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -589,6 +595,7 @@ export function InferenceProvider({ useUrlStateSync( { i_metric: selectedYAxisMetric, + i_pctl: selectedPercentile, i_gpus: selectedGPUs.join(','), i_dates: selectedDates.join(','), i_dstart: selectedDateRange.startDate, @@ -783,6 +790,8 @@ export function InferenceProvider({ workflowInfo, selectedYAxisMetric, setSelectedYAxisMetric: setSelectedYAxisMetricAndClear, + selectedPercentile, + setSelectedPercentile, selectedGPUs, setSelectedGPUs: setSelectedGPUsAndClear, availableGPUs, diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 625e63ab..81ab0780 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -1,7 +1,7 @@ import { useMemo, useRef } from 'react'; import { useQueries } from '@tanstack/react-query'; -import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants'; +import { rowToSequence } from '@semianalysisai/inferencex-constants'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { @@ -15,7 +15,7 @@ import type { import { filterDataByCostLimit } from '@/components/inference/utils'; import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants'; -import { transformBenchmarkRows } from '@/lib/benchmark-transform'; +import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; import type { Model, Sequence } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; @@ -79,6 +79,7 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, + selectedPercentile = 'median', ) { // When the selected date is the latest available, use '' (empty string) to match // the initial no-date query key, reusing the eagerly-fetched benchmarks from the @@ -119,11 +120,13 @@ export function useChartData( // Merge main rows with comparison date rows. // Stamp each row with the *requested* date (not the actual DB date) so that // GPUGraph's activeDates filter (keyed by user-selected date) matches the points. - const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]); + // + // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via + // benchmark_type), so one filter covers every scenario. const rows = useMemo(() => { - if (!allRows || !sequenceIslOsl) return []; - const seqFilter = (r: { isl: number; osl: number }) => - r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl; + if (!allRows) return []; + const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) => + rowToSequence(r) === selectedSequence; const seqFiltered = allRows.filter(seqFilter); // For each (hw, framework, spec_method, disagg, precision) group, keep only @@ -150,14 +153,14 @@ export function useChartData( .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })), ); return [...mainRows, ...extraRows]; - }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]); + }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]); // Transform filtered rows into chart data const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => { if (rows.length === 0) return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig }; - return transformBenchmarkRows(rows); - }, [rows]); + return transformBenchmarkRows(rows, selectedPercentile); + }, [rows, selectedPercentile]); // Sort hardware config — stabilize reference when keys haven't changed. // Different sequences for the same model often have the same GPU configs, @@ -192,8 +195,11 @@ export function useChartData( (chartDefinitions as ChartDefinition[]).map((chartDef) => { const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; - // Determine dynamic x-axis - let xAxisField: keyof AggDataEntry = chartDef.x; + // Default x-axis = chart's natural latency metric, percentile-adjusted + // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic + // scenarios `withPercentile` is a no-op when percentile === 'median'. + const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry; + let xAxisField: keyof AggDataEntry = naturalX; let xAxisLabel = chartDef.x_label; const metricTitle = @@ -232,8 +238,10 @@ export function useChartData( // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), // so no roofline flip is needed for the e2e chart. + // Compare against `naturalX` (percentile-adjusted) — switching the + // percentile of the same logical metric is NOT a flip. const xAxisFlipped = - xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride); + xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride); const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition; const dynamicYLabel = chartDef[yLabelKey]; @@ -261,7 +269,7 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric], + [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile], ); // Build renderable graphs (data processing + stable chart definitions) diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a23707ba..53c8d84c 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -88,6 +88,29 @@ export interface AggDataEntry { actualDate?: string; /** URL to the GitHub Actions workflow run that produced this data point. */ run_url?: string; + /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */ + benchmark_type?: string; + /** ISL in tokens — null for agentic_traces. */ + isl?: number | null; + /** OSL in tokens — null for agentic_traces. */ + osl?: number | null; + // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ── + /** "on" | "off" — whether KV cache offload to CPU was enabled. */ + offload_mode?: string; + /** Actual server-observed GPU prefix-cache hit rate (0..1). */ + server_gpu_cache_hit_rate?: number; + /** Actual server-observed CPU prefix-cache hit rate (0..1). */ + server_cpu_cache_hit_rate?: number; + /** Infinite-cache theoretical hit rate (0..1) computed from trace. */ + theoretical_cache_hit_rate?: number; + /** Total requests attempted during the window. */ + num_requests_total?: number; + /** Requests that completed successfully. */ + num_requests_successful?: number; + /** Total prompt tokens served. */ + total_prompt_tokens?: number; + /** Total generated (output) tokens. */ + total_generation_tokens?: number; } /** @@ -468,6 +491,9 @@ export interface InferenceChartContextType { workflowInfo: any; selectedYAxisMetric: string; setSelectedYAxisMetric: (metric: string) => void; + /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */ + selectedPercentile: string; + setSelectedPercentile: (p: string) => void; selectedXAxisMetric: string | null; setSelectedXAxisMetric: (metric: string | null) => void; selectedE2eXAxisMetric: string | null; diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 5f8e7787..e4f55ad7 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -1,11 +1,14 @@ 'use client'; +import { useEffect, useState } from 'react'; + import { track } from '@/lib/analytics'; import { useInference } from '@/components/inference/InferenceContext'; import { ModelSelector, - SequenceSelector, + ScenarioSelector, + PercentileSelector, PrecisionSelector, } from '@/components/ui/chart-selectors'; import { DateRangePicker } from '@/components/ui/date-range-picker'; @@ -23,7 +26,7 @@ import { import { TooltipProvider } from '@/components/ui/tooltip'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { ChartDefinition } from '@/components/inference/types'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model, type Percentile } from '@/lib/data-mappings'; // Build Y-axis metric options from static chart config JSON — available immediately, no API wait const METRIC_GROUPS = [ @@ -78,6 +81,13 @@ interface ChartControlsProps { } export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) { + // The percentile selector is rendered conditionally on `selectedSequence`, + // which on the client is hydrated from URL params. SSR doesn't see the URL, + // so deferring the conditional until after mount keeps the initial DOM + // identical between server and client (avoids hydration warnings). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const { selectedModel, setSelectedModel, @@ -87,6 +97,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro setSelectedPrecisions, selectedYAxisMetric, setSelectedYAxisMetric, + selectedPercentile, + setSelectedPercentile, graphs, selectedGPUs, setSelectedGPUs, @@ -203,12 +215,19 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro availableModels={availableModels} data-testid="model-selector" /> - + {mounted && selectedSequence === Sequence.AgenticTraces && ( + setSelectedPercentile(p)} + data-testid="percentile-selector" + /> + )} ('.dot-group').each(function (d) { + const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`); + const showHalo = onFrontier && d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); + // Double-click to track/untrack zoomGroup .selectAll('.dot-group') @@ -1567,6 +1585,9 @@ const ScatterGraph = React.memo( chartDefinition.chartType, xScaleConfig._isLog, yScaleConfig.type, + optimalPointKeys, + getCssColor, + resolveColor, ], ); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index e88e9930..7391225e 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -88,6 +88,51 @@ const runLinkHTML = (runUrl?: string) => const tooltipLine = (label: string, value: string | number) => `
${label}: ${value}
`; +const formatPct = (v: number | undefined): string | null => + v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`; + +/** + * Agentic-only tooltip rows: offload mode, KV cache hit rates, request + * success, token totals. Returns an empty string for non-agentic rows. + */ +const generateAgenticHTML = (d: InferenceData): string => { + if (d.benchmark_type !== 'agentic_traces') return ''; + + const parts: string[] = []; + if (d.offload_mode) { + parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase())); + } + + const gpuHit = formatPct(d.server_gpu_cache_hit_rate); + const cpuHit = formatPct(d.server_cpu_cache_hit_rate); + const theoHit = formatPct(d.theoretical_cache_hit_rate); + if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit)); + if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit)); + if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit)); + + if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) { + const successPct = + d.num_requests_total > 0 + ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)` + : ''; + parts.push( + tooltipLine( + 'Requests', + `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`, + ), + ); + } + + if (d.total_prompt_tokens !== undefined) { + parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens))); + } + if (d.total_generation_tokens !== undefined) { + parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens))); + } + + return parts.join(''); +}; + /** * Generates HTML for the parallelism configuration section of a tooltip. * Falls back to GPU count for old data without parallelism fields. @@ -177,9 +222,10 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} ${ isPinned @@ -231,9 +277,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`; }; @@ -292,9 +339,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)}
`; diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index 75e2f257..1c843e12 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -19,12 +19,16 @@ import { type Model, type Precision, type Sequence, + type Percentile, + PERCENTILE_OPTIONS, getModelCategory, getModelLabel, + getPercentileLabel, getPrecisionLabel, getSequenceCategory, getSequenceLabel, groupByCategory, + sequenceKind, } from '@/lib/data-mappings'; function DeprecatedLabel({ reason }: { reason: string }) { @@ -167,6 +171,126 @@ export function SequenceSelector({ ); } +interface ScenarioSelectorProps { + id?: string; + value: string; + onChange: (value: Sequence) => void; + availableSequences: string[]; + 'data-testid'?: string; +} + +/** + * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length", + * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL + * framing only applies to the fixed-seq subset). + */ +export function ScenarioSelector({ + id = 'scenario-select', + value, + onChange, + availableSequences, + 'data-testid': testId, +}: ScenarioSelectorProps) { + const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq'); + const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic'); + const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence)); + + return ( +
+ + +
+ ); +} + +interface PercentileSelectorProps { + id?: string; + value: string; + onChange: (value: Percentile) => void; + 'data-testid'?: string; +} + +/** + * Latency percentile selector for agentic-trace charts. The selected value + * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so + * picking p99 plots p99 e2e latency / interactivity instead of the median. + */ +export function PercentileSelector({ + id = 'percentile-select', + value, + onChange, + 'data-testid': testId, +}: PercentileSelectorProps) { + return ( +
+ + +
+ ); +} + interface PrecisionSelectorProps { id?: string; value: string[]; diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts index f4263d2c..05b522c5 100644 --- a/packages/app/src/components/unofficial-run-provider.test.ts +++ b/packages/app/src/components/unofficial-run-provider.test.ts @@ -29,6 +29,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 128, diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index 2dccdf7f..42530a51 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -12,7 +12,7 @@ import { import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types'; import { UnofficialBanner } from '@/components/ui/unofficial-banner'; -import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants'; import { computeToggle } from '@/hooks/useTogglableSet'; import type { BenchmarkRow, EvalRow } from '@/lib/api'; import { normalizeEvalHardwareKey } from '@/lib/chart-utils'; @@ -93,7 +93,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData const groups = new Map(); for (const row of benchmarks) { const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model; - const sequence = islOslToSequence(row.isl, row.osl); + const sequence = rowToSequence(row); if (!sequence) continue; const key = `${displayModel}_${sequence}`; if (!groups.has(key)) groups.set(key, []); diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 11ba4521..240251c3 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -23,9 +23,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -140,13 +144,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) { export interface AvailabilityRow { model: string; - isl: number; - osl: number; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; spec_method: string; disagg: boolean; + benchmark_type: string; date: string; } diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index be76438e..6a6c97c8 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -23,6 +23,8 @@ function makeRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 64, diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 107f0b12..69745da2 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -15,9 +15,39 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils'; import { getHardwareConfig } from '@/lib/constants'; import type { BenchmarkRow } from '@/lib/api'; +/** + * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl + * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here: + * e2el ≡ ttlt (time-to-last-token == end-to-end latency) + * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output) + * intvty ≡ 1/itl (tok/s from the user's perspective) + * Existing fields win if present; we only fill in the gaps. + */ +function agenticAliases(m: Record): Record { + const out: Record = {}; + for (const suffix of ['mean', 'median', 'p90', 'p99']) { + const itl = m[`${suffix}_itl`]; + const ttlt = m[`${suffix}_ttlt`]; + if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; + if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl; + if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) { + out[`${suffix}_intvty`] = 1 / itl; + } + } + return out; +} + /** Convert a DB benchmark row to an AggDataEntry. */ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { - const m = row.metrics; + const isAgentic = row.benchmark_type === 'agentic_traces'; + const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics; + // Prefer the dedicated column (added in migration 004); fall back to the + // legacy stash inside `metrics` for any rows ingested before that column + // existed. + const rawMetrics = row.metrics as Record; + const offloadMode = + row.offload_mode ?? + (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined); return { hw: row.hardware, framework: row.framework, @@ -68,6 +98,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { date: row.date, actualDate: (row as any).actualDate ?? row.date, run_url: row.run_url ?? undefined, + benchmark_type: row.benchmark_type, + isl: row.isl, + osl: row.osl, + offload_mode: offloadMode, + server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate, + server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate, + theoretical_cache_hit_rate: m.theoretical_cache_hit_rate, + num_requests_total: m.num_requests_total, + num_requests_successful: m.num_requests_successful, + total_prompt_tokens: m.total_prompt_tokens, + total_generation_tokens: m.total_generation_tokens, }; } @@ -77,13 +118,30 @@ interface PreparedEntry { date: string; } +/** + * Rewrite a chart x-axis key to use a different latency percentile prefix + * (`median_` → `p99_` etc). Only touches keys that start with a known + * percentile prefix; leaves everything else alone. + */ +export function withPercentile(key: string, percentile: string): string { + return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`); +} + /** * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig. * Returns one InferenceData[] per chart definition (e2e, interactivity). * * Converts rows to AggDataEntry once, then reuses for each chart definition. + * + * @param percentile Optional latency percentile for the chart x-axis + * (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart + * definition for the chosen percentile — only agentic rows carry the + * full set (median/p90/p99/p99.9) so this mainly affects that scenario. */ -export function transformBenchmarkRows(rows: BenchmarkRow[]): { +export function transformBenchmarkRows( + rows: BenchmarkRow[], + percentile = 'median', +): { chartData: InferenceData[][]; hardwareConfig: HardwareConfig; } { @@ -109,13 +167,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): { // Phase 2: Build chart data per chart definition (reusing prepared entries) const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => { + const xKey = withPercentile(chartDef.x, percentile); const groupedByHw: Record = {}; for (const { entry, hwKey, date } of prepared) { const dataPoint = createChartDataPoint( date, entry, - chartDef.x as keyof AggDataEntry, + xKey as keyof AggDataEntry, chartDef.y as keyof AggDataEntry, hwKey, ); diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 823b6823..8900f50e 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -102,17 +102,77 @@ export enum Sequence { OneK_OneK = '1k/1k', OneK_EightK = '1k/8k', EightK_OneK = '8k/1k', + AgenticTraces = 'agentic-traces', } -const SEQUENCE_CONFIG: Record = - { - [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' }, - [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' }, - [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' }, - }; +/** + * Top-level scenario kind. Fixed-seq sequences cluster under a single group + * in the selector; agentic traces sit alongside as their own kind. + */ +export type ScenarioKind = 'fixed-seq' | 'agentic'; + +export function sequenceKind(seq: Sequence): ScenarioKind { + return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq'; +} + +const SEQUENCE_CONFIG: Record< + Sequence, + { label: string; compact: string; category: CategoryTag; kind: ScenarioKind } +> = { + [Sequence.OneK_OneK]: { + label: '1K / 1K', + compact: '1k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.OneK_EightK]: { + label: '1K / 8K', + compact: '1k8k', + category: 'deprecated', + kind: 'fixed-seq', + }, + [Sequence.EightK_OneK]: { + label: '8K / 1K', + compact: '8k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.AgenticTraces]: { + label: 'Agentic Traces', + compact: 'agentic', + category: 'default', + kind: 'agentic', + }, +}; export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; +/** + * Percentile of the latency distribution used for the chart x-axis when + * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which + * slice to plot. + */ +export enum Percentile { + Median = 'median', + P90 = 'p90', + P99 = 'p99', + P99_9 = 'p99.9', +} + +const PERCENTILE_CONFIG: Record = { + [Percentile.Median]: { label: 'p50 (median)' }, + [Percentile.P90]: { label: 'p90' }, + [Percentile.P99]: { label: 'p99' }, + [Percentile.P99_9]: { label: 'p99.9' }, +}; + +export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; + +export function getPercentileLabel(p: Percentile): string { + return PERCENTILE_CONFIG[p]?.label ?? p; +} + export const DEPRECATED_SEQUENCES: ReadonlySet = new Set( (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][]) .filter(([, c]) => c.category === 'deprecated') diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts index 3947488f..fb2e9d70 100644 --- a/packages/app/src/lib/url-state.ts +++ b/packages/app/src/lib/url-state.ts @@ -22,6 +22,7 @@ const URL_STATE_KEYS = [ 'i_seq', 'i_prec', 'i_metric', + 'i_pctl', 'i_xmetric', 'i_e2e_xmetric', 'i_scale', @@ -61,6 +62,7 @@ export const PARAM_DEFAULTS: Record = { i_seq: '8k/1k', i_prec: 'fp4', i_metric: 'y_tpPerGpu', + i_pctl: 'median', i_xmetric: 'p99_ttft', i_e2e_xmetric: '', i_scale: 'auto', diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts index 6d646f08..d9a3d2d1 100644 --- a/packages/constants/src/models.ts +++ b/packages/constants/src/models.ts @@ -53,3 +53,20 @@ export function islOslToSequence(isl: number, osl: number): string | null { }; return map[`${isl}_${osl}`] ?? null; } + +/** + * Map a benchmark/availability row to its sequence (scenario) string. + * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl. + * - Other rows (today: `single_turn`) fall back to `islOslToSequence`. + * Returns `null` for rows that can't be classified (e.g. `single_turn` with + * unmapped isl/osl values). + */ +export function rowToSequence(row: { + isl: number | null; + osl: number | null; + benchmark_type: string; +}): string | null { + if (row.benchmark_type === 'agentic_traces') return 'agentic-traces'; + if (row.isl === null || row.osl === null) return null; + return islOslToSequence(row.isl, row.osl); +} diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql new file mode 100644 index 00000000..c143914e --- /dev/null +++ b/packages/db/migrations/002_agentic_scenario.sql @@ -0,0 +1,30 @@ +-- Support agentic scenarios in benchmark_results. +-- +-- Scenarios are discriminated by benchmark_type: +-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. +-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. +-- +-- conc retains its meaning (concurrent users/requests) for both. + +-- 1) isl/osl become nullable for agentic rows +alter table benchmark_results + alter column isl drop not null, + alter column osl drop not null; + +-- 2) CHECK constraints: positive-or-null +alter table benchmark_results + drop constraint benchmark_results_isl_positive, + drop constraint benchmark_results_osl_positive; + +alter table benchmark_results + add constraint benchmark_results_isl_positive check (isl is null or isl > 0), + add constraint benchmark_results_osl_positive check (osl is null or osl > 0); + +-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows +-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc); diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql new file mode 100644 index 00000000..e96cbd50 --- /dev/null +++ b/packages/db/migrations/003_agentic_availability.sql @@ -0,0 +1,21 @@ +-- Extend the availability table to cover agentic scenarios. +-- +-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same +-- for availability and add benchmark_type so the frontend can enumerate +-- agentic vs single_turn scenarios per model/date. +-- +-- Postgres primary keys require every column to be NOT NULL, so we drop the PK +-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally +-- equivalent except it allows isl/osl to be NULL for agentic rows. + +alter table availability + drop constraint availability_pkey; + +alter table availability + alter column isl drop not null, + alter column osl drop not null, + add column benchmark_type text not null default 'single_turn'; + +alter table availability + add constraint availability_natural_key unique nulls not distinct + (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql new file mode 100644 index 00000000..24b617f1 --- /dev/null +++ b/packages/db/migrations/004_offload_mode.sql @@ -0,0 +1,42 @@ +-- Add offload_mode as a first-class dimension on benchmark_results. +-- +-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace +-- runs: a single run may emit two rows for the same (config, isl, osl, conc) +-- — one with offload disabled, one enabled. The pre-existing unique key +-- collapsed those into one row, forcing the ingest to skip variants. +-- +-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the +-- assumption baked into the existing 5,500+ rows. + +alter table benchmark_results + add column offload_mode text not null default 'off'; + +-- Backfill agentic rows from the offload_mode value already living in metrics +-- JSONB (set during the earlier agentic ingest backfill). +update benchmark_results + set offload_mode = metrics->>'offload_mode' + where benchmark_type = 'agentic_traces' + and metrics ? 'offload_mode'; + +-- Replace the unique constraint so on/off variants can coexist. +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); + +-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. +drop materialized view if exists latest_benchmarks cascade; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; + +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index 67173c64..ea802d3f 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows( // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears // more than once in a single batch. Deduplicate within the batch, keeping - // the last occurrence (last metrics for each unique config/isl/osl/conc). + // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode). const seen = new Map(); - for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r); + for (const r of rows) { + seen.set( + `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`, + r, + ); + } const deduped = [...seen.values()]; const configIds = deduped.map((r) => r.configId); + const benchmarkTypes = deduped.map((r) => r.benchmarkType); + const offloadModes = deduped.map((r) => r.offloadMode); const isls = deduped.map((r) => r.isl); const osls = deduped.map((r) => r.osl); const concs = deduped.map((r) => r.conc); @@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows( const result = await sql<{ inserted: boolean; id: number }[]>` insert into benchmark_results ( - workflow_run_id, config_id, benchmark_type, date, + workflow_run_id, config_id, benchmark_type, offload_mode, date, isl, osl, conc, image, metrics ) select ${workflowRunId}, unnest(${sql.array(configIds)}::int[]), - 'single_turn', + unnest(${sql.array(benchmarkTypes)}::text[]), + unnest(${sql.array(offloadModes)}::text[]), ${date}::date, unnest(${sql.array(isls)}::int[]), unnest(${sql.array(osls)}::int[]), unnest(${sql.array(concs)}::int[]), unnest(${sql.array(images)}), unnest(${sql.array(metricsJsons)}::jsonb[]) - on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc) + on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode) do update set metrics = excluded.metrics, image = excluded.image @@ -147,13 +155,14 @@ export async function bulkUpsertAvailability( sql: Sql, rows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[], date: string, ): Promise { @@ -162,7 +171,7 @@ export async function bulkUpsertAvailability( const seen = new Set(); const unique: typeof rows = []; for (const r of rows) { - const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`; + const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`; if (!seen.has(key)) { seen.add(key); unique.push(r); @@ -170,7 +179,7 @@ export async function bulkUpsertAvailability( } await sql` - insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date) + insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date) select unnest(${sql.array(unique.map((r) => r.model))}::text[]), unnest(${sql.array(unique.map((r) => r.isl))}::int[]), @@ -180,6 +189,7 @@ export async function bulkUpsertAvailability( unnest(${sql.array(unique.map((r) => r.framework))}::text[]), unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]), unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]), + unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]), ${date}::date on conflict do nothing `; diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 7d78e175..5b120843 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([ 'decode_num_workers', 'num_prefill_gpu', 'num_decode_gpu', + // agentic scenario + 'scenario_type', + 'users', + 'offload_mode', + 'num_requests_total', + 'num_requests_successful', ]); +/** + * `benchmark_type` values understood by the ingest. + * - `single_turn` — fixed sequence-length runs (isl/osl set). + * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc). + */ +export type BenchmarkType = 'single_turn' | 'agentic_traces'; + /** * METRIC_KEYS from constants is the canonical set of known metric keys. * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured @@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set(); export interface BenchmarkParams { config: ConfigParams; - isl: number; - osl: number; + benchmarkType: BenchmarkType; + // Null for agentic_traces; present for single_turn. + isl: number | null; + osl: number | null; conc: number; + /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */ + offloadMode: string; image: string | null; metrics: Record; } @@ -114,10 +131,15 @@ export function mapBenchmarkRow( return null; } - const isl = parseInt2(row.isl) ?? islOslFallback?.isl; - const osl = parseInt2(row.osl) ?? islOslFallback?.osl; - const conc = parseInt2(row.conc); - if (!isl || !osl || !conc) { + // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants), + // no isl/osl, and `users` instead of `conc`. Everything else stays as-is. + const isAgentic = String(row.scenario_type ?? '').startsWith('agentic'); + const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn'; + + const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); + const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); + const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc); + if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } @@ -182,6 +204,12 @@ export function mapBenchmarkRow( } } + // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it + // as a stringified metric so the frontend can expose it in tooltips. + if (isAgentic && typeof row.offload_mode === 'string') { + (metrics as Record).offload_mode = row.offload_mode; + } + // Artifact names encode '/' as '#' to avoid path separators; restore the URI. const image = row.image ? String(row.image).replaceAll('#', '/') : null; @@ -205,9 +233,14 @@ export function mapBenchmarkRow( numPrefillGpu, numDecodeGpu, }, + benchmarkType, isl, osl, conc, + offloadMode: + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : 'off', image, metrics, }; diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 14c7b4d0..8cce43ca 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -248,13 +248,14 @@ async function main(): Promise { const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; let totalNewBmk = 0, @@ -367,6 +368,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index e20278d6..6dc604e9 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -596,13 +596,14 @@ async function main(): Promise { // Upsert availability rows only for successfully resolved configs const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const r of allInserted) { availRows.push({ @@ -614,6 +615,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } if (availRows.length > 0) { diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts index 1e494e9f..43aae047 100644 --- a/packages/db/src/ingest-supplemental.ts +++ b/packages/db/src/ingest-supplemental.ts @@ -219,8 +219,10 @@ async function ingestSupplementalBmk( const rows: { configId: number; - isl: number; - osl: number; + benchmarkType: 'single_turn' | 'agentic_traces'; + offloadMode: string; + isl: number | null; + osl: number | null; conc: number; image: string | null; metrics: Record; @@ -271,6 +273,8 @@ async function ingestSupplementalBmk( rows.push({ configId, + benchmarkType: 'single_turn', + offloadMode: 'off', isl: entry.isl, osl: entry.osl, conc: entry.conc, @@ -294,13 +298,14 @@ async function ingestSupplementalBmk( // to `rows` are exactly the valid ones. const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const entry of entries) { const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined }); @@ -317,6 +322,7 @@ async function ingestSupplementalBmk( framework, specMethod, disagg, + benchmarkType: 'single_turn', }); } if (availRows.length > 0) { diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index 0d9373d3..f09a2686 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -290,6 +290,8 @@ function toBenchmarkRow( decode_num_workers: c.decode_num_workers, num_prefill_gpu: c.num_prefill_gpu, num_decode_gpu: c.num_decode_gpu, + benchmark_type: br.benchmark_type ?? 'single_turn', + offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off', isl: br.isl, osl: br.osl, conc: br.conc, @@ -410,7 +412,11 @@ export function getAvailabilityData(): AvailabilityRow[] { for (const a of s.availability) { const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`; if (validKeys.has(key)) { - rows.push({ ...a, date: toDateString(a.date) }); + rows.push({ + ...a, + benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn', + date: toDateString(a.date), + }); } } diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 1c30b1fd..74e20380 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -18,9 +18,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces; numeric for single_turn fixed-seq runs. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -68,6 +72,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -106,6 +112,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + lb.benchmark_type, + lb.offload_mode, lb.isl, lb.osl, lb.conc, @@ -153,6 +161,7 @@ export async function getAllBenchmarksForHistory( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, br.isl, br.osl, br.conc, diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts index b4e4f255..d5e2d933 100644 --- a/packages/db/src/queries/workflow-info.ts +++ b/packages/db/src/queries/workflow-info.ts @@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise { const rows = await sql` - SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text + SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text FROM availability a WHERE EXISTS ( SELECT 1 @@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise Date: Thu, 30 Apr 2026 19:01:56 -0500 Subject: [PATCH 002/111] =?UTF-8?q?fix:=20agentic=20offload=20variants=20?= =?UTF-8?q?=E2=80=94=20render=20both=20halos=20+=20map=20renamed=20fields?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ScatterGraph: include `offload_mode` in `buildPointConfigId` so d3's data join keeps both `on` and `off` variants for the same (config, conc). Without it, the second variant collapsed onto the first key, so FP8 offload-on points (and their halos) silently disappeared. - benchmark-mapper: handle older artifacts that emit `users`/`offload_mode` AND newer ones that emit `conc`/`offloading` (with 'none' → 'off' mapping). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 4 +++ packages/db/src/etl/benchmark-mapper.ts | 27 ++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 15bb60f0..55a206ce 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -295,6 +295,10 @@ const ScatterGraph = React.memo( const buildPointConfigId = useCallback((point: InferenceData): string => { let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`; if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`; + // Agentic runs emit two rows per (config, conc) — one offload=on, one off. + // Without this suffix, d3's data join treats them as the same point and + // drops one variant (along with its halo). + if (point.offload_mode) key += `|offload-${point.offload_mode}`; return key; }, []); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 5b120843..d842276e 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -138,12 +138,24 @@ export function mapBenchmarkRow( const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); - const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc); + // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones. + const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc); if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` + // ('none' → 'off'; any other non-empty value → 'on'). + const offloadModeRaw = + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : typeof row.offloading === 'string' && row.offloading.length > 0 + ? row.offloading === 'none' + ? 'off' + : 'on' + : 'off'; + const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg); const isMultinode = parseBool(row.is_multinode); const precision = normalizePrecision(String(row.precision ?? '')); @@ -204,10 +216,10 @@ export function mapBenchmarkRow( } } - // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it - // as a stringified metric so the frontend can expose it in tooltips. - if (isAgentic && typeof row.offload_mode === 'string') { - (metrics as Record).offload_mode = row.offload_mode; + // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`) + // — preserve as a stringified metric so the frontend can expose it in tooltips. + if (isAgentic) { + (metrics as Record).offload_mode = offloadModeRaw; } // Artifact names encode '/' as '#' to avoid path separators; restore the URI. @@ -237,10 +249,7 @@ export function mapBenchmarkRow( isl, osl, conc, - offloadMode: - typeof row.offload_mode === 'string' && row.offload_mode.length > 0 - ? row.offload_mode - : 'off', + offloadMode: offloadModeRaw, image, metrics, }; From 07ba10636dae87b5a819afa524d7c10322fae41b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 00:29:55 -0500 Subject: [PATCH 003/111] fix: render offload halo on every offload-on point, not just frontier The halo's purpose is to surface KV-offload usage; restricting it to Pareto-frontier-only points hid the indicator on most runs. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/inference/ui/ScatterGraph.tsx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 55a206ce..61ac0983 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -1516,10 +1516,9 @@ const ScatterGraph = React.memo( .attr('pointer-events', 'none'); }); - // Offload halo: dashed ring on frontier points that used KV offload + // Offload halo: dashed ring on every point that used KV offload (Pareto or not) zoomGroup.selectAll('.dot-group').each(function (d) { - const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`); - const showHalo = onFrontier && d.offload_mode === 'on'; + const showHalo = d.offload_mode === 'on'; d3.select(this) .selectAll('.offload-halo') .data(showHalo ? [true] : []) From 95e9dc77431adf5354ef0df36989816199624383 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 01:13:42 -0500 Subject: [PATCH 004/111] fix: strip runner-pool suffix (-p1, -p2, ...) from hw identifier b300-p1 (and similar) artifacts were skipping ingest because the runner-pool suffix wasn't in the strip list and didn't normalize to the canonical b300 GPU key. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/db/src/etl/normalizers.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index ad12a454..bd497f7a 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -34,7 +34,8 @@ export function hwToGpuKey(hw: string): string | null { .replace(/-dgxc-slurm$/, '') .replace(/-dgxc$/, '') .replace(/-nb$/, '') - .replace(/-nv$/, ''); + .replace(/-nv$/, '') + .replace(/-p\d+$/, ''); // strip runner-pool suffix (e.g. b300-p1 → b300) return GPU_KEYS.has(base) ? base : null; } From 982106da5f4421983841304f0503b6467033852d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 09:25:33 -0500 Subject: [PATCH 005/111] feat: bold scatter labels with concurrency tag + collision avoidance - Label text now includes `C=` alongside the GPU/parallelism tag (default ` C=`, advanced ` C=`) - Bumped point-label font-weight to 700 so the labels read clearly against the chart fill - Greedy collision-avoidance pass on render and zoom: tries placing each label above/below the point through 4 candidate dy offsets, hiding the label only when no slot is free Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 68 ++++++++++++++++++- .../src/lib/d3-chart/layers/scatter-points.ts | 1 + 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 61ac0983..3fbd8588 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -55,6 +55,63 @@ import { buildGradientColorMap, } from '@/components/inference/utils/paretoLabels'; +// Greedy label-collision avoidance: try positions above/below the point; +// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom. +function avoidLabelCollisions( + zoomGroup: d3.Selection, +): void { + const labels: { + el: SVGTextElement; + cx: number; + cy: number; + w: number; + h: number; + }[] = []; + zoomGroup.selectAll('.dot-group').each(function () { + const labelEl = this.querySelector('.point-label'); + if (!labelEl) return; + if ((this as SVGGElement).style.opacity === '0') return; + const transform = (this as SVGGElement).getAttribute('transform') ?? ''; + const m = transform.match(/translate\(([^,]+),([^)]+)\)/); + if (!m) return; + const cx = parseFloat(m[1]); + const cy = parseFloat(m[2]); + labelEl.setAttribute('dy', '-8'); + labelEl.style.opacity = '1'; + const bbox = labelEl.getBBox(); + labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height }); + }); + labels.sort((a, b) => a.cx - b.cx); + const placed: { left: number; right: number; top: number; bottom: number }[] = []; + const pad = 1; + const candidates = [-8, 14, -22, 28]; + for (const lab of labels) { + let chosenDy: number | null = null; + let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; + for (const dy of candidates) { + const top = lab.cy + dy - lab.h - pad; + const bottom = lab.cy + dy + pad; + const left = lab.cx - lab.w / 2 - pad; + const right = lab.cx + lab.w / 2 + pad; + const collides = placed.some( + (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), + ); + if (!collides) { + chosenDy = dy; + chosenBox = { left, right, top, bottom }; + break; + } + } + if (chosenDy !== null && chosenBox) { + lab.el.setAttribute('dy', String(chosenDy)); + lab.el.style.opacity = '1'; + placed.push(chosenBox); + } else { + lab.el.style.opacity = '0'; + } + } +} + // X-shape path for overlay (unofficial) data points const X_SIZE = 5; const X_HOVER_SIZE = 7; @@ -603,6 +660,7 @@ const ScatterGraph = React.memo( d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any, ); } + avoidLabelCollisions(ctx.layout.zoomGroup); }, }), [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type], @@ -1251,7 +1309,8 @@ const ScatterGraph = React.memo( getOpacity: (d) => (isPointVisible(d) ? 1 : 0), getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'), hideLabels: hidePointLabels || showGradientLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + getLabelText: (d) => + useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1353,8 +1412,11 @@ const ScatterGraph = React.memo( .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') + .attr('font-weight', '700') .attr('pointer-events', 'none') - .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp)); + .text( + useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, + ); }); // Overlay tooltip handlers @@ -1566,6 +1628,8 @@ const ScatterGraph = React.memo( }); }); + avoidLabelCollisions(zoomGroup); + // Log tick formatting on initial render if (xScaleConfig._isLog) { const xScale = ctx.xScale as d3.ScaleLogarithmic; diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 507654e1..9f2d2f38 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -72,6 +72,7 @@ export function renderScatterPoints Date: Fri, 1 May 2026 09:32:44 -0500 Subject: [PATCH 006/111] fix: stack multi-line point labels upward so they don't overlap the point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tspans now ride above the text's `dy` anchor — the LAST line sits at the anchor (just above the point) and earlier lines stack above it. Previously the second tspan landed below the anchor and crashed into the marker. Also widened collision candidates by label height so the flipped-below position fully clears the point on multi-line labels. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 28 +++++++--- .../src/lib/d3-chart/layers/scatter-points.ts | 52 +++++++++++++------ 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 3fbd8588..f8ce9b8f 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -84,8 +84,11 @@ function avoidLabelCollisions( labels.sort((a, b) => a.cx - b.cx); const placed: { left: number; right: number; top: number; bottom: number }[] = []; const pad = 1; - const candidates = [-8, 14, -22, 28]; for (const lab of labels) { + // Candidates scale with the label's own height so multi-line labels don't + // overlap the point shape when flipped below. + const below = lab.h + 8; + const candidates = [-8, below, -8 - below - 4, 2 * below]; let chosenDy: number | null = null; let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; for (const dy of candidates) { @@ -1310,7 +1313,7 @@ const ScatterGraph = React.memo( getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'), hideLabels: hidePointLabels || showGradientLabels, getLabelText: (d) => - useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, + useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1403,7 +1406,14 @@ const ScatterGraph = React.memo( // Labels const showLabels = !hidePointLabels && !showGradientLabels; overlayPoints.each(function (d) { - d3.select(this) + const lines = showLabels + ? (useAdvancedLabels + ? `${getPointLabel(d)}\nC=${d.conc}` + : `${d.tp}\nC=${d.conc}` + ).split('\n') + : []; + const text = d3 + .select(this) .selectAll('.overlay-label') .data(showLabels ? [true] : []) .join('text') @@ -1413,10 +1423,14 @@ const ScatterGraph = React.memo( .style('fill', 'var(--foreground)') .attr('font-size', '10px') .attr('font-weight', '700') - .attr('pointer-events', 'none') - .text( - useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, - ); + .attr('pointer-events', 'none'); + text + .selectAll('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .text((l) => l); }); // Overlay tooltip handlers diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 9f2d2f38..13c588d8 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -63,18 +63,30 @@ export function renderScatterPoints` element — the + // intra-stack offsets stay correct whether the label ends up above or below. if (!config.hideLabels && config.getLabelText && config.foreground) { - entered - .append('text') - .attr('class', 'point-label') - .attr('dy', -8) - .attr('text-anchor', 'middle') - .attr('fill', config.foreground) - .attr('font-size', '10px') - .attr('font-weight', '700') - .attr('pointer-events', 'none') - .text(config.getLabelText); + const labelGetter = config.getLabelText; + entered.each(function (d) { + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .append('text') + .attr('class', 'point-label') + .attr('dy', -8) + .attr('text-anchor', 'middle') + .attr('fill', config.foreground!) + .attr('font-size', '10px') + .attr('font-weight', '700') + .attr('pointer-events', 'none'); + lines.forEach((line, i) => { + const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'; + text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line); + }); + }); } // Exit: remove stale points @@ -103,9 +115,12 @@ export function renderScatterPoints('.point-label') + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .selectAll('.point-label') .data([true]) .join('text') .attr('class', 'point-label') @@ -113,8 +128,15 @@ export function renderScatterPoints('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .text((l) => l); }); } else { points.selectAll('.point-label').remove(); From 37eecc6e28c10751ffc52c8a0d0588177e43d4d8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 09:38:39 -0500 Subject: [PATCH 007/111] fix: anchor multi-line labels via first tspan + tspan-aware collision pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a `` contains tspans, the parent's `dy` does not shift the bbox cleanly — its (unused) y=0 origin still factors in, so the rendered text ended up centered on the point. Move the absolute offset into the FIRST tspan's `dy`; later tspans cascade by 1.1em. Collision avoidance now drives the first tspan's `dy` and tries four candidate baselines (primary above, primary below, secondary above, secondary below), accounting for full label height when picking a non- overlapping slot. Labels still hidden as a last resort. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 72 +++++++++++++------ .../src/lib/d3-chart/layers/scatter-points.ts | 25 ++++--- 2 files changed, 66 insertions(+), 31 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index f8ce9b8f..27d3680c 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -55,58 +55,88 @@ import { buildGradientColorMap, } from '@/components/inference/utils/paretoLabels'; -// Greedy label-collision avoidance: try positions above/below the point; -// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom. +// Greedy label-collision avoidance. +// Each candidate is the y-position of the FIRST baseline (relative to point +// center) which we apply via the first tspan's `dy` — later tspans cascade +// down by 1.1em. We try above/below at primary and secondary offsets, and +// hide the label if all four positions collide. function avoidLabelCollisions( zoomGroup: d3.Selection, ): void { - const labels: { + interface LabelInfo { el: SVGTextElement; + firstTspan: SVGTSpanElement; cx: number; cy: number; w: number; - h: number; - }[] = []; + nLines: number; + defaultFirstY: number; + } + const labels: LabelInfo[] = []; + const ASCENT = 9; + const DESCENT = 3; + const LINE_H = 11; + zoomGroup.selectAll('.dot-group').each(function () { const labelEl = this.querySelector('.point-label'); if (!labelEl) return; if ((this as SVGGElement).style.opacity === '0') return; + const tspans = labelEl.querySelectorAll('tspan'); + if (tspans.length === 0) return; const transform = (this as SVGGElement).getAttribute('transform') ?? ''; const m = transform.match(/translate\(([^,]+),([^)]+)\)/); if (!m) return; const cx = parseFloat(m[1]); const cy = parseFloat(m[2]); - labelEl.setAttribute('dy', '-8'); + const nLines = tspans.length; + const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point + // Reset to default before measuring so prior positioning doesn't bias bbox + tspans[0].setAttribute('dy', `${defaultFirstY}px`); labelEl.style.opacity = '1'; const bbox = labelEl.getBBox(); - labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height }); + labels.push({ + el: labelEl, + firstTspan: tspans[0], + cx, + cy, + w: bbox.width, + nLines, + defaultFirstY, + }); }); + labels.sort((a, b) => a.cx - b.cx); const placed: { left: number; right: number; top: number; bottom: number }[] = []; - const pad = 1; + const pad = 2; + for (const lab of labels) { - // Candidates scale with the label's own height so multi-line labels don't - // overlap the point shape when flipped below. - const below = lab.h + 8; - const candidates = [-8, below, -8 - below - 4, 2 * below]; - let chosenDy: number | null = null; + const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT; + const aboveFirstY = lab.defaultFirstY; + const belowFirstY = 14; // first baseline 14px below point center + const candidates = [ + aboveFirstY, + belowFirstY, + aboveFirstY - blockH - 2, + belowFirstY + blockH + 2, + ]; + let chosenY: number | null = null; let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; - for (const dy of candidates) { - const top = lab.cy + dy - lab.h - pad; - const bottom = lab.cy + dy + pad; + for (const firstY of candidates) { + const top = lab.cy + firstY - ASCENT - pad; + const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad; const left = lab.cx - lab.w / 2 - pad; const right = lab.cx + lab.w / 2 + pad; const collides = placed.some( (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), ); if (!collides) { - chosenDy = dy; + chosenY = firstY; chosenBox = { left, right, top, bottom }; break; } } - if (chosenDy !== null && chosenBox) { - lab.el.setAttribute('dy', String(chosenDy)); + if (chosenY !== null && chosenBox) { + lab.firstTspan.setAttribute('dy', `${chosenY}px`); lab.el.style.opacity = '1'; placed.push(chosenBox); } else { @@ -1418,18 +1448,18 @@ const ScatterGraph = React.memo( .data(showLabels ? [true] : []) .join('text') .attr('class', 'overlay-label') - .attr('dy', -10) .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') .attr('font-weight', '700') .attr('pointer-events', 'none'); + const firstDy = -(1 + (lines.length - 1) * 1.1); text .selectAll('tspan') .data(lines) .join('tspan') .attr('x', 0) - .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) .text((l) => l); }); diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 13c588d8..71d1f050 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -64,10 +64,10 @@ export function renderScatterPoints` element — the - // intra-stack offsets stay correct whether the label ends up above or below. + // we anchor the entire stack via the FIRST tspan's `dy` so getBBox() doesn't + // pick up the text element's own (unused) y=0 origin. The first tspan is + // raised so the LAST line baseline lands ~8px above the point; subsequent + // tspans cascade down by 1.1em. if (!config.hideLabels && config.getLabelText && config.foreground) { const labelGetter = config.getLabelText; entered.each(function (d) { @@ -76,15 +76,18 @@ export function renderScatterPoints { - const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'; - text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line); + text + .append('tspan') + .attr('x', 0) + .attr('dy', i === 0 ? `${firstDy}em` : '1.1em') + .text(line); }); }); } @@ -113,7 +116,9 @@ export function renderScatterPoints('tspan') .data(lines) .join('tspan') .attr('x', 0) - .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) .text((l) => l); }); } else { From f317377dfaea35f9cb5dc435ea177966aa17fbf8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 10:21:00 -0500 Subject: [PATCH 008/111] fix: dedupe artifacts by logical name + skip 0-successful agg rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two complementary fixes for runs whose `results_bmk` aggregated artifact ends up containing both a successful row and a failed-attempt row for the same (config, conc, offload) — the failed row's null metrics were overwriting the good row via ON CONFLICT DO UPDATE. 1. Artifact-level: strip the trailing `__` suffix from each artifact name and group by the logical name, keeping only the most recent per group. 2. Row-level: skip rows with `num_requests_successful === 0` AND `num_requests_total > 0`. The aggregated artifact merges rows from all runners — including failed ones — so artifact-level dedup alone can't reach inside it. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/db/src/etl/benchmark-mapper.ts | 14 +++++++++++ packages/db/src/etl/skip-tracker.ts | 10 +++++++- packages/db/src/ingest-ci-run.ts | 33 ++++++++++++++++++++----- packages/db/src/ingest-gcs-backup.ts | 1 + 4 files changed, 51 insertions(+), 7 deletions(-) diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index d842276e..1aff5ea9 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -145,6 +145,20 @@ export function mapBenchmarkRow( return null; } + // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from + // every runner, including ones with 0 successful requests and null metrics. + // Without this skip, the empty row's nulls overwrite a good row via + // ON CONFLICT DO UPDATE when both share the same (config, conc, offload). + if ( + typeof row.num_requests_successful === 'number' && + row.num_requests_successful === 0 && + typeof row.num_requests_total === 'number' && + row.num_requests_total > 0 + ) { + tracker.skips.failedRun++; + return null; + } + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` // ('none' → 'off'; any other non-empty value → 'on'). const offloadModeRaw = diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts index 6166ea44..588718dd 100644 --- a/packages/db/src/etl/skip-tracker.ts +++ b/packages/db/src/etl/skip-tracker.ts @@ -8,6 +8,7 @@ export interface Skips { unmappedModel: number; unmappedHw: number; noIslOsl: number; + failedRun: number; dbError: number; } @@ -66,7 +67,14 @@ const MAX_DB_ERRORS = 10; * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets. */ export function createSkipTracker(): SkipTracker { - const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 }; + const skips: Skips = { + badZip: 0, + unmappedModel: 0, + unmappedHw: 0, + noIslOsl: 0, + failedRun: 0, + dbError: 0, + }; const unmappedModels = new Set(); const unmappedHws = new Set(); const unmappedPrecisions = new Set(); diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 8cce43ca..fb1fbbbc 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -101,15 +101,30 @@ if (isDownloadMode) { } catch {} } - const byName = new Map(); + // Strip the trailing `__` token from each + // artifact name, then group by the resulting logical name and keep only + // the most recent per group. Without this, two artifacts produced on + // different runners for the same logical config (e.g. `…_h200-cw_00` and + // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty + // metrics can overwrite the good one via ON CONFLICT DO UPDATE. + // + // The runner pool name itself has no underscores (`h200-cw`, + // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip + // bounded — using `\w` here would over-match across earlier `_` + // separators and collapse different (conc, offload) variants into the + // same logical name. + const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/; + const byLogical = new Map(); for (const a of allArtifacts) { - const existing = byName.get(a.name); + const key = a.name.replace(RUNNER_SUFFIX_RE, ''); + const existing = byLogical.get(key); if (!existing || a.created_at > existing.created_at) { - byName.set(a.name, a); + byLogical.set(key, a); } } - for (const [name, artifact] of byName) { + for (const [, artifact] of byLogical) { + const name = artifact.name; console.log(` ${name}`); const zipPath = path.join(artifactsDir, 'artifact.zip'); execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, { @@ -121,7 +136,7 @@ if (isDownloadMode) { fs.unlinkSync(zipPath); } - console.log(`\n Downloaded ${byName.size} artifact(s)`); + console.log(`\n Downloaded ${byLogical.size} artifact(s)`); // Fetch run attempt from API const attemptStr = execSync( @@ -510,11 +525,17 @@ async function main(): Promise { const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker; const totalSkips = - skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError; + skips.badZip + + skips.unmappedModel + + skips.unmappedHw + + skips.noIslOsl + + skips.failedRun + + skips.dbError; if (totalSkips > 0) { console.log(`\n Skipped: ${totalSkips} rows`); const skipLines: [string, number][] = [ ['no isl/osl (old format)', skips.noIslOsl], + ['failed run (0 successful)', skips.failedRun], ['unmapped model', skips.unmappedModel], ['unmapped hw', skips.unmappedHw], ['bad/empty zip', skips.badZip], diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index 6dc604e9..d67f5164 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -434,6 +434,7 @@ async function mapWorkflowDir( unmappedModel: local.skips.unmappedModel, unmappedHw: local.skips.unmappedHw, noIslOsl: local.skips.noIslOsl, + failedRun: local.skips.failedRun, }, localUnmappedModels: new Set(local.unmappedModels), localUnmappedHws: new Set(local.unmappedHws), From c2f66f62f5a1dedb6a87c7c5e58ca990b3cb0956 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 7 May 2026 08:41:26 -0500 Subject: [PATCH 009/111] feat: add AIPerf to FRAMEWORK_LABELS Tag display name for the `aiperf` spec_method suffix used by the alternate-harness runs ingested for the agentic minimax sweep. Without this entry the legend shows 'AIPERF' from the default toUpperCase fallback. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/constants/src/framework-aliases.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts index cc5eb6b4..e23a93bc 100644 --- a/packages/constants/src/framework-aliases.ts +++ b/packages/constants/src/framework-aliases.ts @@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record = { ]), ), mtp: 'MTP', + aiperf: 'AIPerf', }; /** From 024797a978a2a6e2954f66a963de3205b62a149e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 12 May 2026 15:02:07 -0500 Subject: [PATCH 010/111] fix(changelog): coerce ids to string when filtering changelog by run bigint workflow_run_id sometimes deserializes as a number on the frontend depending on the postgres adapter's behavior; strict === between a number and a string silently dropped every match, so the changelog popover always reported "no changelog data available." Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/GlobalFilterContext.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 08fc7094..11e56de7 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -87,7 +87,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record { const runs: Record = {}; for (const run of data.runs) { const runId = String(run.github_run_id); - const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id); + const runChangelogs = data.changelogs.filter( + (c) => String(c.workflow_run_id) === String(run.github_run_id), + ); runs[runId] = { runId, runDate: run.created_at, From aa154193dfbc12535f25444cdf6fccc16a3e1382 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 12 May 2026 15:36:57 -0500 Subject: [PATCH 011/111] feat: default sequence to Agentic Traces when available If the selected model has agentic_traces data, prefer that over the default 8K/1K fixed-seq when the user hasn't explicitly chosen via URL. effectiveSequence already falls back to availableSequences[0] for models without agentic, so models with only fixed-seq data still render correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/GlobalFilterContext.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 11e56de7..7813d079 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -125,7 +125,9 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { const [selectedSequence, setSelectedSequence] = useState(() => { const urlSeq = getUrlParam('i_seq'); if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence; - return Sequence.EightK_OneK; + // Prefer Agentic Traces by default when the selected model has it; the + // effectiveSequence fallback below handles models without agentic data. + return Sequence.AgenticTraces; }); const [selectedPrecisions, setSelectedPrecisionsRaw] = useState(() => { From 099a33efcb53f5130dc40d715a0f4b86d6136a93 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:25:25 -0500 Subject: [PATCH 012/111] fix(agentic): respect percentile selector for input-throughput x axis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rowToAggDataEntry was only copying median/p99 metric variants — picking p90/p99.9 in the percentile selector silently fell back to 0 and collapsed every point into a vertical line at x=0. Copy the full median/p90/p99/p99.9 set into AggDataEntry. Hide the X-Axis Metric dropdown for agentic mode (it doubled up with the percentile selector) and route the input-metric chart through withPercentile so picking p99 actually plots p99_ttft instead of the hard-coded p99_ttft config default. Percentile options pared back to median + p99. --- .../inference/hooks/useChartData.ts | 46 +++++++++++++++++-- .../app/src/components/inference/types.ts | 10 ++++ .../components/inference/ui/ChartControls.tsx | 3 +- packages/app/src/lib/benchmark-transform.ts | 12 ++++- packages/app/src/lib/data-mappings.ts | 8 +--- packages/app/src/lib/energy-metrics.test.ts | 10 ++++ 6 files changed, 77 insertions(+), 12 deletions(-) diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 81ab0780..57e9a1c2 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -16,7 +16,7 @@ import { filterDataByCostLimit } from '@/components/inference/utils'; import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants'; import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; /** Build deduplicated comparison dates, excluding the main run date. */ @@ -216,7 +216,14 @@ export function useChartData( ? 'P99 Time To First Token (s)' : 'Median Time To First Token (s)'; - if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) { + const isAgentic = selectedSequence === Sequence.AgenticTraces; + + if ( + effectiveXMetric && + chartDef.chartType === 'interactivity' && + isInputMetric && + !isAgentic + ) { xAxisField = effectiveXMetric as keyof AggDataEntry; const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) { @@ -225,15 +232,40 @@ export function useChartData( xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label; } } else if (chartDef.chartType === 'interactivity' && isInputMetric) { + // Agentic falls through here too — the manual X-axis dropdown is + // hidden in agentic mode (would double up with the percentile + // selector), so the config default + percentile post-processing + // below drives the x axis. const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition; const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x; xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label; - } else if (chartDef.chartType === 'e2e' && isTtftOverride) { + } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) { xAxisField = effectiveXMetric as keyof AggDataEntry; xAxisLabel = ttftLabel; } + // Agentic: rewrite the resolved x metric to the chosen percentile, + // and relabel accordingly. naturalX is already percentile-adjusted, + // so the per-metric override path is the only one that actually + // changes here. + if (isAgentic) { + const adjusted = withPercentile( + xAxisField as string, + selectedPercentile, + ) as keyof AggDataEntry; + if (adjusted !== xAxisField) { + const pctlWord = + selectedPercentile === 'median' + ? 'Median' + : selectedPercentile === 'p99.9' + ? 'P99.9' + : selectedPercentile.toUpperCase(); + xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord); + xAxisField = adjusted; + } + } + // The x-axis is "flipped" only when the good-direction reverses // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), @@ -269,7 +301,13 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile], + [ + selectedYAxisMetric, + selectedXAxisMetric, + selectedE2eXAxisMetric, + selectedPercentile, + selectedSequence, + ], ); // Build renderable graphs (data processing + stable chart definitions) diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a2d9ef2e..cddeba54 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -50,23 +50,33 @@ export interface AggDataEntry { mean_ttft: number; median_ttft: number; std_ttft: number; + p90_ttft: number; p99_ttft: number; + 'p99.9_ttft': number; mean_tpot: number; mean_intvty: number; median_tpot: number; median_intvty: number; std_tpot: number; std_intvty: number; + p90_tpot: number; + p90_intvty: number; p99_tpot: number; p99_intvty: number; + 'p99.9_tpot': number; + 'p99.9_intvty': number; mean_itl: number; median_itl: number; std_itl: number; + p90_itl: number; p99_itl: number; + 'p99.9_itl': number; mean_e2el: number; median_e2el: number; std_e2el: number; + p90_e2el: number; p99_e2el: number; + 'p99.9_e2el': number; disagg: boolean; num_prefill_gpu: number; num_decode_gpu: number; diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 6707bd9e..7b4fa08f 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -269,7 +269,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
{graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') && - isInputMetric && ( + isInputMetric && + selectedSequence !== Sequence.AgenticTraces && (
): Record { const out: Record = {}; - for (const suffix of ['mean', 'median', 'p90', 'p99']) { + for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) { const itl = m[`${suffix}_itl`]; const ttlt = m[`${suffix}_ttlt`]; if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; @@ -62,23 +62,33 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { mean_ttft: m.mean_ttft ?? 0, median_ttft: m.median_ttft ?? 0, std_ttft: m.std_ttft ?? 0, + p90_ttft: m.p90_ttft ?? 0, p99_ttft: m.p99_ttft ?? 0, + 'p99.9_ttft': m['p99.9_ttft'] ?? 0, mean_tpot: m.mean_tpot ?? 0, median_tpot: m.median_tpot ?? 0, std_tpot: m.std_tpot ?? 0, + p90_tpot: m.p90_tpot ?? 0, p99_tpot: m.p99_tpot ?? 0, + 'p99.9_tpot': m['p99.9_tpot'] ?? 0, mean_intvty: m.mean_intvty ?? 0, median_intvty: m.median_intvty ?? 0, std_intvty: m.std_intvty ?? 0, + p90_intvty: m.p90_intvty ?? 0, p99_intvty: m.p99_intvty ?? 0, + 'p99.9_intvty': m['p99.9_intvty'] ?? 0, mean_itl: m.mean_itl ?? 0, median_itl: m.median_itl ?? 0, std_itl: m.std_itl ?? 0, + p90_itl: m.p90_itl ?? 0, p99_itl: m.p99_itl ?? 0, + 'p99.9_itl': m['p99.9_itl'] ?? 0, mean_e2el: m.mean_e2el ?? 0, median_e2el: m.median_e2el ?? 0, std_e2el: m.std_e2el ?? 0, + p90_e2el: m.p90_e2el ?? 0, p99_e2el: m.p99_e2el ?? 0, + 'p99.9_e2el': m['p99.9_e2el'] ?? 0, disagg: row.disagg, num_prefill_gpu: row.num_prefill_gpu, num_decode_gpu: row.num_decode_gpu, diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index f137875c..bf48c864 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -186,21 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; /** * Percentile of the latency distribution used for the chart x-axis when * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants - * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which - * slice to plot. + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the + * two most commonly read slices (p50, p99) are surfaced in the UI. */ export enum Percentile { Median = 'median', - P90 = 'p90', P99 = 'p99', - P99_9 = 'p99.9', } const PERCENTILE_CONFIG: Record = { [Percentile.Median]: { label: 'p50 (median)' }, - [Percentile.P90]: { label: 'p90' }, [Percentile.P99]: { label: 'p99' }, - [Percentile.P99_9]: { label: 'p99.9' }, }; export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts index 28cc1e36..54788585 100644 --- a/packages/app/src/lib/energy-metrics.test.ts +++ b/packages/app/src/lib/energy-metrics.test.ts @@ -57,23 +57,33 @@ function makeEntry(overrides: Partial = {}): AggDataEntry { mean_ttft: 0.5, median_ttft: 0.4, std_ttft: 0.1, + p90_ttft: 0.7, p99_ttft: 0.8, + 'p99.9_ttft': 0.9, mean_tpot: 0.02, mean_intvty: 45, median_tpot: 0.02, median_intvty: 44, std_tpot: 0.005, std_intvty: 5, + p90_tpot: 0.025, + p90_intvty: 55, p99_tpot: 0.03, p99_intvty: 60, + 'p99.9_tpot': 0.035, + 'p99.9_intvty': 65, mean_itl: 0.01, median_itl: 0.01, std_itl: 0.002, + p90_itl: 0.013, p99_itl: 0.015, + 'p99.9_itl': 0.018, mean_e2el: 5, median_e2el: 4.8, std_e2el: 0.5, + p90_e2el: 5.5, p99_e2el: 6, + 'p99.9_e2el': 6.5, disagg: false, num_prefill_gpu: 0, num_decode_gpu: 0, From 50a06d1419c70ddd8d24b2c6545da44fe6be3a4d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:27:19 -0500 Subject: [PATCH 013/111] fix(agentic): default percentile to p99 and drop median option --- packages/app/src/components/inference/InferenceContext.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index b4ccb9ef..af2d364e 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -122,7 +122,7 @@ export function InferenceProvider({ // Latency percentile applied to the chart x-axis for agentic scenarios. // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( - () => getUrlParam('i_pctl') || 'median', + () => getUrlParam('i_pctl') || 'p99', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index bf48c864..1b4f47c3 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -186,16 +186,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; /** * Percentile of the latency distribution used for the chart x-axis when * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants - * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the - * two most commonly read slices (p50, p99) are surfaced in the UI. + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p99 + * is surfaced in the UI. */ export enum Percentile { - Median = 'median', P99 = 'p99', } const PERCENTILE_CONFIG: Record = { - [Percentile.Median]: { label: 'p50 (median)' }, [Percentile.P99]: { label: 'p99' }, }; From 3c96e9137776d1c368a0acdfeee6e769d5733464 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:31:27 -0500 Subject: [PATCH 014/111] fix(agentic): keep only p90 as the percentile option --- packages/app/src/components/inference/InferenceContext.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 0ba14a21..accfdf9e 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -136,7 +136,7 @@ export function InferenceProvider({ // Latency percentile applied to the chart x-axis for agentic scenarios. // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( - () => getUrlParam('i_pctl') || 'p99', + () => getUrlParam('i_pctl') || 'p90', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 0afb304a..83e6648a 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -191,12 +191,10 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; */ export enum Percentile { P90 = 'p90', - P99 = 'p99', } const PERCENTILE_CONFIG: Record = { [Percentile.P90]: { label: 'p90' }, - [Percentile.P99]: { label: 'p99' }, }; export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; From 642081af77c8165ac89a5177abbd6c0244dfb9c0 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Fri, 15 May 2026 13:31:30 -0400 Subject: [PATCH 015/111] fix(agentic): default percentile to p90, surface only p90/p99 Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/cypress/support/mock-data.ts | 2 +- .../app/src/components/inference/InferenceContext.tsx | 2 +- .../app/src/components/inference/hooks/useChartData.ts | 9 ++------- packages/app/src/components/ui/chart-selectors.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 6 ++++-- packages/app/src/lib/url-state.ts | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index f267dcc9..34b89aba 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,7 +189,7 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), - selectedPercentile: 'median', + selectedPercentile: 'p90', setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index accfdf9e..36dc672d 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -134,7 +134,7 @@ export function InferenceProvider({ () => getUrlParam('i_e2e_xmetric') || null, ); // Latency percentile applied to the chart x-axis for agentic scenarios. - // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. + // Values: 'p90' | 'p99'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( () => getUrlParam('i_pctl') || 'p90', ); diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index f2ef85ec..436fd662 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -83,7 +83,7 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, - selectedPercentile = 'median', + selectedPercentile = 'p90', /** When set, only series for these two registry GPU keys are shown (compare pages). */ compareGpuPair?: readonly [string, string] | null, ) { @@ -261,12 +261,7 @@ export function useChartData( selectedPercentile, ) as keyof AggDataEntry; if (adjusted !== xAxisField) { - const pctlWord = - selectedPercentile === 'median' - ? 'Median' - : selectedPercentile === 'p99.9' - ? 'P99.9' - : selectedPercentile.toUpperCase(); + const pctlWord = selectedPercentile.toUpperCase(); xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord); xAxisField = adjusted; } diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index d2940de4..e30816fa 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -315,7 +315,7 @@ export function PercentileSelector({ - P99 TTFT - Median TTFT + P90 TTFT
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index f0e1692a..78df2c37 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -408,27 +408,20 @@ export default function ChartDisplay() { if ( graph.chartDefinition.chartType === 'interactivity' && isInputMetric && - selectedXAxisMetric + selectedXAxisMetric === 'p90_ttft' ) { - if (selectedXAxisMetric === 'p99_ttft') { - return 'vs. P99 Time To First Token'; - } else if (selectedXAxisMetric === 'median_ttft') { - return 'vs. Median Time To First Token'; - } + return 'vs. P90 Time To First Token'; } // For e2e chart: render clickable inline dropdown for x-axis if (graph.chartDefinition.chartType === 'e2e') { const xAxisLabel = - selectedE2eXAxisMetric === 'p99_ttft' - ? 'P99 TTFT' - : selectedE2eXAxisMetric === 'median_ttft' - ? 'Median TTFT' - : 'End-to-end Latency'; + selectedE2eXAxisMetric === 'p90_ttft' + ? 'P90 TTFT' + : 'End-to-end Latency'; const xAxisOptions = [ { value: null, label: 'End-to-end Latency' }, - { value: 'p99_ttft', label: 'P99 TTFT' }, - { value: 'median_ttft', label: 'Median TTFT' }, + { value: 'p90_ttft', label: 'P90 TTFT' }, ]; const zoomPrefix = selectedDateRange.startDate && diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts index 8f8705e1..589ba580 100644 --- a/packages/app/src/components/inference/utils.test.ts +++ b/packages/app/src/components/inference/utils.test.ts @@ -157,12 +157,12 @@ describe('processOverlayChartData', () => { }); it('remaps x to config override for input metrics on interactivity chart', () => { - // inputTputPerGpu has x override to p99_ttft on interactivity chart + // inputTputPerGpu has x override to p90_ttft on interactivity chart const data = [ pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_intvty: 50, } as any), ]; @@ -176,16 +176,11 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - median_ttft: 0.1, + p90_ttft: 0.1, median_intvty: 50, } as any), ]; - const result = processOverlayChartData( - data, - 'interactivity', - 'y_inputTputPerGpu', - 'median_ttft', - ); + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.1); }); @@ -195,76 +190,62 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_e2el: 2.5, } as any), ]; const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null); expect(result).toHaveLength(1); - // e2e uses median_e2el as x (from chart config default), not p99_ttft + // e2e uses median_e2el as x (from chart config default), not p90_ttft expect(result[0].x).toBe(2.5); }); - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => { - const data = [ - pt({ - x: 100, - tpPerGpu: { y: 42, roof: false }, - p99_ttft: 0.35, - median_e2el: 2.5, - } as any), - ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); - expect(result).toHaveLength(1); - expect(result[0].x).toBe(0.35); - }); - - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => { + it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => { const data = [ pt({ x: 100, tpPerGpu: { y: 42, roof: false }, - median_ttft: 0.12, + p90_ttft: 0.12, median_e2el: 2.5, } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.12); }); it('filters e2e TTFT outliers exceeding y_latency_limit', () => { const data = [ - pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any), - pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any), + pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any), + pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); it('does not filter interactivity points by latency limit when x-axis is default', () => { - // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity + // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity // chart's x-axis stays median_intvty for non-input metrics. The latency limit // (60) must NOT apply to median_intvty values. const data = [ pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any), pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(2); }); it('applies latency limit on interactivity only when x-axis is actually overridden', () => { - // When an input metric IS selected and x-axis overrides to p99_ttft, + // When an input metric IS selected and x-axis overrides to p90_ttft, // the latency limit should apply. const data = [ - pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any), - pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any), + pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any), + pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft'); - // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999 + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); + // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999 expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts index 4b5335b6..735007ab 100644 --- a/packages/app/src/components/inference/utils.ts +++ b/packages/app/src/components/inference/utils.ts @@ -88,8 +88,7 @@ export function processOverlayChartData( let xAxisField: string = chartDef.x; // selectedXAxisMetric is already the effective metric for this chart type // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric) - const isTtftOverride = - selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft'; + const isTtftOverride = selectedXAxisMetric === 'p90_ttft'; if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) { xAxisField = selectedXAxisMetric; diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index e30816fa..19b4bfb0 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -315,7 +315,7 @@ export function PercentileSelector({ From 19b99586353cd39bccd4072bd6e2a2afcaf73367 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 26 May 2026 18:32:26 -0500 Subject: [PATCH 043/111] fix(scenario-selector): wrap Deprecated header in SelectLabel only inside Select MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous commit (b3e315c) changed DeprecatedSectionTitle to render SelectLabel internally, which throws at runtime ("SelectLabel must be used within SelectGroup") in callsites that render the header via MultiSelect — MultiSelect wraps the header in its own div, not a Radix SelectGroup. Revert the component to a plain styled span (MultiSelect's div wrapper supplies the small/muted styling), and wrap with SelectLabel only at the ScenarioSelector callsite, where the header sits directly inside a SelectGroup. Co-Authored-By: Claude Opus 4.7 --- .../app/src/components/ui/chart-selectors.tsx | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index 8b91059a..49ea3f1a 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -31,9 +31,16 @@ import { sequenceKind, } from '@/lib/data-mappings'; +/** + * "Deprecated" sub-header used by selectors. Rendered as a span (not + * SelectLabel) because some callsites use `MultiSelect`, which wraps + * headers in its own div and isn't a SelectGroup. The span carries no + * styling of its own — the parent context supplies the muted/small + * treatment. ScenarioSelector renders this inside a SelectLabel directly. + */ function DeprecatedSectionTitle({ reason }: { reason: string }) { return ( - + Deprecated @@ -43,7 +50,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) { {reason} - + ); } @@ -282,7 +289,9 @@ export function ScenarioSelector({ ))} {fixedGroups.deprecated.length > 0 && ( <> - + + + {fixedGroups.deprecated.map((seq) => ( {getSequenceLabel(seq as Sequence)} From 7114833409b92a206f7c22b80846db527e01da43 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 13:22:13 -0500 Subject: [PATCH 044/111] feat(agentic-detail): add cumulative input tokens chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces a new chart on the agentic detail page showing the running total of input (prompt) tokens served over the course of the run — useful for seeing how the load actually accumulates vs the instantaneous prefill_tps line we already plot. Adds a `cumulativeSum` helper alongside the existing `cumulativeAverage` and `sumSeries` time-series utilities. No backfill needed — the source data (`chart_series.prefillTps`) is already pre-computed at ingest time for every blob-bearing row. (Input throughput as a Pareto axis is already wired via the existing `y_inputTputPerGpu` y-axis option; no change there.) Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 24 +++++++++++++++++++ .../agentic-point/time-series-chart.tsx | 17 +++++++++++++ 2 files changed, 41 insertions(+) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 2e43b4fb..1a61b93b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -26,6 +26,7 @@ import { StackedAreaChart, TimeSeriesChart, cumulativeAverage, + cumulativeSum, rollingAverage, sumSeries, } from './time-series-chart'; @@ -381,6 +382,29 @@ export function AgenticPointDetail({ id }: Props) { ); }} /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + return ( + + ); + }} + /> )} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index cd10aff7..042c4331 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -58,6 +58,23 @@ export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } +/** + * Running cumulative sum of a per-interval rate series. Each output point + * is the integral of the rate from start to that point, assuming the rate + * applies over a 1-second window (aiperf's scrape interval). Use for + * "total tokens served so far" from a tokens-per-second series. + */ +export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { + if (data.length === 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + let sum = 0; + for (let i = 0; i < data.length; i++) { + sum += data[i]!.value; + out[i] = { t: data[i]!.t, value: sum }; + } + return out; +} + /** Pointwise sum of two arrays sharing the same t index. */ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { const n = Math.min(a.length, b.length); From c6697de8ff3d8263924986fd71b4622f1369f9a3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 14:44:19 -0500 Subject: [PATCH 045/111] feat(agentic-detail): plot cumulative unique input tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the "Total input tokens over time" chart with "Total unique input tokens over time" — cumsum of (prompt-token rate − prefix-cache- hit rate per second), which equals the cumulative tokens vllm actually had to prefill from scratch (= vllm:request_prefill_kv_computed_tokens). Adds `prefixCacheHitsTps` to the chart_series JSONB (extracted by summing vllm:prefix_cache_hits.rate across all engine series, same DP- aware path as prefillTps). Bumps CHART_SERIES_VERSION to 3; the existing trace-server-metrics query defaults the field to [] for any older v2 rows so reads stay safe before backfill catches up. Backfilled 62 rows to v3. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 14 +++++++++++--- .../src/hooks/api/use-trace-server-metrics.ts | 2 ++ packages/db/src/etl/compute-chart-series.ts | 16 +++++++++++++++- packages/db/src/queries/trace-server-metrics.ts | 4 ++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 1a61b93b..4bebd37c 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -384,16 +384,24 @@ export function AgenticPointDetail({ id }: Props) { /> { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; + // Unique = total prompt tokens vllm received minus the tokens + // it served from the prefix cache. The cache-miss portion is + // what actually constitutes "new content" the GPU had to + // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens. + const unique = sumSeries( + metrics.prefillTps, + metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })), + ); return ( ; prefillTps: TimeSeriesPoint[]; decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; } async function fetchTraceServerMetrics( diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 530600cf..91e89521 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -25,8 +25,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * only series[0], which under-counted by Nx on multi-engine DP/PP * deployments — most visible as a request-queue-depth chart that maxed out * at ~3 when the timeline clearly showed 20+ in-flight). + * + * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative + * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps). */ -export const CHART_SERIES_VERSION = 2; +export const CHART_SERIES_VERSION = 3; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -57,6 +60,13 @@ export interface ChartSeries { promptTokensBySource: Record; prefillTps: TimeSeriesPoint[]; decodeTps: TimeSeriesPoint[]; + /** + * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across + * engines. Detail page derives "cumulative unique input tokens" as + * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually + * saved vs the raw queries that came in. + */ + prefixCacheHitsTps: TimeSeriesPoint[]; } // ── Raw blob shapes (subset we read) ──────────────────────────────────── @@ -249,6 +259,9 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { })); const prefillTps = counterRate('vllm:prompt_tokens'); const decodeTps = counterRate('vllm:generation_tokens'); + // Tokens served from prefix cache per scrape. Lets the frontend derive + // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). + const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits'); // Per-source prompt tokens — sum across engines per source label. const promptBySrcByT = new Map>(); @@ -286,5 +299,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { promptTokensBySource, prefillTps, decodeTps, + prefixCacheHitsTps, }; } diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 624b6ed3..76775e77 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -71,6 +71,8 @@ export interface TraceServerMetrics { prefillTps: TimeSeriesPoint[]; /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */ decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; } interface RawMetaRow extends PointMeta { @@ -114,6 +116,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { promptTokensBySource: series.promptTokensBySource, prefillTps: series.prefillTps, decodeTps: series.decodeTps, + // v2 chart_series rows pre-backfill don't have this field — default to [] + prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], }; } From b5679bb10acfd6a6765b48a5864b2a0ec73d4915 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 15:00:12 -0500 Subject: [PATCH 046/111] feat(request-timeline): expandable subagent -> stream rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The harness fans a single subagent into multiple parallel ":sN" streams when its inner requests overlap in time (weka_trace._pack_into_streams). Previously each :sN got its own swimlane row, which made one parent conversation with 5 subagents (each fanned into 2-8 streams) render as 23 separate rows — visually implying 23 distinct subagent invocations when really there are 5. Now: each subagent shows as one row by default with a chevron + stream count chip ("subagent 003 · f1e7 ×8"). The collapsed row draws the union of all stream bars overlaid, so the concurrency burst is still visible at a glance. Click the chevron to fan into per-stream rows; click again to collapse. For conv 0f5b266f in benchmark 206360: 23 rows → 5 rows by default. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/request-timeline.tsx | 325 ++++++++++++------ 1 file changed, 226 insertions(+), 99 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index bcbe105a..8762a158 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -53,44 +53,84 @@ const PHASE_COLORS: Record = { unknown: '#64748b', }; +/** + * Row kinds: + * parent — top-level conversation (depth 0) + * worker — worker swimlane (depth 0, worker mode) + * subagent — a subagent invocation (depth 1). Either a single + * stream (renders its own bars), or a multi-stream + * container whose bars are the union of its streams + * when collapsed. + * stream — one :sN stream of a multi-stream subagent (depth 2). + * Hidden by default; toggled in via the parent's chevron. + */ +type RowKind = 'parent' | 'worker' | 'subagent' | 'stream'; + interface Row { key: string; label: string; color: string; requests: RequestRecord[]; - /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */ depth: number; - /** True if this row is a sub-agent ("Subagent N of parent X"). */ - isSubagent: boolean; + kind: RowKind; + /** Number of streams under this subagent (>=1). Only set for subagent rows. */ + streamCount?: number; + /** For stream rows: the parent subagent's row key (drives expand/collapse). */ + parentRowKey?: string; } /** * Conversation ids for subagent calls look like - * ::sa:subagent__ - * Split into the parent cid and a sub-agent label (or the whole thing if - * this is a top-level conversation). + * ::sa:[:s] + * The optional `:s` suffix is set when the harness fans a single + * subagent into multiple parallel "streams" (interval-graph + * decomposition in weka_trace._pack_into_streams). We split it off so + * we can group all streams of one subagent under a single header row. */ -function splitCid(cid: string): { parent: string; subagent: string | null } { +function splitCid(cid: string): { + parent: string; + subagentBase: string | null; + stream: number | null; +} { const sep = cid.indexOf('::sa:'); - if (sep === -1) return { parent: cid, subagent: null }; - return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) }; + if (sep === -1) return { parent: cid, subagentBase: null, stream: null }; + const parent = cid.slice(0, sep); + const raw = cid.slice(sep + 5); + const m = /^(.*):s(\d+)$/.exec(raw); + if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) }; + return { parent, subagentBase: raw, stream: null }; } -/** Group requests into rows; in conversation mode subagents nest under parents. */ -function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { - const groups = new Map(); - for (const r of requests) { - const key = mode === 'conversation' ? r.cid : r.wid; - let list = groups.get(key); - if (!list) { - list = []; - groups.set(key, list); - } - list.push(r); - } - +/** + * Group requests into rows. In conversation mode, output order is: + * parent_conv + * subagent_001 (collapsed by default, container) + * :s0 (hidden unless expanded) + * :s1 + * subagent_002 + * ... + * + * `expandedSubagents` controls which subagent containers reveal their + * stream children. Bars on a collapsed subagent are the UNION of all its + * streams' requests — overlapping bars visually communicate the + * stream-level parallelism without expanding. + */ +function buildRows( + requests: RequestRecord[], + mode: RowMode, + expandedSubagents: ReadonlySet, +): Row[] { if (mode !== 'conversation') { // Worker mode: flat rows, sorted by first activity. + const groups = new Map(); + for (const r of requests) { + let list = groups.get(r.wid); + if (!list) { + list = []; + groups.set(r.wid, list); + } + list.push(r); + } const rows: Row[] = []; let i = 0; for (const [key, list] of groups) { @@ -101,7 +141,7 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { color: ROW_COLORS[i % ROW_COLORS.length]!, requests: list, depth: 0, - isSubagent: false, + kind: 'worker', }); i++; } @@ -109,36 +149,40 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { return rows; } - // Conversation mode: build a parent → [subagents] tree so each parent - // group renders as one parent row followed by its sub-agent rows. Color - // is shared inside a tree so the visual grouping reads. + // Conversation mode — tree: parent → subagent → stream. interface Tree { parentCid: string; - parentRow: { key: string; requests: RequestRecord[] } | null; - subagents: Map; // subagent label → requests + parentReqs: RequestRecord[]; + // subagentBase → (streamIndex|null → requests) + subagents: Map>; firstStart: number; } const trees = new Map(); - for (const [cid, list] of groups) { - list.sort((a, b) => a.start - b.start); - const { parent, subagent } = splitCid(cid); + for (const r of requests) { + const { parent, subagentBase, stream } = splitCid(r.cid); let tree = trees.get(parent); if (!tree) { tree = { parentCid: parent, - parentRow: null, + parentReqs: [], subagents: new Map(), firstStart: Number.POSITIVE_INFINITY, }; trees.set(parent, tree); } - if (subagent === null) { - tree.parentRow = { key: cid, requests: list }; + if (subagentBase === null) { + tree.parentReqs.push(r); } else { - tree.subagents.set(subagent, list); + let saMap = tree.subagents.get(subagentBase); + if (!saMap) { + saMap = new Map(); + tree.subagents.set(subagentBase, saMap); + } + const list = saMap.get(stream); + if (list) list.push(r); + else saMap.set(stream, [r]); } - const earliest = list[0]!.start; - if (earliest < tree.firstStart) tree.firstStart = earliest; + if (r.start < tree.firstStart) tree.firstStart = r.start; } const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart); @@ -147,39 +191,66 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { for (const tree of sortedTrees) { const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!; colorIdx++; - if (tree.parentRow) { + // Parent row (use a placeholder key if the parent itself wasn't replayed). + tree.parentReqs.sort((a, b) => a.start - b.start); + rows.push({ + key: tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`, + label: tree.parentCid, + color, + requests: tree.parentReqs, + depth: 0, + kind: 'parent', + }); + + // One subagent row per base (which may contain N streams). + const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => { + const aStart = Math.min( + ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ); + const bStart = Math.min( + ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ); + return aStart - bStart; + }); + for (const [saBase, streams] of subagentEntries) { + const subagentKey = `${tree.parentCid}::sa:${saBase}`; + // Union of all stream requests for collapsed-view bars. + const allReqs: RequestRecord[] = []; + for (const reqs of streams.values()) allReqs.push(...reqs); + allReqs.sort((a, b) => a.start - b.start); + const streamCount = streams.size; rows.push({ - key: tree.parentRow.key, - label: shortenCid(tree.parentCid), + key: subagentKey, + label: `↳ ${formatSubagentLabel(saBase)}`, color, - requests: tree.parentRow.requests, - depth: 0, - isSubagent: false, - }); - } else { - // Pseudo-parent header so orphan subagents still render under - // something they belong to. - rows.push({ - key: `__parent_${tree.parentCid}`, - label: shortenCid(tree.parentCid), - color, - requests: [], - depth: 0, - isSubagent: false, - }); - } - const subagentEntries = [...tree.subagents.entries()].toSorted( - (a, b) => a[1][0]!.start - b[1][0]!.start, - ); - for (const [saLabel, list] of subagentEntries) { - rows.push({ - key: `${tree.parentCid}::${saLabel}`, - label: `↳ ${formatSubagentLabel(saLabel)}`, - color, - requests: list, + requests: allReqs, depth: 1, - isSubagent: true, + kind: 'subagent', + streamCount, }); + + // Stream children only when expanded AND there's more than one + // stream (a single-stream subagent has nothing extra to show). + if (streamCount > 1 && expandedSubagents.has(subagentKey)) { + const streamEntries = [...streams.entries()].toSorted((a, b) => { + // Sort by stream index (null first as the "default" stream) + const ai = a[0] ?? -1; + const bi = b[0] ?? -1; + return ai - bi; + }); + for (const [streamIdx, reqs] of streamEntries) { + reqs.sort((a, b) => a.start - b.start); + rows.push({ + key: `${subagentKey}:s${streamIdx ?? '∅'}`, + label: `stream ${streamIdx ?? '∅'}`, + color, + requests: reqs, + depth: 2, + kind: 'stream', + parentRowKey: subagentKey, + }); + } + } } } return rows; @@ -192,11 +263,6 @@ function formatSubagentLabel(raw: string): string { return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`; } -function shortenCid(cid: string): string { - if (cid.length <= 12) return cid; - return `${cid.slice(0, 8)}…${cid.slice(-4)}`; -} - function shortenWid(wid: string): string { // worker_4ae87bea → w_4ae8 return wid.replace(/^worker_/, 'w_').slice(0, 12); @@ -314,6 +380,17 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { const [rowMode, setRowMode] = useState('conversation'); const [phaseFilter, setPhaseFilter] = useState('profiling'); const [tooltip, setTooltip] = useState(null); + // Which multi-stream subagents currently have their per-stream rows + // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id). + const [expandedSubagents, setExpandedSubagents] = useState>(() => new Set()); + const toggleSubagent = useCallback((key: string) => { + setExpandedSubagents((prev) => { + const next = new Set(prev); + if (next.has(key)) next.delete(key); + else next.add(key); + return next; + }); + }, []); const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null); // Apply phase filter, then group into rows. @@ -322,7 +399,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'), [data.requests, phaseFilter], ); - const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]); + const rows = useMemo( + () => buildRows(filtered, rowMode, expandedSubagents), + [filtered, rowMode, expandedSubagents], + ); // Pre-sort the timestamp columns so the cursor-time stats popover can // count "running / waiting at time t" in O(log n). With a few hundred @@ -359,7 +439,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { const isZoomed = viewEnd !== null; // Layout - const LABEL_WIDTH = 160; + // Wide enough for a full 36-char conversation id at 10px font, plus the + // indent + color stripe + count badge. Subagent rows inherit the same + // width but truncate the longer "↳ subagent N · hash" tail with ellipsis. + const LABEL_WIDTH = 360; const ROW_HEIGHT = 22; const ROW_GAP = 3; const HEADER_HEIGHT = 24; @@ -537,33 +620,58 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {rowMode === 'conversation' ? 'Conversation' : 'Worker'} - {rows.map((row) => ( -
- { + const isSubagentRow = row.kind === 'subagent'; + const isStreamRow = row.kind === 'stream'; + const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; + const isExpanded = isExpandable && expandedSubagents.has(row.key); + return ( +
- - {row.label} - - - {row.requests.length > 0 ? row.requests.length : '—'} - -
- ))} + {isExpandable ? ( + + ) : ( + + )} + + + {row.label} + {isExpandable && ( + ×{row.streamCount} + )} + + + {row.requests.length > 0 ? row.requests.length : '—'} + +
+ ); + })} {/* Scrollable SVG */} @@ -636,6 +744,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {rows.map((row, rowIdx) => { const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; const barH = ROW_HEIGHT - 4; + // For multi-stream subagent containers, suppress the union + // bars when expanded — the child stream rows draw them + // individually instead, so we'd double-draw otherwise. + if ( + row.kind === 'subagent' && + (row.streamCount ?? 1) > 1 && + expandedSubagents.has(row.key) + ) { + return null; + } return row.requests.map((req) => { const xCredit = xOf(req.credit); const xStart = xOf(req.start); @@ -663,7 +781,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { opacity={0.35} /> )} - {/* Main bar */} + {/* Main bar — opacity stepped down with depth so + parent > subagent > stream reads visually. */} {/* Phase strip at bottom */} Date: Wed, 27 May 2026 15:07:27 -0500 Subject: [PATCH 047/111] fix(agentic-detail): make unique-input-tokens chart monotonic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vllm's per-scrape prompt_tokens.rate and prefix_cache_hits.rate counters can lag each other by several seconds across scrapes (we see prefill=0 at one tick with hits=1.1M, then prefill=1.5M with hits=452K six ticks later — lifetime totals agree but per-tick they don't). Computing cumsum(prefill - hits) per tick made the chart dip well negative at the start. Replaces the per-tick subtraction with `cumulativeDifferenceMonotonic`: union the two series by timestamp, accumulate each independently, take the diff, then enforce a running max so the curve never decreases. End-of-run totals are unchanged (both counters converge to the right value); transient skew just looks like a brief plateau instead of a negative dip. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 21 ++++++----- .../agentic-point/time-series-chart.tsx | 37 +++++++++++++++++++ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 4bebd37c..1abf64e6 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -26,7 +26,7 @@ import { StackedAreaChart, TimeSeriesChart, cumulativeAverage, - cumulativeSum, + cumulativeDifferenceMonotonic, rollingAverage, sumSeries, } from './time-series-chart'; @@ -388,20 +388,21 @@ export function AgenticPointDetail({ id }: Props) { render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; - // Unique = total prompt tokens vllm received minus the tokens - // it served from the prefix cache. The cache-miss portion is - // what actually constitutes "new content" the GPU had to - // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens. - const unique = sumSeries( - metrics.prefillTps, - metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })), - ); + // Unique = total prompt tokens received minus tokens served + // from the prefix cache. Equivalent to cumsum of + // vllm:request_prefill_kv_computed_tokens. We compute it as + // monotonic-non-decreasing cumulative-diff so per-scrape + // timing skew between the prompt_tokens and prefix_cache_hits + // counters can't make the line dip negative. return ( [p.t, p.value])); + const bByT = new Map(b.map((p) => [p.t, p.value])); + const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y); + const out: TimeSeriesPoint[] = Array.from({ length: allT.length }); + let cumA = 0; + let cumB = 0; + let runningMax = 0; + for (let i = 0; i < allT.length; i++) { + const t = allT[i]!; + cumA += aByT.get(t) ?? 0; + cumB += bByT.get(t) ?? 0; + const diff = cumA - cumB; + if (diff > runningMax) runningMax = diff; + out[i] = { t, value: runningMax }; + } + return out; +} + /** Pointwise sum of two arrays sharing the same t index. */ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { const n = Math.min(a.length, b.length); From 08bbe6650c73935d7ac7a9fa29a722b141911bc9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 15:15:05 -0500 Subject: [PATCH 048/111] feat(agentic-detail): add unique input tokens in flight chart New chart on the per-point view that plots the deduped count of input tokens currently held by in-flight requests, as a 30s time- weighted rolling average with the raw step series rendered as faint scatter behind it. Useful for seeing the working set the model has to hold KV cache for at any instant. Computation (frontend, from request_timeline): - At each request start/end event, maintain active ISL per cid (within one cid turns are sequential, so each cid contributes at most one in-flight ISL at a time) - total_in_flight(t) = sum over cids with active request of that cid's current ISL - Across cids we treat content as independent (cross-conv prefix sharing measured at <1 pp, so summing is a tight approximation) Adds timeRollingAverage helper: time-weighted (vs sample-count) moving average suitable for irregularly-sampled event series like this one. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 43 ++++++++- .../agentic-point/time-series-chart.tsx | 96 +++++++++++++++++++ 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 1abf64e6..2db2809b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -27,8 +27,10 @@ import { TimeSeriesChart, cumulativeAverage, cumulativeDifferenceMonotonic, + inflightUniqueTokens, rollingAverage, sumSeries, + timeRollingAverage, } from './time-series-chart'; interface Props { @@ -124,8 +126,10 @@ export function AgenticPointDetail({ id }: Props) { // shows how the metric varies across the SKU. const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? []; const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates'); - // Per-request timeline fetched only when the timeline view is active. - const timelineQuery = useRequestTimeline(id, view === 'timeline'); + // Per-request timeline used by both the timeline view AND the per-point + // "Unique input tokens in flight" chart, so fetch whenever we're on + // either view. + const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point'); return (
@@ -414,6 +418,41 @@ export function AgenticPointDetail({ id }: Props) { ); }} /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!timelineQuery.data) { + return timelineQuery.isLoading ? : ; + } + // Step function: at each request start/end, sum the ISLs of + // currently-active requests across distinct cids. Within one + // cid turns are sequential so each cid contributes at most + // one in-flight ISL; across cids we treat content as + // independent (cross-conv prefix sharing adds <1pp in + // practice). Smooth with a 30s time-weighted rolling average + // so brief turn-handoff dips don't dominate the chart. + const raw = inflightUniqueTokens(timelineQuery.data.requests); + const smoothed = timeRollingAverage(raw, 30); + return ( + + ); + }} + />
)} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 25d5a672..520b3ed6 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -27,6 +27,39 @@ interface TimeSeriesChartProps { height?: number; } +/** + * Time-weighted rolling average over a `windowS`-second trailing window. + * Treats the input as a step function (value held constant between + * samples) and integrates over the trailing window, dividing by the + * window length. Good for smoothing irregularly-sampled event series + * (e.g. request start/end events) where the regular sample-count + * `rollingAverage` would over-weight bursts of close-together events. + */ +export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] { + if (data.length === 0 || windowS <= 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + for (let i = 0; i < data.length; i++) { + const tEnd = data[i]!.t; + const tStart = Math.max(0, tEnd - windowS); + // Find the first sample j whose t is >= tStart; the step value at + // tStart is data[j-1].value if j > 0, else data[0].value. + let j = 0; + while (j < data.length && data[j]!.t < tStart) j++; + let prevT = tStart; + let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value; + let area = 0; + for (; j <= i; j++) { + const curT = data[j]!.t; + area += prevV * (curT - prevT); + prevT = curT; + prevV = data[j]!.value; + } + const dur = tEnd - tStart; + out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value }; + } + return out; +} + /** Centered rolling average over `windowSize` samples. */ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] { if (data.length === 0 || windowSize <= 1) return data; @@ -75,6 +108,69 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } +/** + * Per-event step series: at each request start/end, sum the ISLs of + * currently-active requests across distinct `cid`s. Within a single + * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N), + * so each cid contributes at most one in-flight ISL at a time. Across + * different cids we assume content is independent (parent ↔ subagent + * and conv ↔ conv share negligible prefix in practice — cross-conv + * dedup added ~0.25 pp to theoretical hit rate, so treating them as + * independent is a tight approximation of the true in-flight unique + * token count). + * + * Output is a step function: one point per event, value held constant + * until the next event. Time axis is seconds relative to the earliest + * event in `requests`. + */ +export function inflightUniqueTokens( + requests: readonly { cid: string; start: number; end: number; isl: number | null }[], +): TimeSeriesPoint[] { + if (requests.length === 0) return []; + // The request_timeline timestamps are ns-relative to its own origin. + // Convert events to seconds and emit a step series. + interface Event { + tNs: number; + kind: 'start' | 'end'; + cid: string; + isl: number; + } + const events: Event[] = []; + for (const r of requests) { + const isl = r.isl ?? 0; + if (isl <= 0) continue; + events.push({ tNs: r.start, kind: 'start', cid: r.cid, isl }); + events.push({ tNs: r.end, kind: 'end', cid: r.cid, isl }); + } + if (events.length === 0) return []; + // Sort by time; on ties, process 'end' before 'start' so a same-instant + // turn handoff within one cid doesn't transiently double-count. + events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1)); + + // Active ISL per cid (max in case the same cid somehow has overlapping + // events; in practice it's always 0 or 1 request at a time per cid). + const activeByCid = new Map(); + let total = 0; + const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }]; + for (const e of events) { + const tSec = e.tNs / 1e9; + if (e.kind === 'start') { + const prev = activeByCid.get(e.cid) ?? 0; + const next = Math.max(prev, e.isl); + activeByCid.set(e.cid, next); + total += next - prev; + } else { + const cur = activeByCid.get(e.cid) ?? 0; + if (cur > 0) { + total -= cur; + activeByCid.delete(e.cid); + } + } + out.push({ t: tSec, value: Math.max(0, total) }); + } + return out; +} + /** * Monotonic-non-decreasing cumulative difference of two rate series: * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce From 7561deb1cc5a210ce6cd074ab0d4771b3b9f8342 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 20:30:39 -0500 Subject: [PATCH 049/111] feat(chart-series): extract SGLang metrics alongside vllm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our chart_series + aggregate_stats extractors hardcoded vllm:* metric names, so SGLang runs (e.g. qwen3.5/h100/sglang) ingested cleanly but the per-point detail page rendered empty charts — chart_series fields were all zero-length arrays. Adds fallback chains in each extractor: KV cache util vllm:kv_cache_usage_perc → sglang:token_usage Prefix cache hits vllm:prefix_cache_hits → sglang:cached_tokens Prefix cache qrys vllm:prefix_cache_queries → sglang:prompt_tokens Requests running vllm:num_requests_running → sglang:num_running_reqs Requests waiting vllm:num_requests_waiting → sglang:num_queue_reqs Prompt tokens rate vllm:prompt_tokens → sglang:prompt_tokens Generation rate vllm:generation_tokens → sglang:generation_tokens The `pickFirstNonEmpty` helper walks the chain and uses whichever series has data, so a future framework (mori-sglang, dynamo, etc.) can plug in by adding its names to each chain — no per-framework branching. CHART_SERIES_VERSION → 4, STATS_VERSION → 3. Both backfills re-ran (86 chart_series rows, 190 aggregate_stats rows). SGLang chart_series for qwen3.5 run 944 verified — was 0-length arrays before, now ~1800 samples each. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 67 +++++++++++++++---- packages/db/src/queries/agentic-aggregates.ts | 56 +++++++++++++--- 2 files changed, 98 insertions(+), 25 deletions(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 91e89521..86b79925 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -28,8 +28,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps). + * + * v4: extract sglang:* metrics too (fallback chain in each picker), so + * SGLang runs populate the chart_series the same way vllm runs do. */ -export const CHART_SERIES_VERSION = 3; +export const CHART_SERIES_VERSION = 4; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -89,8 +92,13 @@ interface RawMetric { type MetricsMap = Record; -/** The set of metric subtrees the chart consumes. */ +/** + * The set of metric subtrees the chart consumes. Includes both vllm:* and + * sglang:* names so the stream-parse fallback collects whichever framework + * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric. + */ const CHART_METRIC_KEYS = new Set([ + // vLLM 'vllm:kv_cache_usage_perc', 'vllm:gpu_cache_usage_perc', 'vllm:prefix_cache_hits', @@ -100,6 +108,13 @@ const CHART_METRIC_KEYS = new Set([ 'vllm:prompt_tokens', 'vllm:generation_tokens', 'vllm:prompt_tokens_by_source', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', + 'sglang:generation_tokens', + 'sglang:num_running_reqs', + 'sglang:num_queue_reqs', ]); /** @@ -220,18 +235,37 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { if (!Number.isFinite(startNs)) startNs = 0; const tOf = (ns: number) => (ns - startNs) / 1e9; + // Pick the first metric name whose series array has any data; fallback + // chain lets the same code path serve both vllm:* and sglang:* blobs. + const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; + }; + // KV cache usage (gauge, 0..1) — average across engines so the value // stays a fraction (each engine has its own KV pool). - const kvSeries = - metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvSeries = pickSeries( + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); const kvCacheUsage: TimeSeriesPoint[] = sortedEntries( aggregateByStart(kvSeries, 'avg', 'avg'), ).map(([t, v]) => ({ t: tOf(t), value: v })); // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across - // engines, joined on start_ns. - const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum'); - const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum'); + // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens. + const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + const qsSeries = pickSeries( + 'vllm:prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); + const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum'); + const qsByT = aggregateByStart(qsSeries, 'rate', 'sum'); const prefixCacheHitRate: TimeSeriesPoint[] = []; for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) { const q = qsByT.get(t); @@ -239,8 +273,10 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } // Queue depth: sum running + waiting across engines per timeslice. - const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum'); - const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum'); + const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs'); + const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs'); + const runByT = aggregateByStart(runSeries, 'avg', 'sum'); + const waitByT = aggregateByStart(waitSeries, 'avg', 'sum'); const queueDepth: QueueDepthPoint[] = []; // Union of timestamps so we surface activity even if one of the gauges // didn't report a sample on a given tick. @@ -252,16 +288,19 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } // Throughput: sum the counter `rate` (already per-second) across engines. - const counterRate = (name: string): TimeSeriesPoint[] => - sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({ + // Takes a fallback chain so vllm:* and sglang:* both work. + const counterRate = (...names: string[]): TimeSeriesPoint[] => { + const s = pickSeries(...names); + return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({ t: tOf(t), value: v, })); - const prefillTps = counterRate('vllm:prompt_tokens'); - const decodeTps = counterRate('vllm:generation_tokens'); + }; + const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens'); + const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens'); // Tokens served from prefix cache per scrape. Lets the frontend derive // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). - const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits'); + const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); // Per-source prompt tokens — sum across engines per source label. const promptBySrcByT = new Map>(); diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts index 1ad7fd7f..da5d18a0 100644 --- a/packages/db/src/queries/agentic-aggregates.ts +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -32,8 +32,12 @@ import type { DbClient } from '../connection.js'; * * v2: aggregate vllm gauges/counters across all engine series (was reading * only series[0], which under-counted by Nx on multi-engine DP/PP deployments). + * + * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate + * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way + * they do for vllm runs. */ -export const STATS_VERSION = 2; +export const STATS_VERSION = 3; export interface MetricPercentiles { mean: number; @@ -199,6 +203,18 @@ function aggregateSeriesByStart( * Aggregates across all engine series so multi-engine DP/PP deployments are * counted correctly (previously we only read engine 0). */ +/** First metric whose series array is non-empty; supports vllm/sglang fallback. */ +function pickFirstNonEmpty( + metrics: Record, + ...names: string[] +): Series[] | undefined { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; +} + export function extractServerMetricSamples(json: string): { kvCacheUtil: number[]; prefixCacheHitRate: number[]; @@ -208,17 +224,29 @@ export function extractServerMetricSamples(json: string): { // KV cache util — per-engine gauge in [0, 1]. Average across engines so the // value stays a percentage; summing would give meaningless 0..N. - const kvSeriesAll = - metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvSeriesAll = pickFirstNonEmpty( + metrics, + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()]; // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across - // all engines. Sum first, then divide. - const hitsAll = - metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series; - const queriesAll = - metrics['vllm:prefix_cache_queries']?.series ?? - metrics['vllm:gpu_prefix_cache_queries']?.series; + // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens. + const hitsAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_hits', + 'vllm:gpu_prefix_cache_hits', + 'sglang:cached_tokens', + ); + const queriesAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum'); const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum'); const prefixCacheHitRate: number[] = []; @@ -232,12 +260,18 @@ export function extractServerMetricSamples(json: string): { /** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */ const TARGET_METRIC_KEYS = new Set([ + // vLLM 'vllm:kv_cache_usage_perc', - 'vllm:gpu_cache_usage_perc', // older fallback name + 'vllm:gpu_cache_usage_perc', 'vllm:prefix_cache_hits', 'vllm:prefix_cache_queries', - 'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths) + 'vllm:gpu_prefix_cache_hits', 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', ]); /** From 625d6e85e411cf8081977d3b76ad98d1805ad3c5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 20:48:58 -0500 Subject: [PATCH 050/111] fix(ingest): derive GPU cache hit rate for SGLang at ingest time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SGLang runs' harness JSON doesn't populate server_gpu_cache_hit_rate (vLLM runs do), so the detail-page header and inference chart tooltip showed "—" for SGLang points. Now at trace_replay ingest, if any of the linked benchmark_results rows has a null server_gpu_cache_hit_rate and we have non-empty prefill/hits time-series in the computed chart_series, derive the lifetime cluster ratio as sum(hits.rate) / sum(prompt.rate) and write it into the row's metrics JSONB. Already-stored SGLang rows from runs 944/945 backfilled via a one-off UPDATE earlier in this session (8 rows, mostly ~87-89% hit rate, one high-conc outlier at 2.4%). Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 8cc03f2a..8d1e01b8 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -100,4 +100,23 @@ export async function insertTraceReplay( set trace_replay_id = ${traceReplayId} where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; + + // Derive a lifetime GPU cache hit rate from chart_series for any linked + // row whose harness JSON didn't set one (SGLang runs don't populate + // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has + // no usable prefill data — leaves the field null in that case, matching + // legacy "no trace_replay" behavior. + if (chartSeries && chartSeries.prefillTps.length > 0) { + const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); + const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); + if (sumPrompts > 0) { + const rate = sumHits / sumPrompts; + await sql` + update benchmark_results + set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric)) + where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) + and (metrics->>'server_gpu_cache_hit_rate') is null + `; + } + } } From aa76e9eca423d3ab2c7079ff28d74b70adefae1c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 14:38:52 -0500 Subject: [PATCH 051/111] feat(chart-series): map sglang:realtime_tokens to promptTokensBySource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "Cumulative prompt token source breakdown" chart was empty for SGLang runs because the vllm-specific vllm:prompt_tokens_by_source metric doesn't exist on SGLang. Maps sglang:realtime_tokens (which has mode={prefill_cache, prefill_compute, decode}) into the same source breakdown when no vllm series is present, filtered to prefill_* modes (decode tokens are output throughput, not prompt-token volume). CHART_SERIES_VERSION → 5. Backfilled 128 rows; SGLang rows from runs 944/946/947 now have prefill_cache + prefill_compute sources populated. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 31 ++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 86b79925..0807e238 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -31,8 +31,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * * v4: extract sglang:* metrics too (fallback chain in each picker), so * SGLang runs populate the chart_series the same way vllm runs do. + * + * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode}) + * into promptTokensBySource so the cumulative prompt-token-source-breakdown + * chart shows useful splits for SGLang runs (filtered to prefill_* modes). */ -export const CHART_SERIES_VERSION = 4; +export const CHART_SERIES_VERSION = 5; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -115,6 +119,7 @@ const CHART_METRIC_KEYS = new Set([ 'sglang:generation_tokens', 'sglang:num_running_reqs', 'sglang:num_queue_reqs', + 'sglang:realtime_tokens', ]); /** @@ -303,6 +308,12 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); // Per-source prompt tokens — sum across engines per source label. + // vllm: vllm:prompt_tokens_by_source has one series per source label + // (local_cache_hit, external_cache_hit, miss, ...). Use the + // `source`/`reason`/`kind` label as the breakdown key. + // sglang: sglang:realtime_tokens uses a `mode` label with values + // {prefill_cache, prefill_compute, decode}. Filter to prefill_* + // since decode isn't prompt-token volume. const promptBySrcByT = new Map>(); for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) { const labels = series.labels ?? {}; @@ -318,6 +329,24 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } } } + // SGLang fallback: only consider when the vllm metric wasn't found. + if (promptBySrcByT.size === 0) { + for (const series of metrics['sglang:realtime_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const mode = labels['mode'] ?? 'unknown'; + if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens) + let byT = promptBySrcByT.get(mode); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(mode, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + } + } const promptTokensBySource: Record = {}; for (const [source, byT] of promptBySrcByT) { const arr: TimeSeriesPoint[] = []; From 5872a3d8d3c6f5e6feee879e2f8f6f5d0ddd04ac Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 14:48:27 -0500 Subject: [PATCH 052/111] feat(chart-series): break out SGLang cache hits by cache_source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously SGLang detail pages showed two stacked-area layers in the prompt-token source breakdown: prefill_cache (everything that hit the cache) + prefill_compute (cache miss). The user wanted finer granularity — specifically a distinction between on-GPU HBM cache and CPU-offloaded (hicache) host cache. SGLang's sglang:cached_tokens metric carries a cache_source label that varies per cache tier: - "device" → on-GPU HBM cache hit - "host" → CPU-offload (hicache) cache hit - "total" → older sglang, single series with no tier breakdown Switches the cache-hit portion of the breakdown from the coarse `prefill_cache` mode label to per-cache_source series: - device → "cache hit (HBM)" - host → "cache hit (CPU offload)" - total → "cache hit" - other → "cache hit ()" Cache misses still come from realtime_tokens[mode=prefill_compute], relabeled "compute (miss)" for symmetry. Current data only contains device/total (no hicache runs ingested yet) — when hicache runs come in, the chart will automatically split cache hits into HBM + CPU-offload layers with no further code change. CHART_SERIES_VERSION → 6. Backfilled 128 rows. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 47 +++++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 0807e238..1996708f 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -35,8 +35,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode}) * into promptTokensBySource so the cumulative prompt-token-source-breakdown * chart shows useful splits for SGLang runs (filtered to prefill_* modes). + * + * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source + * breakdown from sglang:cached_tokens — current runs always have one + * cache_source ("device" / HBM) but hicache (CPU offload) runs would + * split into "device" + "host" automatically once ingested. */ -export const CHART_SERIES_VERSION = 5; +export const CHART_SERIES_VERSION = 6; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -330,15 +335,49 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } } // SGLang fallback: only consider when the vllm metric wasn't found. + // - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]` + // - Cache hits, split by tier: per-series `sglang:cached_tokens` where each + // series carries a `cache_source` label ("device" = HBM, "host" = CPU + // offload via hicache). Current runs have only `device`; when hicache + // runs land, additional series will appear and the chart will split. if (promptBySrcByT.size === 0) { for (const series of metrics['sglang:realtime_tokens']?.series ?? []) { const labels = series.labels ?? {}; const mode = labels['mode'] ?? 'unknown'; - if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens) - let byT = promptBySrcByT.get(mode); + // Only carry the cache-miss line over — cache hits come from + // sglang:cached_tokens broken out by cache_source below, so we'd + // double-count if we kept `prefill_cache` here too. + if (mode !== 'prefill_compute') continue; + const label = 'compute (miss)'; + let byT = promptBySrcByT.get(label); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(label, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + } + // Cache hits broken out per cache_source. Strip the noisy "total" label + // (older sglang versions emit a single un-broken-out series labelled + // total — show that as just "cache hit"). + for (const series of metrics['sglang:cached_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const src = labels['cache_source'] ?? 'cache hit'; + const label = + src === 'device' + ? 'cache hit (HBM)' + : src === 'host' + ? 'cache hit (CPU offload)' + : src === 'total' + ? 'cache hit' + : `cache hit (${src})`; + let byT = promptBySrcByT.get(label); if (!byT) { byT = new Map(); - promptBySrcByT.set(mode, byT); + promptBySrcByT.set(label, byT); } for (const ts of series.timeslices ?? []) { if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { From 94a3e8b1986e54165c062e2a14eda60d9e9dd146 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:01:24 -0500 Subject: [PATCH 053/111] feat(chart-series): host cache util line + fix SGLang stacked-area colors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes for SGLang hicache rendering on the agentic detail page: 1. KV cache utilization chart was GPU-HBM-only. SGLang hicache runs also expose sglang:hicache_host_{used,total}_tokens — the CPU offload pool's tokens-in-use over its capacity. Extracted as a new `hostKvCacheUsage` time series; frontend overlays it as a second orange line on the existing chart when the row has hicache data. 2. The cumulative-prompt-token-source-breakdown chart rendered ALL three SGLang sources in the same color, because the colors dict only knew vllm-style names (local_compute, local_cache_hit, etc.). Added explicit colors for the SGLang label names ('cache hit (HBM)', 'cache hit (CPU offload)', 'cache hit', 'compute (miss)') plus a memoized fallback palette so any future unknown source name gets a distinct color rather than falling through to gray. CHART_SERIES_VERSION → 7. Backfilled 128 rows; hicache rows from workflow_run 947 (8 rows) now have ~1830 hostKvCacheUsage samples matching their HBM samples. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 16 ++++++++- .../agentic-point/time-series-chart.tsx | 30 ++++++++++++++-- .../src/hooks/api/use-trace-server-metrics.ts | 2 ++ packages/db/src/etl/compute-chart-series.ts | 36 ++++++++++++++++++- .../db/src/queries/trace-server-metrics.ts | 3 ++ 5 files changed, 83 insertions(+), 4 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 2db2809b..b047ea8f 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -236,16 +236,30 @@ export function AgenticPointDetail({ id }: Props) { render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; + // For SGLang hicache rows we have both GPU (HBM) util and + // host (CPU offload pool) util — overlay them as two lines. + const hasHost = metrics.hostKvCacheUsage.length > 0; return ( = { + // vLLM source names local_compute: '#f97316', local_cache_hit: '#3b82f6', external_kv_transfer: '#22c55e', miss: '#f97316', + // SGLang source names (set by compute-chart-series for sglang rows) + 'cache hit (HBM)': '#3b82f6', + 'cache hit (CPU offload)': '#22c55e', + 'cache hit': '#3b82f6', + 'compute (miss)': '#f97316', }; const labelFor: Record = { local_compute: 'Prefill', @@ -496,6 +502,26 @@ export function StackedAreaChart({ external_kv_transfer: 'Offload Cache Hit', miss: 'Miss', }; + // Fallback palette for any source name not in `colors` so we never + // emit two layers in the same shade. Cycles by insertion order. + const fallbackPalette = [ + '#3b82f6', + '#f97316', + '#22c55e', + '#a855f7', + '#ef4444', + '#06b6d4', + '#f59e0b', + '#ec4899', + ]; + let fallbackIdx = 0; + const colorFor = (name: string): string => { + if (colors[name]) return colors[name]!; + const c = fallbackPalette[fallbackIdx % fallbackPalette.length]!; + fallbackIdx++; + colors[name] = c; // memoize so the SAME unknown name always gets the same color + return c; + }; if (!computed) { return ( @@ -522,7 +548,7 @@ export function StackedAreaChart({ .toReversed() .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`) .join(' ')} Z`; - const color = colors[name] ?? '#6b7280'; + const color = colorFor(name); for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!; return { name, color, d }; }); @@ -540,7 +566,7 @@ export function StackedAreaChart({ } } const items: HoverItem[] = stackOrder.map((name) => ({ - color: colors[name] ?? '#6b7280', + color: colorFor(name), label: labelFor[name] ?? name, value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`, })); diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts index 664bc6c7..bac67a50 100644 --- a/packages/app/src/hooks/api/use-trace-server-metrics.ts +++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts @@ -44,6 +44,8 @@ export interface TraceServerMetrics { decodeTps: TimeSeriesPoint[]; /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; } async function fetchTraceServerMetrics( diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 1996708f..8105961e 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -40,8 +40,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * breakdown from sglang:cached_tokens — current runs always have one * cache_source ("device" / HBM) but hicache (CPU offload) runs would * split into "device" + "host" automatically once ingested. + * + * v7: extract sglang:hicache_host_{used,total}_tokens into a new + * hostKvCacheUsage series so the KV cache utilization chart can plot + * the CPU offload pool's usage alongside the on-GPU HBM line. */ -export const CHART_SERIES_VERSION = 6; +export const CHART_SERIES_VERSION = 7; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -79,6 +83,12 @@ export interface ChartSeries { * saved vs the raw queries that came in. */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** + * Host (CPU offload) KV cache utilization, 0..1. Only populated for + * SGLang hicache runs (derived as hicache_host_used / hicache_host_total). + * Frontend overlays this on the KV cache util chart as a second line. + */ + hostKvCacheUsage: TimeSeriesPoint[]; } // ── Raw blob shapes (subset we read) ──────────────────────────────────── @@ -125,6 +135,8 @@ const CHART_METRIC_KEYS = new Set([ 'sglang:num_running_reqs', 'sglang:num_queue_reqs', 'sglang:realtime_tokens', + 'sglang:hicache_host_used_tokens', + 'sglang:hicache_host_total_tokens', ]); /** @@ -312,6 +324,27 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + // SGLang hicache: host-pool KV cache utilization as used/total per + // timeslice. Both metrics are gauges in absolute tokens. Total stays + // constant (it's the pool size), used fluctuates. + const hostUsedByT = aggregateByStart( + metrics['sglang:hicache_host_used_tokens']?.series, + 'avg', + 'sum', + ); + const hostTotalByT = aggregateByStart( + metrics['sglang:hicache_host_total_tokens']?.series, + 'avg', + 'sum', + ); + const hostKvCacheUsage: TimeSeriesPoint[] = []; + for (const [t, used] of [...hostUsedByT.entries()].toSorted((a, b) => a[0] - b[0])) { + const total = hostTotalByT.get(t); + if (total !== undefined && total > 0) { + hostKvCacheUsage.push({ t: tOf(t), value: used / total }); + } + } + // Per-source prompt tokens — sum across engines per source label. // vllm: vllm:prompt_tokens_by_source has one series per source label // (local_cache_hit, external_cache_hit, miss, ...). Use the @@ -407,5 +440,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { prefillTps, decodeTps, prefixCacheHitsTps, + hostKvCacheUsage, }; } diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 76775e77..eccb0a0c 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -73,6 +73,8 @@ export interface TraceServerMetrics { decodeTps: TimeSeriesPoint[]; /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; } interface RawMetaRow extends PointMeta { @@ -118,6 +120,7 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { decodeTps: series.decodeTps, // v2 chart_series rows pre-backfill don't have this field — default to [] prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], + hostKvCacheUsage: series.hostKvCacheUsage ?? [], }; } From 93e197b7e54d140acfe65b61aeb4f5c48ca27091 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:19:20 -0500 Subject: [PATCH 054/111] fix(stacked-area): align sources by timestamp before computing shares MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cumulative-prompt-token-source-breakdown chart was showing huge "100% compute (miss)" plateaus around minute 20-24 of many SGLang runs. Root cause: the chart computed cumulative shares per ARRAY INDEX (not timestamp), but in SGLang's per-scrape metrics, cache hits and misses fire on different ticks — one scrape reports 193K hits + 0 miss, the next reports 0 hits + 8K miss. So each source has a different timestamp array. Indexing them in lockstep mixed values from different moments and made the share calculation flap to 100% one side or the other. Fix: union timestamps across all sources, then for each unique timestamp carry forward each source's cumulative sum (a source that didn't report at time t holds its previous cumulative value rather than appearing as 0). After fix: shares change smoothly over time as each source's cumulative sum grows; transient single-tick gaps no longer drive the visible share to either extreme. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/time-series-chart.tsx | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 15a15869..75d7bb1e 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -464,15 +464,36 @@ export function StackedAreaChart({ const computed = useMemo(() => { const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0); if (entries.length === 0) return null; - const tValues = entries[0]![1].map((p) => p.t); + + // Different sources can land on different scrape timestamps + // (SGLang's hits/misses fire on alternating ticks), so we MUST + // align across all sources before computing shares — otherwise the + // share calculation indexes into each source's own time axis and + // mixes values from different moments. + // + // Approach: union all timestamps across sources, then for each + // unique timestamp carry forward the cumulative sum for every + // source (a source that didn't report at time t holds its previous + // cumulative value rather than dropping to 0). + const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted( + (a, b) => a - b, + ); + + // For each source, walk its (sorted) array and produce a parallel + // cumulative-sum array indexed against `tValues` via carry-forward. const cum: Record = {}; for (const [name, arr] of entries) { + const valByT = new Map(arr.map((p) => [p.t, p.value])); + const out: number[] = Array.from({ length: tValues.length }); let acc = 0; - cum[name] = arr.map((p) => { - acc += p.value; - return acc; - }); + for (let i = 0; i < tValues.length; i++) { + const v = valByT.get(tValues[i]!); + if (v !== undefined) acc += v; + out[i] = acc; + } + cum[name] = out; } + const shares: Record = {}; for (const name of Object.keys(cum)) shares[name] = []; for (let i = 0; i < tValues.length; i++) { From c14e19e277930495e4a43c3a6d6f42a611fec336 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:44:07 -0500 Subject: [PATCH 055/111] fix(ingest): split GPU vs CPU cache hit rate for SGLang hicache rows Previous inline derivation (commit 625d6e8) summed ALL cache hit sources into server_gpu_cache_hit_rate, which conflated GPU HBM hits with CPU offload hits on SGLang hicache rows. The harness JSON also never sets server_cpu_cache_hit_rate. Now derives both metrics from chart_series.promptTokensBySource: server_gpu_cache_hit_rate = sum(HBM + 'cache hit') / sum(prompts) server_cpu_cache_hit_rate = sum(CPU offload) / sum(prompts) or null (null when no CPU offload source exists) Falls back to prefixCacheHitsTps for vLLM rows where promptTokensBySource isn't broken out by cache source. Overwrites any pre-existing value so the derivation stays consistent with what the detail-page charts plot. Backfilled all existing rows via two-phase SQL update earlier in the session: - 8 hicache rows in workflow_run 947 now show GPU ~1-2% / CPU ~87-91% - Other SGLang rows show GPU ~87% / CPU null - vLLM rows restored to their original GPU hit rates Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 40 +++++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 8d1e01b8..43655d9a 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -101,21 +101,43 @@ export async function insertTraceReplay( where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; - // Derive a lifetime GPU cache hit rate from chart_series for any linked - // row whose harness JSON didn't set one (SGLang runs don't populate - // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has - // no usable prefill data — leaves the field null in that case, matching - // legacy "no trace_replay" behavior. + // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang + // runs don't populate these in the harness JSON; vLLM runs do but only + // for GPU. We always recompute to keep the derivation consistent with + // what the detail-page charts plot — overwriting any pre-existing value. + // + // For hicache (CPU offload) rows the chart_series.promptTokensBySource + // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)" + // sources, letting us split GPU vs CPU hit rate. Other rows just have + // a single cache-hit source (either "cache hit (HBM)" / "cache hit" + // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps + // sum which equals the single cache source's total). if (chartSeries && chartSeries.prefillTps.length > 0) { const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); - const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); if (sumPrompts > 0) { - const rate = sumHits / sumPrompts; + const sumOf = (name: string): number => + (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0); + const cpuHits = sumOf('cache hit (CPU offload)'); + const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit'); + // If the source breakdown has a HBM entry, use it (covers SGLang). + // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path). + const gpuHits = + hbmFromBreakdown > 0 + ? hbmFromBreakdown + : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); + const gpuRate = gpuHits / sumPrompts; + const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null; await sql` update benchmark_results - set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric)) + set metrics = jsonb_set( + case when ${cpuRate}::numeric is not null + then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric)) + else metrics + end, + '{server_gpu_cache_hit_rate}', + to_jsonb(${gpuRate}::numeric) + ) where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) - and (metrics->>'server_gpu_cache_hit_rate') is null `; } } From 268617ccd85ccc8aea6ed12dd4bd61273c8a37c1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 10:40:04 -0500 Subject: [PATCH 056/111] fix(ingest): recognize vLLM LMCache external_kv_transfer as CPU hit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline cache-hit-rate derivation only handled SGLang's hicache label ('cache hit (CPU offload)'). vLLM with LMCache uses 'external_kv_transfer' in its prompt_tokens_by_source breakdown for the same concept (CPU offload pool serving tokens to GPU). Those vLLM rows had cpu rate null even when external_kv_transfer was the dominant source. Adds external_kv_transfer + local_cache_hit to the source name aliases: GPU hits = local_cache_hit + cache hit (HBM) + cache hit CPU hits = external_kv_transfer + cache hit (CPU offload) fallback = prefixCacheHitsTps total (for single-source rows) Backfilled 132 affected rows via SQL — vLLM LMCache rows now show CPU rate where present (e.g. dsv4 b300 conc=128 offload=on shows GPU ~1% + CPU ~87%, matching the actual cache topology). Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 23 ++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 43655d9a..cb022ca9 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -106,21 +106,24 @@ export async function insertTraceReplay( // for GPU. We always recompute to keep the derivation consistent with // what the detail-page charts plot — overwriting any pre-existing value. // - // For hicache (CPU offload) rows the chart_series.promptTokensBySource - // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)" - // sources, letting us split GPU vs CPU hit rate. Other rows just have - // a single cache-hit source (either "cache hit (HBM)" / "cache hit" - // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps - // sum which equals the single cache source's total). + // Source label naming differs by framework / cache topology: + // SGLang hicache: 'cache hit (HBM)' + 'cache hit (CPU offload)' + // SGLang older: 'cache hit' (no tier breakdown) + // vLLM LMCache: 'local_cache_hit' + 'external_kv_transfer' (+ 'local_compute' for miss) + // vLLM single: falls back to prefixCacheHitsTps total (= local cache only) if (chartSeries && chartSeries.prefillTps.length > 0) { const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); if (sumPrompts > 0) { const sumOf = (name: string): number => (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0); - const cpuHits = sumOf('cache hit (CPU offload)'); - const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit'); - // If the source breakdown has a HBM entry, use it (covers SGLang). - // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path). + // CPU-offload hits: SGLang hicache + vLLM LMCache external transfer. + const cpuHits = sumOf('cache hit (CPU offload)') + sumOf('external_kv_transfer'); + // GPU/HBM hits from source breakdown, summed across known aliases. + const hbmFromBreakdown = + sumOf('cache hit (HBM)') + sumOf('cache hit') + sumOf('local_cache_hit'); + // If the source breakdown has any GPU entry, use it. Otherwise fall back + // to total prefixCacheHitsTps sum (single-source vLLM path with no + // by_source metric — equals the lone cache counter's lifetime). const gpuHits = hbmFromBreakdown > 0 ? hbmFromBreakdown From 7fc6b4f7b5a49aa370d912d6df36b40d80b813a6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 13:02:34 -0500 Subject: [PATCH 057/111] fix(scatter): use lightweight presence endpoint for View charts button MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chart pre-fetched full trace_replay JSONL blobs for every visible agentic point just to decide whether to render the "View charts" button in pinned tooltips. With the latest run's 8x8 conc=512 rows pushing up to 13 MB compressed per blob, 12-id chunks blew past Neon's 64 MB per-HTTP-response cap and 500'd — hiding the button for every point. New /api/v1/trace-availability returns {id: true} for ids that have a stored blob; ScatterGraph uses that boolean instead. trace-histograms is still used by the detail page (single id, no chunking issue). Co-Authored-By: Claude Opus 4.7 --- .../app/api/v1/trace-availability/route.ts | 59 +++++++++++++++++++ .../components/inference/ui/ScatterGraph.tsx | 23 ++++---- .../inference/utils/tooltipUtils.ts | 15 ++--- .../src/hooks/api/use-trace-availability.ts | 29 +++++++++ packages/db/src/queries/trace-availability.ts | 34 +++++++++++ 5 files changed, 143 insertions(+), 17 deletions(-) create mode 100644 packages/app/src/app/api/v1/trace-availability/route.ts create mode 100644 packages/app/src/hooks/api/use-trace-availability.ts create mode 100644 packages/db/src/queries/trace-availability.ts diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts new file mode 100644 index 00000000..2484ceaf --- /dev/null +++ b/packages/app/src/app/api/v1/trace-availability/route.ts @@ -0,0 +1,59 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceAvailability, + type TraceAvailabilityMap, +} from '@semianalysisai/inferencex-db/queries/trace-availability'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedTraceAvailability = cachedQuery( + (ids: number[]): Promise => getTraceAvailability(getDb(), ids), + 'trace-availability', +); + +const MAX_IDS_PER_REQUEST = 500; + +/** + * GET /api/v1/trace-availability?ids=1,2,3 + * + * Returns `{[id]: true}` for ids that have a stored trace_replay blob. + * Lightweight presence check used by the scatter tooltip to decide whether + * to render the "View charts" button — see queries/trace-availability.ts. + */ +export async function GET(request: NextRequest) { + const raw = request.nextUrl.searchParams.get('ids'); + if (!raw) { + return NextResponse.json({ error: 'ids query param is required' }, { status: 400 }); + } + + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => Number(s.trim())) + .filter((n) => Number.isFinite(n) && n > 0), + ), + ]; + if (ids.length === 0) { + return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 }); + } + if (ids.length > MAX_IDS_PER_REQUEST) { + return NextResponse.json( + { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` }, + { status: 400 }, + ); + } + + try { + const sorted = [...ids].toSorted((a, b) => a - b); + const availability = await getCachedTraceAvailability(sorted); + return cachedJson(availability); + } catch (error) { + console.error('Error fetching trace availability:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index fdcf8952..b93799db 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -6,7 +6,7 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react'; import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry'; import { useInference } from '@/components/inference/InferenceContext'; -import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; +import { useTraceAvailability } from '@/hooks/api/use-trace-availability'; import { useRouter } from 'next/navigation'; import ChartLegend from '@/components/ui/chart-legend'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; @@ -497,8 +497,11 @@ const ScatterGraph = React.memo( // All official points for rendering (unfiltered — visibility via opacity) const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]); - // Trace-replay histograms (ISL / OSL distributions) for agentic points. - // Pre-fetch the whole visible set so tooltip render stays synchronous. + // Bulk presence lookup for agentic points: which ids have a stored + // trace_replay blob → controls the "View charts" button in the pinned + // tooltip. We deliberately don't fetch the histograms themselves here; + // a 95-point dsv4-b300 dashboard would pull GB of profile blobs through + // Neon's HTTP API and trip its 64 MB per-response cap. const agenticIds = useMemo(() => { const ids: number[] = []; for (const p of pointsData) { @@ -506,7 +509,7 @@ const ScatterGraph = React.memo( } return ids; }, [pointsData]); - const { data: traceHistograms } = useTraceHistograms(agenticIds); + const { data: traceAvailability } = useTraceAvailability(agenticIds); const router = useRouter(); // Gradient label data @@ -774,8 +777,7 @@ const ScatterGraph = React.memo( hardwareConfig, isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)), runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined, - traceHistogram: - typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined, + hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false, }), getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x), getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y), @@ -842,10 +844,11 @@ const ScatterGraph = React.memo( removeTrackedConfig, chartDefinition.chartType, selectedPrecisions, - // Tooltip content closure reads traceHistograms to decide whether to - // show the "View charts" button — rebuild config when the histogram - // fetch resolves so the button appears for points that have data. - traceHistograms, + // Tooltip content closure reads traceAvailability to decide whether + // to render the "View charts" button — rebuild config when the + // presence fetch resolves so the button appears for points that + // have a trace_replay blob. + traceAvailability, router, ], ); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index ccc371f9..ed68c41b 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -20,12 +20,13 @@ export interface TooltipConfig { /** URL to the GitHub Actions workflow run */ runUrl?: string; /** - * Per-request ISL/OSL arrays for agentic points, sourced from the stored - * aiperf `profile_export.jsonl`. Used to detect whether the point has any - * trace data (so the "View charts" button can appear); the actual - * distributions are rendered on the detail page, not inline. + * Whether this agentic point has a stored trace_replay blob. Controls + * visibility of the "View charts" button — the actual distributions are + * rendered on the detail page, not inline, so all the tooltip needs is a + * presence boolean (sourced from the bulk `/api/v1/trace-availability` + * call so we don't ship megabytes of profile JSONL just for this check). */ - traceHistogram?: { isl: number[]; osl: number[] } | undefined; + hasTrace?: boolean; } export interface OverlayTooltipConfig extends TooltipConfig { @@ -221,7 +222,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => { selectedYAxisMetric, hardwareConfig, runUrl, - traceHistogram, + hasTrace, } = config; return ` @@ -271,7 +272,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => { ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} - ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))} + ${viewChartsButtonHTML(isPinned, Boolean(hasTrace))} ${ isPinned ? ` + + + {sw.infoTooltip} + + + + )} ))} From de5e51a1330d7c24f51850e729a19a2d8802d990 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 14:50:42 -0500 Subject: [PATCH 063/111] fix(inference): don't scope chart to one run when runs cover different hardware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two workflow runs landing on the same date for the same model+precision but DIFFERENT hardware (e.g. a B300 dsv4 run and a B200 dsv4 run) each get their own changelog entry. The single-run scoping guard matched runs by model+precision only, so both counted as "runs with a changelog for this model", length>1 tripped, and selecting either run scoped the benchmarks query to that one workflow run — hiding the other GPU's curve entirely (carry-forward across hardware silently broke). Scope to a single run only when two runs contest the SAME full config_key (model-precision-hardware-framework) — a genuine same-day re-run of one hardware, where a DISTINCT ON merge could mix them. Complementary different-hardware runs now both render via the normal date carry-forward. Co-Authored-By: Claude Opus 4.7 --- .../components/inference/InferenceContext.tsx | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index c446dc71..244c713c 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -242,25 +242,42 @@ export function InferenceProvider({ const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING) .filter(([, model]) => model === selectedModel) .map(([prefix]) => prefix); - const runIdsWithModelChangelog: string[] = []; + // Map each FULL config_key (model-precision-hardware-framework) a run's + // changelog claims to the set of runs claiming it. Single-run scoping should + // only kick in when two runs contest the SAME full key — e.g. a same-day + // re-run of one hardware — because then a DISTINCT ON merge could mix them + // and the user needs to pick which run wins. Runs covering DIFFERENT hardware + // of the same model (e.g. a B300 run and a B200 run on the same date) are + // complementary: both must render via carry-forward. Matching on model+ + // precision alone (the old behavior) wrongly treated those as alternatives + // and scoped the chart to one run, hiding the other GPU's curve. + const runsByConfigKey = new Map>(); if (availableRuns) { for (const [runId, runInfo] of Object.entries(availableRuns)) { if (!runInfo.changelog) continue; - const matches = runInfo.changelog.entries.some((entry) => - entry.config_keys.some((key) => { + for (const entry of runInfo.changelog.entries) { + for (const key of entry.config_keys) { const parts = key.split('-'); - return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!); - }), - ); - if (matches) runIdsWithModelChangelog.push(runId); + if (modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!)) { + let runs = runsByConfigKey.get(key); + if (!runs) { + runs = new Set(); + runsByConfigKey.set(key, runs); + } + runs.add(runId); + } + } + } } } + // A run is "contested" only if some full config_key it claims is also claimed + // by another run. Only then does picking a run disambiguate anything. + const contestedRunIds = new Set(); + for (const runs of runsByConfigKey.values()) { + if (runs.size > 1) for (const r of runs) contestedRunIds.add(r); + } const benchmarkRunId = - selectedRunId && - runIdsWithModelChangelog.length > 1 && - runIdsWithModelChangelog.includes(selectedRunId) - ? String(selectedRunId) - : undefined; + selectedRunId && contestedRunIds.has(String(selectedRunId)) ? String(selectedRunId) : undefined; const { graphs, From af8766ddbe9a3077b9a226cd3487f4f4e040e58b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 11:24:29 -0500 Subject: [PATCH 064/111] fix(inference): carry forward un-contested configs when a run is selected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selecting a workflow run in the picker scoped the ENTIRE benchmarks query to that run, so any same-day config living in a different workflow run vanished — e.g. with two vLLM runs and one SGLang run on the same date, picking either vLLM run (contested, so scoping kicks in) hid the SGLang curve entirely, while picking the SGLang run (uncontested, no scoping) showed everything. Fetch both the normal latest-per-config rows and the run-scoped rows, and merge: the selected run wins for every (model, precision, hardware, framework, benchmark_type) group it actually produced — preserving the disambiguation that scoping exists for, including dropping base rows for concs the run didn't cover so DISTINCT-ON mixing can't sneak back — and every other config carries forward from the base rows. benchmark_type is part of the replacement key so an agentic-only run can't hide the same config's fixed-seq carry-forward. The base query is the default view query so it's effectively always cached; run selection adds no extra latency in practice. Verified live: Jun 10, DSv4 B300, run 3/3 (vLLM affinity run) now renders both b300_vllm (run-scoped) and b300_sglang (carried forward). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/inference/InferenceContext.tsx | 4 ++ .../inference/hooks/useChartData.ts | 41 ++++++++++--- .../app/src/lib/benchmark-transform.test.ts | 60 ++++++++++++++++++- packages/app/src/lib/benchmark-transform.ts | 29 +++++++++ 4 files changed, 125 insertions(+), 9 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 3b994367..5d165e60 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -286,6 +286,10 @@ export function InferenceProvider({ } // A run is "contested" only if some full config_key it claims is also claimed // by another run. Only then does picking a run disambiguate anything. + // Downstream (useChartData / mergeRunScopedRows) this no longer scopes the + // WHOLE chart to the run: only the configs the run actually produced are + // pinned to it, and every other config (e.g. another framework's same-day + // run) still carries forward from the normal latest-per-config rows. const contestedRunIds = new Set(); for (const runs of runsByConfigKey.values()) { if (runs.size > 1) for (const r of runs) contestedRunIds.add(r); diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 019d0691..e76c3123 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -19,7 +19,11 @@ import { getModelSortIndex, hardwareKeyMatchesAnyBase, } from '@/lib/constants'; -import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; +import { + mergeRunScopedRows, + transformBenchmarkRows, + withPercentile, +} from '@/lib/benchmark-transform'; import { Sequence, type Model } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; import { @@ -183,19 +187,40 @@ export function useChartData( // When the selected date is the latest available, use '' (empty string) to match // the initial no-date query key, reusing the eagerly-fetched benchmarks from the // materialized view instead of firing a redundant second fetch with identical data. - // When a specific run is selected, we always go through the runId branch and the - // date is effectively ignored — keep queryDate set so React Query still has a - // distinct cache key per date if the user navigates back to "latest". const queryDate = selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate ? '' : selectedRunDate; + // Two queries: the normal latest-per-config view (always), plus the + // run-scoped rows when a specific workflow run is selected. The merged + // result pins ONLY the configs the selected run produced to that run, and + // carries every other config forward from the base rows — selecting one of + // two same-day vLLM runs must not hide the day's SGLang curve just because + // it lives in a different workflow run. The base query is the default view + // query, so it's almost always already in the React Query cache. + const { + data: baseRows, + isLoading: baseLoading, + error: baseError, + } = useBenchmarks(selectedModel, queryDate, enabled); const { - data: allRows, - isLoading: queryLoading, - error: queryError, - } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId); + data: runRows, + isLoading: runLoading, + error: runError, + } = useBenchmarks(selectedModel, queryDate, enabled && Boolean(selectedRunId), selectedRunId); + + const allRows = useMemo(() => { + if (!selectedRunId) return baseRows; + // Wait for the run rows before rendering a scoped view — rendering base + // rows first would flash the un-scoped chart, then swap contested points. + if (!runRows) return undefined; + if (!baseRows) return runRows; + return mergeRunScopedRows(runRows, baseRows); + }, [selectedRunId, runRows, baseRows]); + + const queryLoading = baseLoading || (Boolean(selectedRunId) && runLoading); + const queryError = baseError ?? (selectedRunId ? runError : null); // GPU comparison: fetch data for each additional comparison date const comparisonDates = useMemo( diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index 62cc1809..077e8c3e 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -2,7 +2,11 @@ import { describe, it, expect, vi } from 'vitest'; import type { BenchmarkRow } from '@/lib/api'; -import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform'; +import { + mergeRunScopedRows, + rowToAggDataEntry, + transformBenchmarkRows, +} from './benchmark-transform'; function makeRow(overrides: Partial = {}): BenchmarkRow { return { @@ -776,3 +780,57 @@ describe('transformBenchmarkRows — dp_attention narrowing', () => { expect(point.decode_dp_attention).toBe(true); }); }); + +describe('mergeRunScopedRows', () => { + const vllmRun = (over: Partial = {}) => + makeRow({ model: 'dsv4', hardware: 'b300', framework: 'vllm', precision: 'fp4', ...over }); + const sglangBase = (over: Partial = {}) => + makeRow({ model: 'dsv4', hardware: 'b300', framework: 'sglang', precision: 'fp4', ...over }); + + it('pins configs the run covers to the run rows, replacing base rows', () => { + const runRows = [vllmRun({ id: 10, conc: 32 }), vllmRun({ id: 11, conc: 64 })]; + const baseRows = [vllmRun({ id: 90, conc: 32 }), vllmRun({ id: 91, conc: 128 })]; + const merged = mergeRunScopedRows(runRows, baseRows); + // All vllm base rows dropped (incl. conc=128 the run didn't cover) — a + // partial-sweep run must fully own its config or the DISTINCT-ON mixing + // the scoping exists to prevent comes right back. + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]); + }); + + it('carries forward configs the run does not cover (the same-day other-framework curve)', () => { + const runRows = [vllmRun({ id: 10 })]; + const baseRows = [ + vllmRun({ id: 90 }), + sglangBase({ id: 91 }), + sglangBase({ id: 92, conc: 128 }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91, 92]); + }); + + it('keeps base rows of other hardware / precision / model untouched', () => { + const runRows = [vllmRun({ id: 10 })]; + const baseRows = [ + vllmRun({ id: 90, hardware: 'b200' }), + vllmRun({ id: 91, precision: 'fp8' }), + vllmRun({ id: 92, model: 'kimik2.5' }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 90, 91, 92]); + }); + + it('scopes per benchmark_type — an agentic run does not hide fixed-seq carry-forward', () => { + const runRows = [vllmRun({ id: 10, benchmark_type: 'agentic_traces' })]; + const baseRows = [ + vllmRun({ id: 90, benchmark_type: 'agentic_traces' }), + vllmRun({ id: 91, benchmark_type: 'single_turn' }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]); + }); + + it('returns base rows unchanged when the run produced nothing', () => { + const baseRows = [vllmRun({ id: 90 }), sglangBase({ id: 91 })]; + expect(mergeRunScopedRows([], baseRows)).toBe(baseRows); + }); +}); diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 9f6b43d1..8329c84b 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -172,6 +172,35 @@ export function withPercentile(key: string, percentile: string): string { return key.replace(/^(?:mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`); } +// Replacement granularity for single-run scoping: the changelog config_key +// tuple (model-precision-hardware-framework) plus benchmark_type, so an +// agentic-only run never hides the same config's fixed-seq carry-forward. +const runScopeKey = (r: BenchmarkRow): string => + `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}`; + +/** + * Merge run-scoped benchmark rows with the normal latest-per-config rows. + * + * When the user picks a specific workflow run (to disambiguate two same-day + * sweeps of the same config), only the configs that run actually produced + * should be pinned to it — every other config must keep its normal + * carry-forward rows. Scoping the whole chart to the run (the old behavior) + * silently hid complementary configs that happened to land on the same date, + * e.g. selecting one of two same-day vLLM runs made the day's SGLang curve + * vanish because it lived in a different workflow run. + * + * Run rows win for every (model, precision, hardware, framework, + * benchmark_type) group they cover; base rows fill in the rest. + */ +export function mergeRunScopedRows( + runRows: BenchmarkRow[], + baseRows: BenchmarkRow[], +): BenchmarkRow[] { + if (runRows.length === 0) return baseRows; + const claimed = new Set(runRows.map(runScopeKey)); + return [...runRows, ...baseRows.filter((r) => !claimed.has(runScopeKey(r)))]; +} + /** * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig. * Returns one InferenceData[] per chart definition (e2e, interactivity). From d6d31436abf38eb32e6383ab692ff0b8519ca32c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:25:49 -0500 Subject: [PATCH 065/111] fix: reconcile agentic data after master merge --- .../component/inference-chart-controls.cy.tsx | 4 +- .../inference/hooks/useChartData.ts | 8 +- .../components/inference/ui/ChartDisplay.tsx | 481 +++++++++--------- .../components/inference/ui/ScatterGraph.tsx | 5 +- .../components/unofficial-run-provider.tsx | 10 +- packages/app/src/lib/api.ts | 15 +- packages/db/src/queries/benchmarks.ts | 21 +- 7 files changed, 282 insertions(+), 262 deletions(-) diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx index 03e6a50c..5a6311f4 100644 --- a/packages/app/cypress/component/inference-chart-controls.cy.tsx +++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx @@ -14,8 +14,8 @@ describe('Inference ChartControls', () => { it('renders the sequence selector with the current sequence', () => { // Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K" - cy.get('#sequence-select').should('be.visible'); - cy.get('#sequence-select').should('contain.text', '8K / 1K'); + cy.get('#scenario-select').should('be.visible'); + cy.get('#scenario-select').should('contain.text', '8K / 1K'); }); it('renders the precision multi-select with the current precision', () => { diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 0d1eac64..ee5acb88 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -220,13 +220,7 @@ export function useChartData( data: runRows, isLoading: runLoading, error: runError, - } = useBenchmarks( - selectedModel, - '', - enabled && Boolean(selectedRunId), - selectedRunId, - true, - ); + } = useBenchmarks(selectedModel, '', enabled && Boolean(selectedRunId), selectedRunId, true); const allRows = useMemo(() => { if (!selectedRunId) return baseRows; diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 3a431440..caf713cc 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -429,217 +429,206 @@ export default function ChartDisplay() { }); }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]); - const displayGraphs = isFirstLoad || isDerivedLoading - ? [ - - - - - , - ] - : renderableGraphs.length === 0 - ? [] - : renderableGraphs.map((graph, graphIndex) => { - const isTimelineMode = Boolean( - selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, - ); - const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; - return ( -
-
- handleViewModeChange(graphIndex, v)} - ariaLabel="View mode" - testId={`inference-view-toggle-${graphIndex}`} - /> - } - hideImageExport={getViewMode(graphIndex) === 'table'} - setIsLegendExpanded={setIsLegendExpanded} - exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} - onExportMp4={ - replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined - } - onExportCsv={() => { - const visibleData = graph.data.filter((d) => + const displayGraphs = + isFirstLoad || isDerivedLoading + ? [ + + + + + , + ] + : renderableGraphs.length === 0 + ? [] + : renderableGraphs.map((graph, graphIndex) => { + const isTimelineMode = Boolean( + selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, + ); + const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; + return ( +
+
+ - activeOverlayHwTypes.has(p.hwKey as string) && - selectedPrecisions.includes(p.precision), - ); - const issueNotes = matchKnownConfigIssues(graph.model, [ - ...visibleData, - ...visibleOverlayRows, - ]).map((issue) => - knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))), - ); - exportToCsv( - `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`, - headers, - rows, - issueNotes, - ); - }} - /> - - {(() => { - const chartCaption = ( - <> -

- { - graph.chartDefinition[ - `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] - }{' '} - {(() => { - // For Input metrics with dynamic x-axis, use dynamic heading - const metricTitle = - (graph.chartDefinition[ - `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] as string) || ''; - const isInputMetric = metricTitle.toLowerCase().includes('input'); - if ( - graph.chartDefinition.chartType === 'interactivity' && - isInputMetric && - selectedXAxisMetric - ) { - if (selectedXAxisMetric === 'p99_ttft') { - return 'vs. P99 Time To First Token'; - } else if (selectedXAxisMetric === 'median_ttft') { - return 'vs. Median Time To First Token'; - } - } - - // The e2e chart heading follows the branch-level x-axis mode - // selector, including agentic-only derived metrics. - if (graph.chartDefinition.chartType === 'e2e') { - if (selectedXAxisMode === 'session-time') { - return 'vs. Mean Normalized Session Time'; - } - if (selectedXAxisMode === 'prefill-tps') { - return 'vs. P90 Prefill TPS / user'; - } - const isAgentic = sequenceKind(selectedSequence) === 'agentic'; - if (selectedE2eXAxisMetric?.endsWith('_ttft')) { - const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, ''); - const word = - percentile === 'median' ? 'Median' : percentile.toUpperCase(); - return `vs. ${word} Time To First Token`; - } - return isAgentic - ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency` - : 'vs. End-to-end Latency'; - } - - // Fall back to configured heading - return ( - graph.chartDefinition[ - `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition - ] || graph.chartDefinition.heading - ); - })()} -

-

- {getModelLabel(graph.model as Model)} •{' '} - {selectedPrecisions - .map((prec) => getPrecisionLabel(prec as Precision)) - .join(', ')}{' '} - • {getSequenceLabel(graph.sequence as Sequence)} •{' '} - {isUnofficialRun - ? 'Source: UNOFFICIAL' - : 'Source: SemiAnalysis InferenceX™'} - {selectedRunDate && ( - <> - {' '} - • Updated:{' '} - {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( - 'en-US', - { - year: 'numeric', - month: '2-digit', - day: '2-digit', - timeZone: 'UTC', - }, - )} - - )} -

- - - - ); - - if (getViewMode(graphIndex) === 'table') { + ? 'gpu_timeseries' + : graph.chartDefinition.chartType === 'e2e' + ? 'latency' + : 'interactivity' + } + leadingControls={ + handleViewModeChange(graphIndex, v)} + ariaLabel="View mode" + testId={`inference-view-toggle-${graphIndex}`} + /> + } + hideImageExport={getViewMode(graphIndex) === 'table'} + setIsLegendExpanded={setIsLegendExpanded} + exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} + onExportMp4={ + replayAvailable + ? () => replayHandlesRef.current[graphIndex]?.open() + : undefined + } + onExportCsv={() => { + const visibleData = graph.data.filter((d) => + isTimelineMode + ? activeDates.has(`${d.date}_${d.hwKey}`) + : activeHwTypes.has(d.hwKey as string) && + selectedPrecisions.includes(d.precision), + ); + const { headers, rows } = inferenceChartToCsv( + visibleData, + graph.model, + graph.sequence, + ); + // Match warnings against the same series the chart annotates, + // including visible unofficial-run overlay series. const overlay = graph.chartDefinition.chartType === 'e2e' ? overlayDataByChartType.e2e : overlayDataByChartType.interactivity; - const overlayRows = (overlay?.data ?? []).filter((p) => - selectedPrecisions.includes(p.precision), + const visibleOverlayRows = isTimelineMode + ? [] + : (overlay?.data ?? []).filter( + (p) => + activeOverlayHwTypes.has(p.hwKey as string) && + selectedPrecisions.includes(p.precision), + ); + const issueNotes = matchKnownConfigIssues(graph.model, [ + ...visibleData, + ...visibleOverlayRows, + ]).map((issue) => + knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))), ); - return ( + exportToCsv( + `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`, + headers, + rows, + issueNotes, + ); + }} + /> + + {(() => { + const chartCaption = ( <> - {chartCaption} - 0 ? [...graph.data, ...overlayRows] : graph.data - } - chartDefinition={graph.chartDefinition} - selectedYAxisMetric={selectedYAxisMetric} - /> +

+ { + graph.chartDefinition[ + `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition + ] + }{' '} + {(() => { + // For Input metrics with dynamic x-axis, use dynamic heading + const metricTitle = + (graph.chartDefinition[ + `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition + ] as string) || ''; + const isInputMetric = metricTitle.toLowerCase().includes('input'); + if ( + graph.chartDefinition.chartType === 'interactivity' && + isInputMetric && + selectedXAxisMetric + ) { + if (selectedXAxisMetric === 'p99_ttft') { + return 'vs. P99 Time To First Token'; + } else if (selectedXAxisMetric === 'median_ttft') { + return 'vs. Median Time To First Token'; + } + } + + // The e2e chart heading follows the branch-level x-axis mode + // selector, including agentic-only derived metrics. + if (graph.chartDefinition.chartType === 'e2e') { + if (selectedXAxisMode === 'session-time') { + return 'vs. Mean Normalized Session Time'; + } + if (selectedXAxisMode === 'prefill-tps') { + return 'vs. P90 Prefill TPS / user'; + } + const isAgentic = sequenceKind(selectedSequence) === 'agentic'; + if (selectedE2eXAxisMetric?.endsWith('_ttft')) { + const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, ''); + const word = + percentile === 'median' ? 'Median' : percentile.toUpperCase(); + return `vs. ${word} Time To First Token`; + } + return isAgentic + ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency` + : 'vs. End-to-end Latency'; + } + + // Fall back to configured heading + return ( + graph.chartDefinition[ + `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition + ] || graph.chartDefinition.heading + ); + })()} +

+

+ {getModelLabel(graph.model as Model)} •{' '} + {selectedPrecisions + .map((prec) => getPrecisionLabel(prec as Precision)) + .join(', ')}{' '} + • {getSequenceLabel(graph.sequence as Sequence)} •{' '} + {isUnofficialRun + ? 'Source: UNOFFICIAL' + : 'Source: SemiAnalysis InferenceX™'} + {selectedRunDate && ( + <> + {' '} + • Updated:{' '} + {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( + 'en-US', + { + year: 'numeric', + month: '2-digit', + day: '2-digit', + timeZone: 'UTC', + }, + )} + + )} +

+ + ); - } - return selectedGPUs.length > 0 && - ((selectedDateRange.startDate && selectedDateRange.endDate) || - selectedDates.length > 0) ? ( - - ) : ( -
- + selectedPrecisions.includes(p.precision), + ); + return ( + <> + {chartCaption} + 0 + ? [...graph.data, ...overlayRows] + : graph.data + } + chartDefinition={graph.chartDefinition} + selectedYAxisMetric={selectedYAxisMetric} + /> + + ); + } + + return selectedGPUs.length > 0 && + ((selectedDateRange.startDate && selectedDateRange.endDate) || + selectedDates.length > 0) ? ( + - {selectedGPUs.length > 0 && - (!selectedDateRange.startDate || !selectedDateRange.endDate) && - selectedDates.length === 0 && ( -
-

- Select a date range or add a run to view GPU comparison -

-
- )} -
- ); - })()} - {replayAvailable && ( - { - replayHandlesRef.current[graphIndex] = handle; - }} - parentChartId={`chart-${graphIndex}`} - chartDefinition={graph.chartDefinition} - yLabel={`${ - graph.chartDefinition[ - `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition - ] - }`} - xLabel={graph.chartDefinition.x_label} - /> - )} -
-
-
- ); - }); + ) : ( +
+ + {selectedGPUs.length > 0 && + (!selectedDateRange.startDate || !selectedDateRange.endDate) && + selectedDates.length === 0 && ( +
+

+ Select a date range or add a run to view GPU comparison +

+
+ )} +
+ ); + })()} + {replayAvailable && ( + { + replayHandlesRef.current[graphIndex] = handle; + }} + parentChartId={`chart-${graphIndex}`} + chartDefinition={graph.chartDefinition} + yLabel={`${ + graph.chartDefinition[ + `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition + ] + }`} + xLabel={graph.chartDefinition.x_label} + /> + )} + +
+
+ ); + }); return (
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 982c24d2..e1cad1a4 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -7,7 +7,6 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react'; import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry'; import { useInference } from '@/components/inference/InferenceContext'; import { useTraceAvailability } from '@/hooks/api/use-trace-availability'; -import { useRouter } from 'next/navigation'; import { pointNearestX } from '@/components/inference/ui/line-label-anchor'; import ChartLegend from '@/components/ui/chart-legend'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; @@ -582,7 +581,6 @@ const ScatterGraph = React.memo( return ids; }, [pointsData]); const { data: traceAvailability } = useTraceAvailability(agenticIds); - const router = useRouter(); // Gradient label data const allPointLabelsByKey = useMemo(() => { @@ -902,7 +900,7 @@ const ScatterGraph = React.memo( }); chartRef.current?.dismissTooltip(); chartRef.current?.hideTooltip(); - router.push(`/inference/agentic/${pointId}`); + window.location.assign(`/inference/agentic/${pointId}`); }); } }, @@ -923,7 +921,6 @@ const ScatterGraph = React.memo( // presence fetch resolves so the button appears for points that // have a trace_replay blob. traceAvailability, - router, ], ); diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index b8e76f38..54b470ff 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -279,11 +279,11 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { // Filter chart data by stamped `run_url`. A row belongs to the dismissed // run if its URL matches exactly OR the numeric id parses to the same. const belongsToDismissed = (rowUrl?: string | null) => { - if (!rowUrl) return false; - if (rowUrl === target.url) return true; - const m = rowUrl.match(/\/runs\/(?\d+)/u); - return m?.groups?.runId === runId; - }; + if (!rowUrl) return false; + if (rowUrl === target.url) return true; + const m = rowUrl.match(/\/runs\/(?\d+)/u); + return m?.groups?.runId === runId; + }; // Compute the filtered chart data BEFORE any setState so we can pass the // same value to setUnofficialChartData and parseAvailableModelsAndSequences. diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 0dac5883..a9d66715 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -8,6 +8,8 @@ import type { WorkerPower } from '@/components/inference/types'; import type { SubmissionsResponse } from './submissions-types'; export interface BenchmarkRow { + /** Stable per-point id from benchmark_results; used for agentic detail lookups. */ + id: number; hardware: string; framework: string; model: string; @@ -25,9 +27,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode. Defaults to 'off' for fixed-sequence rows. */ + offload_mode: string; image: string | null; metrics: Record; /** @@ -176,13 +182,14 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) { export interface AvailabilityRow { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; spec_method: string; disagg: boolean; + benchmark_type: string; date: string; } diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 49c60604..6833756a 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -11,6 +11,8 @@ import type { WorkerPower } from '../etl/benchmark-mapper.js'; export type BenchmarkWorkerRow = WorkerPower; export interface BenchmarkRow { + /** Stable benchmark_results id used for agentic detail lookups. */ + id: number; hardware: string; framework: string; model: string; @@ -28,9 +30,11 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + isl: number | null; + osl: number | null; conc: number; + offload_mode: string; image: string | null; metrics: Record; /** @@ -95,6 +99,7 @@ export async function getLatestBenchmarks( : sql``; const rows = await sql` SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + br.id, c.hardware, c.framework, c.model, @@ -112,6 +117,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -136,6 +143,7 @@ export async function getLatestBenchmarks( // No date filter: use materialized view for instant lookups const rows = await sql` SELECT + lb.id, c.hardware, c.framework, c.model, @@ -153,6 +161,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + lb.benchmark_type, + lb.offload_mode, lb.isl, lb.osl, lb.conc, @@ -185,6 +195,7 @@ export async function getBenchmarksForRun( const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + br.id, c.hardware, c.framework, c.model, @@ -202,6 +213,8 @@ export async function getBenchmarksForRun( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -235,6 +248,7 @@ export async function getAllBenchmarksForHistory( const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` SELECT + br.id, c.hardware, c.framework, c.model, @@ -252,9 +266,12 @@ export async function getAllBenchmarksForHistory( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, + br.image, br.metrics - '{std_ttft,std_tpot,std_e2el,std_intvty,std_itl,mean_ttft,mean_tpot,mean_e2el,mean_intvty,mean_itl}'::text[] as metrics, br.workers, br.date::text, From f60ef9c7f18a1782edd5542510328b242048a2de Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:34:00 -0500 Subject: [PATCH 066/111] fix(gpu-compare): show concurrency (C=) over points GPU compare mode (GPUGraph) labeled points with only the parallelism/tp string, dropping the C= suffix that the single-run scatter chart (ScatterGraph) shows. Append it so compare-mode points are annotated the same way. Verified live in compare mode: points now read e.g. 'DEP8 / C=2048', 'TP4 / C=64'. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/app/src/components/inference/ui/GPUGraph.tsx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index e7737a2e..24b1266f 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -759,7 +759,11 @@ const GPUGraph = React.memo( config: { getColor, hideLabels: hidePointLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + // Match ScatterGraph: append the concurrency (C=) to the + // parallelism/tp label so compare-mode points are annotated the + // same way as the single-run scatter chart. + getLabelText: (d) => + useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { series: (d) => `${d.date}_${d.hwKey}`, From 22028ccfe3141aa632b4c23aaca26b9c4bd51b58 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:43:42 -0500 Subject: [PATCH 067/111] fix(agentic-timeline): hide no-op phase toggle; fixed-height scroll window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes to the conversation/request-timeline view: 1. The Profiling vs 'All (incl. warmup)' toggle never did anything — aiperf's profile_export only contains profiling-phase requests, so every stored record has phase='profiling' (verified: 297k/297k rows). Hide the toggle unless a non-profiling request actually exists, so it reappears and works only if warmup is ever exported. 2. The timeline grew to fit every conversation/worker, making the card arbitrarily tall. Cap the body at a fixed height (480px) and scroll the rows vertically inside it. Few-row runs still size to content (no empty space); the label column and bars scroll together since they share the one scroll container. Verified live on a 3475-request point: phase toggle absent, row-mode toggle still present, window clientHeight 480 with ~3745px scrolling inside. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agentic-point/request-timeline.tsx | 474 +++++++++--------- 1 file changed, 249 insertions(+), 225 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 3c032fdd..2313775e 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -30,6 +30,11 @@ const PHASE_OPTIONS: SegmentedToggleOption[] = [ { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' }, ]; +// The timeline body is capped at this height and scrolls internally, so a run +// with many conversations/workers doesn't make the card grow unbounded and push +// the rest of the detail page down. Sized to show ~16 rows + the header. +const TIMELINE_BODY_MAX_HEIGHT = 480; + /** A stable color palette indexed by row-key hash. */ const ROW_COLORS = [ '#3b82f6', @@ -393,11 +398,24 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { }, []); const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null); - // Apply phase filter, then group into rows. + // The phase toggle only means something when warmup requests are actually + // present. aiperf's profile_export only contains profiling-phase requests, so + // in practice every record is `profiling` and the toggle is a no-op — hide it + // unless a non-profiling request exists (keeps it working if warmup is ever + // exported). + const hasWarmup = useMemo( + () => data.requests.some((r) => r.phase !== 'profiling'), + [data.requests], + ); + + // Apply phase filter, then group into rows. With no warmup data the filter + // collapses to "profiling" regardless of the (hidden) toggle state. const filtered = useMemo( () => - phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'), - [data.requests, phaseFilter], + phaseFilter === 'all' && hasWarmup + ? data.requests + : data.requests.filter((r) => r.phase === 'profiling'), + [data.requests, phaseFilter, hasWarmup], ); const rows = useMemo( () => buildRows(filtered, rowMode, expandedSubagents), @@ -581,14 +599,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { testId="timeline-row-mode" buttonClassName="px-2.5 py-1 text-xs" /> - + {hasWarmup && ( + + )} {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '} {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '} @@ -606,243 +626,247 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {/* Chart container */}
-
- {/* Label column — sticky, doesn't scroll horizontally with the chart. */} -
+ {/* Fixed-height window: the rows scroll vertically inside it instead of + the card growing to fit every conversation/worker. */} +
+
+ {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
- - {rowMode === 'conversation' ? 'Conversation' : 'Worker'} - -
- {rows.map((row) => { - const isSubagentRow = row.kind === 'subagent'; - const isStreamRow = row.kind === 'stream'; - const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; - const isExpanded = isExpandable && expandedSubagents.has(row.key); - return ( -
- {isExpandable ? ( - - ) : ( - - )} - - + + {rowMode === 'conversation' ? 'Conversation' : 'Worker'} + +
+ {rows.map((row) => { + const isSubagentRow = row.kind === 'subagent'; + const isStreamRow = row.kind === 'stream'; + const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; + const isExpanded = isExpandable && expandedSubagents.has(row.key); + return ( +
- {row.label} - {isExpandable && ( - ×{row.streamCount} + {isExpandable ? ( + + ) : ( + )} - - - {row.requests.length > 0 ? row.requests.length : '—'} - -
- ); - })} -
- - {/* Scrollable SVG */} -
- - {/* Header / time-axis baseline */} - - - {/* Time axis ticks */} - {ticks.map((t) => { - // Convert visible-window ns offset → x px (the tick array - // is already in dataStart-relative coords). - const x = (t - vStart) * scale; - return ( - - - - {formatTickLabel(t)} - - + {row.label} + {isExpandable && ( + ×{row.streamCount} + )} + + + {row.requests.length > 0 ? row.requests.length : '—'} + +
); })} +
- {/* Row separators */} - {rows.map((row, idx) => ( + {/* Scrollable SVG */} +
+ + {/* Header / time-axis baseline */} - ))} - - {/* Request bars */} - {rows.map((row, rowIdx) => { - const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; - const barH = ROW_HEIGHT - 4; - // For multi-stream subagent containers, suppress the union - // bars when expanded — the child stream rows draw them - // individually instead, so we'd double-draw otherwise. - if ( - row.kind === 'subagent' && - (row.streamCount ?? 1) > 1 && - expandedSubagents.has(row.key) - ) { - return null; - } - return row.requests.map((req) => { - const xCredit = xOf(req.credit); - const xStart = xOf(req.start); - const xEnd = xOf(req.end); - // Cull bars entirely outside the visible window so big - // benchmarks don't render thousands of zero-width rects. - if (xEnd < -2 || xCredit > chartWidth + 2) return null; - const runW = Math.max(xEnd - xStart, 1); - const queueW = Math.max(xStart - xCredit, 0); - const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!; + + {/* Time axis ticks */} + {ticks.map((t) => { + // Convert visible-window ns offset → x px (the tick array + // is already in dataStart-relative coords). + const x = (t - vStart) * scale; return ( - setTooltip({ x: e.clientX, y: e.clientY, row, req })} - onMouseLeave={() => setTooltip(null)} - > - {/* Queue lead-in (faint) — only drawn when noticeable. */} - {queueW >= 1 && ( + + + + {formatTickLabel(t)} + + + ); + })} + + {/* Row separators */} + {rows.map((row, idx) => ( + + ))} + + {/* Request bars */} + {rows.map((row, rowIdx) => { + const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; + const barH = ROW_HEIGHT - 4; + // For multi-stream subagent containers, suppress the union + // bars when expanded — the child stream rows draw them + // individually instead, so we'd double-draw otherwise. + if ( + row.kind === 'subagent' && + (row.streamCount ?? 1) > 1 && + expandedSubagents.has(row.key) + ) { + return null; + } + return row.requests.map((req) => { + const xCredit = xOf(req.credit); + const xStart = xOf(req.start); + const xEnd = xOf(req.end); + // Cull bars entirely outside the visible window so big + // benchmarks don't render thousands of zero-width rects. + if (xEnd < -2 || xCredit > chartWidth + 2) return null; + const runW = Math.max(xEnd - xStart, 1); + const queueW = Math.max(xStart - xCredit, 0); + const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!; + return ( + setTooltip({ x: e.clientX, y: e.clientY, row, req })} + onMouseLeave={() => setTooltip(null)} + > + {/* Queue lead-in (faint) — only drawn when noticeable. */} + {queueW >= 1 && ( + + )} + {/* Main bar — opacity stepped down with depth so + parent > subagent > stream reads visually. */} - )} - {/* Main bar — opacity stepped down with depth so - parent > subagent > stream reads visually. */} - - {/* Phase strip at bottom */} - - {/* Cancelled X overlay */} - {req.cancelled && runW > 6 && ( - - )} - - ); - }); - })} - - {/* Cursor crosshair — drawn on top of bars so it stays visible + {/* Cancelled X overlay */} + {req.cancelled && runW > 6 && ( + + )} + + ); + }); + })} + + {/* Cursor crosshair — drawn on top of bars so it stays visible through dense rows. Stats popover is rendered as fixed HTML below the SVG block. */} - {cursor && ( - - )} - + {cursor && ( + + )} + +
From 28d25a53b7e3543a3d91e9c19f05b2409c20c032 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:50:26 -0500 Subject: [PATCH 068/111] feat(agentic-timeline): sticky bottom h-scroll + double-click to reset zoom The fixed-height window put the chart's horizontal scrollbar at the bottom of the tall (full-height) content, below the fold and unreachable. Make the window itself the single scroll container (overflow-auto, both axes) and pin the label column with position:sticky left-0, so the horizontal scrollbar stays at the window's bottom edge while the label column stays put during horizontal scroll and scrolls with the rows vertically. Also add double-click anywhere on the timeline to reset zoom/pan (same resetZoom the existing button calls) and note it in the hint text. Verified live: window scrollW 1280 > clientW 879 (h-scroll present and working), label column sticky, rows scroll vertically. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agentic-point/request-timeline.tsx | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 2313775e..7c5fdab0 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -626,13 +626,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {/* Chart container */}
- {/* Fixed-height window: the rows scroll vertically inside it instead of - the card growing to fit every conversation/worker. */} -
-
- {/* Label column — sticky, doesn't scroll horizontally with the chart. */} + {/* Fixed-height window: rows scroll vertically and the chart scrolls + horizontally inside it, so the card doesn't grow to fit every + conversation/worker AND the horizontal scrollbar stays pinned to the + window's bottom edge (rather than the bottom of the tall content). */} +
+
+ {/* Label column — pinned left (sticky) so it stays put during + horizontal scroll, while scrolling vertically with the rows. */}
- {/* Scrollable SVG */} -
+ {/* Chart column — horizontal scrolling is handled by the window + container above so its scrollbar stays pinned to the window's + bottom edge; double-click anywhere resets the zoom. */} +
{/* Header / time-axis baseline */} warmup - scroll to zoom · drag to pan + + scroll to zoom · drag to pan · double-click to reset +
{/* Cursor stats popover: count of in-flight / waiting at the cursor's From 6e56bbfb2a29c6ffad2e4d4484bfcb6673fdacfd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 09:29:18 -0500 Subject: [PATCH 069/111] fix(gpu-compare): show CPU-offload halo on points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dashed offload-mode ring (drawn in ScatterGraph's onRender for every point with offload_mode='on') was missing from GPU compare mode (GPUGraph), so the CPU-offloading indicator never appeared there. Mirror it in GPUGraph's onRender — same dashed var(--foreground) ring at POINT_SIZE+4, appended inside each .dot-group so it travels with the point on zoom/pan. Verified live in compare mode (DSv4 B200/B300 agentic): offload points now render the dashed halo (5 rings, r=7.5, dash 3 2). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/components/inference/ui/GPUGraph.tsx | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index 24b1266f..19ba574f 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -26,6 +26,7 @@ import { formatLargeNumber, getShapeKeyForPrecision, logTickFormat, + POINT_SIZE, } from '@/lib/chart-rendering'; import { paretoFrontLowerLeft, @@ -827,6 +828,28 @@ const GPUGraph = React.memo( } // Set foreground color on scatter point labels ctx.layout.zoomGroup.selectAll('.point-label').style('fill', 'var(--foreground)'); + + // Offload halo: dashed ring on every point that used KV offload + // (mirrors ScatterGraph so compare mode shows the same CPU-offload + // indicator). The ring is a child of the dot-group, so it travels + // with the point on zoom/pan without a separate onZoom pass. + ctx.layout.zoomGroup + .selectAll('.dot-group') + .each(function (d) { + const showHalo = d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); }} legendElement={ Date: Thu, 18 Jun 2026 12:56:08 -0500 Subject: [PATCH 070/111] fix(high-contrast): use full hue wheel for single-vendor comparisons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit generateHighContrastColors clamps each vendor's series into its brand hue zone (NVIDIA=green, AMD=red) at <=PREFERRED_MAX items. The point of that clamp is to keep DIFFERENT vendors apart at a glance — but when only one vendor is present (the common all-NVIDIA agentic comparison: B200/B300 x vLLM/SGLang), there's no rival to separate from, so every series collapses into the same narrow green band and high-contrast mode looks like it does nothing. When a single vendor is present, skip the brand zone and rival-ban and use the full hue wheel for maximum separation. Verified on an all-NVIDIA agentic view: HC now spreads pink/blue/gold/green (hues 45/99/227/330, min adjacent gap 54deg) instead of four near-identical greens. Multi-vendor behavior is unchanged — vendors keep their brand zones so they stay distinguishable. The non-HC palette still carries vendor identity. Updated the single-vendor color tests to assert separability across the full wheel rather than brand-zone confinement. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/app/src/lib/chart-utils.test.ts | 39 ++++++++++-------------- packages/app/src/lib/chart-utils.ts | 19 ++++++++++-- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/packages/app/src/lib/chart-utils.test.ts b/packages/app/src/lib/chart-utils.test.ts index 061037ed..f6828ce2 100644 --- a/packages/app/src/lib/chart-utils.test.ts +++ b/packages/app/src/lib/chart-utils.test.ts @@ -353,30 +353,29 @@ describe('generateHighContrastColors', () => { expect(Object.values(dark).join(',')).not.toEqual(Object.values(light).join(',')); }); - // ---------- Tier 1: few items → brand zone ---------- - - it('3 NVIDIA GPUs are not red', () => { + // ---------- Single vendor: full wheel for maximum contrast ---------- + // Brand-zone / rival-ban only apply when MULTIPLE vendors are present (so the + // vendors stay visually separable). With a single vendor there's no rival to + // distinguish from, so HC opens the full hue wheel — brand hue is sacrificed + // for the contrast HC exists to provide (fixes the all-NVIDIA agentic case + // where every series otherwise collapsed into the green brand band). + + it('3 NVIDIA GPUs (single vendor) are distinguishable across the full wheel', () => { const result = generateHighContrastColors(['h100_vllm', 'h200_vllm', 'b200_vllm'], 'dark'); - for (const color of Object.values(result)) { - expect(isNotReddish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(3); assertMinDist(result, 30); }); - it('2 AMD GPUs are not green', () => { + it('2 AMD GPUs (single vendor) are distinguishable across the full wheel', () => { const result = generateHighContrastColors(['mi300x_sglang', 'mi325x_sglang'], 'dark'); - for (const color of Object.values(result)) { - expect(isNotGreenish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(2); assertMinDist(result, 30); }); - it('4 NVIDIA GPUs stay in brand zone and are distinguishable', () => { + it('4 NVIDIA GPUs (single vendor) use the full wheel and stay well-separated', () => { const keys = ['h100_vllm', 'h200_vllm', 'b200_vllm', 'b300_vllm']; const result = generateHighContrastColors(keys, 'dark'); - for (const color of Object.values(result)) { - expect(isNotReddish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(4); assertMinDist(result, 25); }); @@ -401,19 +400,13 @@ describe('generateHighContrastColors', () => { assertMinDist(result, 25); }); - // ---------- Tier 2: moderate items → full wheel minus rival color ---------- + // ---------- Single vendor, many items → full wheel, best spacing ---------- - it('10 NVIDIA GPUs: no red hues, still distinguishable', () => { + it('10 NVIDIA GPUs (single vendor) are well-separated across the full wheel', () => { const gpus = ['h100', 'h200', 'b200', 'b300', 'gb200']; const keys = gpus.flatMap((g) => [`${g}_vllm`, `${g}_sglang`]); const result = generateHighContrastColors(keys, 'dark'); - // Should not be reddish (banned) - for (const color of Object.values(result)) { - const rgb = parseRgb(color); - // Not red-dominant with low green — i.e. not in the red/pink zone - const isRedPink = rgb[0] > 150 && rgb[1] < 80 && rgb[2] < 150; - expect(isRedPink).toBe(false); - } + expect(Object.keys(result)).toHaveLength(10); assertMinDist(result, 20); }); diff --git a/packages/app/src/lib/chart-utils.ts b/packages/app/src/lib/chart-utils.ts index 33a5b4e3..3eeda15b 100644 --- a/packages/app/src/lib/chart-utils.ts +++ b/packages/app/src/lib/chart-utils.ts @@ -61,10 +61,17 @@ const PALETTE_CACHE = new Map(); /** * Generates high-contrast colors using iwanthue (k-means in CIELab space). * - * Tiered strategy per vendor: + * Tiered strategy per vendor (only when >1 vendor is present): * ≤ PREFERRED_MAX → constrain to brand zone (NVIDIA=green, AMD=red) * ≤ BAN_MAX → full wheel minus rival's brand color * > BAN_MAX → full wheel, no restrictions, best spacing wins + * + * Single-vendor case (e.g. an all-NVIDIA agentic comparison of B200/B300 × + * vLLM/SGLang): the brand zone and rival-ban exist to keep vendors apart at a + * glance, but with one vendor there's no rival — clamping every series into the + * same narrow hue band just collapses the contrast HC is supposed to maximize. + * So skip both restrictions and use the full wheel, giving the series the widest + * possible separation. */ export const generateHighContrastColors = ( keys: string[], @@ -91,6 +98,12 @@ export const generateHighContrastColors = ( list.push(key); } + // Brand-zone / rival-ban only serve to keep DIFFERENT vendors apart. With a + // single vendor present there's nothing to separate from, so those + // restrictions only shrink the usable hue range and kill contrast — open the + // full wheel instead (the common all-NVIDIA agentic comparison case). + const multiVendor = groups.size > 1; + for (const [vendor, vendorKeys] of groups) { const count = vendorKeys.length; const isBanned = BANNED_HUE_TEST[vendor] ?? null; @@ -99,8 +112,8 @@ export const generateHighContrastColors = ( // Tier 1: few items → brand zone only // Tier 2: moderate → full wheel minus rival color // Tier 3: many → full wheel, no restrictions - const usePreferred = preferred && count <= PREFERRED_MAX; - const useBan = !usePreferred && isBanned && count <= BAN_MAX; + const usePreferred = multiVendor && preferred && count <= PREFERRED_MAX; + const useBan = multiVendor && !usePreferred && isBanned && count <= BAN_MAX; // Everything iwanthue's output depends on (the ban filter and preferred // zone are functions of vendor; the seed is vendor+theme). From 6275aa70bf0162cd83762ff79a2e0a5c053270e2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 10:17:42 -0500 Subject: [PATCH 071/111] feat(inference): default line labels off, parallelism labels + high contrast on Change the inference chart's default toggle states: - Line Labels: on -> off (i_linelabel=1 overrides on) - Parallelism Labels: off -> on, which also defaults point labels on since parallelism labels ARE point labels (i_advlabel=0 overrides off) - High Contrast: off -> on, via a new opt-in defaultHighContrast on useChartUIState so reliability/evaluation (r_/e_ prefixes) stay off; i_hc=0 overrides off. Historical trends shares the inference context so it inherits the high-contrast default too. URL serialization flipped to omit each param at its new default and only write the override value, so share links stay clean. Updated line-labels, gradient-labels, and url-params E2E specs to the new defaults. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../app/cypress/e2e/gradient-labels.cy.ts | 16 +++++----- packages/app/cypress/e2e/line-labels.cy.ts | 31 ++++++++++++------- packages/app/cypress/e2e/url-params.cy.ts | 14 +++++++-- .../components/inference/InferenceContext.tsx | 25 ++++++++------- packages/app/src/hooks/useChartContext.ts | 12 +++++-- 5 files changed, 61 insertions(+), 37 deletions(-) diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts index 333baa6d..a0753e90 100644 --- a/packages/app/cypress/e2e/gradient-labels.cy.ts +++ b/packages/app/cypress/e2e/gradient-labels.cy.ts @@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => { cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels'); }); - it('Parallelism Labels toggle is off by default', () => { - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked'); + it('Parallelism Labels toggle is on by default', () => { + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); }); it('per-point labels are visible by default (gradient labels off)', () => { @@ -60,21 +60,19 @@ describe('Gradient Labels Toggle', () => { }); it('both toggles can be enabled simultaneously', () => { - // Turn on Gradient Labels (off by default) + // Parallelism Labels is on by default; ensure it's on, then turn on Gradient. + cy.get('#scatter-parallelism-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('#scatter-gradient-labels').click(); cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); - // Turn on Parallelism Labels - cy.get('#scatter-parallelism-labels').click(); - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Both should be checked cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Reset for next tests + // Reset gradient for next tests (parallelism stays at its default-on). cy.get('#scatter-gradient-labels').click(); - cy.get('#scatter-parallelism-labels').click(); }); it('URL param i_gradlabel=1 enables gradient labels on load', () => { diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts index 84e655f8..23b372df 100644 --- a/packages/app/cypress/e2e/line-labels.cy.ts +++ b/packages/app/cypress/e2e/line-labels.cy.ts @@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => { cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels'); }); - it('Line Labels toggle is on by default', () => { - cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); - - // Line labels render without any interaction - cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); - }); - - it('toggling Line Labels off then back on removes and restores label elements', () => { - // On by default — turn it off first. - cy.get('#scatter-line-labels').click(); + it('Line Labels toggle is off by default', () => { cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + + // No line labels render without interaction cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); + }); - // Turn it back on — labels return. + it('toggling Line Labels on then back off adds and removes label elements', () => { + // Off by default — turn it on first. cy.get('#scatter-line-labels').click(); cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); + + // Turn it back off — labels disappear. + cy.get('#scatter-line-labels').click(); + cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); }); it('line labels have colored background rects and text', () => { + // Off by default — ensure on (idempotent; prior test left them off). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); // Each line label group should contain a background rect and text cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should( 'have.length.greaterThan', @@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => { }); it('line labels render in the foreground, after the scatter points', () => { - // Labels were toggled on in the test above and remain on here. + // Off by default — ensure on (idempotent; previous test leaves them on). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); cy.get('[data-testid="scatter-graph"] svg').then(($svg) => { diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts index 33282b9c..3c480686 100644 --- a/packages/app/cypress/e2e/url-params.cy.ts +++ b/packages/app/cypress/e2e/url-params.cy.ts @@ -236,9 +236,15 @@ describe('URL Parameter Persistence', () => { }); describe('High contrast mode', () => { - it('page loads without high contrast by default', () => { + it('inference loads with high contrast on by default', () => { visitWithDismissedModal('/inference'); cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked'); + }); + + it('i_hc=0 disables high contrast on load', () => { + visitWithDismissedModal('/inference?i_hc=0'); + cy.get('[data-testid="scatter-graph"]').should('exist'); cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); }); @@ -267,10 +273,12 @@ describe('URL Parameter Persistence', () => { cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); - it('historical trends tab has high contrast switch off by default', () => { + it('historical trends tab shares the inference high-contrast default (on)', () => { + // Historical reads highContrast from the same InferenceContext as the + // scatter chart, so it inherits the default-on behavior. visitWithDismissedModal('/historical'); cy.get('[data-testid="historical-trends-display"]').should('exist'); - cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); + cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); it('i_hc=1 enables historical trends high contrast', () => { diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index d66febd0..c2c599ff 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -195,6 +195,8 @@ export function InferenceProvider({ ); const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({ urlPrefix: 'i_', + // Inference chart defaults to high contrast (?i_hc=0 overrides off). + defaultHighContrast: true, }); const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0'); @@ -202,21 +204,22 @@ export function InferenceProvider({ // Legacy `?i_nolabel=1` from before the rename: keep hiding point labels // explicitly so the share link's intent survives future default changes. if (getUrlParam('i_nolabel') === '1') return false; + if (getUrlParam('i_label') === '0') return false; if (getUrlParam('i_label') === '1') return true; - // Old share links set `?i_advlabel=1` while keeping the labels default - // (shown). Mirror the toggle's auto-enable side-effect on load so those - // links still render advanced labels under the new default-off behavior. - if (getUrlParam('i_advlabel') === '1') return true; - return false; + // Default on: parallelism labels (also default on) are point labels and + // are pointless without them shown. + return true; }); const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1'); + // Parallelism labels default on (?i_advlabel=0 overrides off). const [useAdvancedLabels, setUseAdvancedLabels] = useState( - () => getUrlParam('i_advlabel') === '1', + () => getUrlParam('i_advlabel') !== '0', ); const [showGradientLabels, setShowGradientLabels] = useState( () => getUrlParam('i_gradlabel') === '1', ); - const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') !== '0'); + // Line labels default off (?i_linelabel=1 overrides on). + const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') === '1'); const [showSpeedOverlay, setShowSpeedOverlay] = useState(() => getUrlParam('i_speed') === '1'); const [showMinecraftOverlay, setShowMinecraftOverlay] = useState( () => getUrlParam('i_mc') === '1', @@ -983,17 +986,17 @@ export function InferenceProvider({ i_dstart: selectedDateRange.startDate, i_dend: selectedDateRange.endDate, i_optimal: hideNonOptimal ? '' : '0', - i_label: showPointLabels ? '1' : '', - i_hc: highContrast ? '1' : '', + i_label: showPointLabels ? '' : '0', + i_hc: highContrast ? '' : '0', i_log: logScale ? '1' : '', i_xmetric: selectedXAxisMetric || '', i_e2e_xmetric: selectedE2eXAxisMetric || '', i_xmode: selectedXAxisMode, i_scale: scaleType, i_legend: isLegendExpanded ? '' : '0', - i_advlabel: useAdvancedLabels ? '1' : '', + i_advlabel: useAdvancedLabels ? '' : '0', i_gradlabel: showGradientLabels ? '1' : '', - i_linelabel: showLineLabels ? '' : '0', + i_linelabel: showLineLabels ? '1' : '', i_speed: showSpeedOverlay ? '1' : '', i_mc: showMinecraftOverlay ? '1' : '', i_active: iActiveStr, diff --git a/packages/app/src/hooks/useChartContext.ts b/packages/app/src/hooks/useChartContext.ts index 49812c3e..be095430 100644 --- a/packages/app/src/hooks/useChartContext.ts +++ b/packages/app/src/hooks/useChartContext.ts @@ -37,6 +37,12 @@ export function reconcileActiveSet( interface UseChartStateConfig { /** URL parameter prefix (e.g., 'i_' for inference, 'r_' for reliability, 'e_' for evaluation) */ urlPrefix: string; + /** + * Initial high-contrast value when the URL has no `hc` param. + * Defaults to false; the inference chart opts in to true. A `hc=0` + * URL param overrides it back off. + */ + defaultHighContrast?: boolean; } /** @@ -44,7 +50,7 @@ interface UseChartStateConfig { * Includes mobile-specific legend collapse behavior. */ export function useChartUIState(config: UseChartStateConfig) { - const { urlPrefix } = config; + const { urlPrefix, defaultHighContrast = false } = config; const { getUrlParam } = useUrlState(); const hcParam = `${urlPrefix}hc` as any; @@ -52,7 +58,7 @@ export function useChartUIState(config: UseChartStateConfig) { // Initialize with safe defaults that match SSR output to avoid hydration mismatches. // URL-param values are applied in a mount effect so the state is only set client-side. - const [highContrast, setHighContrast] = useState(false); + const [highContrast, setHighContrast] = useState(defaultHighContrast); const [isLegendExpanded, setIsLegendExpanded] = useState(true); const didInit = useRef(false); @@ -60,7 +66,9 @@ export function useChartUIState(config: UseChartStateConfig) { if (didInit.current) return; didInit.current = true; const hcVal = getUrlParam(hcParam); + // Respect both overrides so the toggle round-trips regardless of the default. if (hcVal === '1') setHighContrast(true); + else if (hcVal === '0') setHighContrast(false); const legendVal = getUrlParam(legendParam); if (legendVal === '0') setIsLegendExpanded(false); }, [getUrlParam, hcParam, legendParam]); From 5c290a49f50d7a0834a544d3e837bc1d1ccad5de Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 14:30:44 -0500 Subject: [PATCH 072/111] feat(agentic): use the chart's TP/EP/DEP/TEP parallelism labels on sibling chips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agentic detail page's sibling navigator labeled configs with an ad-hoc `TP{n}EP{n}` / `{p}P+{d}D` scheme that ignored dp-attention and the TEP/DEP collapse, so a DEP4 config read as plain TP4EP4 (and, mid-deploy before the API carried dp_attention, as TEP4). Extract the scatter chart's labeler into a shared parallelism-label module (configSegmentLabel + parallelismLabel) and route both getPointLabel and the sibling chipLabel through it, so the two surfaces describe a config identically (TP/EP/TEP/DEP/DPA…, multinode-disagg worker segments). Carry the fields the labeler needs through the siblings query/API/hook: decode/prefill dp_attention + num_workers + is_multinode. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../inference/agentic-point/sibling-nav.tsx | 20 ++++- .../inference/utils/parallelism-label.test.ts | 58 ++++++++++++++ .../inference/utils/parallelism-label.ts | 79 +++++++++++++++++++ .../inference/utils/tooltipUtils.ts | 69 ++++++---------- .../src/hooks/api/use-benchmark-siblings.ts | 5 ++ packages/db/src/queries/benchmark-siblings.ts | 20 ++++- 6 files changed, 202 insertions(+), 49 deletions(-) create mode 100644 packages/app/src/components/inference/utils/parallelism-label.test.ts create mode 100644 packages/app/src/components/inference/utils/parallelism-label.ts diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx index aa727fdc..f92d6b63 100644 --- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx +++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx @@ -4,6 +4,7 @@ import { useRouter } from 'next/navigation'; import { ChevronLeft, ChevronRight } from 'lucide-react'; import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings'; +import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; const HW_LABELS: Record = { b200: 'B200', @@ -49,9 +50,22 @@ function frameworkLabel(fw: string) { /** Short label for a sibling chip: parallelism + concurrency. */ export function chipLabel(s: BenchmarkSibling): string { - const parallel = s.disagg - ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D` - : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`; + // Same parallelism labeler the chart points use (TP/EP/TEP/DEP/DPA…). + const parallel = parallelismLabel({ + tp: s.decode_tp, + ep: s.decode_ep, + dpAttention: s.decode_dp_attention, + disagg: s.disagg, + isMultinode: s.is_multinode, + prefillTp: s.prefill_tp, + prefillEp: s.prefill_ep, + prefillDpAttention: s.prefill_dp_attention, + prefillNumWorkers: s.prefill_num_workers, + decodeTp: s.decode_tp, + decodeEp: s.decode_ep, + decodeDpAttention: s.decode_dp_attention, + decodeNumWorkers: s.decode_num_workers, + }); const offload = s.offload_mode === 'on' ? ' • off=ON' : ''; return `${parallel} • c=${s.conc}${offload}`; } diff --git a/packages/app/src/components/inference/utils/parallelism-label.test.ts b/packages/app/src/components/inference/utils/parallelism-label.test.ts new file mode 100644 index 00000000..aaf715d3 --- /dev/null +++ b/packages/app/src/components/inference/utils/parallelism-label.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; + +import { configSegmentLabel, parallelismLabel } from './parallelism-label'; + +describe('configSegmentLabel', () => { + it('collapses symmetric tp===ep to TEP / DEP by dp-attention', () => { + expect(configSegmentLabel(8, 8, false)).toBe('TEP8'); + expect(configSegmentLabel(8, 8, true)).toBe('DEP8'); + }); + + it('uses EP / DPAEP when ep>1 and tp!==ep', () => { + expect(configSegmentLabel(4, 16, false)).toBe('EP16'); + expect(configSegmentLabel(4, 16, true)).toBe('DPAEP16'); + }); + + it('uses TP / DPATP when ep<=1 or absent', () => { + expect(configSegmentLabel(8, 1, false)).toBe('TP8'); + expect(configSegmentLabel(8, undefined, false)).toBe('TP8'); + expect(configSegmentLabel(8, 1, true)).toBe('DPATP8'); + }); +}); + +describe('parallelismLabel', () => { + it('falls back to bare tp when no ep data', () => { + expect(parallelismLabel({ tp: 8 })).toBe('8'); + }); + + it('labels a single-segment config', () => { + expect(parallelismLabel({ tp: 8, ep: 8, dpAttention: true })).toBe('DEP8'); + expect(parallelismLabel({ tp: 4, ep: 8, dpAttention: false })).toBe('EP8'); + }); + + it('builds multinode-disagg per-role worker segments', () => { + expect( + parallelismLabel({ + tp: 8, + ep: 4, + disagg: true, + isMultinode: true, + prefillTp: 4, + prefillEp: 4, + prefillDpAttention: false, + prefillNumWorkers: 2, + decodeTp: 8, + decodeEp: 8, + decodeDpAttention: true, + decodeNumWorkers: 1, + }), + ).toBe('2xTEP4+1xDEP8'); + }); + + it('single-node disagg uses the single (decode) segment, not worker syntax', () => { + // is_multinode false → no "NxPrefill+MxDecode" expansion. + expect( + parallelismLabel({ tp: 8, ep: 8, dpAttention: false, disagg: true, isMultinode: false }), + ).toBe('TEP8'); + }); +}); diff --git a/packages/app/src/components/inference/utils/parallelism-label.ts b/packages/app/src/components/inference/utils/parallelism-label.ts new file mode 100644 index 00000000..98207110 --- /dev/null +++ b/packages/app/src/components/inference/utils/parallelism-label.ts @@ -0,0 +1,79 @@ +/** + * Shared parallelism-config labeling — the single source of truth for the + * short "TP8 / EP8 / TEP8 / DEP8 / DPAEP8 / 2xEP4+1xDPAEP32" labels. + * + * Used by the scatter/GPU chart point labels (via getPointLabel) and the + * agentic detail page's sibling navigator chips, so both surfaces describe a + * config identically. + */ + +/** + * Generates a short config segment label from parallelism params. + * - tp == ep and dp-attn false: "TEP{N}" + * - tp == ep and dp-attn true: "DEP{N}" + * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}" + * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}" + */ +export const configSegmentLabel = ( + tp: number, + ep: number | undefined, + dpAttention: boolean | undefined, +): string => { + if (ep !== null && ep !== undefined && ep > 1 && tp === ep) { + return dpAttention ? `DEP${tp}` : `TEP${tp}`; + } + const dpaPrefix = dpAttention ? 'DPA' : ''; + if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`; + return `${dpaPrefix}EP${ep}`; +}; + +/** Parallelism params for one benchmark config, framework-agnostic. */ +export interface ParallelismFields { + tp: number; + ep?: number; + dpAttention?: boolean; + disagg?: boolean; + isMultinode?: boolean; + prefillTp?: number; + prefillEp?: number; + prefillDpAttention?: boolean; + prefillNumWorkers?: number; + decodeTp?: number; + decodeEp?: number; + decodeDpAttention?: boolean; + decodeNumWorkers?: number; +} + +/** + * Returns the short parallelism label for a config. + * - No EP data (old rows): falls back to the bare tp value (e.g. "8"). + * - Multinode disagg: per-role segments with worker counts, + * e.g. "2xEP4+1xDPAEP32". + * - Otherwise: a single segment from (tp, ep, dpAttention). + */ +export const parallelismLabel = (f: ParallelismFields): string => { + if ( + (f.ep === null || f.ep === undefined) && + (f.prefillEp === null || f.prefillEp === undefined) + ) { + return String(f.tp); + } + + if (f.isMultinode && f.disagg) { + const prefillLabel = configSegmentLabel( + f.prefillTp ?? f.tp, + f.prefillEp ?? f.ep, + f.prefillDpAttention ?? f.dpAttention, + ); + const decodeLabel = configSegmentLabel( + f.decodeTp ?? f.tp, + f.decodeEp ?? f.ep, + f.decodeDpAttention ?? f.dpAttention, + ); + const pw = f.prefillNumWorkers ?? 1; + const dw = f.decodeNumWorkers ?? 1; + return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`; + } + + return configSegmentLabel(f.tp, f.ep, f.dpAttention); +}; diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index 14d3b553..ea039336 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -1,6 +1,7 @@ import { formatNumber, getDisplayLabel } from '@/lib/utils'; import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types'; +import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; export interface TooltipConfig { /** The data point to display */ @@ -34,57 +35,37 @@ export interface OverlayTooltipConfig extends TooltipConfig { overlayData: OverlayData; } -/** - * Generates a short config segment label from parallelism params. - * - tp == ep and dp-attn false: "TEP{N}" - * - tp == ep and dp-attn true: "DEP{N}" - * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}" - * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}" - */ -const configSegmentLabel = ( - tp: number, - ep: number | undefined, - dpAttention: boolean | undefined, -): string => { - if (ep !== null && ep !== undefined && ep > 1 && tp === ep) { - return dpAttention ? `DEP${tp}` : `TEP${tp}`; - } - const dpaPrefix = dpAttention ? 'DPA' : ''; - if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`; - return `${dpaPrefix}EP${ep}`; -}; +// `dp_attention` is `boolean | string` on InferenceData (DB sends raw, the +// transform narrows "true"/"false" → boolean). Coerce to a plain boolean for +// the shared labeler, treating the legacy string form correctly. +const asBool = (v: boolean | string | undefined): boolean | undefined => + typeof v === 'string' ? v === 'true' : v; /** * Returns the short label for a data point on the chart. * - Non-multinode: e.g. "TP8", "EP8", "TEP8", "DEP8", "DPAEP8" * - Multinode disagg: e.g. "2xEP4+1xDPAEP32" * - Old data (no ep field): falls back to tp value + * + * Delegates to the shared {@link parallelismLabel} so the chart points and the + * agentic sibling navigator describe a config identically. */ -export const getPointLabel = (d: InferenceData): string => { - if ( - (d.ep === null || d.ep === undefined) && - (d.prefill_ep === null || d.prefill_ep === undefined) - ) - return String(d.tp); - - if (d.is_multinode && d.disagg) { - const prefillLabel = configSegmentLabel( - d.prefill_tp ?? d.tp, - d.prefill_ep ?? d.ep, - d.prefill_dp_attention ?? d.dp_attention, - ); - const decodeLabel = configSegmentLabel( - d.decode_tp ?? d.tp, - d.decode_ep ?? d.ep, - d.decode_dp_attention ?? d.dp_attention, - ); - const pw = d.prefill_num_workers ?? 1; - const dw = d.decode_num_workers ?? 1; - return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`; - } - - return configSegmentLabel(d.tp, d.ep, d.dp_attention); -}; +export const getPointLabel = (d: InferenceData): string => + parallelismLabel({ + tp: d.tp, + ep: d.ep, + dpAttention: asBool(d.dp_attention), + disagg: d.disagg, + isMultinode: d.is_multinode, + prefillTp: d.prefill_tp, + prefillEp: d.prefill_ep, + prefillDpAttention: asBool(d.prefill_dp_attention), + prefillNumWorkers: d.prefill_num_workers, + decodeTp: d.decode_tp, + decodeEp: d.decode_ep, + decodeDpAttention: asBool(d.decode_dp_attention), + decodeNumWorkers: d.decode_num_workers, + }); const runLinkHTML = (runUrl?: string) => runUrl diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts index 1ea90c0d..e6bc4906 100644 --- a/packages/app/src/hooks/api/use-benchmark-siblings.ts +++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts @@ -6,11 +6,16 @@ export interface BenchmarkSibling { offload_mode: string | null; decode_tp: number; decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; prefill_tp: number; prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; disagg: boolean; + is_multinode: boolean; is_current: boolean; has_trace: boolean; } diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts index 245a1170..241a48ba 100644 --- a/packages/db/src/queries/benchmark-siblings.ts +++ b/packages/db/src/queries/benchmark-siblings.ts @@ -14,11 +14,16 @@ export interface BenchmarkSibling { offload_mode: string | null; decode_tp: number; decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; prefill_tp: number; prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; disagg: boolean; + is_multinode: boolean; /** True if this row IS the point passed in. */ is_current: boolean; /** Whether the row has a stored trace_replay blob (for navigation hint). */ @@ -74,8 +79,9 @@ export async function getBenchmarkSiblings( const rows = (await sql` select br.id, br.conc, br.offload_mode, - c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep, - c.num_prefill_gpu, c.num_decode_gpu, c.disagg, + c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers, + c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers, + c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode, (br.trace_replay_id is not null) as has_trace from benchmark_results br join configs c on c.id = br.config_id @@ -93,11 +99,16 @@ export async function getBenchmarkSiblings( offload_mode: string | null; decode_tp: number; decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; prefill_tp: number; prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; disagg: boolean; + is_multinode: boolean; has_trace: boolean; }[]; @@ -107,11 +118,16 @@ export async function getBenchmarkSiblings( offload_mode: r.offload_mode, decode_tp: r.decode_tp, decode_ep: r.decode_ep, + decode_dp_attention: r.decode_dp_attention, + decode_num_workers: r.decode_num_workers, prefill_tp: r.prefill_tp, prefill_ep: r.prefill_ep, + prefill_dp_attention: r.prefill_dp_attention, + prefill_num_workers: r.prefill_num_workers, num_prefill_gpu: r.num_prefill_gpu, num_decode_gpu: r.num_decode_gpu, disagg: r.disagg, + is_multinode: r.is_multinode, is_current: Number(r.id) === benchmarkResultId, has_trace: r.has_trace, })); From 32adf6bec66f41ffe2cfa4f08251afcb333c007d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 14:53:17 -0500 Subject: [PATCH 073/111] feat(agentic): sort dropdown for the sibling point navigator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 'Sort by' dropdown to the agentic detail page's point navigator: - Default (DB order) - Concurrency ↑ - Parallelism (groups all TP, then TEP/DEP/EP… by ep→tp→dpa, conc within) - Throughput/GPU ↓ - Total requests ↓ Carry tput_per_gpu and total_requests (total_requests_completed, falling back to legacy num_requests_total) through the siblings query/API/hook. prev/next follow the sorted order, and the chosen sort is persisted in the URL (?sort=) — read on mount and threaded through every point link plus a router.replace — so navigating to another point no longer resets it. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../inference/agentic-point/sibling-nav.tsx | 131 ++++++++++++++++-- .../src/hooks/api/use-benchmark-siblings.ts | 2 + packages/db/src/queries/benchmark-siblings.ts | 16 +++ 3 files changed, 141 insertions(+), 8 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx index f92d6b63..a1a5d1ab 100644 --- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx +++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx @@ -1,10 +1,19 @@ 'use client'; +import { useMemo, useState } from 'react'; import { useRouter } from 'next/navigation'; import { ChevronLeft, ChevronRight } from 'lucide-react'; import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings'; import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { track } from '@/lib/analytics'; const HW_LABELS: Record = { b200: 'B200', @@ -70,12 +79,83 @@ export function chipLabel(s: BenchmarkSibling): string { return `${parallel} • c=${s.conc}${offload}`; } +type SortMode = 'default' | 'conc' | 'parallelism' | 'tput' | 'requests'; + +const SORT_OPTIONS: { value: SortMode; label: string }[] = [ + { value: 'default', label: 'Default' }, + { value: 'conc', label: 'Concurrency ↑' }, + { value: 'parallelism', label: 'Parallelism' }, + { value: 'tput', label: 'Throughput/GPU ↓' }, + { value: 'requests', label: 'Total requests ↓' }, +]; + +// Group key for the "parallelism" sort: ep first (so TP/EP1 sorts ahead of +// EP/TEP/DEP groups), then tp, then dp-attention, then disagg — every config +// of one parallelism lands together, ordered by concurrency within. +const parallelRank = (s: BenchmarkSibling): [number, number, number, number] => [ + s.decode_ep ?? 0, + s.decode_tp ?? 0, + s.decode_dp_attention ? 1 : 0, + s.disagg ? 1 : 0, +]; + +function sortSiblings(siblings: BenchmarkSibling[], mode: SortMode): BenchmarkSibling[] { + if (mode === 'default') return siblings; + const out = [...siblings]; + if (mode === 'conc') { + out.sort((a, b) => a.conc - b.conc); + } else if (mode === 'tput') { + // Highest throughput/GPU first; rows missing the metric sink to the end. + out.sort((a, b) => (b.tput_per_gpu ?? -Infinity) - (a.tput_per_gpu ?? -Infinity)); + } else if (mode === 'requests') { + // Most total requests first; rows missing the metric sink to the end. + out.sort((a, b) => (b.total_requests ?? -Infinity) - (a.total_requests ?? -Infinity)); + } else { + out.sort((a, b) => { + const ra = parallelRank(a); + const rb = parallelRank(b); + for (let i = 0; i < ra.length; i++) { + if (ra[i] !== rb[i]) return ra[i] - rb[i]; + } + // Within a parallelism group: offload off before on, then concurrency. + const oa = a.offload_mode === 'on' ? 1 : 0; + const ob = b.offload_mode === 'on' ? 1 : 0; + return oa - ob || a.conc - b.conc; + }); + } + return out; +} + +const isSortMode = (v: string | null): v is SortMode => + v !== null && SORT_OPTIONS.some((o) => o.value === v); + export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) { const router = useRouter(); - const currentIdx = siblings.findIndex((s) => s.is_current); - const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null; - const next = - currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null; + // Persist the sort in the URL so clicking a point (which remounts this + // component on the new route) keeps the chosen order instead of resetting. + // Read it once from the URL on mount — this component only renders after the + // client-side siblings query resolves, so `window` is always available here + // (no SSR/hydration mismatch). Matches the app's window-based url-state read. + const [sortMode, setSortMode] = useState(() => { + if (typeof window === 'undefined') return 'default'; + const v = new URLSearchParams(window.location.search).get('sort'); + return isSortMode(v) ? v : 'default'; + }); + + const sorted = useMemo(() => sortSiblings(siblings, sortMode), [siblings, sortMode]); + + // prev/next follow the displayed (sorted) order so navigation matches the row. + const currentIdx = sorted.findIndex((s) => s.is_current); + const prev = currentIdx > 0 ? sorted[currentIdx - 1] : null; + const next = currentIdx !== -1 && currentIdx < sorted.length - 1 ? sorted[currentIdx + 1] : null; + + // Carry the active sort through every point-to-point link. + const hrefFor = (id: number) => + sortMode === 'default' + ? `/inference/agentic/${id}` + : `/inference/agentic/${id}?sort=${sortMode}`; + + const currentId = siblings.find((s) => s.is_current)?.id; const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`; @@ -88,23 +168,58 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
+
+ Sort by + +
- {siblings.map((s) => { + {sorted.map((s) => { const active = s.is_current; return ( + + Page {page + 1} of {pageCount} + + +
+ )} + +
+ ); +} + +function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx new file mode 100644 index 00000000..5fcc0dfe --- /dev/null +++ b/packages/app/src/components/datasets/dataset-list.tsx @@ -0,0 +1,85 @@ +'use client'; + +import Link from 'next/link'; + +import { Card } from '@/components/ui/card'; +import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; + +function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + return String(Math.round(n)); +} + +function DatasetCard({ d }: { d: DatasetRecord }) { + const s = d.summary ?? {}; + const cachedPct = typeof s.cachedPct === 'number' ? `${(s.cachedPct * 100).toFixed(0)}%` : '—'; + return ( + track('datasets_card_clicked', { slug: d.slug })} + className="block transition-colors hover:[&_*]:border-primary/40" + > + +
+

{d.label}

+ + {d.variant} + +
+ {d.description && ( +

{d.description}

+ )} +
+ + + + + + +
+
View dataset →
+
+ + ); +} + +function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} + +export function DatasetList() { + const { data, isLoading, isError } = useDatasets(); + + if (isLoading) { + return
Loading datasets…
; + } + if (isError || !data) { + return ( +
Failed to load datasets.
+ ); + } + if (data.length === 0) { + return ( +
+ No datasets ingested yet. +
+ ); + } + + return ( +
+ {data.map((d) => ( + + ))} +
+ ); +} diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx new file mode 100644 index 00000000..7abc367f --- /dev/null +++ b/packages/app/src/components/datasets/distribution-card.tsx @@ -0,0 +1,220 @@ +'use client'; + +import { useMemo } from 'react'; + +import { Card } from '@/components/ui/card'; +import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover'; +import type { Distribution } from '@/hooks/api/use-datasets'; + +/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ +function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + if (abs > 0 && abs < 1) return n.toFixed(2); + return String(Math.round(n)); +} + +interface DistributionCardProps { + title: string; + subtitle?: string; + unit: string; + distribution?: Distribution; + scale?: 'log' | 'linear'; + /** Format the x value (defaults to compact). e.g. percent for cached fraction. */ + formatValue?: (v: number) => string; +} + +const W = 720; +const H = 240; +const PAD = { top: 12, right: 16, bottom: 48, left: 52 }; + +/** + * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a + * themeable bar chart with median/p90 guide lines and a hover tooltip. Bars are + * drawn at equal visual width; for log-scaled bins the edge labels are already + * log-spaced so the shape reads as a log histogram. + */ +export function DistributionCard({ + title, + subtitle, + unit, + distribution, + scale = 'linear', + formatValue = compact, +}: DistributionCardProps) { + const computed = useMemo(() => { + const bins = distribution?.bins ?? []; + if (bins.length === 0) return null; + const maxCount = Math.max(1, ...bins.map((b) => b.count)); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const n = bins.length; + const barW = innerW / n; + // Map a data value to an x pixel by locating its bin (positional — works for + // both linear and log bins since the edges are precomputed at ingest). + const valueToX = (v: number): number | null => { + for (let i = 0; i < n; i++) { + if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) { + return PAD.left + (i + 0.5) * barW; + } + } + if (v <= bins[0].x0) return PAD.left + 0.5 * barW; + return PAD.left + (n - 0.5) * barW; + }; + return { bins, maxCount, innerW, innerH, n, barW, valueToX }; + }, [distribution]); + + if (!computed) { + return ( + +
{title}
+
+ No data +
+
+ ); + } + + const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed; + const stats = distribution?.stats; + + const guides = stats + ? ([ + { label: 'median', value: stats.median, color: '#3b82f6' }, + { label: 'p90', value: stats.p90, color: '#f59e0b' }, + ] as const) + : []; + + // X tick labels from a few bin edges. + const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1]; + + const resolve = (fraction: number) => { + const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n))); + const b = bins[i]; + const items: HoverItem[] = [ + { + color: 'currentColor', + label: 'Range', + value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`, + }, + { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() }, + ]; + return { items }; + }; + + return ( + +
+ {title} + {scale === 'log' && ( + + log scale + + )} +
+ {subtitle &&
{subtitle}
} + {stats && ( +
+ n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '} + {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit} +
+ )} +
+ + {/* bars */} + {bins.map((b, i) => { + const h = (b.count / maxCount) * innerH; + const x = PAD.left + i * barW; + const y = PAD.top + (innerH - h); + return ( + + ); + })} + + {/* guide lines */} + {guides.map((g) => { + const x = valueToX(g.value); + if (x === null) return null; + return ( + + ); + })} + + {/* x axis */} + + {tickIdxs.map((i, k) => { + const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle'; + const x = PAD.left + (i + 0.5) * barW; + return ( + + {formatValue(bins[i].x0)} + + ); + })} + + {unit} + + + {/* guide legend */} + {guides.map((g, i) => ( + + + + {g.label} {formatValue(g.value)} + + + ))} + +
+
+ ); +} diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx new file mode 100644 index 00000000..12588582 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -0,0 +1,273 @@ +'use client'; + +import { useCallback, useMemo, useState } from 'react'; + +import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; + +/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ +function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + return String(Math.round(n)); +} + +// Stacked-bar segment colors. Cached prefix vs uncached input vs output — +// fixed hues (theme-independent) so the meaning is stable in light/dark. +const SEG = { + cached: '#10b981', // emerald-500 — input served from prefix cache + uncached: '#f59e0b', // amber-500 — input that must be (re)computed + output: '#8b5cf6', // violet-500 — generated tokens +} as const; + +const LEGEND = [ + { key: 'cached', label: 'Cached prefix', color: SEG.cached }, + { key: 'uncached', label: 'Uncached input', color: SEG.uncached }, + { key: 'output', label: 'Output', color: SEG.output }, +] as const; + +interface VisibleRow { + key: string; + label: string; + sublabel?: string; + cached: number; + uncached: number; + output: number; + total: number; + indent: number; + isGroup: boolean; + isExpanded: boolean; + groupIndex?: number; +} + +interface TooltipState { + x: number; + y: number; + row: VisibleRow; +} + +/** + * Per-conversation flamegraph driven by the precomputed `structure` JSONB. + * One row per turn; subagent groups render a collapsible header with indented + * children (collapsed by default). Each bar stacks cached-prefix + uncached + * input + output, scaled to the widest visible turn. + */ +export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) { + const nodes = structure.nodes; + + // Subagent groups collapsed by default. + const [expanded, setExpanded] = useState>(() => new Set()); + const [tooltip, setTooltip] = useState(null); + + const groupIndexes = useMemo(() => { + const out: number[] = []; + nodes.forEach((node, i) => { + if (node.kind === 'subagent') out.push(i); + }); + return out; + }, [nodes]); + + const toggle = useCallback((i: number) => { + setExpanded((prev) => { + const next = new Set(prev); + if (next.has(i)) next.delete(i); + else next.add(i); + return next; + }); + }, []); + + const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]); + const collapseAll = useCallback(() => setExpanded(new Set()), []); + + const rows = useMemo(() => { + const out: VisibleRow[] = []; + let turnNo = 0; + nodes.forEach((node: StructureNode, i) => { + if (node.kind === 'turn') { + turnNo += 1; + out.push({ + key: `t-${i}`, + label: `Turn ${turnNo}`, + sublabel: node.model ?? undefined, + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: false, + isExpanded: false, + }); + } else { + const isExpanded = expanded.has(i); + out.push({ + key: `g-${i}`, + label: `${node.label}`, + sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${ + node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : '' + }`, + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: true, + isExpanded, + groupIndex: i, + }); + if (isExpanded) { + node.children.forEach((child, ci) => { + out.push({ + key: `g-${i}-c-${ci}`, + label: `↳ subturn ${ci + 1}`, + sublabel: child.model ?? undefined, + cached: child.cached, + uncached: child.uncached, + output: child.out, + total: child.in + child.out, + indent: 1, + isGroup: false, + isExpanded: false, + }); + }); + } + } + }); + return out; + }, [nodes, expanded]); + + const maxTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)), + [rows], + ); + + const onMove = (e: React.MouseEvent, row: VisibleRow) => { + setTooltip({ x: e.clientX, y: e.clientY, row }); + }; + + return ( +
+
+
+ {LEGEND.map((l) => ( + + + {l.label} + + ))} +
+ {groupIndexes.length > 0 && ( +
+ + +
+ )} +
+ +
+ {rows.map((row) => { + const barFrac = row.total / maxTotal; + const cw = (row.cached / row.total) * 100; + const uw = (row.uncached / row.total) * 100; + const ow = (row.output / row.total) * 100; + return ( +
+ {/* label / group toggle */} +
+ {row.isGroup ? ( + + ) : ( + {row.label} + )} +
+ + {/* stacked bar */} +
onMove(e, row)} + onMouseLeave={() => setTooltip(null)} + > +
+
+
+
+
+
+ + {/* total */} +
+ {compact(row.total)} +
+
+ ); + })} +
+ + {tooltip && ( +
+
+ {tooltip.row.label} + {tooltip.row.sublabel ? ( + {tooltip.row.sublabel} + ) : null} +
+
+ Cached prefix + + {compact(tooltip.row.cached)} + + Uncached input + + {compact(tooltip.row.uncached)} + + Output + + {compact(tooltip.row.output)} + + Cached % + + {tooltip.row.cached + tooltip.row.uncached > 0 + ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%` + : '—'} + +
+
+ )} +
+ ); +} diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx index 57965518..5725d99f 100644 --- a/packages/app/src/components/header/header.tsx +++ b/packages/app/src/components/header/header.tsx @@ -46,6 +46,12 @@ const NAV_LINKS = [ testId: 'nav-link-supporters', event: 'header_supporters_clicked', }, + { + href: '/datasets', + label: 'Datasets', + testId: 'nav-link-datasets', + event: 'header_datasets_clicked', + }, { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' }, { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' }, ] as const; diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts new file mode 100644 index 00000000..c1676445 --- /dev/null +++ b/packages/db/src/queries/datasets.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from 'vitest'; + +import type { DbClient } from '../connection.js'; +import { getConversation, listConversations, listDatasets } from './datasets.js'; + +/** + * Mock DbClient: returns canned result sets in call order. Each call to the + * tagged-template `sql` shifts the next queued rows array. The query text is + * ignored — these tests assert the JS-side shaping/coercion, not SQL. + */ +function mockSql(queue: unknown[][]): DbClient { + const responses = [...queue]; + return (() => Promise.resolve(responses.shift() ?? [])) as unknown as DbClient; +} + +describe('listDatasets', () => { + it('coerces conversation_count to a number', async () => { + const sql = mockSql([ + [ + { + id: 'a/b', + slug: 'b', + label: 'B', + variant: 'full', + conversation_count: '393', + summary: {}, + }, + ], + ]); + const out = await listDatasets(sql); + expect(out).toHaveLength(1); + expect(out[0].conversation_count).toBe(393); + expect(typeof out[0].conversation_count).toBe('number'); + }); +}); + +describe('listConversations', () => { + it('returns null when the dataset slug is unknown', async () => { + const sql = mockSql([[]]); // datasets lookup → no rows + expect(await listConversations(sql, 'missing')).toBeNull(); + }); + + it('returns total + numerically-coerced items', async () => { + const sql = mockSql([ + [{ id: 'ds-id' }], // datasets lookup + [{ n: 2 }], // count + [ + { + conv_id: 'c1', + models: ['m'], + num_turns: '5', + num_subagent_groups: '1', + total_in: '1000', + total_out: '200', + total_cached: '900', + }, + ], // items + ]); + const out = await listConversations(sql, 'b', { sort: 'tokens' }); + expect(out).not.toBeNull(); + expect(out!.total).toBe(2); + expect(out!.items[0]).toMatchObject({ + conv_id: 'c1', + num_turns: 5, + num_subagent_groups: 1, + total_in: 1000, + total_out: 200, + total_cached: 900, + }); + expect(typeof out!.items[0].total_in).toBe('number'); + }); +}); + +describe('getConversation', () => { + it('returns null when the conversation is missing', async () => { + const sql = mockSql([[]]); + expect(await getConversation(sql, 'b', 'nope')).toBeNull(); + }); + + it('coerces counts and passes through the structure', async () => { + const structure = { blockSize: 64, nodes: [], totals: {} }; + const sql = mockSql([ + [ + { + conv_id: 'c1', + models: ['m'], + num_turns: '3', + num_subagent_groups: '0', + total_in: '500', + total_out: '100', + total_cached: '450', + structure, + }, + ], + ]); + const out = await getConversation(sql, 'b', 'c1'); + expect(out).not.toBeNull(); + expect(out!.num_turns).toBe(3); + expect(out!.total_cached).toBe(450); + expect(out!.structure).toBe(structure); + }); +}); From 0c50139594a99adcc43f558d0b80ae08870af20e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 16:18:38 -0500 Subject: [PATCH 080/111] docs(ingest): note the separate agentic-dataset ingest script Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/agents/ingest.md | 188 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 .claude/agents/ingest.md diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md new file mode 100644 index 00000000..aa0099ac --- /dev/null +++ b/.claude/agents/ingest.md @@ -0,0 +1,188 @@ +--- +name: ingest +description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL. +tools: Bash, Read, Edit, Write +--- + +You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`. + +## Environment + +- **Repo root**: `/Users/quilicic/InferenceX-app` +- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements: + - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname. + - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`. +- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000) +- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app` +- **INVALIDATE_SECRET** lives in repo root `.env` under that key. +- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var. + +## Standard ingest + +```bash +cd /Users/quilicic/InferenceX-app/packages/db +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts --download SemiAnalysisAI/InferenceX +``` + +Then refresh the materialized view (the script's auto-refresh sometimes races): +`REFRESH MATERIALIZED VIEW latest_benchmarks;` + +## Cache purge (always do after any DB mutation) + +```bash +SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"') +# Localhost (port 3002, NOT 3000) +curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate +# Preview +mkdir -p /tmp/vp && cd /tmp/vp \ + && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \ + && vercel curl /api/v1/invalidate \ + --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \ + --yes -- -sS -X POST -H "Authorization: Bearer $SECRET" +rm -rf /tmp/vp +``` + +## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision)) + +```sql +BEGIN; +DELETE FROM benchmark_results br USING configs c +WHERE c.id = br.config_id + AND c.model = '' AND c.hardware = '' AND c.framework = '' + AND c.precision = '' AND br.benchmark_type = ''; +DELETE FROM availability +WHERE model = '' AND hardware = '' AND framework = '' + AND precision = '' AND benchmark_type = ''; +COMMIT; +``` + +If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked. + +## AIPerf tagging — DO NOT use by default + +AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision). + +Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`. + +
+Explicit-request-only: how to tag a run as `spec_decoding='aiperf'` + +```bash +RID= +TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX) +cd $TMPDIR + +# 1. Logical-name dedup + download +gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \ + --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \ + | python3 -c " +import sys, re, collections +seen = collections.OrderedDict() +for line in sys.stdin: + name, url, created = line.rstrip('\n').split('\t') + key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name) + if key not in seen or seen[key][2] < created: + seen[key] = (name, url, created) +for _, (name, url, _) in seen.items(): + print(f'{name}\t{url}') +" > artifacts.tsv +while IFS=$'\t' read -r name url; do + mkdir -p "$name" + gh api "$url" > "$name/a.zip" 2>/dev/null + unzip -oq "$name/a.zip" -d "$name" 2>/dev/null + rm "$name/a.zip" +done < artifacts.tsv + +# 2. Patch every benchmark JSON to set spec_decoding=aiperf +find $TMPDIR -name "*.json" | python3 -c " +import sys, json +for fn in (l.strip() for l in sys.stdin): + try: + with open(fn) as f: d = json.load(f) + except Exception: continue + rows = d if isinstance(d, list) else [d] + if not rows or not isinstance(rows[0], dict): continue + changed = False + for row in rows: + if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row): + row['spec_decoding'] = 'aiperf' + changed = True + if changed: + with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f) +" + +# 3. Ingest in CI mode (reads INGEST_* env vars) +cd /Users/quilicic/InferenceX-app/packages/db +INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \ +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts +rm -rf $TMPDIR +``` + +The `spec_method` column has a lowercase check constraint — always lowercase. + +
+ +## Don't auto-mention "AIPerf" in changelog entries + +Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`. + +## Adding a perf changelog entry + +Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `---` (matches what the user actually sees in the filter chain). + +```sql +INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link) +SELECT id, date, '', '', ARRAY['---'], '', NULL +FROM latest_workflow_runs WHERE github_run_id = +RETURNING id, workflow_run_id, date::text, description; +``` + +Description convention from prior entries: ` Ingest # ()` — e.g. + +- `B200 Kimi Ingest #1` +- `MI355X Kimi Ingest #2` +- `H200 Kimi Ingest #1 (mmap cache)` + +If user doesn't specify a description, ask for one OR derive from the run name. + +## Common gotchas + +- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = ` then `REFRESH MATERIALIZED VIEW latest_benchmarks`. +- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT. +- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites. +- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `__` suffix. +- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection. +- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = ''` so the frontend's max-date-per-group dedup doesn't drop the older sweep. + +## Process + +1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/ --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips). +2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding. +3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line. +4. **Refresh materialized view**. +5. **Add changelog entry** if the user asked or if the run is a "marker" worth surfacing. +6. **Purge both caches** (localhost 3002 + preview). +7. **Report** the row count, date, hardware, run id, and changelog id (if added). + +## Related: ingesting agentic _datasets_ (not benchmark runs) + +This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow: + +```bash +cd packages/db && DATABASE_WRITE_URL='' \ + pnpm exec tsx src/ingest-weka-dataset.ts \ + [--label "…"] [--variant full|256k] [--description "…"] [--limit N] +``` + +It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). + +## Don't + +- Don't push to git unless the user asked. +- Don't ingest without permission if it's a delete+reingest of existing data. +- Don't hit port 3000 for cache purge — it's a different project. +- Don't capitalize `spec_method` values (DB has a lowercase check constraint). From 2ae6ebaab06b27bd65f0601aa6ae7905cbd01d79 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 16:24:01 -0500 Subject: [PATCH 081/111] fix(datasets): flamegraph scroll box + dual-scale group bars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap rows in a fixed-height (max-h-[520px]) vertically scrollable bordered box. Subagent group headers carry aggregate token totals that dwarf any single turn, which made their bars overflow the row (width >> 100%). Now turns/subturns use a per-turn scale while group headers use a separate group-aggregate scale (slim muted strips), both clamped to the track — groups stay comparable to each other and nothing overflows. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/datasets/trace-flamegraph.tsx | 111 ++++++++++-------- 1 file changed, 63 insertions(+), 48 deletions(-) diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 12588582..12cc14ec 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -135,10 +135,19 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur return out; }, [nodes, expanded]); + // Two scales: leaf turns/subturns share a per-turn axis (the primary signal — + // how cached/uncached evolves), while subagent group headers carry aggregates + // orders of magnitude larger, so they get their own axis to stay comparable to + // each other. Group bars render slim + muted, so the mixed scale reads as a + // distinct "group summary" track rather than a contradiction. const maxTotal = useMemo( () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)), [rows], ); + const maxGroupTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)), + [rows], + ); const onMove = (e: React.MouseEvent, row: VisibleRow) => { setTooltip({ x: e.clientX, y: e.clientY, row }); @@ -178,61 +187,67 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur )}
-
- {rows.map((row) => { - const barFrac = row.total / maxTotal; - const cw = (row.cached / row.total) * 100; - const uw = (row.uncached / row.total) * 100; - const ow = (row.output / row.total) * 100; - return ( -
- {/* label / group toggle */} -
- {row.isGroup ? ( - - ) : ( - {row.label} - )} -
- - {/* stacked bar */} +
+
+ {rows.map((row) => { + // Group headers use the group axis; turns/subturns use the per-turn + // axis. Clamp to the track width either way. + const denom = row.isGroup ? maxGroupTotal : maxTotal; + const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100)); + const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0; + const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0; + const ow = row.total > 0 ? (row.output / row.total) * 100 : 0; + return (
onMove(e, row)} - onMouseLeave={() => setTooltip(null)} + key={row.key} + className="flex items-center gap-2" + style={{ paddingLeft: row.indent * 20 }} > + {/* label / group toggle */} +
+ {row.isGroup ? ( + + ) : ( + {row.label} + )} +
+ + {/* stacked bar — group headers render as a slim muted summary + strip so they read as aggregates, not individual turns. */}
onMove(e, row)} + onMouseLeave={() => setTooltip(null)} > -
-
-
+
+
+
+
+
-
- {/* total */} -
- {compact(row.total)} + {/* total */} +
+ {compact(row.total)} +
-
- ); - })} + ); + })} +
{tooltip && ( From c749f8f271bcfa46293b1ce2ec29adac1907231d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 16:31:40 -0500 Subject: [PATCH 082/111] feat(datasets): link request timeline to source-dataset conversation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add run_datasets (workflow_run → dataset slug) mapping (migration 012) and surface it through the benchmark-siblings sku. The agentic detail page's request timeline now deep-links each request bar to its exact conversation in the /datasets viewer — the request cid, stripped of any ::sa:/::fa: suffix, is the dataset conv_id. Tooltip shows a 'click to view in dataset' hint; bars get a pointer cursor only when a mapping exists. Backfilled workflow_run 27915787191 (the dsv4/b300/vllm run incl. point 422083) → cc-traces-weka-062126. Verified: clicking a timeline bar on /inference/agentic/422083 navigates to the matching /datasets/cc-traces-weka-062126/conversations/. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agentic-point/agentic-point-detail.tsx | 5 ++- .../agentic-point/dataset-conv-id.test.ts | 27 ++++++++++++ .../agentic-point/request-timeline.tsx | 43 +++++++++++++++++-- .../src/hooks/api/use-benchmark-siblings.ts | 1 + packages/db/migrations/012_run_datasets.sql | 19 ++++++++ packages/db/src/queries/benchmark-siblings.ts | 7 ++- 6 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts create mode 100644 packages/db/migrations/012_run_datasets.sql diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 278ad8f7..4a076955 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -225,7 +225,10 @@ export function AgenticPointDetail({ id }: Props) { Loading request timeline…
) : timelineQuery.data ? ( - + ) : (
No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts new file mode 100644 index 00000000..a7ebbd8c --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts @@ -0,0 +1,27 @@ +import { describe, expect, it } from 'vitest'; + +import { datasetConvId } from './request-timeline'; + +describe('datasetConvId', () => { + it('returns a plain conversation id unchanged', () => { + expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602')).toBe( + '002001296e8a8c38ad9d7cc436d691afc602', + ); + }); + + it('strips a ::sa: subagent suffix to the parent conv id', () => { + expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe( + '002001296e8a8c38ad9d7cc436d691afc602', + ); + }); + + it('strips a ::fa: forked-agent suffix', () => { + expect(datasetConvId('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBe( + '02bc0afb13f7a2d9efa86c28511261d85c0e', + ); + }); + + it('strips at the first :: even with a trailing stream index', () => { + expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc'); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 7c5fdab0..655556fb 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -1,9 +1,21 @@ 'use client'; import { useCallback, useMemo, useRef, useState } from 'react'; +import { useRouter } from 'next/navigation'; import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline'; import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; + +/** + * The dataset conversation id for a request: the cid with any subagent/forked + * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in + * dataset_conversations, so it deep-links into /datasets//conversations/. + */ +export function datasetConvId(cid: string): string { + const i = cid.indexOf('::'); + return i === -1 ? cid : cid.slice(0, i); +} /** * Gantt-style request timeline for one agentic benchmark point. @@ -317,7 +329,7 @@ interface TooltipData { req: RequestRecord; } -function Tooltip({ data }: { data: TooltipData }) { +function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) { const { row, req } = data; const totalMs = (req.end - req.start) / 1e6; const queueMs = (req.start - req.credit) / 1e6; @@ -377,14 +389,37 @@ function Tooltip({ data }: { data: TooltipData }) {
Started at {formatTickLabel(req.start)}
+ {linkable && ( +
+ Click to view this conversation in the dataset → +
+ )}
); } -export function RequestTimelineView({ data }: { data: RequestTimeline }) { +export function RequestTimelineView({ + data, + datasetSlug, +}: { + data: RequestTimeline; + /** Source dataset slug for this run; enables click-to-conversation deep links. */ + datasetSlug?: string | null; +}) { + const router = useRouter(); const [rowMode, setRowMode] = useState('conversation'); const [phaseFilter, setPhaseFilter] = useState('profiling'); const [tooltip, setTooltip] = useState(null); + + const openConversation = useCallback( + (cid: string) => { + if (!datasetSlug) return; + const convId = datasetConvId(cid); + track('agentic_timeline_to_dataset', { slug: datasetSlug }); + router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`); + }, + [datasetSlug, router], + ); // Which multi-stream subagents currently have their per-stream rows // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id). const [expandedSubagents, setExpandedSubagents] = useState>(() => new Set()); @@ -798,6 +833,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { key={`${req.cid}-${req.ti}-${req.start}`} onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })} onMouseLeave={() => setTooltip(null)} + onClick={datasetSlug ? () => openConversation(req.cid) : undefined} + style={datasetSlug ? { cursor: 'pointer' } : undefined} > {/* Queue lead-in (faint) — only drawn when noticeable. */} {queueW >= 1 && ( @@ -910,7 +947,7 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { )} {/* Tooltip */} - {tooltip && } + {tooltip && }
); } diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts index 55720bdf..f8bef99e 100644 --- a/packages/app/src/hooks/api/use-benchmark-siblings.ts +++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts @@ -31,6 +31,7 @@ export interface BenchmarkSku { benchmark_type: string; github_run_id: number; date: string; + dataset_slug: string | null; } export interface BenchmarkSiblings { diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql new file mode 100644 index 00000000..58dd9f88 --- /dev/null +++ b/packages/db/migrations/012_run_datasets.sql @@ -0,0 +1,19 @@ +-- Maps a benchmark workflow_run to the source dataset it replayed, so the +-- agentic detail page can deep-link each request in the timeline to the exact +-- conversation in the /datasets viewer (the request's conversation_id, with any +-- ::sa:/::fa: suffix stripped, is the dataset conv_id). +-- +-- One row per workflow_run (every benchmark in a run replays the same dataset). +-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/ +-- URL) rather than an FK, so the mapping can be recorded before/independent of +-- the dataset being ingested; the UI degrades gracefully if the slug is absent. +-- +-- Additive only. To revert: +-- drop table if exists run_datasets; +-- delete from schema_migrations where filename = '012_run_datasets.sql'; + +create table run_datasets ( + workflow_run_id bigint primary key references workflow_runs(id) on delete cascade, + dataset_slug text not null, + created_at timestamptz not null default now() +); diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts index c7e4a317..2d36eb22 100644 --- a/packages/db/src/queries/benchmark-siblings.ts +++ b/packages/db/src/queries/benchmark-siblings.ts @@ -47,6 +47,8 @@ export interface BenchmarkSku { /** Human-readable workflow_run summary so the page header can hint at provenance. */ github_run_id: number; date: string; + /** Slug of the source dataset this run replayed (run_datasets), or null. */ + dataset_slug: string | null; } export interface BenchmarkSiblings { @@ -63,10 +65,11 @@ export async function getBenchmarkSiblings( select c.hardware, c.framework, c.model, c.precision, c.spec_method, br.benchmark_type, br.workflow_run_id, br.date::text, - wr.github_run_id + wr.github_run_id, rd.dataset_slug from benchmark_results br join configs c on c.id = br.config_id join workflow_runs wr on wr.id = br.workflow_run_id + left join run_datasets rd on rd.workflow_run_id = br.workflow_run_id where br.id = ${benchmarkResultId} `) as unknown as { hardware: string; @@ -78,6 +81,7 @@ export async function getBenchmarkSiblings( workflow_run_id: number; date: string; github_run_id: number; + dataset_slug: string | null; }[]; const root = seed[0]; if (!root) return null; @@ -158,6 +162,7 @@ export async function getBenchmarkSiblings( benchmark_type: root.benchmark_type, github_run_id: Number(root.github_run_id), date: root.date, + dataset_slug: root.dataset_slug ?? null, }, siblings, }; From 6b700a3ccbc53fbc7e109360a2e5baa582e588c9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 17:31:14 -0500 Subject: [PATCH 083/111] feat(datasets): deep-link request-timeline bar to the exact turn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timeline link now carries ?turn= (and &sa= for subagent requests). The flamegraph resolves the target node — main turns by ordinal, subagent turns by matching the group's agentId then the ti-th child — expands the subagent group if needed, scrolls the row into view, and flashes a ring. subagentIdOf strips the harness stream suffix (:s and :aux:) so the cid's agent id matches the dataset SubagentNode.agentId. Verified end-to-end: clicking a subagent bar on /inference/agentic/422083 opens the conversation, expands the right group, and highlights the exact subturn. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/datasets/conversation-view.tsx | 18 +++++- .../components/datasets/trace-flamegraph.tsx | 60 +++++++++++++++++-- .../agentic-point/dataset-conv-id.test.ts | 28 ++++++++- .../agentic-point/request-timeline.tsx | 30 ++++++++-- 4 files changed, 125 insertions(+), 11 deletions(-) diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index 43992c41..ba1d0532 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -1,5 +1,6 @@ 'use client'; +import { useState } from 'react'; import Link from 'next/link'; import { Card } from '@/components/ui/card'; @@ -17,6 +18,17 @@ function compact(n: number): string { export function ConversationView({ slug, convId }: { slug: string; convId: string }) { const { data, isLoading, isError } = useDatasetConversation(slug, convId); + // Deep-link target from a request-timeline click: ?turn=[&sa=]. + // Read once from the URL on mount (matches the app's window-based url-state + // reads; avoids a Suspense boundary for useSearchParams). + const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => { + if (typeof window === 'undefined') return { turn: null, agent: null }; + const p = new URLSearchParams(window.location.search); + const turnRaw = p.get('turn'); + const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null; + return { turn, agent: p.get('sa') }; + }); + if (isLoading) { return (
Loading conversation…
@@ -85,7 +97,11 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin click a group to expand it. Each bar splits input into cached prefix and uncached suffix, plus generated output.

- +
); diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 12cc14ec..3995a9c5 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -1,6 +1,6 @@ 'use client'; -import { useCallback, useMemo, useState } from 'react'; +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; @@ -52,12 +52,58 @@ interface TooltipState { * children (collapsed by default). Each bar stacks cached-prefix + uncached * input + output, scaled to the widest visible turn. */ -export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) { +export function TraceFlamegraph({ + structure, + highlightTurn, + highlightAgentId, +}: { + structure: ConversationStructure; + /** Turn index to scroll to / highlight (from a request-timeline deep link). */ + highlightTurn?: number | null; + /** Subagent id when the highlighted turn is inside a subagent group. */ + highlightAgentId?: string | null; +}) { const nodes = structure.nodes; - // Subagent groups collapsed by default. - const [expanded, setExpanded] = useState>(() => new Set()); + // Resolve the deep-link target to a row key (+ the group that must be open to + // show it). Main turns match by their main-turn ordinal; subagent turns match + // the group by agentId, then the ti-th child. + const target = useMemo(() => { + if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null; + if (highlightAgentId) { + const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + if (highlightTurn >= group.children.length) return null; + return { rowKey: `g-${gi}-c-${highlightTurn}`, expandGroup: gi }; + } + let ordinal = 0; + for (let i = 0; i < nodes.length; i++) { + if (nodes[i].kind === 'turn') { + if (ordinal === highlightTurn) return { rowKey: `t-${i}`, expandGroup: null }; + ordinal += 1; + } + } + return null; + }, [nodes, highlightTurn, highlightAgentId]); + + // Subagent groups collapsed by default — except the deep-link target's group. + const [expanded, setExpanded] = useState>(() => + typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(), + ); const [tooltip, setTooltip] = useState(null); + const scrollRef = useRef(null); + + // Scroll the target row into view and flash a highlight once it's rendered. + useEffect(() => { + if (!target) return; + const el = scrollRef.current?.querySelector(`[data-rowkey="${target.rowKey}"]`); + if (!el) return; + el.scrollIntoView({ block: 'center', behavior: 'smooth' }); + el.classList.add('ring-2', 'ring-primary', 'rounded-sm'); + const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600); + return () => clearTimeout(t); + }, [target]); const groupIndexes = useMemo(() => { const out: number[] = []; @@ -187,7 +233,10 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur )}
-
+
{rows.map((row) => { // Group headers use the group axis; turns/subturns use the per-turn @@ -200,6 +249,7 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur return (
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts index a7ebbd8c..f55d6131 100644 --- a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts +++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { datasetConvId } from './request-timeline'; +import { datasetConvId, subagentIdOf } from './request-timeline'; describe('datasetConvId', () => { it('returns a plain conversation id unchanged', () => { @@ -25,3 +25,29 @@ describe('datasetConvId', () => { expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc'); }); }); + +describe('subagentIdOf', () => { + it('returns null for a main-conversation cid', () => { + expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602')).toBeNull(); + }); + + it('extracts the subagent id from a ::sa: cid', () => { + expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe( + 'subagent_004_27c95af7', + ); + }); + + it('drops a trailing :s index from the subagent id', () => { + expect(subagentIdOf('abc::sa:subagent_001_f552fe6f:s3')).toBe('subagent_001_f552fe6f'); + }); + + it('drops an :aux: stream suffix from the subagent id', () => { + expect(subagentIdOf('04dba6fe::sa:subagent_001_b00fdc12:aux:011')).toBe( + 'subagent_001_b00fdc12', + ); + }); + + it('returns null for a ::fa: forked-agent cid (no matching subagent group)', () => { + expect(subagentIdOf('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBeNull(); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 655556fb..baf3dc1f 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -17,6 +17,21 @@ export function datasetConvId(cid: string): string { return i === -1 ? cid : cid.slice(0, i); } +/** + * The subagent id encoded in a cid (`…::sa:[:s|:aux:]`), or null + * for a main-conversation request. The harness fans a single subagent into + * parallel streams with a `:s` or `:aux:` suffix; the dataset + * SubagentNode.agentId is the bare base (e.g. `subagent_001_b00fdc12`). Agent + * ids never contain a colon, so the base is everything up to the first one. + */ +export function subagentIdOf(cid: string): string | null { + const i = cid.indexOf('::sa:'); + if (i === -1) return null; + const raw = cid.slice(i + '::sa:'.length); + const colon = raw.indexOf(':'); + return colon === -1 ? raw : raw.slice(0, colon); +} + /** * Gantt-style request timeline for one agentic benchmark point. * @@ -412,11 +427,18 @@ export function RequestTimelineView({ const [tooltip, setTooltip] = useState(null); const openConversation = useCallback( - (cid: string) => { + (req: RequestRecord) => { if (!datasetSlug) return; - const convId = datasetConvId(cid); + const convId = datasetConvId(req.cid); + // Carry the turn (and, for subagent requests, the subagent id) so the + // flamegraph can scroll to / highlight the exact node this bar maps to. + const params = new URLSearchParams({ turn: String(req.ti) }); + const sa = subagentIdOf(req.cid); + if (sa) params.set('sa', sa); track('agentic_timeline_to_dataset', { slug: datasetSlug }); - router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`); + router.push( + `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`, + ); }, [datasetSlug, router], ); @@ -833,7 +855,7 @@ export function RequestTimelineView({ key={`${req.cid}-${req.ti}-${req.start}`} onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })} onMouseLeave={() => setTooltip(null)} - onClick={datasetSlug ? () => openConversation(req.cid) : undefined} + onClick={datasetSlug ? () => openConversation(req) : undefined} style={datasetSlug ? { cursor: 'pointer' } : undefined} > {/* Queue lead-in (faint) — only drawn when noticeable. */} From 83fcd04e16649ca7a8fb3b1b78231c8588f274e8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 17:44:05 -0500 Subject: [PATCH 084/111] fix(datasets): visible turn highlight + pointer-tracking flamegraph tooltip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Deep-link highlight is now state-driven (bg-primary/20 + ring, fades over 700ms) instead of fragile classList mutation, so it's clearly visible and survives re-renders. Subagent groups still auto-expand and scroll into view. - Portal the hover tooltip to document.body so its position:fixed is viewport-relative — an ancestor transform was offsetting it away from the cursor. Now it sits at pointer+12px. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/datasets/trace-flamegraph.tsx | 96 +++++++++++-------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 3995a9c5..53f13b6a 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -1,6 +1,7 @@ 'use client'; import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { createPortal } from 'react-dom'; import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; @@ -94,14 +95,23 @@ export function TraceFlamegraph({ const [tooltip, setTooltip] = useState(null); const scrollRef = useRef(null); - // Scroll the target row into view and flash a highlight once it's rendered. + // Portal target only exists after mount (the tooltip is portaled to body so + // its position:fixed is viewport-relative, immune to ancestor transforms). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + + // The deep-link target row gets a state-driven highlight (ring + bg flash) + // that fades out — state-driven so a re-render can't clobber it, and so the + // fade is a real CSS transition rather than an abrupt classList removal. + const [highlightKey, setHighlightKey] = useState(target?.rowKey ?? null); + + // Scroll the target row into view once it's rendered, then fade the highlight. useEffect(() => { if (!target) return; + setHighlightKey(target.rowKey); const el = scrollRef.current?.querySelector(`[data-rowkey="${target.rowKey}"]`); - if (!el) return; - el.scrollIntoView({ block: 'center', behavior: 'smooth' }); - el.classList.add('ring-2', 'ring-primary', 'rounded-sm'); - const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600); + el?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + const t = setTimeout(() => setHighlightKey(null), 2200); return () => clearTimeout(t); }, [target]); @@ -246,11 +256,14 @@ export function TraceFlamegraph({ const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0; const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0; const ow = row.total > 0 ? (row.output / row.total) * 100 : 0; + const isHighlighted = row.key === highlightKey; return (
{/* label / group toggle */} @@ -300,39 +313,44 @@ export function TraceFlamegraph({
- {tooltip && ( -
-
- {tooltip.row.label} - {tooltip.row.sublabel ? ( - {tooltip.row.sublabel} - ) : null} -
-
- Cached prefix - - {compact(tooltip.row.cached)} - - Uncached input - - {compact(tooltip.row.uncached)} - - Output - - {compact(tooltip.row.output)} - - Cached % - - {tooltip.row.cached + tooltip.row.uncached > 0 - ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%` - : '—'} - -
-
- )} + {tooltip && + mounted && + createPortal( +
+
+ {tooltip.row.label} + {tooltip.row.sublabel ? ( + + {tooltip.row.sublabel} + + ) : null} +
+
+ Cached prefix + + {compact(tooltip.row.cached)} + + Uncached input + + {compact(tooltip.row.uncached)} + + Output + + {compact(tooltip.row.output)} + + Cached % + + {tooltip.row.cached + tooltip.row.uncached > 0 + ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%` + : '—'} + +
+
, + document.body, + )}
); } From 3c40d31172cce46f5e150223bcfa092ff573288f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 17:58:47 -0500 Subject: [PATCH 085/111] fix(datasets): deep-link highlight fires on first navigation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The conversation page read ?turn/&sa from window.location.search in a useState initializer, which captures stale/empty params during a client-side navigation — so scroll+highlight+expand only worked after a manual reload. Switch to the reactive useSearchParams (page wrapped in Suspense) so the params are present on the first nav. Also make the flamegraph expand the target subagent group via an effect (reacting to target changes), and defer the scroll one frame so the just-expanded child row exists. Verified via a real timeline click — no reload. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../[slug]/conversations/[convId]/page.tsx | 5 ++++- .../components/datasets/conversation-view.tsx | 19 ++++++++-------- .../components/datasets/trace-flamegraph.tsx | 22 +++++++++++++++---- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx index 75702c1b..83eb56a0 100644 --- a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx +++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx @@ -1,3 +1,4 @@ +import { Suspense } from 'react'; import type { Metadata } from 'next'; import { ConversationView } from '@/components/datasets/conversation-view'; @@ -25,7 +26,9 @@ export default async function ConversationPage({ params }: Props) { return (
- + + +
); diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index ba1d0532..739d3bb2 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -1,7 +1,7 @@ 'use client'; -import { useState } from 'react'; import Link from 'next/link'; +import { useSearchParams } from 'next/navigation'; import { Card } from '@/components/ui/card'; import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; @@ -19,15 +19,14 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin const { data, isLoading, isError } = useDatasetConversation(slug, convId); // Deep-link target from a request-timeline click: ?turn=[&sa=]. - // Read once from the URL on mount (matches the app's window-based url-state - // reads; avoids a Suspense boundary for useSearchParams). - const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => { - if (typeof window === 'undefined') return { turn: null, agent: null }; - const p = new URLSearchParams(window.location.search); - const turnRaw = p.get('turn'); - const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null; - return { turn, agent: p.get('sa') }; - }); + // useSearchParams (not a one-shot window.location read) so the params are + // present on the very first client-side navigation, not just after a reload. + const params = useSearchParams(); + const turnRaw = params.get('turn'); + const highlight = { + turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null, + agent: params.get('sa'), + }; if (isLoading) { return ( diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 53f13b6a..a577193b 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -105,14 +105,28 @@ export function TraceFlamegraph({ // fade is a real CSS transition rather than an abrupt classList removal. const [highlightKey, setHighlightKey] = useState(target?.rowKey ?? null); - // Scroll the target row into view once it's rendered, then fade the highlight. + // When the deep-link target resolves/changes: expand its subagent group, then + // (after the row renders) scroll it into view and flash the highlight. Runs on + // first load and on any later target change (e.g. clicking another bar into + // the same conversation). The row query/scroll is deferred to the next frame + // so the just-expanded child row exists in the DOM. useEffect(() => { if (!target) return; + if (typeof target.expandGroup === 'number') { + const gi = target.expandGroup; + setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi))); + } setHighlightKey(target.rowKey); - const el = scrollRef.current?.querySelector(`[data-rowkey="${target.rowKey}"]`); - el?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + const raf = requestAnimationFrame(() => { + scrollRef.current + ?.querySelector(`[data-rowkey="${target.rowKey}"]`) + ?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + }); const t = setTimeout(() => setHighlightKey(null), 2200); - return () => clearTimeout(t); + return () => { + cancelAnimationFrame(raf); + clearTimeout(t); + }; }, [target]); const groupIndexes = useMemo(() => { From e460ea2300f57912eff46d92fbb6fb447fc435e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 22:34:55 -0500 Subject: [PATCH 086/111] fix(high-contrast): stable line colors when deselecting legend items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In HC mode the iwanthue palette is sized and indexed by the key set it's generated over. ScatterGraph generated it from the *active* (selected) hw set, so deselecting a line shrank the set, re-sized the palette, and shifted every remaining line's hue — most visible on single-vendor agentic runs (which span the full hue wheel since 2c06009), where deselecting B300 could recolor B200 from red to blue. Pass the stable full set of hw-types-with-data as hcKeys so the palette and per-key index are fixed; toggling now only hides/shows lines without recoloring the rest. Adds a useThemeColors regression test asserting a line's HC color is identical across active subsets when hcKeys is the full set. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 8 ++++++ packages/app/src/hooks/useThemeColors.test.ts | 28 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 76231522..77770ec0 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -381,10 +381,18 @@ const ScatterGraph = React.memo( () => [...effectiveOfficialHwTypes], [effectiveOfficialHwTypes], ); + // High-contrast palette is keyed off the FULL set of official hw types with + // data, not the active subset. Otherwise deselecting a line shrinks the key + // set, which re-sizes the iwanthue palette and shifts every remaining line's + // hue (most visible for single-vendor agentic runs that span the full wheel — + // e.g. deselecting B300 would recolor B200 from red to blue). Keying off the + // stable full set fixes each hw's color so toggling only hides/shows lines. + const stableHcKeys = useMemo(() => [...hwTypesWithData], [hwTypesWithData]); const { resolveColor, getCssColor } = useThemeColors({ highContrast, identifiers: activeHwKeys, activeKeys: activeOfficialKeys, + hcKeys: stableHcKeys, }); // --- Changelog --- diff --git a/packages/app/src/hooks/useThemeColors.test.ts b/packages/app/src/hooks/useThemeColors.test.ts index 7275e384..11050d19 100644 --- a/packages/app/src/hooks/useThemeColors.test.ts +++ b/packages/app/src/hooks/useThemeColors.test.ts @@ -170,4 +170,32 @@ describe('useThemeColors color maps', () => { } unmountOn(); }); + + // Regression: deselecting a legend line must not recolor the remaining lines. + // The HC palette is sized/indexed by the key set it's generated over, so when + // it was generated over the *active* subset (no hcKeys), shrinking the + // selection re-sized the palette and shifted every remaining line's hue (most + // visible on single-vendor agentic runs spanning the full wheel). Passing a + // stable `hcKeys` (the full set with data) fixes each line's color. + it('keeps a line HC color stable across active subsets when hcKeys is the full set', () => { + const FULL = ['b200', 'b300']; // single-vendor (NVIDIA) agentic comparison + + const all = renderHook(() => + useThemeColors({ highContrast: true, activeKeys: ['b200', 'b300'], hcKeys: FULL }), + ); + const b200WithBoth = all.result.current.resolveColor('b200'); + const b300Color = all.result.current.resolveColor('b300'); + all.unmount(); + + // b300 deselected → only b200 active, but hcKeys is still the full set. + const subset = renderHook(() => + useThemeColors({ highContrast: true, activeKeys: ['b200'], hcKeys: FULL }), + ); + const b200Alone = subset.result.current.resolveColor('b200'); + subset.unmount(); + + expect(b200WithBoth).toMatch(/^#[0-9a-f]{6}$/iu); + expect(b200WithBoth).not.toBe(b300Color); // HC still produces distinct hues + expect(b200Alone).toBe(b200WithBoth); // deselecting b300 did NOT recolor b200 + }); }); From a912eab780a76ba015b21590d3c162e0fd4c37ea Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:04:28 -0500 Subject: [PATCH 087/111] chore(security): bump dompurify override to >=3.4.11 (GHSA-cmwh-pvxp-8882) --- pnpm-lock.yaml | 52 ++++++++++++++++++++++++++++++++------------- pnpm-workspace.yaml | 2 +- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cdd8a01d..bb7bb824 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -5,7 +5,7 @@ settings: excludeLinksFromLockfile: false overrides: - dompurify@<3.4.9: '>=3.4.9' + dompurify@<=3.4.10: '>=3.4.11' esbuild@>=0.27.3 <0.28.1: '>=0.28.1' form-data@>=4.0.0 <4.0.6: '>=4.0.6' hono@<4.12.21: '>=4.12.21' @@ -20,7 +20,7 @@ importers: devDependencies: '@babel/core': specifier: ^7.29.6 - version: 7.29.7 + version: 7.29.7(supports-color@8.1.1) audit-ci: specifier: ^7.1.0 version: 7.1.0 @@ -2994,9 +2994,6 @@ packages: resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==} engines: {node: '>=8'} - dompurify@3.4.10: - resolution: {integrity: sha512-0xzNv0e7oYC6yyuOGZIABPM4qtg3QxLFniDNPP4ZP90wR8Yq3zgwpRbrNiT4N3IKqDbbYFEJLV+JWEs19aZ//w==} - dompurify@3.4.11: resolution: {integrity: sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==} @@ -5538,7 +5535,27 @@ snapshots: '@babel/helpers': 7.29.7 '@babel/parser': 7.29.7 '@babel/template': 7.29.7 - '@babel/traverse': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) + '@babel/types': 7.29.7 + '@jridgewell/remapping': 2.3.5 + convert-source-map: 2.0.0 + debug: 4.4.3(supports-color@8.1.1) + gensync: 1.0.0-beta.2 + json5: 2.2.3 + semver: 6.3.1 + transitivePeerDependencies: + - supports-color + + '@babel/core@7.29.7(supports-color@8.1.1)': + dependencies: + '@babel/code-frame': 7.29.7 + '@babel/generator': 7.29.7 + '@babel/helper-compilation-targets': 7.29.7 + '@babel/helper-module-transforms': 7.29.7(@babel/core@7.29.7(supports-color@8.1.1)) + '@babel/helpers': 7.29.7 + '@babel/parser': 7.29.7 + '@babel/template': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) '@babel/types': 7.29.7 '@jridgewell/remapping': 2.3.5 convert-source-map: 2.0.0 @@ -5569,17 +5586,26 @@ snapshots: '@babel/helper-module-imports@7.29.7': dependencies: - '@babel/traverse': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) '@babel/types': 7.29.7 transitivePeerDependencies: - supports-color + '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7(supports-color@8.1.1))': + dependencies: + '@babel/core': 7.29.7(supports-color@8.1.1) + '@babel/helper-module-imports': 7.29.7 + '@babel/helper-validator-identifier': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) + transitivePeerDependencies: + - supports-color + '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7)': dependencies: '@babel/core': 7.29.7 '@babel/helper-module-imports': 7.29.7 '@babel/helper-validator-identifier': 7.29.7 - '@babel/traverse': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) transitivePeerDependencies: - supports-color @@ -5621,7 +5647,7 @@ snapshots: '@babel/parser': 7.29.7 '@babel/types': 7.29.7 - '@babel/traverse@7.29.7': + '@babel/traverse@7.29.7(supports-color@8.1.1)': dependencies: '@babel/code-frame': 7.29.7 '@babel/generator': 7.29.7 @@ -7981,10 +8007,6 @@ snapshots: dependencies: path-type: 4.0.0 - dompurify@3.4.10: - optionalDependencies: - '@types/trusted-types': 2.0.7 - dompurify@3.4.11: optionalDependencies: '@types/trusted-types': 2.0.7 @@ -8812,7 +8834,7 @@ snapshots: jest-worker@27.5.1: dependencies: - '@types/node': 25.9.3 + '@types/node': 26.0.0 merge-stream: 2.0.0 supports-color: 8.1.1 @@ -9790,7 +9812,7 @@ snapshots: '@posthog/core': 1.35.3 '@posthog/types': 1.390.2 core-js: 3.49.0 - dompurify: 3.4.10 + dompurify: 3.4.11 fflate: 0.4.8 preact: 10.29.2 query-selector-shadow-dom: 1.0.1 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index c6ea723c..361059bb 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -25,7 +25,7 @@ auditConfig: - GHSA-h67p-54hq-rp68 overrides: - dompurify@<3.4.9: '>=3.4.9' + dompurify@<=3.4.10: '>=3.4.11' esbuild@>=0.27.3 <0.28.1: '>=0.28.1' form-data@>=4.0.0 <4.0.6: '>=4.0.6' hono@<4.12.21: '>=4.12.21' From ba6bc1ce6cedce56d45c8fcd96a74c3cd53879dc Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:28:18 -0500 Subject: [PATCH 088/111] test(e2e): align selector testid with scenario-selector rename; rewrite x-axis toggle test for single-chart mode buttons --- .../app/cypress/e2e/dropdown-switching.cy.ts | 4 +- .../app/cypress/e2e/historical-trends.cy.ts | 4 +- .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 64 +++++++++---------- packages/app/cypress/e2e/url-params.cy.ts | 10 +-- 4 files changed, 39 insertions(+), 43 deletions(-) diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts index ac88dc84..4bc8b695 100644 --- a/packages/app/cypress/e2e/dropdown-switching.cy.ts +++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts @@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => { cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false'); - cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true'); + cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); }); diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts index f0a70a56..55b0e274 100644 --- a/packages/app/cypress/e2e/historical-trends.cy.ts +++ b/packages/app/cypress/e2e/historical-trends.cy.ts @@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => { delete doc.body.dataset.scrollLocked; doc.body.style.removeProperty('pointer-events'); }); - cy.get('[data-testid="sequence-selector"]').should('be.visible'); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[role="option"]').should('have.length.greaterThan', 0); cy.get('body').type('{esc}'); }); diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index e17a4aff..636a7ccf 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -1,46 +1,42 @@ -describe('TTFT X-Axis Toggle (E2E chart)', () => { +describe('X-Axis Mode Toggle (inference chart)', () => { before(() => { - cy.window().then((win) => { - win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + cy.visit('/inference', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + }, }); - cy.visit('/inference'); - cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2); + cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible'); + cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); }); - it('shows the x-axis dropdown in the e2e chart heading', () => { - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2 button') - .should('contain.text', 'vs.') - .and('contain.text', 'Latency'); + it('shows the x-axis mode buttons with Interactivity active by default', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-interactivity"]') + .should('be.visible') + .and('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); }); - it('opens popover with three x-axis options', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').within(() => { - cy.contains('End-to-end Latency').should('exist'); - cy.contains('P99 TTFT').should('exist'); - cy.contains('Median TTFT').should('exist'); - }); - }); - - it('switches x-axis to P99 TTFT and updates the heading', () => { - cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT'); + it('switches the x-axis to TTFT and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').click(); + cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token'); }); - it('switches x-axis to Median TTFT and updates the heading', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('Median TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT'); + it('switches the x-axis to E2E Latency and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-e2e"]').click(); + cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency'); }); - it('switches back to End-to-end Latency', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click(); - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2') - .should('contain.text', 'End-to-end Latency'); + it('switches back to Interactivity', () => { + cy.get('[data-testid="x-axis-mode-interactivity"]').click(); + cy.get('[data-testid="x-axis-mode-interactivity"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); }); }); diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts index 3c480686..927aee5f 100644 --- a/packages/app/cypress/e2e/url-params.cy.ts +++ b/packages/app/cypress/e2e/url-params.cy.ts @@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => { }; const assertNoHydrationMismatch = () => { - cy.get('[data-testid="sequence-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); cy.get('@consoleError').then((spy) => { const calls = (spy as unknown as { args: unknown[][] }).args; const hydration = calls.filter((args) => @@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => { it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => { visitWithErrorSpy('/inference?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); @@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => { // Visit the canonical model-prefixed slug so the assertion is directly // about the rendered page, not about a bare-slug redirect interleaving. visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => { visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk'); - cy.get('[data-testid="sequence-selector"]') + cy.get('[data-testid="scenario-selector"]') .invoke('text') .should('not.contain', 'junk') .and('match', /[18]K . [18]K/u); @@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => { // `effectivePrecisions` intersects the selection with available precisions // and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported. visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek'); cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8'); assertNoHydrationMismatch(); From ada19b54e41ea3ad87cdfc22dd3d27e1a3d7df44 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:41:03 -0500 Subject: [PATCH 089/111] test(datasets): component tests for distribution card, trace flamegraph (incl deep-link), and dataset list states --- .../app/cypress/component/dataset-list.cy.tsx | 93 +++++++++++++++++++ .../component/distribution-card.cy.tsx | 45 +++++++++ .../cypress/component/trace-flamegraph.cy.tsx | 86 +++++++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 packages/app/cypress/component/dataset-list.cy.tsx create mode 100644 packages/app/cypress/component/distribution-card.cy.tsx create mode 100644 packages/app/cypress/component/trace-flamegraph.cy.tsx diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx new file mode 100644 index 00000000..f7cfcb9a --- /dev/null +++ b/packages/app/cypress/component/dataset-list.cy.tsx @@ -0,0 +1,93 @@ +import { QueryClient, QueryClientProvider } from '@tanstack/react-query'; +import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime'; + +import { DatasetList } from '@/components/datasets/dataset-list'; +import type { DatasetRecord } from '@/hooks/api/use-datasets'; + +const datasets: DatasetRecord[] = [ + { + id: 'ds-1', + slug: 'cc-traces-weka-full', + label: 'cc-traces-weka (full)', + variant: 'full', + description: 'Every captured request, unmodified.', + hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full', + license: 'apache-2.0', + conversation_count: 1234, + summary: { + totalIn: 5_000_000, + totalOut: 250_000, + cachedPct: 0.82, + mainTurns: 9800, + subagentGroups: 540, + }, + ingested_at: '2026-06-20T00:00:00Z', + }, + { + id: 'ds-2', + slug: 'cc-traces-weka-256k', + label: 'cc-traces-weka (256k)', + variant: '256k', + description: 'Turns trimmed to a 256k context window.', + hf_url: null, + license: 'apache-2.0', + conversation_count: 980, + summary: { + totalIn: 3_200_000, + totalOut: 180_000, + cachedPct: 0.79, + mainTurns: 7600, + subagentGroups: 410, + }, + ingested_at: '2026-06-19T00:00:00Z', + }, +]; + +function createMockRouter() { + return { + push: cy.stub(), + replace: cy.stub(), + refresh: cy.stub(), + back: cy.stub(), + forward: cy.stub(), + prefetch: cy.stub().resolves(), + }; +} + +function mountList() { + const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } }); + cy.mount( + + + + + , + ); +} + +describe('DatasetList', () => { + it('renders a card per dataset with its summary stats', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list'); + mountList(); + cy.wait('@list'); + cy.contains('cc-traces-weka (full)').should('be.visible'); + cy.contains('cc-traces-weka (256k)').should('be.visible'); + cy.contains('1,234').should('be.visible'); // conversation_count, localized + cy.contains('82%').should('be.visible'); // cachedPct + cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist'); + }); + + it('shows the empty state when no datasets are ingested', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty'); + mountList(); + cy.wait('@empty'); + cy.contains('No datasets ingested yet.').should('be.visible'); + }); + + it('shows the error state when the request fails', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err'); + mountList(); + cy.wait('@err'); + cy.contains('Failed to load datasets.').should('be.visible'); + }); +}); diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx new file mode 100644 index 00000000..fb7e5461 --- /dev/null +++ b/packages/app/cypress/component/distribution-card.cy.tsx @@ -0,0 +1,45 @@ +import { DistributionCard } from '@/components/datasets/distribution-card'; +import type { Distribution } from '@/hooks/api/use-datasets'; + +const distribution: Distribution = { + bins: [ + { x0: 0, x1: 100, count: 5 }, + { x0: 100, x1: 200, count: 20 }, + { x0: 200, x1: 300, count: 12 }, + { x0: 300, x1: 400, count: 3 }, + ], + stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 }, +}; + +describe('DistributionCard', () => { + it('renders the title, summary stats, and one bar per bin', () => { + cy.mount( + , + ); + cy.contains('Input tokens per turn').should('be.visible'); + cy.contains('n=40').should('be.visible'); + cy.contains('median 175').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + // One filled bar rect per bin (ChartHover may add a transparent overlay rect). + cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length); + }); + + it('shows a "No data" placeholder when no distribution is provided', () => { + cy.mount(); + cy.contains('Empty metric').should('be.visible'); + cy.contains('No data').should('be.visible'); + cy.get('rect[class*="fill-primary"]').should('not.exist'); + }); + + it('marks the chart as log scale when scale="log"', () => { + cy.mount( + , + ); + cy.contains('log scale').should('be.visible'); + }); +}); diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx new file mode 100644 index 00000000..1be90e0c --- /dev/null +++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx @@ -0,0 +1,86 @@ +import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; +import type { ConversationStructure } from '@/hooks/api/use-datasets'; + +// Two main turns followed by one subagent group with two child turns. +// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`). +const structure: ConversationStructure = { + blockSize: 64, + nodes: [ + { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2000, + out: 300, + cached: 1500, + uncached: 500, + }, + { + kind: 'subagent', + label: 'Subagent: search', + agentId: 'agent-1', + durationMs: 12000, + in: 5000, + out: 800, + cached: 3000, + uncached: 2000, + children: [ + { + kind: 'turn', + turnIndex: 0, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + ], + }, + ], + totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 }, +}; + +describe('TraceFlamegraph', () => { + it('renders the legend, main-turn rows, and the subagent group header', () => { + cy.mount(); + cy.contains('Cached prefix').should('be.visible'); + cy.contains('Uncached input').should('be.visible'); + cy.contains('Output').should('be.visible'); + cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1'); + cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2'); + cy.contains('Subagent: search').should('be.visible'); + }); + + it('keeps subagent children collapsed until the group is expanded', () => { + cy.mount(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + cy.contains('button', 'Subagent: search').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible'); + }); + + it('expand all / collapse all toggles every subagent group', () => { + cy.mount(); + cy.contains('button', 'Expand all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.contains('button', 'Collapse all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + }); + + it('auto-expands and highlights the target group child for a request-timeline deep link', () => { + cy.mount( + , + ); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary'); + }); +}); From 1c61ee3f597e22d33e891b73f7f95511a73844d3 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:47:02 -0500 Subject: [PATCH 090/111] refactor(datasets): extract shared compact() formatter, dedupe 5 local copies --- .../src/components/datasets/conversation-view.tsx | 9 +-------- .../app/src/components/datasets/dataset-detail.tsx | 9 +-------- .../app/src/components/datasets/dataset-list.tsx | 9 +-------- .../src/components/datasets/distribution-card.tsx | 11 +---------- packages/app/src/components/datasets/format.ts | 12 ++++++++++++ .../app/src/components/datasets/trace-flamegraph.tsx | 9 +-------- 6 files changed, 17 insertions(+), 42 deletions(-) create mode 100644 packages/app/src/components/datasets/format.ts diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index 739d3bb2..d39b83d9 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -6,14 +6,7 @@ import { useSearchParams } from 'next/navigation'; import { Card } from '@/components/ui/card'; import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; import { useDatasetConversation } from '@/hooks/api/use-datasets'; - -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; export function ConversationView({ slug, convId }: { slug: string; convId: string }) { const { data, isLoading, isError } = useDatasetConversation(slug, convId); diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx index 57c50649..9410a505 100644 --- a/packages/app/src/components/datasets/dataset-detail.tsx +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -18,14 +18,7 @@ import { type ConversationSort, } from '@/hooks/api/use-datasets'; import { track } from '@/lib/analytics'; - -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; const PAGE = 50; diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx index 5fcc0dfe..84b279db 100644 --- a/packages/app/src/components/datasets/dataset-list.tsx +++ b/packages/app/src/components/datasets/dataset-list.tsx @@ -5,14 +5,7 @@ import Link from 'next/link'; import { Card } from '@/components/ui/card'; import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets'; import { track } from '@/lib/analytics'; - -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; function DatasetCard({ d }: { d: DatasetRecord }) { const s = d.summary ?? {}; diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx index 7abc367f..d0c0f166 100644 --- a/packages/app/src/components/datasets/distribution-card.tsx +++ b/packages/app/src/components/datasets/distribution-card.tsx @@ -5,16 +5,7 @@ import { useMemo } from 'react'; import { Card } from '@/components/ui/card'; import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover'; import type { Distribution } from '@/hooks/api/use-datasets'; - -/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - if (abs > 0 && abs < 1) return n.toFixed(2); - return String(Math.round(n)); -} +import { compact } from './format'; interface DistributionCardProps { title: string; diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts new file mode 100644 index 00000000..f6f5530c --- /dev/null +++ b/packages/app/src/components/datasets/format.ts @@ -0,0 +1,12 @@ +/** + * Compact number formatter for dataset token/count displays: + * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82". + */ +export function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + if (abs > 0 && abs < 1) return n.toFixed(2); + return String(Math.round(n)); +} diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index a577193b..12ecb4a4 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -4,14 +4,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { createPortal } from 'react-dom'; import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; - -/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; // Stacked-bar segment colors. Cached prefix vs uncached input vs output — // fixed hues (theme-independent) so the meaning is stable in light/dark. From e2e5424e7071d380d05b7c1bcfddfc5bccfc3c5b Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 10:26:34 -0500 Subject: [PATCH 091/111] refactor(db): squash agentic migrations into 007_agentic.sql so numbering doesn't collide with master --- .claude/agents/ingest.md | 2 +- .../db/migrations/002_agentic_scenario.sql | 30 -- .../migrations/003_agentic_availability.sql | 21 -- packages/db/migrations/004_offload_mode.sql | 42 --- .../migrations/006_agentic_trace_replay.sql | 34 -- packages/db/migrations/007_agentic.sql | 326 ++++++++++++++++++ .../007_agentic_trace_server_metrics_json.sql | 17 - .../008_agentic_aggregate_stats.sql | 18 - .../migrations/009_agentic_chart_series.sql | 19 - .../010_agentic_request_timeline.sql | 15 - packages/db/migrations/011_datasets.sql | 55 --- packages/db/migrations/012_run_datasets.sql | 19 - 12 files changed, 327 insertions(+), 271 deletions(-) delete mode 100644 packages/db/migrations/002_agentic_scenario.sql delete mode 100644 packages/db/migrations/003_agentic_availability.sql delete mode 100644 packages/db/migrations/004_offload_mode.sql delete mode 100644 packages/db/migrations/006_agentic_trace_replay.sql create mode 100644 packages/db/migrations/007_agentic.sql delete mode 100644 packages/db/migrations/007_agentic_trace_server_metrics_json.sql delete mode 100644 packages/db/migrations/008_agentic_aggregate_stats.sql delete mode 100644 packages/db/migrations/009_agentic_chart_series.sql delete mode 100644 packages/db/migrations/010_agentic_request_timeline.sql delete mode 100644 packages/db/migrations/011_datasets.sql delete mode 100644 packages/db/migrations/012_run_datasets.sql diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md index aa0099ac..4ecbc1dd 100644 --- a/.claude/agents/ingest.md +++ b/.claude/agents/ingest.md @@ -178,7 +178,7 @@ cd packages/db && DATABASE_WRITE_URL='' \ [--label "…"] [--variant full|256k] [--description "…"] [--limit N] ``` -It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). +It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). ## Don't diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql deleted file mode 100644 index c143914e..00000000 --- a/packages/db/migrations/002_agentic_scenario.sql +++ /dev/null @@ -1,30 +0,0 @@ --- Support agentic scenarios in benchmark_results. --- --- Scenarios are discriminated by benchmark_type: --- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. --- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. --- --- conc retains its meaning (concurrent users/requests) for both. - --- 1) isl/osl become nullable for agentic rows -alter table benchmark_results - alter column isl drop not null, - alter column osl drop not null; - --- 2) CHECK constraints: positive-or-null -alter table benchmark_results - drop constraint benchmark_results_isl_positive, - drop constraint benchmark_results_osl_positive; - -alter table benchmark_results - add constraint benchmark_results_isl_positive check (isl is null or isl > 0), - add constraint benchmark_results_osl_positive check (osl is null or osl > 0); - --- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows --- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). -alter table benchmark_results - drop constraint benchmark_results_unique; - -alter table benchmark_results - add constraint benchmark_results_unique unique nulls not distinct - (workflow_run_id, config_id, benchmark_type, isl, osl, conc); diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql deleted file mode 100644 index e96cbd50..00000000 --- a/packages/db/migrations/003_agentic_availability.sql +++ /dev/null @@ -1,21 +0,0 @@ --- Extend the availability table to cover agentic scenarios. --- --- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same --- for availability and add benchmark_type so the frontend can enumerate --- agentic vs single_turn scenarios per model/date. --- --- Postgres primary keys require every column to be NOT NULL, so we drop the PK --- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally --- equivalent except it allows isl/osl to be NULL for agentic rows. - -alter table availability - drop constraint availability_pkey; - -alter table availability - alter column isl drop not null, - alter column osl drop not null, - add column benchmark_type text not null default 'single_turn'; - -alter table availability - add constraint availability_natural_key unique nulls not distinct - (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql deleted file mode 100644 index 24b617f1..00000000 --- a/packages/db/migrations/004_offload_mode.sql +++ /dev/null @@ -1,42 +0,0 @@ --- Add offload_mode as a first-class dimension on benchmark_results. --- --- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace --- runs: a single run may emit two rows for the same (config, isl, osl, conc) --- — one with offload disabled, one enabled. The pre-existing unique key --- collapsed those into one row, forcing the ingest to skip variants. --- --- For fixed-seq runs `offload_mode` defaults to 'off', which matches the --- assumption baked into the existing 5,500+ rows. - -alter table benchmark_results - add column offload_mode text not null default 'off'; - --- Backfill agentic rows from the offload_mode value already living in metrics --- JSONB (set during the earlier agentic ingest backfill). -update benchmark_results - set offload_mode = metrics->>'offload_mode' - where benchmark_type = 'agentic_traces' - and metrics ? 'offload_mode'; - --- Replace the unique constraint so on/off variants can coexist. -alter table benchmark_results - drop constraint benchmark_results_unique; - -alter table benchmark_results - add constraint benchmark_results_unique unique nulls not distinct - (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); - --- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. -drop materialized view if exists latest_benchmarks cascade; - -create materialized view latest_benchmarks as -select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) - br.* -from benchmark_results br -join latest_workflow_runs wr on wr.id = br.workflow_run_id -where br.error is null -order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; - -create unique index latest_benchmarks_pk - on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; -create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql deleted file mode 100644 index 398bc725..00000000 --- a/packages/db/migrations/006_agentic_trace_replay.sql +++ /dev/null @@ -1,34 +0,0 @@ --- Capture raw aiperf trace files per agentic benchmark point. --- --- The aiperf harness produces two per-point export files inside each --- `agentic_` artifact: --- - profile_export.jsonl (~2 MB raw, per-request data) --- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots) --- --- We persist them so the dashboard can later show per-request distributions, --- KV cache utilization over time, and conversation traces without needing to --- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at --- ~500 KB per point post-gzip the total fits comfortably without a separate --- blob service. --- --- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK --- column on benchmark_results). Older, non-aiperf agentic runs simply have a --- NULL `trace_replay_id`. - -create table agentic_trace_replay ( - id bigserial primary key, - -- gzip(profile_export.jsonl); null when only the server metrics file existed - profile_export_jsonl_gz bytea, - profile_export_uncompressed_size bigint, - -- raw csv bytes; null when only the profile file existed - server_metrics_csv bytea, - server_metrics_csv_size bigint, - created_at timestamptz not null default now() -); - -alter table benchmark_results - add column trace_replay_id bigint references agentic_trace_replay(id); - -create index benchmark_results_trace_replay_idx - on benchmark_results (trace_replay_id) - where trace_replay_id is not null; diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/007_agentic.sql new file mode 100644 index 00000000..eceea82e --- /dev/null +++ b/packages/db/migrations/007_agentic.sql @@ -0,0 +1,326 @@ +-- 007_agentic.sql +-- +-- Squashed agentic-benchmark + datasets schema. Collapses the feat/agentx +-- migrations 002_agentic_scenario .. 012_run_datasets into one file that sorts +-- after master's highest migration (006_benchmark_results_workers), so the +-- branch's numbering no longer collides with master's 002-006. None of the +-- collapsed migrations had been applied to any deployed database. +-- +-- Statement order is preserved exactly. The latest_benchmarks recreate uses +-- 'select br.*', so it retains every benchmark_results column added earlier +-- (including master's 'workers' from 006) and re-keys the view on offload_mode. + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 002_agentic_scenario.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Support agentic scenarios in benchmark_results. +-- +-- Scenarios are discriminated by benchmark_type: +-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. +-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. +-- +-- conc retains its meaning (concurrent users/requests) for both. + +-- 1) isl/osl become nullable for agentic rows +alter table benchmark_results + alter column isl drop not null, + alter column osl drop not null; + +-- 2) CHECK constraints: positive-or-null +alter table benchmark_results + drop constraint benchmark_results_isl_positive, + drop constraint benchmark_results_osl_positive; + +alter table benchmark_results + add constraint benchmark_results_isl_positive check (isl is null or isl > 0), + add constraint benchmark_results_osl_positive check (osl is null or osl > 0); + +-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows +-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 003_agentic_availability.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Extend the availability table to cover agentic scenarios. +-- +-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same +-- for availability and add benchmark_type so the frontend can enumerate +-- agentic vs single_turn scenarios per model/date. +-- +-- Postgres primary keys require every column to be NOT NULL, so we drop the PK +-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally +-- equivalent except it allows isl/osl to be NULL for agentic rows. + +alter table availability + drop constraint availability_pkey; + +alter table availability + alter column isl drop not null, + alter column osl drop not null, + add column benchmark_type text not null default 'single_turn'; + +alter table availability + add constraint availability_natural_key unique nulls not distinct + (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 004_offload_mode.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Add offload_mode as a first-class dimension on benchmark_results. +-- +-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace +-- runs: a single run may emit two rows for the same (config, isl, osl, conc) +-- — one with offload disabled, one enabled. The pre-existing unique key +-- collapsed those into one row, forcing the ingest to skip variants. +-- +-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the +-- assumption baked into the existing 5,500+ rows. + +alter table benchmark_results + add column offload_mode text not null default 'off'; + +-- Backfill agentic rows from the offload_mode value already living in metrics +-- JSONB (set during the earlier agentic ingest backfill). +update benchmark_results + set offload_mode = metrics->>'offload_mode' + where benchmark_type = 'agentic_traces' + and metrics ? 'offload_mode'; + +-- Replace the unique constraint so on/off variants can coexist. +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); + +-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. +drop materialized view if exists latest_benchmarks cascade; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; + +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 006_agentic_trace_replay.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Capture raw aiperf trace files per agentic benchmark point. +-- +-- The aiperf harness produces two per-point export files inside each +-- `agentic_` artifact: +-- - profile_export.jsonl (~2 MB raw, per-request data) +-- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots) +-- +-- We persist them so the dashboard can later show per-request distributions, +-- KV cache utilization over time, and conversation traces without needing to +-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at +-- ~500 KB per point post-gzip the total fits comfortably without a separate +-- blob service. +-- +-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK +-- column on benchmark_results). Older, non-aiperf agentic runs simply have a +-- NULL `trace_replay_id`. + +create table agentic_trace_replay ( + id bigserial primary key, + -- gzip(profile_export.jsonl); null when only the server metrics file existed + profile_export_jsonl_gz bytea, + profile_export_uncompressed_size bigint, + -- raw csv bytes; null when only the profile file existed + server_metrics_csv bytea, + server_metrics_csv_size bigint, + created_at timestamptz not null default now() +); + +alter table benchmark_results + add column trace_replay_id bigint references agentic_trace_replay(id); + +create index benchmark_results_trace_replay_idx + on benchmark_results (trace_replay_id) + where trace_replay_id is not null; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 007_agentic_trace_server_metrics_json.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Add the full server-metrics time-series JSON to agentic_trace_replay. +-- +-- The existing `server_metrics_csv` column holds aiperf's summary export — +-- one row per metric with avg/min/max/std/p1..p99 across the entire run. +-- That's enough for the cumulative cache-hit number but not for any +-- "metric over time" view (KV cache utilization curve, queue depth, prefix +-- hit rate per interval, cumulative prefill token source). +-- +-- The harness also writes `server_metrics_export.json` which contains the +-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole +-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x +-- to ~6 MB gzipped (text with repeated metric names + numeric values). +-- That's the file we store here for any future time-series chart. + +alter table agentic_trace_replay + add column server_metrics_json_gz bytea, + add column server_metrics_json_uncompressed_size bigint; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 008_agentic_aggregate_stats.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed aggregate stats for each agentic_trace_replay row. +-- +-- Previously the agentic detail page parsed the (huge) profile_export.jsonl +-- and server_metrics_json blobs on every request to compute distribution +-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived +-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the +-- worst rows (high-conc TP+EP server_metrics blobs that decompress past +-- Node's 512 MB string cap) couldn't be parsed without a stream fallback. +-- +-- This column holds the computed stats so the API serves the page from a +-- single SQL row read. Shape mirrors the existing benchmark_results.metrics +-- JSONB convention; an inner `version` field lets the backfill script +-- detect rows whose stats were computed by an older algorithm and +-- recompute them. Null when stats haven't been computed yet (existing +-- rows pre-backfill; the API has a slow-path fallback for that case). + +alter table agentic_trace_replay + add column aggregate_stats jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 009_agentic_chart_series.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed time-series for the agentic detail page chart. +-- +-- Sibling to `aggregate_stats` (migration 008): that column stores +-- per-row percentile/derived *summaries*, this one stores the full +-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate, +-- queueDepth, prefillTps, decodeTps, promptTokensBySource). +-- +-- Without this, the detail page parsed the entire `server_metrics_json_gz` +-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc +-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length). +-- With pre-computed series the page is a single SQL row read. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored series were produced by an older algorithm. +-- Null when the series haven't been computed yet; the API has a slow-path +-- fallback (with stream-parse for oversized blobs) for that case. + +alter table agentic_trace_replay + add column chart_series jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 010_agentic_request_timeline.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed per-request timeline for the agentic detail page. +-- +-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one +-- holds a thin per-request array extracted from `profile_export_jsonl_gz` +-- so the detail page can render a Gantt-style swimlane of every request +-- (one bar per conversation turn) without re-parsing the JSONL on every +-- page load. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored timeline was produced by an older +-- algorithm. Null when the timeline hasn't been computed yet; the API +-- falls back to parsing the blob in that case. + +alter table agentic_trace_replay + add column request_timeline jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 011_datasets.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora +-- the agentic benchmarks replay) + their per-conversation trace structure. +-- +-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but +-- not the source traces. These two tables back the new /datasets area: a +-- registry of ingested dataset versions with precomputed summary + chart data, +-- and one row per conversation holding a flamegraph-ready `structure` (turns + +-- subagent groups with input split into cached-prefix vs uncached-suffix). The +-- raw hash_ids are NOT stored — they're only needed at ingest to derive the +-- cached/uncached split, so the runtime read is a single small JSONB. +-- +-- Additive only. To revert this migration: +-- drop table if exists dataset_conversations; +-- drop table if exists datasets; +-- (and see the run_datasets revert below; this is all one migration now: +-- delete from schema_migrations where filename = '007_agentic.sql';) + +create table datasets ( + -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'. + id text primary key, + -- URL key, e.g. 'cc-traces-weka-062126'. + slug text not null unique, + label text not null, + -- 'full' | '256k' | 'no-subagents' (the published variants). + variant text not null default 'full', + description text, + hf_url text, + license text, + conversation_count integer not null default 0, + -- Token totals, main_turns, subagent_groups, model mix, date range, etc. + summary jsonb not null default '{}'::jsonb, + -- Precomputed distributions for the dataset-detail cards (input/output length, + -- turns per conversation, subagent fan-out, …). Versioned via an inner field. + chart_data jsonb not null default '{}'::jsonb, + dataset_version integer not null default 1, + ingested_at timestamptz not null default now() +); + +create table dataset_conversations ( + id bigserial primary key, + dataset_id text not null references datasets(id) on delete cascade, + -- The conversation id from the dataset record (trace id). + conv_id text not null, + models text[] not null default '{}', + num_turns integer not null default 0, + num_subagent_groups integer not null default 0, + total_in bigint not null default 0, + total_out bigint not null default 0, + total_cached bigint not null default 0, + -- Flamegraph-ready ordered node tree (turns + subagent groups, each with + -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts. + structure jsonb not null, + unique (dataset_id, conv_id) +); + +create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 012_run_datasets.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Maps a benchmark workflow_run to the source dataset it replayed, so the +-- agentic detail page can deep-link each request in the timeline to the exact +-- conversation in the /datasets viewer (the request's conversation_id, with any +-- ::sa:/::fa: suffix stripped, is the dataset conv_id). +-- +-- One row per workflow_run (every benchmark in a run replays the same dataset). +-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/ +-- URL) rather than an FK, so the mapping can be recorded before/independent of +-- the dataset being ingested; the UI degrades gracefully if the slug is absent. +-- +-- Additive only. To revert this whole squashed migration: +-- drop table if exists run_datasets; +-- drop table if exists dataset_conversations; +-- drop table if exists datasets; +-- drop table if exists agentic_trace_replay cascade; +-- (plus the benchmark_results/availability column + constraint changes above) +-- delete from schema_migrations where filename = '007_agentic.sql'; + +create table run_datasets ( + workflow_run_id bigint primary key references workflow_runs(id) on delete cascade, + dataset_slug text not null, + created_at timestamptz not null default now() +); diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql deleted file mode 100644 index ba7bd095..00000000 --- a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql +++ /dev/null @@ -1,17 +0,0 @@ --- Add the full server-metrics time-series JSON to agentic_trace_replay. --- --- The existing `server_metrics_csv` column holds aiperf's summary export — --- one row per metric with avg/min/max/std/p1..p99 across the entire run. --- That's enough for the cumulative cache-hit number but not for any --- "metric over time" view (KV cache utilization curve, queue depth, prefix --- hit rate per interval, cumulative prefill token source). --- --- The harness also writes `server_metrics_export.json` which contains the --- raw per-scrape (~1Hz) values for every Prometheus metric over the whole --- benchmark window. Raw size is ~250 MB per point but it compresses ~42x --- to ~6 MB gzipped (text with repeated metric names + numeric values). --- That's the file we store here for any future time-series chart. - -alter table agentic_trace_replay - add column server_metrics_json_gz bytea, - add column server_metrics_json_uncompressed_size bigint; diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql deleted file mode 100644 index d55533b9..00000000 --- a/packages/db/migrations/008_agentic_aggregate_stats.sql +++ /dev/null @@ -1,18 +0,0 @@ --- Pre-computed aggregate stats for each agentic_trace_replay row. --- --- Previously the agentic detail page parsed the (huge) profile_export.jsonl --- and server_metrics_json blobs on every request to compute distribution --- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived --- metrics (session-time, p90 prefill TPS). That took ~20s per row and the --- worst rows (high-conc TP+EP server_metrics blobs that decompress past --- Node's 512 MB string cap) couldn't be parsed without a stream fallback. --- --- This column holds the computed stats so the API serves the page from a --- single SQL row read. Shape mirrors the existing benchmark_results.metrics --- JSONB convention; an inner `version` field lets the backfill script --- detect rows whose stats were computed by an older algorithm and --- recompute them. Null when stats haven't been computed yet (existing --- rows pre-backfill; the API has a slow-path fallback for that case). - -alter table agentic_trace_replay - add column aggregate_stats jsonb; diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql deleted file mode 100644 index b42718b9..00000000 --- a/packages/db/migrations/009_agentic_chart_series.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Pre-computed time-series for the agentic detail page chart. --- --- Sibling to `aggregate_stats` (migration 008): that column stores --- per-row percentile/derived *summaries*, this one stores the full --- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate, --- queueDepth, prefillTps, decodeTps, promptTokensBySource). --- --- Without this, the detail page parsed the entire `server_metrics_json_gz` --- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc --- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length). --- With pre-computed series the page is a single SQL row read. --- --- Shape includes an inner `version` field so the backfill script can --- recompute rows whose stored series were produced by an older algorithm. --- Null when the series haven't been computed yet; the API has a slow-path --- fallback (with stream-parse for oversized blobs) for that case. - -alter table agentic_trace_replay - add column chart_series jsonb; diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql deleted file mode 100644 index 756b775e..00000000 --- a/packages/db/migrations/010_agentic_request_timeline.sql +++ /dev/null @@ -1,15 +0,0 @@ --- Pre-computed per-request timeline for the agentic detail page. --- --- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one --- holds a thin per-request array extracted from `profile_export_jsonl_gz` --- so the detail page can render a Gantt-style swimlane of every request --- (one bar per conversation turn) without re-parsing the JSONL on every --- page load. --- --- Shape includes an inner `version` field so the backfill script can --- recompute rows whose stored timeline was produced by an older --- algorithm. Null when the timeline hasn't been computed yet; the API --- falls back to parsing the blob in that case. - -alter table agentic_trace_replay - add column request_timeline jsonb; diff --git a/packages/db/migrations/011_datasets.sql b/packages/db/migrations/011_datasets.sql deleted file mode 100644 index 7a70d83f..00000000 --- a/packages/db/migrations/011_datasets.sql +++ /dev/null @@ -1,55 +0,0 @@ --- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora --- the agentic benchmarks replay) + their per-conversation trace structure. --- --- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but --- not the source traces. These two tables back the new /datasets area: a --- registry of ingested dataset versions with precomputed summary + chart data, --- and one row per conversation holding a flamegraph-ready `structure` (turns + --- subagent groups with input split into cached-prefix vs uncached-suffix). The --- raw hash_ids are NOT stored — they're only needed at ingest to derive the --- cached/uncached split, so the runtime read is a single small JSONB. --- --- Additive only. To revert this migration: --- drop table if exists dataset_conversations; --- drop table if exists datasets; --- delete from schema_migrations where filename = '011_datasets.sql'; - -create table datasets ( - -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'. - id text primary key, - -- URL key, e.g. 'cc-traces-weka-062126'. - slug text not null unique, - label text not null, - -- 'full' | '256k' | 'no-subagents' (the published variants). - variant text not null default 'full', - description text, - hf_url text, - license text, - conversation_count integer not null default 0, - -- Token totals, main_turns, subagent_groups, model mix, date range, etc. - summary jsonb not null default '{}'::jsonb, - -- Precomputed distributions for the dataset-detail cards (input/output length, - -- turns per conversation, subagent fan-out, …). Versioned via an inner field. - chart_data jsonb not null default '{}'::jsonb, - dataset_version integer not null default 1, - ingested_at timestamptz not null default now() -); - -create table dataset_conversations ( - id bigserial primary key, - dataset_id text not null references datasets(id) on delete cascade, - -- The conversation id from the dataset record (trace id). - conv_id text not null, - models text[] not null default '{}', - num_turns integer not null default 0, - num_subagent_groups integer not null default 0, - total_in bigint not null default 0, - total_out bigint not null default 0, - total_cached bigint not null default 0, - -- Flamegraph-ready ordered node tree (turns + subagent groups, each with - -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts. - structure jsonb not null, - unique (dataset_id, conv_id) -); - -create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id); diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql deleted file mode 100644 index 58dd9f88..00000000 --- a/packages/db/migrations/012_run_datasets.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Maps a benchmark workflow_run to the source dataset it replayed, so the --- agentic detail page can deep-link each request in the timeline to the exact --- conversation in the /datasets viewer (the request's conversation_id, with any --- ::sa:/::fa: suffix stripped, is the dataset conv_id). --- --- One row per workflow_run (every benchmark in a run replays the same dataset). --- dataset_slug is a plain slug (matches datasets.slug / the /datasets/ --- URL) rather than an FK, so the mapping can be recorded before/independent of --- the dataset being ingested; the UI degrades gracefully if the slug is absent. --- --- Additive only. To revert: --- drop table if exists run_datasets; --- delete from schema_migrations where filename = '012_run_datasets.sql'; - -create table run_datasets ( - workflow_run_id bigint primary key references workflow_runs(id) on delete cascade, - dataset_slug text not null, - created_at timestamptz not null default now() -); From 772dfef5cde7a79d02963a9f151cb43b6592920e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 10:57:37 -0500 Subject: [PATCH 092/111] add agentic time-series and dataset timing --- .../e2e/agentic-point-time-series.cy.ts | 98 +++++++++++++++++++ .../e2e/datasets-flamegraph-time.cy.ts | 85 ++++++++++++++++ .../components/datasets/conversation-view.tsx | 3 +- .../datasets/trace-flamegraph.test.ts | 16 +++ .../components/datasets/trace-flamegraph.tsx | 35 +++++++ .../agentic-point/agentic-point-detail.tsx | 97 +++++++++++++++++- .../agentic-point/expandable-chart.tsx | 30 ++++-- .../agentic-point/time-series-chart.test.ts | 73 +++++++++++++- .../agentic-point/time-series-chart.tsx | 60 ++++++++++++ .../app/src/hooks/api/use-request-timeline.ts | 2 + .../src/etl/compute-request-timeline.test.ts | 25 ++++- .../db/src/etl/compute-request-timeline.ts | 12 ++- packages/db/src/etl/weka-structure.test.ts | 28 +++++- packages/db/src/etl/weka-structure.ts | 40 ++++++++ 14 files changed, 586 insertions(+), 18 deletions(-) create mode 100644 packages/app/cypress/e2e/agentic-point-time-series.cy.ts create mode 100644 packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts create mode 100644 packages/app/src/components/datasets/trace-flamegraph.test.ts diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts new file mode 100644 index 00000000..b0cfb60d --- /dev/null +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -0,0 +1,98 @@ +const timelineRequest = ( + index: number, + ttftMs: number, + tpotMs: number, + overrides: Record = {}, +) => ({ + cid: 'conversation-1', + ti: index, + wid: 'worker-1', + ad: 0, + phase: 'profiling', + credit: index * 1_000_000_000, + start: index * 1_000_000_000, + ack: null, + end: (index + 1) * 1_000_000_000, + ttftMs, + tpotMs, + isl: 1024, + osl: 128, + cancelled: false, + ...overrides, +}); + +describe('Agentic point request metric time series', () => { + before(() => { + cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} }); + cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null }); + cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/request-timeline*', { + body: { + version: 3, + startNs: 0, + endNs: 7_000_000_000, + durationS: 7, + requests: [ + timelineRequest(0, 100, 10), + timelineRequest(1, 200, 20), + timelineRequest(2, 400, 25), + timelineRequest(3, 800, 40), + timelineRequest(4, 1600, 80), + timelineRequest(5, 3200, 160, { phase: 'warmup' }), + timelineRequest(6, 6400, 320, { cancelled: true }), + ], + }, + }); + cy.visit('/inference/agentic/206885'); + }); + + it('renders rolling P75 interactivity and TTFT using profiling requests only', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('h2', 'Interactivity over time').should('be.visible'); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P75'); + cy.get('svg circle').should('have.length', 5); + cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); + cy.get('svg').should('contain.text', '1 / cumulative mean TPOT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.contains('h2', 'TTFT over time').should('be.visible'); + cy.get('svg circle').should('have.length', 5); + cy.get('svg').should('contain.text', 'TTFT (s)'); + cy.get('svg').should('contain.text', 'Cumulative mean TTFT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + }); + + it('switches each chart independently from P75 to P90', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('svg', 'P75 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .as('p75Path'); + cy.contains('button', 'P90').click(); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P90'); + cy.contains('svg', 'P90 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .then(function (p90Path) { + expect(p90Path).not.to.equal(this.p75Path); + }); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.get('[data-testid="ttft-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P75'); + cy.contains('button', 'P90').click(); + cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + }); + }); +}); diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts new file mode 100644 index 00000000..672675a3 --- /dev/null +++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts @@ -0,0 +1,85 @@ +describe('Dataset conversation flamegraph timing', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', { + body: { + conv_id: 'conversation-1', + models: ['model-a'], + num_turns: 2, + num_subagent_groups: 1, + total_in: 1000, + total_out: 100, + total_cached: 500, + structure: { + blockSize: 64, + totals: { + in: 1000, + out: 100, + cached: 500, + uncached: 500, + numTurns: 2, + numSubagentGroups: 1, + }, + nodes: [ + { + kind: 'turn', + turnIndex: 0, + startS: 0, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + { + kind: 'subagent', + label: 'Explore', + agentId: 'agent-1', + startS: 3661.2, + endS: 3782.6, + durationMs: 121_400, + in: 800, + out: 80, + cached: 500, + uncached: 300, + children: [ + { + kind: 'turn', + turnIndex: 1, + startS: 3661.2, + model: 'model-a', + in: 800, + out: 80, + cached: 500, + uncached: 300, + }, + ], + }, + { + kind: 'turn', + turnIndex: 2, + startS: 65.4, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + ], + }, + }, + }); + cy.visit('/datasets/test-dataset/conversations/conversation-1'); + }); + + it('shows turn offsets and a collapsed subagent time range', () => { + cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00'); + cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05'); + cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03'); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist'); + }); + + it('shows subturn offsets when the subagent group is expanded', () => { + cy.contains('button', 'Explore').click(); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01'); + }); +}); diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index d39b83d9..57aaa0c3 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -87,7 +87,8 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin

One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default — click a group to expand it. Each bar splits input into cached prefix and uncached suffix, - plus generated output. + plus generated output. Timestamps are elapsed from conversation start; subagent headers + show their full active range.

{ + it('formats elapsed seconds below and above one hour', () => { + expect(formatElapsedTime(0)).toBe('00:00'); + expect(formatElapsedTime(65.4)).toBe('01:05'); + expect(formatElapsedTime(3661.6)).toBe('1:01:02'); + expect(formatElapsedTime(86_541.149)).toBe('24:02:21'); + }); + + it('clamps negative offsets to the conversation origin', () => { + expect(formatElapsedTime(-5)).toBe('00:00'); + }); +}); diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 12ecb4a4..d0bbb01f 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -24,6 +24,7 @@ interface VisibleRow { key: string; label: string; sublabel?: string; + timeLabel?: string; cached: number; uncached: number; output: number; @@ -34,6 +35,24 @@ interface VisibleRow { groupIndex?: number; } +/** Format seconds from conversation start as a compact elapsed timestamp. */ +export function formatElapsedTime(seconds: number): string { + const total = Math.max(0, Math.round(seconds)); + const hours = Math.floor(total / 3600); + const minutes = Math.floor((total % 3600) / 60); + const secs = total % 60; + const mm = String(minutes).padStart(2, '0'); + const ss = String(secs).padStart(2, '0'); + return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`; +} + +function timeLabel(startS?: number, endS?: number): string | undefined { + if (startS === undefined || !Number.isFinite(startS)) return undefined; + const start = formatElapsedTime(startS); + if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`; + return `+${start}–${formatElapsedTime(endS)}`; +} + interface TooltipState { x: number; y: number; @@ -152,6 +171,7 @@ export function TraceFlamegraph({ key: `t-${i}`, label: `Turn ${turnNo}`, sublabel: node.model ?? undefined, + timeLabel: timeLabel(node.startS), cached: node.cached, uncached: node.uncached, output: node.out, @@ -168,6 +188,7 @@ export function TraceFlamegraph({ sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${ node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : '' }`, + timeLabel: timeLabel(node.startS, node.endS), cached: node.cached, uncached: node.uncached, output: node.out, @@ -183,6 +204,7 @@ export function TraceFlamegraph({ key: `g-${i}-c-${ci}`, label: `↳ subturn ${ci + 1}`, sublabel: child.model ?? undefined, + timeLabel: timeLabel(child.startS), cached: child.cached, uncached: child.uncached, output: child.out, @@ -291,6 +313,15 @@ export function TraceFlamegraph({ )}
+ {/* Offset from conversation start. Group rows span the full + subagent lifetime; leaf rows show their start instant. */} +
+ {row.timeLabel ?? '—'} +
+ {/* stacked bar — group headers render as a slim muted summary strip so they read as aggregates, not individual turns. */}
+ From start + + {tooltip.row.timeLabel ?? '—'} +
, document.body, diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 4a076955..e24b7e6b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -6,7 +6,7 @@ import { useState } from 'react'; import { ArrowLeft } from 'lucide-react'; import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates'; -import { useRequestTimeline } from '@/hooks/api/use-request-timeline'; +import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-request-timeline'; import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; import { useTraceServerMetrics, @@ -16,6 +16,7 @@ import { } from '@/hooks/api/use-trace-server-metrics'; import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings'; import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart'; import { Distribution } from './distribution'; @@ -30,8 +31,11 @@ import { cumulativeUniqueInputTokens, inflightUniqueTokens, rollingAverage, + rollingRequestMetric, sumSeries, timeRollingAverage, + type RequestMetric, + type RequestPercentile, } from './time-series-chart'; interface Props { @@ -114,6 +118,83 @@ const VIEW_OPTIONS: SegmentedToggleOption[] = [ { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' }, ]; +const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'p75', label: 'P75' }, + { value: 'p90', label: 'P90' }, +]; + +// Unofficial-run overlays cannot open this persisted point-detail route: they +// have no benchmark_results id or stored request timeline. These charts are +// therefore intentionally limited to DB-backed agentic points. +function RequestMetricOverTime({ + title, + metric, + timeline, + isLoading, +}: { + title: string; + metric: RequestMetric; + timeline: RequestTimeline | null | undefined; + isLoading: boolean; +}) { + const [percentile, setPercentile] = useState('p75'); + const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null; + const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity'; + const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4'; + + const controls = ( + { + setPercentile(value); + track('inference_agentic_percentile_changed', { metric, percentile: value }); + }} + ariaLabel={`${metricLabel} percentile`} + testId={`${metric}-percentile-toggle`} + /> + ); + + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!timeline) return isLoading ? : ; + return ( + `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s` + : (value) => `${value.toFixed(0)}` + } + yAxisLabel={metric === 'ttft' ? 'TTFT (s)' : 'Interactivity (tok/s/user)'} + {...size} + /> + ); + }} + /> + ); +} + /** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */ function toAggPoint( sibling: { id: number; label: string }, @@ -254,6 +335,20 @@ export function AgenticPointDetail({ id }: Props) { }} /> + + + + { diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx index 7c8e4538..cb5987ec 100644 --- a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx @@ -13,30 +13,40 @@ import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/u export function ExpandableChart({ title, render, + controls, + testId, }: { title: string; render: (expanded: boolean) => ReactNode; + controls?: ReactNode; + testId?: string; }) { const [open, setOpen] = useState(false); return ( -
+

{title}

- +
+ {controls} + +
{render(false)} - {title} +
+ {title} + {controls} +
{render(true)}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts index 64deace4..926772db 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts @@ -1,6 +1,77 @@ import { describe, expect, it } from 'vitest'; -import { cumulativeUniqueInputTokens } from './time-series-chart'; +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +import { cumulativeUniqueInputTokens, rollingRequestMetric } from './time-series-chart'; + +const request = ( + endS: number, + ttftMs: number | null, + tpotMs: number | null, + overrides: Partial = {}, +): RequestRecord => ({ + cid: 'conversation', + ti: endS, + wid: 'worker', + ad: 0, + phase: 'profiling', + credit: 0, + start: 0, + ack: null, + end: endS * 1e9, + ttftMs, + tpotMs, + isl: 100, + osl: 10, + cancelled: false, + ...overrides, +}); + +describe('rollingRequestMetric', () => { + it('computes a trailing P75 TTFT over the requested window', () => { + const result = rollingRequestMetric( + [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30), request(4, 400, 40)], + 'ttft', + 'p75', + 3, + ); + + expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 }); + expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]); + expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]); + }); + + it('inverts the rolling TPOT percentile for interactivity', () => { + const result = rollingRequestMetric( + [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30)], + 'interactivity', + 'p90', + 3, + ); + + expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]); + expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8); + expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]); + }); + + it('drops warmup, cancelled, missing, and non-positive samples', () => { + const result = rollingRequestMetric( + [ + request(1, 100, 10), + request(2, 200, 20, { phase: 'warmup' }), + request(3, 300, 30, { cancelled: true }), + request(4, null, null), + request(5, 0, 0), + ], + 'ttft', + 'p90', + ); + + expect(result.raw).toEqual([{ t: 1, value: 0.1 }]); + expect(result.trend).toEqual([{ t: 1, value: 0.1 }]); + expect(result.cumulative).toEqual([{ t: 1, value: 0.1 }]); + }); +}); describe('cumulativeUniqueInputTokens', () => { it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => { diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 6b00b1e6..749a17e4 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -3,6 +3,7 @@ import { useMemo } from 'react'; import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics'; +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; import { ChartHover, type HoverItem } from './chart-hover'; @@ -32,6 +33,65 @@ interface TimeSeriesChartProps { height?: number; } +export type RequestMetric = 'interactivity' | 'ttft'; +export type RequestPercentile = 'p75' | 'p90'; + +/** Linear-interpolated percentile (matches numpy's default method). */ +function quantile(sortedAsc: number[], q: number): number { + if (sortedAsc.length === 1) return sortedAsc[0]!; + const pos = (sortedAsc.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sortedAsc[lo]!; + return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo); +} + +/** + * Build raw request samples plus a trailing request-count percentile. + * + * The percentile is computed in latency space. Interactivity then inverts + * the selected TPOT percentile, matching the aggregate chart convention: + * P90 interactivity = 1 / P90 TPOT (a conservative tail-latency view). + */ +export function rollingRequestMetric( + requests: readonly RequestRecord[], + metric: RequestMetric, + percentile: RequestPercentile, + windowSize = 50, +): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } { + const q = percentile === 'p75' ? 0.75 : 0.9; + const samples = requests + .filter((request) => request.phase === 'profiling' && !request.cancelled) + .flatMap((request) => { + const latencyMs = metric === 'ttft' ? request.ttftMs : request.tpotMs; + if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return []; + return [{ t: request.end / 1e9, latencyMs }]; + }) + .toSorted((a, b) => a.t - b.t); + + const raw = samples.map(({ t, latencyMs }) => ({ + t, + value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs, + })); + const trend = samples.map(({ t }, i) => { + const start = Math.max(0, i - Math.max(1, windowSize) + 1); + const sorted = samples + .slice(start, i + 1) + .map((sample) => sample.latencyMs) + .toSorted((a, b) => a - b); + const latencyMs = quantile(sorted, q); + return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs }; + }); + let latencySumMs = 0; + const cumulative = samples.map(({ t, latencyMs }, i) => { + latencySumMs += latencyMs; + const meanLatencyMs = latencySumMs / (i + 1); + return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs }; + }); + + return { raw, trend, cumulative }; +} + /** * Time-weighted rolling average over a `windowS`-second trailing window. * Treats the input as a step function (value held constant between diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts index d3ceaab8..094d2230 100644 --- a/packages/app/src/hooks/api/use-request-timeline.ts +++ b/packages/app/src/hooks/api/use-request-timeline.ts @@ -20,6 +20,8 @@ export interface RequestRecord { /** ns offset from timeline.startNs. Last byte received. */ end: number; ttftMs: number | null; + /** Time per output token in ms. */ + tpotMs: number | null; isl: number | null; osl: number | null; cancelled: boolean; diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts index 64512aca..61e69fe8 100644 --- a/packages/db/src/etl/compute-request-timeline.test.ts +++ b/packages/db/src/etl/compute-request-timeline.test.ts @@ -15,6 +15,8 @@ interface SyntheticRequest { end: number; ack?: number | null; ttftMs?: number | null; + tpotMs?: number | null; + tpotKey?: 'inter_token_latency' | 'time_per_output_token'; isl?: number | null; osl?: number | null; cancelled?: boolean; @@ -37,6 +39,8 @@ function makeBlob(requests: SyntheticRequest[]) { }, metrics: { time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' }, + [r.tpotKey ?? 'inter_token_latency']: + r.tpotMs === null ? null : { value: r.tpotMs ?? 10, unit: 'ms' }, input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' }, output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' }, }, @@ -115,7 +119,7 @@ describe('computeRequestTimeline', () => { expect(r.phase).toBe('profiling'); }); - it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => { + it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => { const tl = computeRequestTimeline( makeBlob([ { @@ -125,6 +129,7 @@ describe('computeRequestTimeline', () => { start: 10, end: 100, ttftMs: 25.5, + tpotMs: 12.5, isl: 1024, osl: 256, cancelled: true, @@ -134,10 +139,28 @@ describe('computeRequestTimeline', () => { const r = tl?.requests[0]!; expect(r.cancelled).toBe(true); expect(r.ttftMs).toBeCloseTo(25.5, 6); + expect(r.tpotMs).toBeCloseTo(12.5, 6); expect(r.isl).toBe(1024); expect(r.osl).toBe(256); }); + it('accepts time_per_output_token as a TPOT alias', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'a', + ti: 0, + credit: 0, + start: 10, + end: 100, + tpotMs: 8.25, + tpotKey: 'time_per_output_token', + }, + ]), + ); + expect(tl?.requests[0]?.tpotMs).toBeCloseTo(8.25, 6); + }); + it('skips records missing both credit_issued_ns and request_start_ns', () => { // Build a record with only request_end_ns — the helper rejects it. const broken = gzipSync( diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts index a1134f7a..707e8c54 100644 --- a/packages/db/src/etl/compute-request-timeline.ts +++ b/packages/db/src/etl/compute-request-timeline.ts @@ -14,7 +14,7 @@ import { gunzipSync } from 'node:zlib'; /** Bump when the extraction algorithm changes — backfill recomputes anything older. */ -export const REQUEST_TIMELINE_VERSION = 1; +export const REQUEST_TIMELINE_VERSION = 3; export interface RequestRecord { /** Conversation id (groups turns of one agent session). */ @@ -37,6 +37,8 @@ export interface RequestRecord { end: number; /** Time-to-first-token in ms. */ ttftMs: number | null; + /** Time per output token in ms. */ + tpotMs: number | null; /** Input sequence length (tokens). */ isl: number | null; /** Output sequence length (tokens). */ @@ -76,6 +78,8 @@ interface RawRecord { metadata?: RawMetadata; metrics?: { time_to_first_token?: RawMetricValue | number; + time_per_output_token?: RawMetricValue | number; + inter_token_latency?: RawMetricValue | number; input_sequence_length?: RawMetricValue | number; output_sequence_length?: RawMetricValue | number; }; @@ -108,6 +112,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n const raw: { meta: RawMetadata; ttftMs: number | null; + tpotMs: number | null; isl: number | null; osl: number | null; }[] = []; @@ -135,6 +140,10 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n raw.push({ meta, ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null, + tpotMs: + readNum(rec.metrics?.time_per_output_token) ?? + readNum(rec.metrics?.inter_token_latency) ?? + null, isl: readNum(rec.metrics?.input_sequence_length) ?? null, osl: readNum(rec.metrics?.output_sequence_length) ?? null, }); @@ -163,6 +172,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n ack, end, ttftMs: r.ttftMs, + tpotMs: r.tpotMs, isl: r.isl, osl: r.osl, cancelled: m.was_cancelled === true, diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts index 95bfef38..5287b682 100644 --- a/packages/db/src/etl/weka-structure.test.ts +++ b/packages/db/src/etl/weka-structure.test.ts @@ -86,17 +86,18 @@ describe('buildConversationStructure', () => { id: 'c4', block_size: 64, requests: [ - { type: 'n', model: 'main', in: 64, out: 10, hash_ids: [1] }, + { type: 'n', model: 'main', t: 0, in: 64, out: 10, hash_ids: [1] }, { type: 'subagent', agent_id: 'a1', subagent_type: 'Explore', + t: 12.5, duration_ms: 1234, requests: [ // sees parent block 1 (snapshot at spawn) → 1 block cached - { type: 'n', model: 'sub', in: 128, out: 7, hash_ids: [1, 5] }, + { type: 'n', model: 'sub', t: 12.5, in: 128, out: 7, hash_ids: [1, 5] }, // now block 5 is also seen within the subagent → 2 cached - { type: 'n', model: 'sub', in: 128, out: 3, hash_ids: [1, 5] }, + { type: 'n', model: 'sub', t: 13.1, in: 128, out: 3, hash_ids: [1, 5] }, ], }, // Parent turn after subagent: block 5 must NOT be cached (subagent @@ -113,7 +114,10 @@ describe('buildConversationStructure', () => { expect(sub.label).toBe('Explore'); expect(sub.agentId).toBe('a1'); expect(sub.durationMs).toBe(1234); + expect(sub.startS).toBe(12.5); + expect(sub.endS).toBeCloseTo(13.734, 6); expect(sub.children).toHaveLength(2); + expect(sub.children.map((child) => child.startS)).toEqual([12.5, 13.1]); expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child expect(sub.in).toBe(256); @@ -132,6 +136,24 @@ describe('buildConversationStructure', () => { expect(s.blockSize).toBe(64); expect((s.nodes[0] as SubagentNode).label).toBe('Subagent'); }); + + it('derives a subagent time range from child timings when group timing is absent', () => { + const conv: RawWekaConversation = { + id: 'c6', + requests: [ + { + type: 'subagent', + requests: [ + { type: 'n', t: 5, api_time: 2.5, in: 10, out: 1 }, + { type: 'n', t: 9, api_time: 3, in: 10, out: 1 }, + ], + }, + ], + }; + const sub = buildConversationStructure(conv).nodes[0] as SubagentNode; + expect(sub.startS).toBe(5); + expect(sub.endS).toBe(12); + }); }); describe('histograms', () => { diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index e4113c68..33e222b4 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -48,6 +48,8 @@ export interface RawWekaConversation { export interface TurnNode { kind: 'turn'; turnIndex: number; + /** Seconds from the start of the conversation. */ + startS?: number; model?: string; in: number; out: number; @@ -61,6 +63,10 @@ export interface SubagentNode { kind: 'subagent'; label: string; agentId?: string; + /** Seconds from the start of the conversation. */ + startS?: number; + /** Seconds from the start of the conversation. */ + endS?: number; durationMs?: number; in: number; out: number; @@ -130,6 +136,35 @@ function subagentLabel(s: RawWekaSubagent): string { return base && base.length > 0 ? base : 'Subagent'; } +function finiteTime(value: number | undefined): number | undefined { + return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined; +} + +function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } { + const children = entry.requests ?? []; + const childStarts = children + .map((child) => finiteTime(child.t)) + .filter((value): value is number => value !== undefined); + const startS = + finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined); + const durationMs = finiteTime(entry.duration_ms); + if (startS !== undefined && durationMs !== undefined) { + return { startS, endS: startS + durationMs / 1000 }; + } + + const childEnds = children + .map((child) => { + const childStart = finiteTime(child.t); + if (childStart === undefined) return undefined; + return childStart + (finiteTime(child.api_time) ?? 0); + }) + .filter((value): value is number => value !== undefined); + return { + startS, + endS: childEnds.length > 0 ? Math.max(...childEnds) : startS, + }; +} + /** * Build the flamegraph structure for one conversation. Main turns share a single * accumulating prefix-cache `seen` set; each subagent group runs against a @@ -153,6 +188,7 @@ export function buildConversationStructure( for (const entry of conv.requests ?? []) { if (isSubagent(entry)) { + const { startS, endS } = subagentTimeRange(entry); const childSeen = new Set(seen); // snapshot at spawn; not merged back const children: TurnNode[] = []; let gin = 0; @@ -165,6 +201,7 @@ export function buildConversationStructure( children.push({ kind: 'turn', turnIndex: turnIndex++, + startS: finiteTime(inner.t), model: inner.model, in: split.in, out, @@ -180,6 +217,8 @@ export function buildConversationStructure( kind: 'subagent', label: subagentLabel(entry), agentId: entry.agent_id, + startS, + endS, durationMs: entry.duration_ms, in: gin, out: gout, @@ -198,6 +237,7 @@ export function buildConversationStructure( nodes.push({ kind: 'turn', turnIndex: turnIndex++, + startS: finiteTime(entry.t), model: entry.model, in: split.in, out, From 13471d75072d574d42be008a462dbfce9467c95d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 13:44:55 -0500 Subject: [PATCH 093/111] add dataset percentile distributions --- .../component/distribution-card.cy.tsx | 41 ++++++++- .../cypress/e2e/datasets-distributions.cy.ts | 90 +++++++++++++++++++ .../components/datasets/dataset-detail.tsx | 6 ++ .../components/datasets/distribution-card.tsx | 23 +++-- packages/app/src/hooks/api/use-datasets.ts | 5 ++ packages/db/src/etl/weka-structure.test.ts | 18 ++++ packages/db/src/etl/weka-structure.ts | 46 ++++++++++ packages/db/src/ingest-weka-dataset.ts | 50 ++++------- 8 files changed, 235 insertions(+), 44 deletions(-) create mode 100644 packages/app/cypress/e2e/datasets-distributions.cy.ts diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx index fb7e5461..511505b9 100644 --- a/packages/app/cypress/component/distribution-card.cy.tsx +++ b/packages/app/cypress/component/distribution-card.cy.tsx @@ -8,7 +8,16 @@ const distribution: Distribution = { { x0: 200, x1: 300, count: 12 }, { x0: 300, x1: 400, count: 3 }, ], - stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 }, + stats: { + count: 40, + min: 10, + max: 390, + mean: 180, + median: 175, + p75: 250, + p90: 320, + p95: 360, + }, }; describe('DistributionCard', () => { @@ -18,8 +27,13 @@ describe('DistributionCard', () => { ); cy.contains('Input tokens per turn').should('be.visible'); cy.contains('n=40').should('be.visible'); - cy.contains('median 175').should('be.visible'); + cy.contains('p50 175').should('be.visible'); + cy.contains('p75 250').should('be.visible'); cy.contains('p90 320').should('be.visible'); + cy.contains('p95 360').should('be.visible'); + cy.get( + 'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]', + ).should('have.length', 8); // One filled bar rect per bin (ChartHover may add a transparent overlay rect). cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length); }); @@ -42,4 +56,27 @@ describe('DistributionCard', () => { ); cy.contains('log scale').should('be.visible'); }); + + it('renders older v1 stats without unavailable percentile guides', () => { + cy.mount( + , + ); + cy.contains('p50 175').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + cy.contains('NaN').should('not.exist'); + }); }); diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts new file mode 100644 index 00000000..7edda341 --- /dev/null +++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts @@ -0,0 +1,90 @@ +const distribution = (values: { + median: number; + p75: number; + p90: number; + p95: number; + max: number; +}) => ({ + bins: [ + { x0: 0, x1: 10, count: 5 }, + { x0: 10, x1: 100, count: 15 }, + ], + stats: { + count: 20, + min: 0, + mean: 40, + ...values, + }, +}); + +describe('Dataset distribution percentiles', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset', { + body: { + id: 'test-dataset', + slug: 'test-dataset', + label: 'Test dataset', + variant: 'full', + description: null, + hf_url: null, + license: 'apache-2.0', + conversation_count: 1, + summary: { + mainTurns: 20, + subagentGroups: 0, + subagentTurns: 0, + cachedPct: 0.5, + totalIn: 1000, + totalOut: 200, + }, + chart_data: { + version: 2, + inputTokensPerTurn: distribution({ + median: 100, + p75: 200, + p90: 300, + p95: 400, + max: 500, + }), + outputTokensPerTurn: distribution({ + median: 10, + p75: 20, + p90: 30, + p95: 40, + max: 50, + }), + uncachedInputTokensPerTurn: distribution({ + median: 0, + p75: 64, + p90: 128, + p95: 256, + max: 512, + }), + }, + ingested_at: '2026-06-23T00:00:00Z', + }, + }); + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', { + body: { total: 0, items: [] }, + }); + cy.visit('/datasets/test-dataset'); + }); + + it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => { + const expected = [ + ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']], + ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']], + ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']], + ] as const; + + for (const [title, percentiles] of expected) { + cy.contains('[data-slot="card"]', title).within(() => { + for (const percentile of percentiles) cy.contains(percentile).should('be.visible'); + cy.get('svg line[stroke="#3b82f6"]').should('exist'); + cy.get('svg line[stroke="#22c55e"]').should('exist'); + cy.get('svg line[stroke="#f59e0b"]').should('exist'); + cy.get('svg line[stroke="#ef4444"]').should('exist'); + }); + } + }); +}); diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx index 9410a505..ac8b2de5 100644 --- a/packages/app/src/components/datasets/dataset-detail.tsx +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -145,6 +145,12 @@ export function DatasetDetail({ slug }: { slug: string }) { scale="log" distribution={cd.outputTokensPerTurn} /> + {subtitle}
} {stats && (
- n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '} - {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit} + n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)} + {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}} · p90{' '} + {formatValue(stats.p90)} + {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}} · max{' '} + {formatValue(stats.max)} {unit}
)}
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts index 3ce61a85..96b0f59f 100644 --- a/packages/app/src/hooks/api/use-datasets.ts +++ b/packages/app/src/hooks/api/use-datasets.ts @@ -46,7 +46,11 @@ export interface DistributionStats { max: number; mean: number; median: number; + /** Added in chart_data v2. */ + p75?: number; p90: number; + /** Added in chart_data v2. */ + p95?: number; } export interface Distribution { @@ -57,6 +61,7 @@ export interface Distribution { export interface DatasetChartData { version?: number; inputTokensPerTurn?: Distribution; + uncachedInputTokensPerTurn?: Distribution; outputTokensPerTurn?: Distribution; turnsPerConversation?: Distribution; subagentGroupsPerConversation?: Distribution; diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts index 5287b682..4debf1ae 100644 --- a/packages/db/src/etl/weka-structure.test.ts +++ b/packages/db/src/etl/weka-structure.test.ts @@ -4,6 +4,8 @@ import { buildConversationStructure, linearHistogram, logHistogram, + logHistogramWithZero, + summarizeValues, type RawWekaConversation, type SubagentNode, type TurnNode, @@ -177,4 +179,20 @@ describe('histograms', () => { expect(linearHistogram([])).toEqual([]); expect(logHistogram([])).toEqual([]); }); + + it('preserves zero-valued samples in a dedicated log histogram bin', () => { + const bins = logHistogramWithZero([0, 0, 1, 10, 100], 4); + expect(bins[0]).toEqual({ x0: 0, x1: 1, count: 2 }); + expect(bins.reduce((total, bin) => total + bin.count, 0)).toBe(5); + }); +}); + +describe('summarizeValues', () => { + it('computes the same linearly-interpolated percentile set as request distributions', () => { + const summary = summarizeValues(Array.from({ length: 100 }, (_, i) => i + 1)); + expect(summary.median).toBeCloseTo(50.5, 6); + expect(summary.p75).toBeCloseTo(75.25, 6); + expect(summary.p90).toBeCloseTo(90.1, 6); + expect(summary.p95).toBeCloseTo(95.05, 6); + }); }); diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index 33e222b4..ac7a6eab 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -274,6 +274,42 @@ export interface HistogramBin { count: number; } +export interface NumberSummary { + count: number; + min: number; + max: number; + mean: number; + median: number; + p75: number; + p90: number; + p95: number; +} + +/** Distribution summary with linear-interpolated percentiles. */ +export function summarizeValues(values: readonly number[]): NumberSummary { + if (values.length === 0) { + return { count: 0, min: 0, max: 0, mean: 0, median: 0, p75: 0, p90: 0, p95: 0 }; + } + const sorted = [...values].toSorted((a, b) => a - b); + const quantile = (q: number): number => { + const pos = (sorted.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sorted[lo]!; + return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo); + }; + return { + count: sorted.length, + min: sorted[0]!, + max: sorted.at(-1)!, + mean: sorted.reduce((sum, value) => sum + value, 0) / sorted.length, + median: quantile(0.5), + p75: quantile(0.75), + p90: quantile(0.9), + p95: quantile(0.95), + }; +} + /** Linear-width histogram over [0, max]. Empty input → []. */ export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] { if (values.length === 0) return []; @@ -313,3 +349,13 @@ export function logHistogram(values: readonly number[], bins = 40): HistogramBin } return out; } + +/** Log-width histogram that preserves zero as a dedicated first bin. */ +export function logHistogramWithZero(values: readonly number[], bins = 40): HistogramBin[] { + const zeroCount = values.filter((value) => value === 0).length; + const positive = values.filter((value) => value > 0); + if (zeroCount === 0) return logHistogram(positive, bins); + if (positive.length === 0) return [{ x0: 0, x1: 1, count: zeroCount }]; + const positiveBins = logHistogram(positive, Math.max(1, bins - 1)); + return [{ x0: 0, x1: positiveBins[0]?.x0 ?? 1, count: zeroCount }, ...positiveBins]; +} diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts index 22069419..e00471d7 100644 --- a/packages/db/src/ingest-weka-dataset.ts +++ b/packages/db/src/ingest-weka-dataset.ts @@ -24,6 +24,8 @@ import { buildConversationStructure, linearHistogram, logHistogram, + logHistogramWithZero, + summarizeValues, type ConversationStructure, type RawWekaConversation, type TurnNode, @@ -140,6 +142,7 @@ async function* iterRows( interface Accumulator { inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children) + uncachedInputPerTurn: number[]; outputPerTurn: number[]; cachedFractionPerTurn: number[]; // cached/in, for turns with in>0 turnsPerConv: number[]; // main (top-level) turns @@ -157,6 +160,7 @@ interface Accumulator { function newAccumulator(): Accumulator { return { inputPerTurn: [], + uncachedInputPerTurn: [], outputPerTurn: [], cachedFractionPerTurn: [], turnsPerConv: [], @@ -174,6 +178,7 @@ function newAccumulator(): Accumulator { function recordTurn(acc: Accumulator, t: TurnNode): void { acc.inputPerTurn.push(t.in); + acc.uncachedInputPerTurn.push(t.uncached); acc.outputPerTurn.push(t.out); if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in); if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1; @@ -198,57 +203,32 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void { } } -interface NumberSummary { - count: number; - min: number; - max: number; - mean: number; - median: number; - p90: number; -} - -function summarize(values: number[]): NumberSummary { - if (values.length === 0) { - return { count: 0, min: 0, max: 0, mean: 0, median: 0, p90: 0 }; - } - const sorted = [...values].toSorted((a, b) => a - b); - const n = sorted.length; - // Quantile by position; q(0)=min, q(1)=max — avoids array-tail indexing that - // the linter rewrites to `.at(-1)` (which widens the type to `| undefined`). - const q = (p: number) => sorted[Math.min(n - 1, Math.max(0, Math.floor(p * (n - 1))))]; - const sum = sorted.reduce((a, b) => a + b, 0); - return { - count: n, - min: q(0), - max: q(1), - mean: sum / n, - median: q(0.5), - p90: q(0.9), - }; -} - function buildChartData(acc: Accumulator) { return { - version: 1, + version: 2, inputTokensPerTurn: { bins: logHistogram(acc.inputPerTurn), - stats: summarize(acc.inputPerTurn), + stats: summarizeValues(acc.inputPerTurn), + }, + uncachedInputTokensPerTurn: { + bins: logHistogramWithZero(acc.uncachedInputPerTurn), + stats: summarizeValues(acc.uncachedInputPerTurn), }, outputTokensPerTurn: { bins: logHistogram(acc.outputPerTurn), - stats: summarize(acc.outputPerTurn), + stats: summarizeValues(acc.outputPerTurn), }, turnsPerConversation: { bins: linearHistogram(acc.turnsPerConv), - stats: summarize(acc.turnsPerConv), + stats: summarizeValues(acc.turnsPerConv), }, subagentGroupsPerConversation: { bins: linearHistogram(acc.subagentGroupsPerConv), - stats: summarize(acc.subagentGroupsPerConv), + stats: summarizeValues(acc.subagentGroupsPerConv), }, cachedFractionPerTurn: { bins: linearHistogram(acc.cachedFractionPerTurn, 20), - stats: summarize(acc.cachedFractionPerTurn), + stats: summarizeValues(acc.cachedFractionPerTurn), }, }; } From 8bfe66408d6b8514031e47af1b94ede19c369d97 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 16:10:02 -0500 Subject: [PATCH 094/111] use cumulative percentiles for agentic charts --- .../e2e/agentic-point-time-series.cy.ts | 34 ++++++++++--------- .../agentic-point/agentic-point-detail.tsx | 7 ++-- .../agentic-point/time-series-chart.test.ts | 4 +-- .../agentic-point/time-series-chart.tsx | 20 ++++++++--- 4 files changed, 40 insertions(+), 25 deletions(-) diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts index b0cfb60d..db59dda2 100644 --- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -46,15 +46,15 @@ describe('Agentic point request metric time series', () => { cy.visit('/inference/agentic/206885'); }); - it('renders rolling P75 interactivity and TTFT using profiling requests only', () => { + it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => { cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { cy.contains('h2', 'Interactivity over time').should('be.visible'); cy.get('[data-testid="interactivity-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') - .should('have.text', 'P75'); + .should('have.text', 'P90'); cy.get('svg circle').should('have.length', 5); - cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); - cy.get('svg').should('contain.text', '1 / cumulative mean TPOT'); + cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT'); cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); }); @@ -62,37 +62,39 @@ describe('Agentic point request metric time series', () => { cy.contains('h2', 'TTFT over time').should('be.visible'); cy.get('svg circle').should('have.length', 5); cy.get('svg').should('contain.text', 'TTFT (s)'); - cy.get('svg').should('contain.text', 'Cumulative mean TTFT'); + cy.get('svg').should('contain.text', 'Cumulative P90 TTFT'); cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); }); }); - it('switches each chart independently from P75 to P90', () => { + it('switches each chart independently from P90 to P75', () => { cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { - cy.contains('svg', 'P75 (rolling 50 req)') + cy.contains('svg', 'P90 (rolling 50 req)') .find('path') .first() .invoke('attr', 'd') - .as('p75Path'); - cy.contains('button', 'P90').click(); + .as('p90Path'); + cy.contains('button', 'P75').click(); cy.get('[data-testid="interactivity-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') - .should('have.text', 'P90'); - cy.contains('svg', 'P90 (rolling 50 req)') + .should('have.text', 'P75'); + cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT'); + cy.contains('svg', 'P75 (rolling 50 req)') .find('path') .first() .invoke('attr', 'd') - .then(function (p90Path) { - expect(p90Path).not.to.equal(this.p75Path); + .then(function (p75Path) { + expect(p75Path).not.to.equal(this.p90Path); }); }); cy.get('[data-testid="ttft-over-time-chart"]').within(() => { cy.get('[data-testid="ttft-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') - .should('have.text', 'P75'); - cy.contains('button', 'P90').click(); - cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + .should('have.text', 'P90'); + cy.contains('button', 'P75').click(); + cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); + cy.get('svg').should('contain.text', 'Cumulative P75 TTFT'); }); }); }); diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index e24b7e6b..e1bc1524 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -137,7 +137,7 @@ function RequestMetricOverTime({ timeline: RequestTimeline | null | undefined; isLoading: boolean; }) { - const [percentile, setPercentile] = useState('p75'); + const [percentile, setPercentile] = useState('p90'); const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null; const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity'; const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4'; @@ -174,7 +174,10 @@ function RequestMetricOverTime({ strokeWidth: 2.5, }, { - name: metric === 'ttft' ? 'Cumulative mean TTFT' : '1 / cumulative mean TPOT', + name: + metric === 'ttft' + ? `Cumulative ${percentile.toUpperCase()} TTFT` + : `1 / cumulative ${percentile.toUpperCase()} TPOT`, data: result?.cumulative ?? [], color: '#ef4444', strokeWidth: 3, diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts index 926772db..3506ff45 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts @@ -38,7 +38,7 @@ describe('rollingRequestMetric', () => { expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 }); expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]); - expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]); + expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.325]); }); it('inverts the rolling TPOT percentile for interactivity', () => { @@ -51,7 +51,7 @@ describe('rollingRequestMetric', () => { expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]); expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8); - expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]); + expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]); }); it('drops warmup, cancelled, missing, and non-positive samples', () => { diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 749a17e4..0c0b5739 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -82,11 +82,21 @@ export function rollingRequestMetric( const latencyMs = quantile(sorted, q); return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs }; }); - let latencySumMs = 0; - const cumulative = samples.map(({ t, latencyMs }, i) => { - latencySumMs += latencyMs; - const meanLatencyMs = latencySumMs / (i + 1); - return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs }; + const prefixLatencies: number[] = []; + const cumulative = samples.map(({ t, latencyMs }) => { + let lo = 0; + let hi = prefixLatencies.length; + while (lo < hi) { + const mid = (lo + hi) >> 1; + if (prefixLatencies[mid]! <= latencyMs) lo = mid + 1; + else hi = mid; + } + prefixLatencies.splice(lo, 0, latencyMs); + const cumulativeLatencyMs = quantile(prefixLatencies, q); + return { + t, + value: metric === 'ttft' ? cumulativeLatencyMs / 1000 : 1000 / cumulativeLatencyMs, + }; }); return { raw, trend, cumulative }; From e3e0bf43ddec5dd8c1d4f21e1c3f9baff469f8f9 Mon Sep 17 00:00:00 2001 From: Alec Ibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 18:34:16 -0500 Subject: [PATCH 095/111] fix(db): build each chart line from a single run, no cross-run/date stitching (#491) --- ..._latest_benchmarks_single_run_per_line.sql | 49 +++++ .../src/json-provider.line-single-run.test.ts | 203 ++++++++++++++++++ packages/db/src/json-provider.ts | 50 +++-- packages/db/src/queries/benchmarks.ts | 58 +++-- 4 files changed, 323 insertions(+), 37 deletions(-) create mode 100644 packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql create mode 100644 packages/db/src/json-provider.line-single-run.test.ts diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql new file mode 100644 index 00000000..039dfe09 --- /dev/null +++ b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql @@ -0,0 +1,49 @@ +-- ============================================================ +-- LATEST_BENCHMARKS — one run per line (no cross-run stitching) +-- ============================================================ +-- +-- Previously the view did `distinct on (config_id, conc, isl, osl)` ordered by +-- date desc — resolved INDEPENDENTLY per concurrency. So if a newer run +-- re-measured only some concurrencies (a partial re-sweep), the concurrencies it +-- skipped fell back to an older run that did measure them, and a single chart line +-- ended up stitched from points produced by different runs on different dates. +-- +-- A line is one config + sequence + offload mode +-- (config_id, benchmark_type, isl, osl, offload_mode) plotted +-- across concurrencies, and it must come from a SINGLE workflow run. We pick the +-- newest run per line (newest date, then latest sweep by run_started_at, then +-- highest workflow_run_id so exactly one run wins even on a same-day / null tie), +-- then keep EVERY concurrency that one run measured. A partial re-sweep therefore +-- truncates the line to its own concurrencies rather than borrowing an older run's. + +drop materialized view if exists latest_benchmarks; + +create materialized view latest_benchmarks as +with winners as ( + select distinct on (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode) + br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.workflow_run_id as winning_run_id + from benchmark_results br + join latest_workflow_runs wr on wr.id = br.workflow_run_id + where br.error is null + order by br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.date desc, wr.run_started_at desc nulls last, br.workflow_run_id desc +) +select br.* +from benchmark_results br +join winners w + on w.config_id = br.config_id + and w.benchmark_type = br.benchmark_type + and w.isl is not distinct from br.isl + and w.osl is not distinct from br.osl + and w.offload_mode = br.offload_mode + and w.winning_run_id = br.workflow_run_id +where br.error is null; + +-- Unique key now includes benchmark_type (part of the line key). One run per line +-- guarantees one row per concurrency, so this stays unique and keeps +-- REFRESH MATERIALIZED VIEW CONCURRENTLY working. +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, benchmark_type, offload_mode) + nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/src/json-provider.line-single-run.test.ts b/packages/db/src/json-provider.line-single-run.test.ts new file mode 100644 index 00000000..b75fa26a --- /dev/null +++ b/packages/db/src/json-provider.line-single-run.test.ts @@ -0,0 +1,203 @@ +import { mkdtempSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +import type { getLatestBenchmarks as GetLatestBenchmarks } from './json-provider.js'; + +/** + * A chart line is one config + sequence + offload mode + * (config_id, benchmark_type, isl, osl, offload_mode) plotted across concurrencies, and it must + * come from a SINGLE workflow run. getLatestBenchmarks picks the + * newest run per line (date, then run_started_at, then workflow_run_id) and returns EVERY + * concurrency that one run measured — never stitching skipped concurrencies from an older run. + * + * These fixtures exercise the multi-concurrency cases the as-of test can't (it is single-conc): + * a partial re-sweep that must truncate the line, per-sequence line independence, and the + * same-day workflow_run_id tiebreak. + */ + +const cfg = (id: number) => ({ + id, + hardware: 'h100', + framework: 'vllm', + model: 'testm', + precision: 'fp8', + spec_method: 'none', + disagg: false, + is_multinode: false, + prefill_tp: 1, + prefill_ep: 1, + prefill_dp_attention: false, + prefill_num_workers: 1, + decode_tp: 1, + decode_ep: 1, + decode_dp_attention: false, + decode_num_workers: 1, + num_prefill_gpu: 0, + num_decode_gpu: 8, +}); + +const run = (id: number, githubId: number, startedAt: string | null, date: string) => ({ + id, + github_run_id: githubId, + run_attempt: 1, + name: `run ${githubId}`, + status: 'completed', + conclusion: 'success', + head_sha: 'sha', + head_branch: 'main', + html_url: `https://github.com/x/runs/${githubId}`, + created_at: startedAt ?? `${date}T00:00:00Z`, + run_started_at: startedAt, + date, +}); + +let nextResultId = 1000; +const result = ( + runDbId: number, + configId: number, + date: string, + conc: number, + tpot: number, + isl = 1024, + osl = 1024, + offloadMode = 'off', +) => ({ + id: nextResultId++, + workflow_run_id: runDbId, + config_id: configId, + benchmark_type: 'latency', + date, + isl, + osl, + conc, + offload_mode: offloadMode, + image: null, + metrics: { median_tpot: tpot }, + error: null, + server_log_id: null, +}); + +const OLD = '2026-06-10'; +const NEW = '2026-06-14'; +let getLatestBenchmarks: typeof GetLatestBenchmarks; + +beforeAll(async () => { + const dir = mkdtempSync(join(tmpdir(), 'infx-line-')); + writeFileSync(join(dir, 'configs.json'), JSON.stringify([cfg(1), cfg(2)])); + writeFileSync( + join(dir, 'workflow_runs.json'), + JSON.stringify([ + run(10, 100, `${OLD}T04:00:00Z`, OLD), // run A: older full sweep + run(11, 101, `${NEW}T05:00:00Z`, NEW), // run B: newer partial re-sweep + run(20, 200, `${NEW}T07:00:00Z`, NEW), // run E: same-day, lower run id + run(21, 201, `${NEW}T07:00:00Z`, NEW), // run F: same-day, SAME timestamp, higher run id + ]), + ); + writeFileSync( + join(dir, 'benchmark_results.json'), + JSON.stringify([ + // config 1, seq (1024,1024): run A full sweep, run B partial re-sweep. + result(10, 1, OLD, 1, 0.1), + result(10, 1, OLD, 8, 0.18), + result(10, 1, OLD, 64, 0.5), + result(11, 1, NEW, 1, 0.09), + result(11, 1, NEW, 8, 0.16), + // config 1, seq (8192,1024): only run A measured it (run B skipped this sequence). + result(10, 1, OLD, 1, 0.2, 8192, 1024), + result(10, 1, OLD, 8, 0.3, 8192, 1024), + // Offload mode is an independent line dimension. A newer off-mode run must not hide + // the older on-mode line for the same config and sequence. + result(10, 1, OLD, 4, 0.25, 4096, 4096, 'on'), + result(11, 1, NEW, 4, 0.2, 4096, 4096, 'off'), + // config 2, seq (1024,1024): two same-day runs with identical run_started_at. + result(20, 2, NEW, 1, 0.5), + result(20, 2, NEW, 8, 0.6), + result(20, 2, NEW, 64, 0.7), + result(21, 2, NEW, 1, 0.4), + result(21, 2, NEW, 8, 0.45), + ]), + ); + process.env.DUMP_DIR = dir; + const mod = await import('./json-provider.js'); + getLatestBenchmarks = mod.getLatestBenchmarks; +}); + +afterAll(() => { + delete process.env.DUMP_DIR; +}); + +/** Concurrencies + their run urls for one (config sequence) line, sorted by conc. */ +function line( + rows: { isl: number | null; osl: number | null; conc: number; run_url: string | null }[], + configRunUrlRe: RegExp, + isl: number, + osl: number, +) { + return rows + .filter((r) => r.isl === isl && r.osl === osl && r.run_url?.match(configRunUrlRe)) + .toSorted((a, b) => a.conc - b.conc) + .map((r) => ({ conc: r.conc, runUrl: r.run_url })); +} + +describe('getLatestBenchmarks — one run per line', () => { + it('truncates a line to the newest run: a partial re-sweep hides the older run’s extra concs', () => { + const rows = getLatestBenchmarks('testm', NEW, false); + // config 1 / seq (1024,1024): run B (101) measured only conc 1 & 8. conc 64 from run A is gone. + const seq = line(rows, /runs\/(?:100|101)\//u, 1024, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/101/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/101/attempts/1' }, + ]); + expect(seq.some((p) => p.conc === 64)).toBe(false); + }); + + it('keeps a different sequence of the same config on its own winning run', () => { + const rows = getLatestBenchmarks('testm', NEW, false); + // seq (8192,1024) was only in run A; run B winning the other sequence must not erase it. + const seq = line(rows, /runs\/100\//u, 8192, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + ]); + }); + + it('selects winning runs independently for each offload mode', () => { + const rows = getLatestBenchmarks('testm', NEW, false).filter( + (r) => r.isl === 4096 && r.osl === 4096, + ); + + expect( + rows + .map((r) => ({ offloadMode: r.offload_mode, runUrl: r.run_url })) + .toSorted((a, b) => a.offloadMode.localeCompare(b.offloadMode)), + ).toEqual([ + { offloadMode: 'off', runUrl: 'https://github.com/x/runs/101/attempts/1' }, + { offloadMode: 'on', runUrl: 'https://github.com/x/runs/100/attempts/1' }, + ]); + }); + + it('breaks a same-day, same-timestamp tie by workflow_run_id (higher id wins the whole line)', () => { + const rows = getLatestBenchmarks('testm', NEW, false); + // config 2: run E (200, id 20) and run F (201, id 21) share run_started_at; F wins by id. + const seq = line(rows, /runs\/(?:200|201)\//u, 1024, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/201/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/201/attempts/1' }, + ]); + // run E's extra conc 64 must not bleed into run F's line. + expect(seq.some((p) => p.conc === 64)).toBe(false); + }); + + it('as of the older run, shows that run’s full sweep (no truncation by a later run)', () => { + const rows = getLatestBenchmarks('testm', NEW, false, '100'); + const seq = line(rows, /runs\/100\//u, 1024, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + { conc: 64, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + ]); + }); +}); diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index c23e5f48..4e548efe 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -72,6 +72,8 @@ interface RawBenchmarkResult { isl: number; osl: number; conc: number; + /** Added by the AgentX schema; older dumps omit it and are treated as off. */ + offload_mode?: string; image: string | null; metrics: Record; /** Added in migration 006; older dumps omit this field — surfaced as undefined. */ @@ -333,12 +335,11 @@ const STRIP_HISTORY_KEYS = new Set([ ]); /** - * Comparator for DISTINCT ON (config, conc, isl, osl) selection: latest calendar - * day first, then — for sweeps on the same day — the latest workflow run first by - * `run_started_at` (NULLS LAST). Mirrors the SQL date-filtered query and the - * `latest_benchmarks` view (migration 003): a calendar day alone ties two same-day - * sweeps, so without this an older run's points can shadow a same-day re-sweep. - * `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically. + * Run-recency comparator used to pick the newest run per line: latest calendar day first, + * then — for sweeps on the same day — the latest workflow run first by `run_started_at` + * (NULLS LAST). Mirrors the `br.date DESC, wr.run_started_at DESC NULLS LAST` portion of the + * SQL ORDER BY; callers apply a `workflow_run_id` DESC final tiebreak on top so exactly one + * run wins. `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically. * Exported so the same-day tiebreak is unit-tested in parity with the SQL. */ export function compareBenchmarkRecency( @@ -355,6 +356,10 @@ export function compareBenchmarkRecency( return bStarted.localeCompare(aStarted); } +/** Chart-line identity: one config + sequence + offload mode. */ +const lineKey = (br: RawBenchmarkResult): string => + `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`; + export function getLatestBenchmarks( modelKey: string | string[], date?: string, @@ -390,27 +395,32 @@ export function getLatestBenchmarks( return true; }); - // DISTINCT ON (config_id, conc, isl, osl) — keep the one with the latest date, - // tiebreaking same-day runs by run_started_at so the latest sweep wins. - const seen = new Map(); - candidates.sort((a, b) => - compareBenchmarkRecency( + // Single run per LINE (config_id, benchmark_type, isl, osl, offload_mode): pick the newest run that + // produced data for the line, then keep EVERY concurrency that one run measured. Sort by + // recency (date, then run_started_at) with a final workflow_run_id DESC tiebreak so exactly + // one run wins even when run_started_at is equal/null — matching the SQL ORDER BY. + candidates.sort((a, b) => { + const recency = compareBenchmarkRecency( toDateString(a.date), toDateString(b.date), s.latestRunsById.get(a.workflow_run_id)?.run_started_at ?? null, s.latestRunsById.get(b.workflow_run_id)?.run_started_at ?? null, - ), - ); + ); + return recency === 0 ? b.workflow_run_id - a.workflow_run_id : recency; + }); + const winningRun = new Map(); for (const br of candidates) { - const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`; - if (!seen.has(key)) seen.set(key, br); + const key = lineKey(br); + if (!winningRun.has(key)) winningRun.set(key, br.workflow_run_id); } - return [...seen.values()].map((br) => { - const c = s.configs.get(br.config_id)!; - const wr = s.latestRunsById.get(br.workflow_run_id)!; - return toBenchmarkRow(br, c, wr); - }); + return candidates + .filter((br) => winningRun.get(lineKey(br)) === br.workflow_run_id) + .map((br) => { + const c = s.configs.get(br.config_id)!; + const wr = s.latestRunsById.get(br.workflow_run_id)!; + return toBenchmarkRow(br, c, wr); + }); } /** In-memory mirror of {@link import('./queries/benchmarks.js').getBenchmarksForRun}. */ diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 6833756a..37301e2b 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -51,9 +51,14 @@ export interface BenchmarkRow { /** * Fetch the latest benchmark results for one or more model DB keys across ALL sequences, * up to a given date. Multiple keys support point-release grouping — e.g. passing - * `['glm5', 'glm5.1']` unions both buckets under the one display. Returns the most recent - * result per (config, concurrency, isl, osl) — so every GPU/framework + sequence combo - * that has been benchmarked appears, with the newest data winning. + * `['glm5', 'glm5.1']` unions both buckets under the one display. + * + * Selection unit is the LINE, not the point: for each line + * `(config_id, benchmark_type, isl, osl, offload_mode)` we pick the single newest workflow run that + * produced data for it (newest date, then latest sweep, then highest run id) and return + * EVERY concurrency that one run measured — and nothing from any other run. A partial + * re-sweep therefore truncates the line to its own concurrencies rather than stitching the + * skipped ones from an older run. This guarantees a line never mixes runs/dates. * * The frontend filters by sequence client-side. This eliminates API round-trips when * switching sequences — the data is already cached by React Query. @@ -74,13 +79,8 @@ export async function getLatestBenchmarks( ): Promise { const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; if (date) { - // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest) - // exact=true: only return data from this exact date (for GPU comparison) - // exact=false (default): return latest data as of this date (for main chart) - // Same-day tiebreak by wr.run_started_at (latest sweep wins), mirroring the - // latest_benchmarks view (migration 003). br.date is a calendar day, so two - // sweeps on the same day tie on date alone and Postgres would otherwise pick - // an arbitrary one — leaving an older run's points shadowing a same-day re-sweep. + // Date-filtered: use the base table (the view only has the absolute latest). + // exact=true: only this exact date (GPU comparison); exact=false (default): as of this date. const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`; // "As of run" filter (main chart only): keep results whose run started no later // than the selected run. run_started_at is an absolute timestamp, so this also @@ -97,8 +97,29 @@ export async function getLatestBenchmarks( ) )` : sql``; + // winners: the single newest run per LINE + // (config_id, benchmark_type, isl, osl, offload_mode) under the + // date/run cutoff. br.date is a calendar day, so two same-day sweeps tie on date — break + // by wr.run_started_at (latest sweep wins), then br.workflow_run_id so exactly one run wins + // even when run_started_at is equal/null. The outer join then pulls EVERY concurrency that + // winning run measured for the line, so the line is built from one run only (no carry-forward + // of concurrencies a partial re-sweep skipped). const rows = await sql` - SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + WITH winners AS ( + SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode) + br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.workflow_run_id AS winning_run_id + FROM benchmark_results br + JOIN configs c ON c.id = br.config_id + JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id + WHERE c.model = ANY(${modelKeys}) + AND br.error IS NULL + AND ${dateFilter} + ${runFilter} + ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.date DESC, wr.run_started_at DESC NULLS LAST, br.workflow_run_id DESC + ) + SELECT br.id, c.hardware, c.framework, @@ -130,12 +151,15 @@ export async function getLatestBenchmarks( FROM benchmark_results br JOIN configs c ON c.id = br.config_id JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id - WHERE c.model = ANY(${modelKeys}) - AND br.error IS NULL - AND ${dateFilter} - ${runFilter} - ORDER BY br.config_id, br.conc, br.isl, br.osl, - br.date DESC, wr.run_started_at DESC NULLS LAST + JOIN winners w + ON w.config_id = br.config_id + AND w.benchmark_type = br.benchmark_type + AND w.isl IS NOT DISTINCT FROM br.isl + AND w.osl IS NOT DISTINCT FROM br.osl + AND w.offload_mode = br.offload_mode + AND w.winning_run_id = br.workflow_run_id + WHERE br.error IS NULL + ORDER BY br.config_id, br.conc, br.isl, br.osl `; return rows as unknown as BenchmarkRow[]; } From 2c3bb6dcaaff6c04ec56928cc08843b267c464bb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 23:08:36 -0500 Subject: [PATCH 096/111] Default agentic charts to interactivity --- packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 7 ++++--- packages/app/src/components/inference/InferenceContext.tsx | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index 636a7ccf..df199b81 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -9,13 +9,14 @@ describe('X-Axis Mode Toggle (inference chart)', () => { cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); }); - it('shows the x-axis mode buttons with Interactivity active by default', () => { + it('shows Interactivity by default for the agentic view', () => { + cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces'); cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible'); cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible'); cy.get('[data-testid="x-axis-mode-interactivity"]') .should('be.visible') .and('have.attr', 'aria-selected', 'true'); - cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); it('switches the x-axis to TTFT and updates the heading', () => { @@ -37,6 +38,6 @@ describe('X-Axis Mode Toggle (inference chart)', () => { 'aria-selected', 'true', ); - cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); }); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 839afeed..ddb923b8 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -533,7 +533,7 @@ export function InferenceProvider({ // Reconcile the x-axis mode with the scenario kind: // - On mount with no `i_xmode` URL param: snap to the kind's natural default - // (agentic → ttft, fixed → interactivity). The state itself was initialized + // (interactivity for both agentic and fixed-sequence scenarios). The state was initialized // to a SSR-stable constant so server and client render the same DOM; this // effect fixes it up after hydration. // - When the user later switches sequence kinds: snap to the new kind's @@ -565,7 +565,7 @@ export function InferenceProvider({ // — fall through to the default snap below. return; } - handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity'); + handleSetXAxisMode('interactivity'); }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]); // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or From 28d007f28df8dfa3a1f826fd0f04876722f0e324 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 25 Jun 2026 15:55:08 -0500 Subject: [PATCH 097/111] feat(datasets): bracket grouping for parallel requests in flamegraph Replace the per-row P# badges with a colored left-gutter bracket that groups requests in the same main-agent or subagent scope whose original execution intervals overlapped (ran in parallel). Non-transitive overlap chains get their own side-by-side lanes; the gutter only renders when an overlap group exists, so non-parallel traces have no extra whitespace. Legend swatch and conversation-view copy updated to describe the bracket; e2e assertions check data-overlap-group on bracket segments. Co-Authored-By: Claude Opus 4.8 --- .../e2e/datasets-flamegraph-time.cy.ts | 56 ++- .../components/datasets/conversation-view.tsx | 4 +- .../components/datasets/trace-flamegraph.tsx | 405 +++++++++++++++--- 3 files changed, 407 insertions(+), 58 deletions(-) diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts index 672675a3..58d95c27 100644 --- a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts +++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts @@ -24,6 +24,7 @@ describe('Dataset conversation flamegraph timing', () => { kind: 'turn', turnIndex: 0, startS: 0, + endS: 1.2, model: 'model-a', in: 100, out: 10, @@ -46,11 +47,34 @@ describe('Dataset conversation flamegraph timing', () => { kind: 'turn', turnIndex: 1, startS: 3661.2, + endS: 3668.2, model: 'model-a', - in: 800, - out: 80, - cached: 500, - uncached: 300, + in: 300, + out: 30, + cached: 150, + uncached: 150, + }, + { + kind: 'turn', + turnIndex: 2, + startS: 3665.2, + endS: 3671.2, + model: 'model-a', + in: 300, + out: 30, + cached: 200, + uncached: 100, + }, + { + kind: 'turn', + turnIndex: 3, + startS: 3670.2, + endS: 3675.2, + model: 'model-a', + in: 200, + out: 20, + cached: 150, + uncached: 50, }, ], }, @@ -58,6 +82,7 @@ describe('Dataset conversation flamegraph timing', () => { kind: 'turn', turnIndex: 2, startS: 65.4, + endS: 67.4, model: 'model-a', in: 100, out: 10, @@ -72,14 +97,31 @@ describe('Dataset conversation flamegraph timing', () => { }); it('shows turn offsets and a collapsed subagent time range', () => { - cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00'); - cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05'); + cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00–00:01'); + cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05–01:07'); cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03'); cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist'); }); it('shows subturn offsets when the subagent group is expanded', () => { cy.contains('button', 'Explore').click(); - cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01'); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01–1:01:08'); + // Parallel groups render as left-gutter brackets; each member row carries + // one bracket segment per group it belongs to (non-transitive chains keep + // their own segments/lanes). + cy.get('[data-testid="flamegraph-overlap-g-1-c-0"]') + .should('have.length', 1) + .and('have.attr', 'data-overlap-group', 'subagent-1-1'); + cy.get('[data-testid="flamegraph-overlap-g-1-c-1"]') + .should('have.length', 2) + .then(($segs) => { + expect([...$segs].map((seg) => seg.dataset.overlapGroup).toSorted()).to.deep.equal([ + 'subagent-1-1', + 'subagent-1-2', + ]); + }); + cy.get('[data-testid="flamegraph-overlap-g-1-c-2"]') + .should('have.length', 1) + .and('have.attr', 'data-overlap-group', 'subagent-1-2'); }); }); diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index 57aaa0c3..ce10241a 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -88,7 +88,9 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default — click a group to expand it. Each bar splits input into cached prefix and uncached suffix, plus generated output. Timestamps are elapsed from conversation start; subagent headers - show their full active range. + show their full active range. A colored bracket on the left groups requests in the same + main-agent or subagent scope whose original execution intervals overlapped (ran in + parallel).

+ Number.isFinite(request.startS) && + Number.isFinite(request.endS) && + request.endS! > request.startS!, + ); + const boundaries = [ + ...new Set(valid.flatMap((request) => [request.startS, request.endS])), + ].toSorted((a, b) => a - b); + const candidates = new Map>(); + + for (let i = 0; i < boundaries.length - 1; i++) { + const startS = boundaries[i]!; + const endS = boundaries[i + 1]!; + if (endS <= startS) continue; + const requestKeys = valid + .filter((request) => request.startS <= startS && request.endS >= endS) + .map((request) => request.key) + .toSorted(); + if (requestKeys.length < 2) continue; + const key = requestKeys.join('\u0000'); + const existing = candidates.get(key); + candidates.set(key, { + requestKeys, + startS: existing ? Math.min(existing.startS, startS) : startS, + endS: existing ? Math.max(existing.endS, endS) : endS, + }); + } + + const maximal = [...candidates.values()].filter( + (candidate, _, all) => + !all.some( + (other) => + other.requestKeys.length > candidate.requestKeys.length && + candidate.requestKeys.every((key) => other.requestKeys.includes(key)), + ), + ); + + return maximal + .toSorted( + (a, b) => + a.startS - b.startS || + a.endS - b.endS || + a.requestKeys.join(',').localeCompare(b.requestKeys.join(',')), + ) + .map((group, index) => ({ ...group, id: `${scopeKey}-${index + 1}` })); +} + +interface RowOverlap { + id: string; + label: string; + color: string; + startS: number; + endS: number; + peerCount: number; +} + interface VisibleRow { key: string; label: string; @@ -33,6 +124,7 @@ interface VisibleRow { isGroup: boolean; isExpanded: boolean; groupIndex?: number; + overlaps: RowOverlap[]; } /** Format seconds from conversation start as a compact elapsed timestamp. */ @@ -161,6 +253,42 @@ export function TraceFlamegraph({ const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]); const collapseAll = useCallback(() => setExpanded(new Set()), []); + const overlapsByRow = useMemo(() => { + const mainGroups = findRequestOverlapGroups( + nodes.flatMap((node, i) => + node.kind === 'turn' ? [{ key: `t-${i}`, startS: node.startS, endS: node.endS }] : [], + ), + 'main', + ); + const subagentGroups = nodes.flatMap((node, i) => + node.kind === 'subagent' + ? findRequestOverlapGroups( + node.children.map((child, ci) => ({ + key: `g-${i}-c-${ci}`, + startS: child.startS, + endS: child.endS, + })), + `subagent-${i}`, + ) + : [], + ); + const groups: RequestOverlapGroup[] = [...mainGroups, ...subagentGroups]; + + const byRow = new Map(); + groups.forEach((group, groupIndex) => { + const overlap = { + id: group.id, + label: `P${groupIndex + 1}`, + color: OVERLAP_COLORS[groupIndex % OVERLAP_COLORS.length]!, + startS: group.startS, + endS: group.endS, + peerCount: group.requestKeys.length - 1, + }; + group.requestKeys.forEach((key) => byRow.set(key, [...(byRow.get(key) ?? []), overlap])); + }); + return byRow; + }, [nodes]); + const rows = useMemo(() => { const out: VisibleRow[] = []; let turnNo = 0; @@ -171,7 +299,7 @@ export function TraceFlamegraph({ key: `t-${i}`, label: `Turn ${turnNo}`, sublabel: node.model ?? undefined, - timeLabel: timeLabel(node.startS), + timeLabel: timeLabel(node.startS, node.endS), cached: node.cached, uncached: node.uncached, output: node.out, @@ -179,6 +307,7 @@ export function TraceFlamegraph({ indent: 0, isGroup: false, isExpanded: false, + overlaps: overlapsByRow.get(`t-${i}`) ?? [], }); } else { const isExpanded = expanded.has(i); @@ -197,6 +326,7 @@ export function TraceFlamegraph({ isGroup: true, isExpanded, groupIndex: i, + overlaps: [], }); if (isExpanded) { node.children.forEach((child, ci) => { @@ -204,7 +334,7 @@ export function TraceFlamegraph({ key: `g-${i}-c-${ci}`, label: `↳ subturn ${ci + 1}`, sublabel: child.model ?? undefined, - timeLabel: timeLabel(child.startS), + timeLabel: timeLabel(child.startS, child.endS), cached: child.cached, uncached: child.uncached, output: child.out, @@ -212,13 +342,14 @@ export function TraceFlamegraph({ indent: 1, isGroup: false, isExpanded: false, + overlaps: overlapsByRow.get(`g-${i}-c-${ci}`) ?? [], }); }); } } }); return out; - }, [nodes, expanded]); + }, [nodes, expanded, overlapsByRow]); // Two scales: leaf turns/subturns share a per-turn axis (the primary signal — // how cached/uncached evolves), while subagent group headers carry aggregates @@ -234,6 +365,90 @@ export function TraceFlamegraph({ [rows], ); + // Geometry for the parallel-group brackets drawn in the left gutter. Each + // overlap group becomes a vertical bracket spanning from its first to its last + // visible member row, with a right-pointing tick on the exact member rows. + // Non-transitive chains (a row in two groups) get separate lanes so their + // brackets sit side by side. `through` = a row inside a group's span that is + // NOT itself a member (the aux-stream edge case) — drawn as a faint connector + // with no tick. + const braces = useMemo(() => { + interface Seg { + role: 'first' | 'middle' | 'last' | 'through'; + isMember: boolean; + color: string; + groupId: string; + peerCount: number; + startS: number; + endS: number; + } + const groupMap = new Map< + string, + { id: string; color: string; peerCount: number; startS: number; endS: number; idxs: number[] } + >(); + rows.forEach((r, idx) => { + for (const ov of r.overlaps) { + const g = groupMap.get(ov.id) ?? { + id: ov.id, + color: ov.color, + peerCount: ov.peerCount, + startS: ov.startS, + endS: ov.endS, + idxs: [], + }; + g.idxs.push(idx); + groupMap.set(ov.id, g); + } + }); + const groups = [...groupMap.values()] + .filter((g) => g.idxs.length >= 2) // need ≥2 visible members to bracket + .map((g) => ({ + ...g, + min: Math.min(...g.idxs), + max: Math.max(...g.idxs), + members: new Set(g.idxs), + })) + .toSorted((a, b) => a.min - b.min || a.max - b.max); + + // Greedy lane assignment: a group reuses a lane whose previous group ended + // before this one starts. + const laneEnd: number[] = []; + const laneOf = new Map(); + for (const g of groups) { + let lane = laneEnd.findIndex((end) => end < g.min); + if (lane === -1) { + lane = laneEnd.length; + laneEnd.push(g.max); + } else { + laneEnd[lane] = g.max; + } + laneOf.set(g.id, lane); + } + const laneCount = laneEnd.length; + + const rowSegs: (Seg | null)[][] = rows.map(() => + Array.from({ length: laneCount }, () => null as Seg | null), + ); + for (const g of groups) { + const lane = laneOf.get(g.id)!; + for (let idx = g.min; idx <= g.max; idx++) { + const isMember = g.members.has(idx); + const role = + idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through'; + rowSegs[idx]![lane] = { + role, + isMember, + color: g.color, + groupId: g.id, + peerCount: g.peerCount, + startS: g.startS, + endS: g.endS, + }; + } + } + return { laneCount, rowSegs }; + }, [rows]); + const onMove = (e: React.MouseEvent, row: VisibleRow) => { setTooltip({ x: e.clientX, y: e.clientY, row }); }; @@ -251,19 +466,32 @@ export function TraceFlamegraph({ {l.label} ))} + + + Bracketed rows ran in parallel +
{groupIndexes.length > 0 && (
- ) : ( - {row.label} - )} -
+ {/* Parallel-group bracket gutter (only rendered when the + conversation has any overlaps, so non-overlap traces keep a + flush-left layout with no dead space). */} + {braces.laneCount > 0 && ( +
+ {segs.map((seg, lane) => { + if (!seg) return
; + const top = seg.role === 'first' ? '50%' : '0'; + const bottom = seg.role === 'last' ? '50%' : '0'; + return ( +
+ {/* vertical rail */} +
+ {/* right-pointing tick marking an actual member row */} + {seg.isMember && ( +
+ )} +
+ ); + })} +
+ )} - {/* Offset from conversation start. Group rows span the full - subagent lifetime; leaf rows show their start instant. */} + {/* row content (indented for subagent children) */}
- {row.timeLabel ?? '—'} -
+ {/* label / group toggle */} +
+ {row.isGroup ? ( + + ) : ( + {row.label} + )} +
- {/* stacked bar — group headers render as a slim muted summary - strip so they read as aggregates, not individual turns. */} -
onMove(e, row)} - onMouseLeave={() => setTooltip(null)} - > + {/* Original interval, measured from conversation start. */}
-
-
-
+ {row.timeLabel ?? '—'}
-
- {/* total */} -
- {compact(row.total)} + {/* stacked bar — group headers render as a slim muted summary + strip so they read as aggregates, not individual turns. */} +
onMove(e, row)} + onMouseLeave={() => setTooltip(null)} + > +
+
+
+
+
+
+ + {/* total */} +
+ {compact(row.total)} +
); From f7f82d40fda392c3b1dfa8ebe0de6227e2e5c6a4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 25 Jun 2026 16:04:21 -0500 Subject: [PATCH 098/111] fix(datasets): bound flamegraph bracket gutter for high-parallelism traces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A pathological conversation (1621 turns, a subagent fanning out into 622 children with 17-way concurrency) produced 49 bracket lanes — a 686px gutter that pushed the bars off-screen, plus one DOM node per lane per row (~110k empty divs, 157k total nodes on Expand all). Cap displayed lanes at MAX_LANES (6): overflow groups fold into the last "dense" lane, so every parallel row still carries a marker but the gutter width stays bounded. Render the gutter sparsely (only lanes a row touches, absolutely positioned) instead of a dense lane-per-row matrix. A subtle note surfaces when lanes are capped so the fold isn't silent. Outlier now: gutter 686px -> 84px, DOM on Expand all 157k -> 35k nodes. Normal multi-lane traces are unchanged (<=6 lanes hit the identity path). Co-Authored-By: Claude Opus 4.8 --- .../components/datasets/trace-flamegraph.tsx | 61 ++++++++++++++----- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 1af65216..158c03c3 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -30,6 +30,15 @@ const OVERLAP_COLORS = ['#06b6d4', '#ec4899', '#6366f1', '#84cc16', '#f97316'] a // side-by-side instead of stacking visually. const LANE_W = 14; +// Cap on simultaneously-drawn bracket lanes. A pathological conversation (e.g. a +// long-running session whose subagent fans out into hundreds of children with +// 15+ concurrent requests) can require dozens of lanes; left unbounded the +// gutter grows wide enough to push the bars off-screen AND emits one DOM node +// per lane per row (tens of thousands of empty divs). We bound it: lanes beyond +// the cap fold into the last "dense" lane, which stays readable for the common +// case (≤6 concurrent) and degrades gracefully for the outliers. +const MAX_LANES = 6; + export interface TimedRequest { key: string; startS?: number; @@ -424,18 +433,25 @@ export function TraceFlamegraph({ } laneOf.set(g.id, lane); } - const laneCount = laneEnd.length; - - const rowSegs: (Seg | null)[][] = rows.map(() => - Array.from({ length: laneCount }, () => null as Seg | null), - ); + const rawLaneCount = laneEnd.length; + // Bound the gutter (see MAX_LANES). Lanes past the cap collapse onto the last + // visible lane, so every parallel row still carries a marker but the gutter + // width and DOM-node count stay bounded regardless of how parallel the + // conversation is. + const laneCount = Math.min(rawLaneCount, MAX_LANES); + const displayLane = (lane: number) => Math.min(lane, laneCount - 1); + + // Sparse per-row segments: only lanes that actually carry a bracket on a row + // are stored (and later rendered). The previous dense matrix emitted one DOM + // node per lane per row — catastrophic at 49 lanes × 2k rows. + const rowSegs: { lane: number; seg: Seg }[][] = rows.map(() => []); for (const g of groups) { - const lane = laneOf.get(g.id)!; + const lane = displayLane(laneOf.get(g.id)!); for (let idx = g.min; idx <= g.max; idx++) { const isMember = g.members.has(idx); const role = idx === g.min ? 'first' : idx === g.max ? 'last' : isMember ? 'middle' : 'through'; - rowSegs[idx]![lane] = { + const seg: Seg = { role, isMember, color: g.color, @@ -444,9 +460,15 @@ export function TraceFlamegraph({ startS: g.startS, endS: g.endS, }; + const cell = rowSegs[idx]!; + const existing = cell.find((c) => c.lane === lane); + // Collisions only happen in the folded overflow lane. Prefer a real + // member marker over a faint pass-through connector. + if (!existing) cell.push({ lane, seg }); + else if (seg.isMember && !existing.seg.isMember) existing.seg = seg; } } - return { laneCount, rowSegs }; + return { laneCount, overflowLanes: rawLaneCount - laneCount, rowSegs }; }, [rows]); const onMove = (e: React.MouseEvent, row: VisibleRow) => { @@ -500,6 +522,14 @@ export function TraceFlamegraph({ )}
+ {braces.overflowLanes > 0 && ( +

+ Dense parallel region — bracket lanes capped at {MAX_LANES}; {braces.overflowLanes}{' '} + further overlapping {braces.overflowLanes === 1 ? 'group is' : 'groups are'} folded into + the last lane. +

+ )} +
{/* Parallel-group bracket gutter (only rendered when the conversation has any overlaps, so non-overlap traces keep a - flush-left layout with no dead space). */} + flush-left layout with no dead space). Segments are sparse and + absolutely positioned per lane so a row only pays for the + lanes it actually touches. */} {braces.laneCount > 0 && (
- {segs.map((seg, lane) => { - if (!seg) return
; + {segs.map(({ lane, seg }) => { const top = seg.role === 'first' ? '50%' : '0'; const bottom = seg.role === 'last' ? '50%' : '0'; return (
Date: Fri, 26 Jun 2026 04:13:55 +0000 Subject: [PATCH 099/111] fix(db): add endS to TurnNode so flamegraph timing typechecks Co-authored-by: Alec Ibarra Co-Authored-By: Claude Opus 4.8 --- packages/db/src/etl/weka-structure.ts | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index ac7a6eab..26cc8da1 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -50,6 +50,8 @@ export interface TurnNode { turnIndex: number; /** Seconds from the start of the conversation. */ startS?: number; + /** Seconds from the start of the conversation (startS + api_time). */ + endS?: number; model?: string; in: number; out: number; @@ -140,6 +142,13 @@ function finiteTime(value: number | undefined): number | undefined { return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined; } +/** End of a turn = its start plus the request's api_time (seconds). */ +function turnEndS(req: RawWekaRequest): number | undefined { + const startS = finiteTime(req.t); + if (startS === undefined) return undefined; + return startS + (finiteTime(req.api_time) ?? 0); +} + function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } { const children = entry.requests ?? []; const childStarts = children @@ -153,11 +162,7 @@ function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: nu } const childEnds = children - .map((child) => { - const childStart = finiteTime(child.t); - if (childStart === undefined) return undefined; - return childStart + (finiteTime(child.api_time) ?? 0); - }) + .map((child) => turnEndS(child)) .filter((value): value is number => value !== undefined); return { startS, @@ -202,6 +207,7 @@ export function buildConversationStructure( kind: 'turn', turnIndex: turnIndex++, startS: finiteTime(inner.t), + endS: turnEndS(inner), model: inner.model, in: split.in, out, @@ -238,6 +244,7 @@ export function buildConversationStructure( kind: 'turn', turnIndex: turnIndex++, startS: finiteTime(entry.t), + endS: turnEndS(entry), model: entry.model, in: split.in, out, From e3a6d41d92349ac824196aae503ec4ed02d0e21e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 26 Jun 2026 01:27:30 -0500 Subject: [PATCH 100/111] fix(agentic): enforce slow-tail interactivity (intvty = 1/itl) end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agentic artifacts ship *_intvty under two harness definitions: slow-tail 1/p(ITL) (what the charts assume) vs fast-tail p(1/ITL), which inverts percentile order (p90 lands at ~1/p10(ITL)). Ingest stored the artifact value verbatim and the frontend only filled intvty when missing, so newer "timing fix" runs landed with the wrong definition — e.g. p90 reading 23.9 instead of 11.2 — silently contaminating cross-run Pareto comparisons. Enforce the invariant in every path: - ingest mapper: derive agentic mean/median/p75/p90/p95/p99 *_intvty from *_itl, discarding the artifact value (self-correcting ingest). - frontend agenticAliases: always derive intvty = 1/itl (override, not fill-if-missing) so overlay / ?unofficialrun= rows match. - backfill-agentic-intvty script: one-time fix for stored rows (already run against the DB: 164 rows / 656 values rewritten, 0 contaminated after). - ingest agent doc: note the invariant + the backfill escape hatch. std_intvty is intentionally left alone (reciprocal of a std is meaningless; the API strips it). Unit tests added on both the mapper and the transform. Co-Authored-By: Claude Opus 4.8 --- .claude/agents/ingest.md | 3 + .../app/src/lib/benchmark-transform.test.ts | 32 ++++++ packages/app/src/lib/benchmark-transform.ts | 15 ++- packages/db/package.json | 2 + packages/db/src/backfill-agentic-intvty.ts | 107 ++++++++++++++++++ packages/db/src/etl/benchmark-mapper.test.ts | 44 +++++++ packages/db/src/etl/benchmark-mapper.ts | 19 ++++ 7 files changed, 217 insertions(+), 5 deletions(-) create mode 100644 packages/db/src/backfill-agentic-intvty.ts diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md index 4ecbc1dd..59045378 100644 --- a/.claude/agents/ingest.md +++ b/.claude/agents/ingest.md @@ -157,6 +157,7 @@ If user doesn't specify a description, ask for one OR derive from the run name. - **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `__` suffix. - **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection. - **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = ''` so the frontend's max-date-per-group dedup doesn't drop the older sweep. +- **Agentic interactivity normalization (`*_intvty`)**: for `agentic_traces` runs, interactivity MUST be the slow-tail reciprocal of the ITL percentile — `*_intvty = 1/*_itl` (so `p90_intvty = 1/p90_itl`). Some harness versions emit `*_intvty` as `p(1/ITL)` instead (fast-tail — inverts percentile order, e.g. p90 shows ~`1/p10(ITL)`), which silently contaminates cross-run Pareto comparisons. The ingest mapper (`benchmark-mapper.ts`) now **derives `*_intvty` from `*_itl` and discards the artifact's value** for agentic rows, so a normal ingest is self-correcting — no manual step needed. The frontend `agenticAliases` does the same for overlay / `?unofficialrun=` rows. If you ever load agentic data through a path that bypasses the mapper, run `pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes` (idempotent; rewrites `mean/p75/p90/p95 _intvty = 1/_itl`) then refresh the MV + purge cache. `std_intvty` is intentionally left alone (the reciprocal of a std is meaningless; the API strips it anyway). ## Process @@ -180,6 +181,8 @@ cd packages/db && DATABASE_WRITE_URL='' \ It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). +New agentic benchmark artifacts preserve AIPerf's `metadata.dataset` provenance as a top-level `dataset` object. Standard benchmark ingest automatically derives the dataset slug from `dataset.hf_dataset_name` and upserts `run_datasets`; do not manually backfill that mapping for new-format runs. Manual mapping is only needed for legacy artifacts that do not contain dataset provenance. + ## Don't - Don't push to git unless the user asked. diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index 88fb6a8b..648ebaae 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -854,3 +854,35 @@ describe('mergeRunScopedRows', () => { expect(mergeRunScopedRows([], baseRows)).toBe(baseRows); }); }); + +describe('rowToAggDataEntry — agentic interactivity invariant', () => { + // Agentic artifacts have shipped *_intvty under two definitions across harness + // versions (slow-tail 1/p(ITL) vs fast-tail p(1/ITL)). The chart's + // interactivity selector is slow-tail, so we always derive intvty = 1/itl and + // discard the artifact value. Mirrors the ingest mapper + backfill. + const agentic = (metrics: Record) => + rowToAggDataEntry(makeRow({ benchmark_type: 'agentic_traces', isl: null, osl: null, metrics })); + + it('overrides an artifact-supplied (fast-tail) *_intvty with 1/*_itl', () => { + const entry = agentic({ + p90_itl: 0.0893, // slow-tail 1/itl ≈ 11.198 + p90_intvty: 23.91, // fast-tail contamination — must be discarded + p75_itl: 0.0692, + p75_intvty: 19, // must be discarded + }); + expect(entry.p90_intvty).toBeCloseTo(1 / 0.0893, 6); + expect(entry.p75_intvty).toBeCloseTo(1 / 0.0692, 6); + expect(entry.p90_intvty).not.toBeCloseTo(23.91, 1); + }); + + it('derives intvty from itl when the artifact omits intvty entirely', () => { + const entry = agentic({ p90_itl: 0.1, p95_itl: 0.2 }); + expect(entry.p90_intvty).toBeCloseTo(10, 6); + expect(entry.p95_intvty).toBeCloseTo(5, 6); + }); + + it('does not invert interactivity for single_turn rows', () => { + const entry = rowToAggDataEntry(makeRow({ metrics: { p90_itl: 0.05, p90_intvty: 999 } })); + expect(entry.p90_intvty).toBe(999); + }); +}); diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index a1c86776..cb8e3ceb 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -21,18 +21,23 @@ import type { BenchmarkRow } from '@/lib/api'; * e2el ≡ ttlt (time-to-last-token == end-to-end latency) * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output) * intvty ≡ 1/itl (tok/s from the user's perspective) - * Existing fields win if present; we only fill in the gaps. + * + * e2el/tpot only fill gaps (existing fields win). `intvty` is ALWAYS derived from + * itl, overriding any artifact-supplied value: the harness definition of + * `*_intvty` has drifted (some versions emit `p(1/ITL)`, which inverts percentile + * order), so for a slow-tail selector interactivity must be `1/p(ITL)`. This + * matches the ingest mapper + backfill-agentic-intvty for official rows; doing it + * here keeps overlay / `?unofficialrun=` rows (transformed live from raw + * artifacts, never through the DB) on the same definition. */ function agenticAliases(m: Record): Record { const out: Record = {}; - for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) { + for (const suffix of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) { const itl = m[`${suffix}_itl`]; const ttlt = m[`${suffix}_ttlt`]; if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl; - if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) { - out[`${suffix}_intvty`] = 1 / itl; - } + if (itl !== undefined && itl > 0) out[`${suffix}_intvty`] = 1 / itl; } return out; } diff --git a/packages/db/package.json b/packages/db/package.json index 8b97c2c3..17d6f627 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -19,8 +19,10 @@ "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts", "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts", "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts", + "db:backfill-agentic-intvty": "dotenv -e ../../.env -- tsx src/backfill-agentic-intvty.ts", "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts", "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts", + "db:backfill-dataset-stats": "dotenv -e ../../.env -- tsx src/backfill-dataset-stats.ts", "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts", "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts", "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts", diff --git a/packages/db/src/backfill-agentic-intvty.ts b/packages/db/src/backfill-agentic-intvty.ts new file mode 100644 index 00000000..a8eebdba --- /dev/null +++ b/packages/db/src/backfill-agentic-intvty.ts @@ -0,0 +1,107 @@ +/** + * Backfill: enforce the slow-tail interactivity invariant on agentic rows. + * + * Agentic trace-replay artifacts emit both `*_itl` and `*_intvty`. Historically + * the harness wrote `*_intvty = 1/p(ITL)` (slow-tail — "interactivity at the + * p-th latency"), which is what the inference chart's interactivity selector + * and the detail time-series both assume. A later "timing fix" harness started + * emitting `*_intvty = p(1/ITL)` instead (fast-tail — equivalent to + * `1/p(100-x)(ITL)`), because taking the reciprocal reverses percentile order. + * Ingest stores every metric verbatim, so those runs landed in the DB with the + * opposite definition — e.g. p90 reading 23.9 instead of 11.2 for the same + * point — contaminating cross-run Pareto comparisons. + * + * This rewrites `mean/p75/p90/p95 _intvty = 1/_itl` for every agentic row so the + * stored value always matches the slow-tail definition the charts use. It is + * idempotent: rows already on the correct definition are left untouched (guarded + * by a relative-deviation check). `std_intvty` is intentionally NOT touched — + * the reciprocal of a standard deviation is meaningless, and the API strips it. + * The prior fast-tail value is discarded on purpose (p10_itl isn't stored, so it + * isn't recoverable anyway, and per project policy fast-tail must not back a + * slow-tail selector). + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-intvty --yes + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils.js'; + +// Percentile-style keys whose interactivity is the reciprocal of the matching +// ITL percentile. `std` is excluded by design (not a reciprocal); `median`/`p99` +// are absent from agentic artifacts so they never appear here. +const KEYS = ['mean', 'p75', 'p90', 'p95'] as const; + +// Relative tolerance: skip rows already within 1e-6 of 1/itl so correct rows +// keep their original full-precision value and the change counts are accurate. +const REL_TOL = 1e-6; + +const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); + +async function contaminationCounts(): Promise> { + const out: Record = {}; + for (const k of KEYS) { + const rows = await sql.unsafe(` + SELECT count(*)::int AS n + FROM benchmark_results + WHERE benchmark_type = 'agentic_traces' + AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0 + AND metrics ? '${k}_intvty' + AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric) + > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric) + `); + out[k] = (rows[0] as unknown as { n: number }).n; + } + return out; +} + +async function main(): Promise { + const total = await sql<{ n: number }[]>` + SELECT count(*)::int AS n FROM benchmark_results WHERE benchmark_type = 'agentic_traces' + `; + console.log(`Agentic rows: ${total[0]!.n}`); + + const before = await contaminationCounts(); + console.log('Contaminated (intvty != 1/itl) before:', JSON.stringify(before)); + if (KEYS.every((k) => before[k] === 0)) { + console.log('Nothing to backfill — all agentic rows already satisfy intvty = 1/itl.'); + await sql.end(); + return; + } + + if (!hasYesFlag() && !(await confirm('Rewrite *_intvty = 1/*_itl for these rows? (y/N) '))) { + await sql.end(); + return; + } + + let totalUpdated = 0; + for (const k of KEYS) { + // keys are from a fixed trusted const — safe to interpolate. + const res = await sql.unsafe(` + UPDATE benchmark_results + SET metrics = jsonb_set(metrics, '{${k}_intvty}', to_jsonb(1.0 / (metrics->>'${k}_itl')::numeric)) + WHERE benchmark_type = 'agentic_traces' + AND metrics ? '${k}_itl' AND (metrics->>'${k}_itl')::numeric > 0 + AND metrics ? '${k}_intvty' + AND abs((metrics->>'${k}_intvty')::numeric - 1.0 / (metrics->>'${k}_itl')::numeric) + > ${REL_TOL} * (1.0 / (metrics->>'${k}_itl')::numeric) + `); + console.log(` ${k}_intvty: updated ${res.count} row(s)`); + totalUpdated += res.count; + } + + const after = await contaminationCounts(); + console.log('Contaminated after:', JSON.stringify(after)); + if (!KEYS.every((k) => after[k] === 0)) { + throw new Error('Backfill incomplete — some rows still deviate. Aborting before MV refresh.'); + } + + await refreshLatestBenchmarks(sql); + console.log(`Done. Rewrote ${totalUpdated} metric value(s) across agentic rows.`); + await sql.end(); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts index 65fb3e39..5fe9ffde 100644 --- a/packages/db/src/etl/benchmark-mapper.test.ts +++ b/packages/db/src/etl/benchmark-mapper.test.ts @@ -22,6 +22,20 @@ function makeV1Row(overrides: Record = {}): Record { }; } +/** Minimal valid agentic row: scenario_type triggers the agentic path; `users` → conc. */ +function makeAgenticRow(overrides: Record = {}): Record { + return { + infmax_model_prefix: 'dsv4', + hw: 'b200-nv', + framework: 'vllm', + precision: 'fp4', + scenario_type: 'agentic-coding', + users: 72, + tput_per_gpu: 20000, + ...overrides, + }; +} + /** Minimal valid v2 benchmark row (disaggregated prefill/decode parallelism). */ function makeV2Row(overrides: Record = {}): Record { return { @@ -570,3 +584,33 @@ describe('extractWorkers', () => { expect(extractWorkers([null, 'bad', 0, undefined])).toBeUndefined(); }); }); + +describe('mapBenchmarkRow — agentic interactivity normalization', () => { + it('derives *_intvty from 1/*_itl, discarding the artifact value', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeAgenticRow({ + p90_itl: 0.0893, + p90_intvty: 23.91, // fast-tail contamination — must be overwritten + p75_itl: 0.0692, + p75_intvty: 19, + }), + tracker, + ); + expect(result!.benchmarkType).toBe('agentic_traces'); + expect(result!.metrics.p90_intvty).toBeCloseTo(1 / 0.0893, 6); + expect(result!.metrics.p75_intvty).toBeCloseTo(1 / 0.0692, 6); + }); + + it('derives *_intvty even when the artifact omits it', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeAgenticRow({ p90_itl: 0.1 }), tracker); + expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6); + }); + + it('does not touch *_intvty for single_turn rows', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV1Row({ p90_itl: 0.05, p90_intvty: 999 }), tracker); + expect(result!.metrics.p90_intvty).toBe(999); + }); +}); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 258a5ecc..5ec3343c 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -63,6 +63,9 @@ const NON_METRIC_KEYS = new Set([ 'offload_mode', 'num_requests_total', 'num_requests_successful', + // Public-dataset provenance emitted by aiperf. The ingest runner uses this + // object to populate run_datasets; it is not a benchmark metric. + 'dataset', // per-worker measured-power array (not a numeric scalar). Surfaced as a // sibling of the metrics JSONB by mapBenchmarkRow so the metrics column // stays Record for the index signature on BenchmarkRow. @@ -266,6 +269,22 @@ export function mapBenchmarkRow( (metrics as Record).offload_mode = offloadModeRaw; } + // Slow-tail interactivity invariant. Agentic artifacts ship `*_intvty`, but the + // definition has drifted across harness versions: some emit `1/p(ITL)` + // (slow-tail), others `p(1/ITL)` — which inverts percentile order, so p90 comes + // out as ~1/p10(ITL) instead. The inference chart's interactivity selector and + // the detail time-series both treat interactivity as the reciprocal of the ITL + // percentile, so we derive it from `*_itl` here rather than trust the artifact, + // keeping every agentic row on one definition. `std` is excluded — the + // reciprocal of a standard deviation is meaningless. Mirrored in the frontend + // overlay path (agenticAliases) and the one-time backfill-agentic-intvty script. + if (isAgentic) { + for (const k of ['mean', 'median', 'p75', 'p90', 'p95', 'p99', 'p99.9']) { + const itl = metrics[`${k}_itl`]; + if (typeof itl === 'number' && itl > 0) metrics[`${k}_intvty`] = 1 / itl; + } + } + // Artifact names encode '/' as '#' to avoid path separators; restore the URI. const image = row.image ? String(row.image).replaceAll('#', '/') : null; From 3ab43e6443a42aefdc21e505ad5673e018b9dc2c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 26 Jun 2026 17:51:17 -0500 Subject: [PATCH 101/111] feat(agentic): agentic-point detail, datasets, and trace-replay metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Frontend (packages/app): agentic-point detail page (server-metrics time series, derived metrics, request timeline, histograms, aggregates), datasets list/detail, and supporting hooks/charts/utilities. Backing (packages/db, packages/constants): trace-replay ingest + ETL (server-metrics adapters, trace-artifact discovery, dataset provenance, chart-series/aggregate-stats compute), queries (derived-agentic-metrics, trace-histograms, trace-server-metrics, request-timeline, datasets, agentic-aggregates), migration 009, and shared agentic constants. Committed together because the frontend API routes import the db query functions — a frontend-only commit would not build. Co-Authored-By: Claude Opus 4.8 --- docs/data-pipeline.md | 12 + .../kv-cache-hit-rate-anomaly.md | 113 +++++ .../e2e/agentic-point-time-series.cy.ts | 220 ++++++++ .../cypress/e2e/datasets-distributions.cy.ts | 43 ++ .../e2e/gpu-compare-agentic-detail.cy.ts | 54 ++ .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 47 ++ .../api/v1/derived-agentic-metrics/route.ts | 6 +- .../components/datasets/dataset-detail.tsx | 34 +- .../src/components/datasets/dataset-list.tsx | 10 +- .../app/src/components/datasets/format.ts | 6 + .../datasets/trace-flamegraph.test.ts | 41 +- .../components/inference/InferenceContext.tsx | 13 +- .../agentic-point/agentic-point-detail.tsx | 475 +++++++++++++++--- .../agentic-point/request-timeline.test.ts | 101 ++++ .../agentic-point/request-timeline.tsx | 169 +++++-- .../agentic-point/time-series-chart.test.ts | 175 ++++++- .../agentic-point/time-series-chart.tsx | 169 ++++++- .../inference/hooks/useChartData.ts | 8 +- .../app/src/components/inference/types.ts | 11 +- .../components/inference/ui/ChartDisplay.tsx | 81 ++- .../src/components/inference/ui/GPUGraph.tsx | 47 ++ .../components/inference/ui/ScatterGraph.tsx | 8 +- .../src/components/inference/utils.test.ts | 21 +- .../app/src/components/inference/utils.ts | 14 + .../inference/utils/tooltip-utils.test.ts | 32 ++ .../inference/utils/tooltipUtils.ts | 34 +- packages/app/src/hooks/api/use-datasets.ts | 6 + .../api/use-derived-agentic-metrics.test.ts | 13 + .../hooks/api/use-derived-agentic-metrics.ts | 26 +- .../src/hooks/api/use-trace-server-metrics.ts | 28 ++ .../d3-chart/layers/scatter-points.test.ts | 50 +- .../src/lib/d3-chart/layers/scatter-points.ts | 17 +- packages/constants/src/agentic.ts | 2 + packages/constants/src/index.ts | 1 + .../migrations/009_dataset_request_stats.sql | 55 ++ packages/db/src/backfill-aggregate-stats.ts | 33 +- packages/db/src/backfill-chart-series.ts | 27 +- packages/db/src/backfill-dataset-stats.ts | 115 +++++ .../src/etl/compute-aggregate-stats.test.ts | 31 +- .../db/src/etl/compute-aggregate-stats.ts | 27 + .../db/src/etl/compute-chart-series.test.ts | 89 ++++ packages/db/src/etl/compute-chart-series.ts | 105 +++- .../db/src/etl/dataset-provenance.test.ts | 40 ++ packages/db/src/etl/dataset-provenance.ts | 30 ++ .../db/src/etl/server-metrics-adapters.ts | 100 ++++ .../src/etl/trace-artifact-discovery.test.ts | 66 +++ .../db/src/etl/trace-artifact-discovery.ts | 89 ++++ packages/db/src/etl/trace-replay-ingest.ts | 6 +- packages/db/src/etl/weka-structure.test.ts | 46 +- packages/db/src/etl/weka-structure.ts | 51 +- packages/db/src/ingest-ci-run.ts | 77 ++- packages/db/src/ingest-weka-dataset.ts | 31 +- packages/db/src/queries/agentic-aggregates.ts | 4 +- packages/db/src/queries/datasets.ts | 4 + .../queries/derived-agentic-metrics.test.ts | 15 + .../db/src/queries/derived-agentic-metrics.ts | 40 +- .../db/src/queries/request-timeline.test.ts | 45 ++ packages/db/src/queries/request-timeline.ts | 28 +- .../db/src/queries/trace-histograms.test.ts | 78 +++ packages/db/src/queries/trace-histograms.ts | 67 ++- .../src/queries/trace-server-metrics.test.ts | 104 ++++ .../db/src/queries/trace-server-metrics.ts | 36 +- 62 files changed, 3231 insertions(+), 295 deletions(-) create mode 100644 docs/investigations/kv-cache-hit-rate-anomaly.md create mode 100644 packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.test.ts create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts create mode 100644 packages/constants/src/agentic.ts create mode 100644 packages/db/migrations/009_dataset_request_stats.sql create mode 100644 packages/db/src/backfill-dataset-stats.ts create mode 100644 packages/db/src/etl/dataset-provenance.test.ts create mode 100644 packages/db/src/etl/dataset-provenance.ts create mode 100644 packages/db/src/etl/server-metrics-adapters.ts create mode 100644 packages/db/src/etl/trace-artifact-discovery.test.ts create mode 100644 packages/db/src/etl/trace-artifact-discovery.ts create mode 100644 packages/db/src/queries/request-timeline.test.ts create mode 100644 packages/db/src/queries/trace-histograms.test.ts create mode 100644 packages/db/src/queries/trace-server-metrics.test.ts diff --git a/docs/data-pipeline.md b/docs/data-pipeline.md index 38e7d471..bc439e8a 100644 --- a/docs/data-pipeline.md +++ b/docs/data-pipeline.md @@ -62,6 +62,18 @@ Configs are preloaded into an in-memory Map at ingest start. `getOrCreateConfig( Unmapped models/hardware are tracked (not silently dropped) so operators can see what new GPU or model names appeared in CI artifacts. This is how new GPUs get added to the system — the skip tracker acts as a change detection mechanism. +### Server-Metric Orchestrator Adapters + +AIPerf defines the `server_metrics_export.json` envelope, but labels such as worker role and rank belong to the serving orchestrator. The chart-series ETL therefore normalizes raw series through an orchestrator-specific adapter before exposing per-worker metrics. For example, the Dynamo adapter maps `dynamo_component=prefill|backend` to canonical `prefill|decode` roles and uses the endpoint, worker ID, DP rank, and engine together as the source identity. + +Adapters are selected from the benchmark's canonical framework, and per-worker series are only emitted for disaggregated configs with a recognized adapter. Unknown orchestrators and non-disaggregated configs retain their aggregate-only series; roles are never guessed from ports or metric names. The frontend only consumes the canonical source identity and never interprets orchestrator-native labels. + +### Agentic Dataset Provenance + +AIPerf exports public-dataset provenance in `metadata.dataset`, including the Hugging Face dataset ID. InferenceX preserves that object as `dataset` on each agentic aggregate benchmark row. During benchmark ingest, `ingest-ci-run.ts` derives the dashboard slug from `hf_dataset_name` (for example, `semianalysisai/cc-traces-weka-062126` becomes `cc-traces-weka-062126`) and upserts `run_datasets` for the workflow run. + +Legacy artifacts without provenance leave any existing mapping untouched. A workflow run can map to only one dataset; conflicting dataset IDs fail ingest rather than silently linking the run to an arbitrary dataset. + ## Frontend Transform Pipeline ### Why transformBenchmarkRows Exists diff --git a/docs/investigations/kv-cache-hit-rate-anomaly.md b/docs/investigations/kv-cache-hit-rate-anomaly.md new file mode 100644 index 00000000..61ffee42 --- /dev/null +++ b/docs/investigations/kv-cache-hit-rate-anomaly.md @@ -0,0 +1,113 @@ +# KV cache hit-rate anomaly on agentic benchmarks (dsv4, b200, vllm) + +## Core issue + +vLLM's prefix cache should be hitting at ~98% on multi-turn agentic conversation replay (each turn extends the prior turn's context). It isn't. Something in the **dataset definition** or **aiperf replay** is producing requests whose token streams aren't actually prefix-compatible turn-to-turn. + +| Concurrency | Theoretical max hit % | vLLM actual hit % | +| ----------: | --------------------: | ----------------: | +| 1 | 97.45% | 83.15% | +| 2 | 98.34% | 46.78% | +| 4 | 97.99% | 12.43% | + +This is **not** a capacity problem. KV cache is sized at 3.29M tokens (12,868 blocks × 256). The conc=4 workload's unique-content footprint is **~1.11M DSV4 tokens** — would fit in ~34% util. Observed peak util is 49.8%, so the cache is holding more blocks than the workload needs, yet vLLM can't find them on lookup. + +## Data sources + +- **Benchmark points**: + - http://localhost:3002/inference/agentic/206252 (conc=1) + - http://localhost:3002/inference/agentic/206245 (conc=2) + - http://localhost:3002/inference/agentic/206247 (conc=4) +- **Neon DB**: project `silent-pond-29172997`, branch `br-cold-sky-ai0c09cy` (agentx-dev). Connection via `DATABASE_WRITE_URL` in `.env`. Console: https://console.neon.tech/app/projects/silent-pond-29172997/branches/br-cold-sky-ai0c09cy + - `agentic_trace_replay.profile_export_jsonl_gz` — gzipped aiperf per-request records + - `agentic_trace_replay.server_metrics_json_gz` — gzipped vllm per-scrape prometheus metrics + - `agentic_trace_replay.request_timeline` (jsonb) — pre-computed per-request timeline used by the simulation +- **Trace replay dataset** (the source-of-truth for "what should be cacheable"): https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-051926. Each row has pre-computed 64-token block `hash_ids` per turn; `hash_id_scope: 'local'` (per-conversation). + +## Theoretical max simulation + +For each replayed request, look up the matching turn in the HF dataset and walk a per-conversation trie of 64-token block hash IDs. Hits = longest contiguous prefix from block 0 that has appeared in any prior request (mirrors vLLM's chained-hash semantics). + +Confirms: the workload IS prefix-cacheable end-to-end. Theoretical max ≈ 98% across all three concurrency levels — same dataset, same conversations, just different dispatch order. + +## Why this points at the dataset/replay, not vLLM + +- **Capacity is not the bottleneck.** Cache holds ~3× the unique content of the workload. Cache util tops out below capacity. +- **The metric isn't lying.** vLLM's own counters cross-check: `prefill_kv_computed_tokens + prefix_cache_hits ≈ request_prompt_tokens` (67.85M + 9.61M ≈ 77.47M for conc=4). +- **It's not a tokenizer artifact.** DSV4 tokens are ~54% the count of Claude tokens, but BPE is left-monotonic on stable text — hit-rate ratio is invariant to tokenizer choice for prefix-growth workloads. +- **It's not the multi-engine DP bug** we found earlier (commit `f2618f4`) — this deployment has 1 engine. + +What's left: the bytes that vLLM actually receives turn-to-turn are not the same prefix + delta that the dataset's `hash_ids` describe. Most likely culprits: + +1. **aiperf isn't sending the cumulative chat history** the way the dataset assumes — each turn is being assembled differently than the previous, breaking the byte-level prefix. +2. **Something in the request payload varies per request** (timestamps, request IDs, tool result serialization order, etc.) — invalidates block 0's hash, cascades to every subsequent block via vLLM's chained hashing. +3. **BPE re-merging across message boundaries** when aiperf re-tokenizes the full history each turn instead of appending tokens. + +## Root cause: `ConversationReconstructor` strips the prev user's `partial_tail` every turn + +The bug is in `utils/aiperf/src/aiperf/dataset/loader/weka_synth_buf.py` — specifically the **boundary case** in `truncate_synth_buf_at_block` (line 453–464) combined with `turn_delta`'s reset logic (line 354–360). + +What happens turn-to-turn: + +1. `init_turn_0` builds a trailing user segment whose `tokens` = `[block_aligned_tokens] + [partial_tail_tokens]` where `partial_tail_n = in_tokens % bs`. The wire prompt for turn 0 includes these tail tokens. +2. `advance_turn` computes `lcp = longest_common_prefix(prev_hash_ids, curr_hash_ids)`. When the LCP equals the prev turn's total block count (the normal append-only case), `truncate_synth_buf_at_block` hits its boundary branch: `cursor + seg.block_count == target_blocks`. +3. That branch **strips `prev_partial_tail` tokens off the trailing user segment in place** and re-decodes its `content`. This sets `_last_disturbance_at = i` (the index of the prev trailing user segment). +4. New `assistant` + `user` segments are appended. +5. `turn_delta` sees `_last_disturbance_at < _emitted_segment_count` and forces `reset_context=True`, re-emitting **the whole conversation** with the now-stripped trailing user. + +The endpoint (`utils/aiperf/src/aiperf/endpoints/base_endpoint.py:110-140`) honors `reset_context=True` via `messages = list(turn.raw_messages)` instead of `messages.extend(...)`. + +Result: every turn sends the full chat history, but the bytes of the prev user message differ from what was sent the turn before — the trailing `partial_tail` chars are missing. vLLM tokenizes the new prompt, hashes 256-token blocks, and the chained-hash invariant breaks at the first block containing the trimmed boundary. That block + every subsequent block of the new turn miss the cache. + +### Empirical confirmation + +Reproducer at `/tmp/test-reconstructor.py` instantiates `ConversationReconstructor` with mock decoders and walks a synthetic 3-turn conversation: + +``` +=== Turn 0 === + delta msgs: 2, reset=False + wire len: 21683 + +=== Turn 1 === + delta msgs: 4, reset=True ← every turn resets + wire len: 25307 + +=== DIFF turn 0 vs turn 1 (wire-level) === + common prefix chars: 21549 / wire0 21683 (99.4%) + wire0[...] = '... 983406 12 1 133 184 16 57 71 155 37 ' ← partial_tail decoded + wire1[...] = '... 983406<|im_end|>\n<|im_start|>assista' ← stripped, template marker next + turn0 user content len: 19812, turn1 user[0] content len: 19711 ← 101 chars stripped +``` + +Across the conc=1 run (point 206252), **280/280 (100%)** consecutive turn-pairs have `prev_in_tokens % bs != 0` — i.e., every single turn hits this boundary disturbance. + +### Why the gap widens with concurrency + +At conc=1 the gap (97.45% − 83.15% = 14pp) is roughly the fraction of each turn's blocks lost to the trimmed-tail invalidation (last user block + chat-template delta). At higher conc: + +- `reset_context=True` makes every request re-send the **entire** conversation prompt, so wire bandwidth + prefill work scale superlinearly per turn. +- Concurrent conversations all do this simultaneously; each writes long sequences of "new" blocks past their respective divergence points, evicting other conversations' usable prefix blocks even though aggregate unique content (1.11M tokens) fits comfortably in the 3.29M-token cache. + +### Fix sketch + +The boundary-cut strip exists to keep the next turn's `assistant` segment block-aligned. Two viable fixes: + +1. **Don't mutate the prev trailing user segment.** Leave its `partial_tail` tokens intact; append the new asst+user as strict-append (no reset_context). The wire-prefix becomes byte-stable turn-to-turn. Cost: the new asst content's block_start no longer aligns to the prev_hash_ids tail, so hash_id accounting for asst blocks loses 1 block of fidelity per turn. +2. **Track `partial_tail` separately** from the prev user segment so the segment's emitted content stays byte-stable, and only the trailing tail (which is regenerated each turn anyway) is allowed to vary. + +Option 1 is the minimal change. Validate with the reproducer above — remove the strip in `truncate_synth_buf_at_block`'s boundary case and re-run; turn N+1's wire prefix should equal turn N's wire byte-for-byte up to the end of the prev assistant template. + +## Re-running the simulation + +```bash +# 1. dump request timelines from DB +pnpm --filter @semianalysisai/inferencex-db exec dotenv -e ../../.env -- tsx /tmp/dump-rt-multi.ts + +# 2. run analysis (needs `pip3 install --break-system-packages --user datasets`) +python3 /tmp/cache-sim-multi.py + +# 3. reproduce the partial_tail strip +python3 /tmp/test-reconstructor.py +``` + +Scripts live in `/tmp/` from this session; recreate from inline code in the previous version of this doc if missing. diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts index db59dda2..4a450f7c 100644 --- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -40,6 +40,26 @@ describe('Agentic point request metric time series', () => { timelineRequest(4, 1600, 80), timelineRequest(5, 3200, 160, { phase: 'warmup' }), timelineRequest(6, 6400, 320, { cancelled: true }), + timelineRequest(7, 0, 0, { + cid: 'conversation-1::sa:subagent_001_abcd', + credit: 1_100_000_000, + start: 1_100_000_000, + end: 1_900_000_000, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + }), + timelineRequest(8, 0, 0, { + cid: 'conversation-1::sa:subagent_001_abcd:aux:011', + credit: 1_200_000_000, + start: 1_200_000_000, + end: 1_800_000_000, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + }), ], }, }); @@ -52,6 +72,7 @@ describe('Agentic point request metric time series', () => { cy.get('[data-testid="interactivity-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') .should('have.text', 'P90'); + cy.get('[data-testid="interactivity-point-count"]').should('have.text', '5 points'); cy.get('svg circle').should('have.length', 5); cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT'); @@ -60,6 +81,7 @@ describe('Agentic point request metric time series', () => { cy.get('[data-testid="ttft-over-time-chart"]').within(() => { cy.contains('h2', 'TTFT over time').should('be.visible'); + cy.get('[data-testid="ttft-point-count"]').should('have.text', '5 points'); cy.get('svg circle').should('have.length', 5); cy.get('svg').should('contain.text', 'TTFT (s)'); cy.get('svg').should('contain.text', 'Cumulative P90 TTFT'); @@ -67,6 +89,34 @@ describe('Agentic point request metric time series', () => { }); }); + it('switches ISL and OSL cards from distributions to in-flight averages', () => { + cy.get('[data-testid="isl-metric-chart"]').within(() => { + cy.get('[data-testid="isl-metric-inflight"]').click(); + cy.contains('h2', 'Average ISL in flight').should('be.visible'); + cy.get('svg').should('contain.text', 'Average ISL in flight (30s avg)'); + }); + cy.get('[data-testid="osl-metric-chart"]').within(() => { + cy.get('[data-testid="osl-metric-inflight"]').click(); + cy.contains('h2', 'Average OSL in flight').should('be.visible'); + cy.contains('Retrospective: final observed OSL').should('be.visible'); + cy.get('svg').should('contain.text', 'Average OSL in flight (30s avg)'); + }); + }); + + it('switches the TTFT chart to E2E request latency over time', () => { + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.get('[data-testid="latency-metric-e2e"]').click(); + cy.contains('h2', 'E2E latency over time').should('be.visible'); + cy.get('[data-testid="e2e-point-count"]').should('have.text', '7 points'); + cy.get('svg circle').should('have.length', 7); + cy.get('svg').should('contain.text', 'E2E latency (s)'); + cy.get('svg').should('contain.text', 'Cumulative P90 E2E latency'); + + cy.get('[data-testid="latency-metric-ttft"]').click(); + cy.contains('h2', 'TTFT over time').should('be.visible'); + }); + }); + it('switches each chart independently from P90 to P75', () => { cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { cy.contains('svg', 'P90 (rolling 50 req)') @@ -97,4 +147,174 @@ describe('Agentic point request metric time series', () => { cy.get('svg').should('contain.text', 'Cumulative P75 TTFT'); }); }); + + it('switches the request activity card from queue depth to cumulative completions', () => { + cy.get('[data-testid="request-activity-chart"]').within(() => { + cy.contains('h2', 'Request queue depth').should('be.visible'); + cy.get('[data-testid="request-activity-completed"]').click(); + cy.contains('h2', 'Cumulative completed requests').should('be.visible'); + cy.get('svg').should('contain.text', 'Completed requests'); + cy.get('svg').should('contain.text', 'Requests'); + cy.get('[data-testid="request-activity-queue"]').click(); + cy.contains('h2', 'Request queue depth').should('be.visible'); + }); + }); + + it('shows total time with no requests in flight on the request timeline', () => { + cy.get('[data-testid="detail-view-timeline"]').click(); + cy.location('search').should('contain', 'view=timeline'); + cy.get('[data-testid="timeline-total-idle-time"]').should('have.text', 'idle 1.00s (14.3%)'); + cy.get('[data-timeline-row-kind="aux"]') + .should('have.css', 'padding-left', '24px') + .and('contain.text', 'aux 011 · parallel'); + }); + + it('restores the request timeline view after browser Back from a dataset route', () => { + cy.window().then((win) => { + win.history.pushState({}, '', '/datasets/test-dataset/conversations/conversation-1'); + }); + cy.go('back'); + cy.location('pathname').should('eq', '/inference/agentic/206885'); + cy.location('search').should('contain', 'view=timeline'); + cy.get('[data-testid="detail-view-timeline"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="timeline-total-idle-time"]').should('be.visible'); + }); + + it('shows a cumulative average for unique input tokens in flight', () => { + cy.get('[data-testid="detail-view-point"]').click(); + cy.get('[data-testid="unique-input-inflight-chart"]').within(() => { + cy.get('svg').should('contain.text', 'Cumulative average'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + }); +}); + +const pointMeta = { + id: 206885, + hardware: 'gb200', + framework: 'dynamo-vllm', + model: 'deepseek-r1-0528', + precision: 'fp8', + spec_method: 'none', + disagg: true, + conc: 128, + offload_mode: 'off', + isl: null, + osl: null, + benchmark_type: 'agentic_traces', + date: '2026-06-23', + run_url: null, + server_gpu_cache_hit_rate: 0.5, + server_cpu_cache_hit_rate: null, +}; + +const sourceSeries = (source: Record, prompt: number, generation: number) => ({ + source, + kvCacheUsage: [ + { t: 0, value: 0.25 }, + { t: 1, value: 0.5 }, + ], + prefixCacheHitRate: [{ t: 0, value: 0.5 }], + queueDepth: [{ t: 0, running: 2, waiting: 1, total: 3 }], + promptTokensBySource: { miss: [{ t: 0, value: prompt }] }, + promptTps: [{ t: 0, value: prompt }], + generationTps: [{ t: 0, value: generation }], + prefixCacheHitsTps: [{ t: 0, value: prompt / 2 }], + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], +}); + +describe('Agentic point orchestrator metric sources', () => { + beforeEach(() => { + const prefill = sourceSeries( + { + id: 'dynamo|prefill|10.30.1.56:7500|prefill-a|0|0', + adapter: 'dynamo', + role: 'prefill', + endpointUrl: '10.30.1.56:7500', + nativeRole: 'prefill', + workerId: 'prefill-a', + dpRank: '0', + engine: '0', + }, + 100, + 1, + ); + const decode = sourceSeries( + { + id: 'dynamo|decode|10.30.1.206:7516|decode-a|0|0', + adapter: 'dynamo', + role: 'decode', + endpointUrl: '10.30.1.206:7516', + nativeRole: 'backend', + workerId: 'decode-a', + dpRank: '0', + engine: '0', + }, + 300, + 400, + ); + cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} }); + cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/request-timeline*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/trace-server-metrics*', { + body: { + meta: pointMeta, + startNs: 0, + endNs: 2_000_000_000, + durationS: 2, + timeslicesCount: 2, + kvCacheUsage: prefill.kvCacheUsage, + prefixCacheHitRate: prefill.prefixCacheHitRate, + queueDepth: prefill.queueDepth, + promptTokensBySource: prefill.promptTokensBySource, + prefillTps: prefill.promptTps, + decodeTps: decode.generationTps, + prefixCacheHitsTps: prefill.prefixCacheHitsTps, + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], + metricSources: [prefill, decode], + }, + }); + cy.visit('/inference/agentic/206885'); + }); + + it('switches every server chart to an orchestrator-normalized worker', () => { + cy.get('[data-testid="metric-source-toolbar"]') + .should('have.css', 'position', 'sticky') + .and('have.css', 'top', '64px'); + cy.get('[data-testid="metric-source-select"]').should('contain.text', 'All endpoints').click(); + cy.contains('[role="option"]', 'Decode · decode-a').click(); + + cy.get('[data-testid="metric-source-select"]').should('contain.text', 'Decode · decode-a'); + cy.contains('h2', 'Throughput · Decode · decode-a').should('be.visible'); + cy.contains('svg', 'Decode (avg n=50)').should('be.visible'); + + cy.get('[data-testid="metric-source-select"]').click(); + cy.contains('[role="option"]', 'Prefill · prefill-a').click(); + cy.contains('h2', 'Throughput · Prefill · prefill-a').should('be.visible'); + }); + + it('toggles input and decode independently while keeping one visible', () => { + cy.get('[data-testid="throughput-series-input"]') + .should('have.attr', 'aria-pressed', 'true') + .and('not.be.disabled'); + cy.get('[data-testid="throughput-series-decode"]') + .should('have.attr', 'aria-pressed', 'true') + .and('not.be.disabled'); + cy.contains('svg', 'Input (avg n=50)').should('be.visible'); + cy.contains('svg', 'Decode (avg n=50)').should('be.visible'); + cy.contains('svg', 'Total running avg (60s burn-in)').should('be.visible'); + + cy.get('[data-testid="throughput-series-input"]').click(); + cy.get('[data-testid="throughput-series-input"]').should('have.attr', 'aria-pressed', 'false'); + cy.get('[data-testid="throughput-series-decode"]').should('be.disabled'); + cy.contains('svg', 'Input (avg n=50)').should('not.exist'); + cy.contains('svg', 'Total running avg (60s burn-in)').should('not.exist'); + + cy.get('[data-testid="throughput-series-input"]').click(); + cy.get('[data-testid="throughput-series-decode"]').click(); + cy.get('[data-testid="throughput-series-input"]').should('be.disabled'); + cy.get('[data-testid="throughput-series-decode"]').should('have.attr', 'aria-pressed', 'false'); + }); }); diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts index 7edda341..6ce4bc34 100644 --- a/packages/app/cypress/e2e/datasets-distributions.cy.ts +++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts @@ -33,6 +33,10 @@ describe('Dataset distribution percentiles', () => { mainTurns: 20, subagentGroups: 0, subagentTurns: 0, + medianRequestsPerConversation: 12, + meanRequestsPerConversation: 14.6, + medianSubagentsPerTrace: 3, + meanSubagentsPerTrace: 4.8, cachedPct: 0.5, totalIn: 1000, totalOut: 200, @@ -60,6 +64,20 @@ describe('Dataset distribution percentiles', () => { p95: 256, max: 512, }), + subagentInputTokensPerRequest: distribution({ + median: 1000, + p75: 2000, + p90: 3000, + p95: 4000, + max: 5000, + }), + subagentOutputTokensPerRequest: distribution({ + median: 100, + p75: 200, + p90: 300, + p95: 400, + max: 500, + }), }, ingested_at: '2026-06-23T00:00:00Z', }, @@ -87,4 +105,29 @@ describe('Dataset distribution percentiles', () => { }); } }); + + it('shows median and mean model requests per conversation', () => { + cy.contains('dt', 'Median requests / convo').next('dd').should('have.text', '12'); + cy.contains('dt', 'Mean requests / convo').next('dd').should('have.text', '14.6'); + }); + + it('summarizes subagents per trace instead of charting group counts', () => { + cy.contains('dt', 'Median subagents / trace').next('dd').should('have.text', '3'); + cy.contains('dt', 'Mean subagents / trace').next('dd').should('have.text', '4.8'); + cy.contains('Subagent groups per conversation').should('not.exist'); + }); + + it('shows ISL and OSL distributions for inner subagent requests only', () => { + const expected = [ + ['Subagent request ISL', ['p50 1.0k', 'p75 2.0k', 'p90 3.0k', 'p95 4.0k']], + ['Subagent request OSL', ['p50 100', 'p75 200', 'p90 300', 'p95 400']], + ] as const; + + for (const [title, percentiles] of expected) { + cy.contains('[data-slot="card"]', title).within(() => { + cy.contains('Inner subagent requests only').should('be.visible'); + for (const percentile of percentiles) cy.contains(percentile).should('be.visible'); + }); + } + }); }); diff --git a/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts new file mode 100644 index 00000000..d574dd2a --- /dev/null +++ b/packages/app/cypress/e2e/gpu-compare-agentic-detail.cy.ts @@ -0,0 +1,54 @@ +describe('GPU comparison agentic point detail', () => { + it('exposes the per-point charts as a normal browser link', () => { + cy.intercept('GET', '/api/v1/trace-availability*', (request) => { + const ids = new URL(request.url).searchParams.get('ids')?.split(',') ?? []; + if (ids.length < 20) request.alias = 'gpuTraceAvailability'; + request.continue(); + }); + + cy.visit('/inference?g_model=DeepSeek-V4-Pro&i_seq=agentic-traces&i_prec=fp4', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + }, + }); + + cy.get('[data-testid="gpu-multiselect"] [role="combobox"]').click({ force: true }); + cy.get('[role="option"]').first().click(); + cy.contains('button', 'Select date range').click(); + cy.get('body').then(($body) => { + if ($body.text().includes('View anyway')) { + cy.contains('button', 'View anyway').click(); + } else { + cy.contains('button', 'Max Range').click(); + cy.contains('button', 'Apply').click(); + } + }); + + cy.get('[data-testid="gpu-graph"]').first().should('be.visible'); + cy.wait('@gpuTraceAvailability'); + cy.wait(100); + cy.get('[data-testid="gpu-graph"]') + .first() + .find('svg .dot-group') + .should('have.length.greaterThan', 0) + .first() + .then(($point) => { + const point = $point[0] as unknown as SVGElement & { + __data__: { benchmark_type?: string; id?: number }; + }; + expect(point.__data__.benchmark_type).to.equal('agentic_traces'); + expect(point.__data__.id).to.be.a('number'); + cy.wrap($point).find('.visible-shape').click({ force: true }); + }); + + cy.get('[data-chart-tooltip]:visible').should('have.length', 1); + cy.get('[data-chart-tooltip]:visible [data-action="view-charts"]') + .should('be.visible') + .then(($link) => { + expect($link).to.match('a'); + expect($link).not.to.have.attr('target'); + expect($link.attr('href')).to.match(/^\/inference\/agentic\/\d+$/u); + }); + cy.location('pathname').should('eq', '/inference'); + }); +}); diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index df199b81..924ff9a9 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -1,3 +1,23 @@ +const interceptDerivedMetrics = () => { + cy.intercept('GET', '/api/v1/derived-agentic-metrics*', (request) => { + const ids = new URL(request.url).searchParams.get('ids')?.split(',').filter(Boolean) ?? []; + request.reply({ + body: Object.fromEntries( + ids.map((id, index) => [ + id, + { + id: Number(id), + normalized_session_time_s: 60 + index, + p90_prefill_tps_per_user: 100 + index, + p75_normalized_e2e_400_s: 8 + index, + p90_normalized_e2e_400_s: 12 + index, + }, + ]), + ), + }); + }).as('derivedAgenticMetrics'); +}; + describe('X-Axis Mode Toggle (inference chart)', () => { before(() => { cy.visit('/inference', { @@ -13,6 +33,7 @@ describe('X-Axis Mode Toggle (inference chart)', () => { cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces'); cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible'); cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should('be.visible'); cy.get('[data-testid="x-axis-mode-interactivity"]') .should('be.visible') .and('have.attr', 'aria-selected', 'true'); @@ -31,6 +52,32 @@ describe('X-Axis Mode Toggle (inference chart)', () => { cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency'); }); + it('switches to request-level normalized E2E at 400 output tokens', () => { + interceptDerivedMetrics(); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').click(); + cy.wait('@derivedAgenticMetrics'); + cy.get('[data-testid="x-axis-mode-normalized-e2e"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + cy.get('[data-testid="chart-figure"] h2').should( + 'contain.text', + 'P90 Normalized E2E @ 400 output tokens', + ); + cy.get('[data-testid="chart-figure"] svg').should( + 'contain.text', + 'P90 Normalized E2E @ 400 output tokens (s)', + ); + + cy.get('[data-testid="percentile-selector"]').click(); + cy.contains('[role="option"]', 'p75').click(); + cy.get('[data-testid="chart-figure"] h2').should( + 'contain.text', + 'P75 Normalized E2E @ 400 output tokens', + ); + }); + it('switches back to Interactivity', () => { cy.get('[data-testid="x-axis-mode-interactivity"]').click(); cy.get('[data-testid="x-axis-mode-interactivity"]').should( diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts index 6ce7c017..6f7ab1ce 100644 --- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts +++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts @@ -13,12 +13,12 @@ export const dynamic = 'force-dynamic'; // blobOnly: the response is one entry per id with two numbers, but the // derivation work parses thousands of JSONL records per blob — cache the // computed result so a chart-refresh hits the warm path. -// Bumped to v2 when mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user. +// Bumped to v3 for per-request normalized-E2E @ 400 output tokens. // Stale v1 cache entries return undefined for the new field and silently // blank the chart with "No data available". const getCachedDerivedAgenticMetrics = cachedQuery( (ids: number[]): Promise => getDerivedAgenticMetrics(getDb(), ids), - 'derived-agentic-metrics-v2', + 'derived-agentic-metrics-v3', { blobOnly: true }, ); @@ -33,6 +33,8 @@ const MAX_IDS_PER_REQUEST = 200; * (Σ per-turn request_latency) rescaled by mean_load / session_load. * - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT) * across every turn in every session. + * - p75/p90_normalized_e2e_400_s: percentile of per-request + * TTFT + 399 × observed ITL. * * Ids without a trace_replay blob or with unparseable records are omitted. */ diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx index ac8b2de5..573e2f6b 100644 --- a/packages/app/src/components/datasets/dataset-detail.tsx +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -18,7 +18,7 @@ import { type ConversationSort, } from '@/hooks/api/use-datasets'; import { track } from '@/lib/analytics'; -import { compact } from './format'; +import { compact, perConversation } from './format'; const PAGE = 50; @@ -97,11 +97,22 @@ export function DatasetDetail({ slug }: { slug: string }) { {/* summary stats */} -
+
+ + - - + + + + + diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts index f6f5530c..f42dceb6 100644 --- a/packages/app/src/components/datasets/format.ts +++ b/packages/app/src/components/datasets/format.ts @@ -10,3 +10,9 @@ export function compact(n: number): string { if (abs > 0 && abs < 1) return n.toFixed(2); return String(Math.round(n)); } + +/** Format a per-conversation count without hiding a meaningful fractional mean. */ +export function perConversation(n: number | undefined): string { + if (typeof n !== 'number' || !Number.isFinite(n)) return '—'; + return n.toLocaleString(undefined, { maximumFractionDigits: 1 }); +} diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts index 00293c00..2ead726b 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.test.ts +++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { formatElapsedTime } from './trace-flamegraph'; +import { findRequestOverlapGroups, formatElapsedTime } from './trace-flamegraph'; describe('formatElapsedTime', () => { it('formats elapsed seconds below and above one hour', () => { @@ -14,3 +14,42 @@ describe('formatElapsedTime', () => { expect(formatElapsedTime(-5)).toBe('00:00'); }); }); + +describe('findRequestOverlapGroups', () => { + it('keeps non-transitive overlap chains as separate groups', () => { + const groups = findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 8 }, + { key: 'B', startS: 5, endS: 11 }, + { key: 'C', startS: 9, endS: 15 }, + ]); + + expect(groups.map((group) => group.requestKeys)).toEqual([ + ['A', 'B'], + ['B', 'C'], + ]); + expect(groups.map(({ startS, endS }) => [startS, endS])).toEqual([ + [5, 8], + [9, 11], + ]); + }); + + it('does not consider touching or invalid intervals parallel', () => { + expect( + findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 5 }, + { key: 'B', startS: 5, endS: 8 }, + { key: 'missing-end', startS: 3 }, + { key: 'zero-duration', startS: 4, endS: 4 }, + ]), + ).toEqual([]); + }); + + it('returns only the maximal simultaneous set for nested intervals', () => { + const groups = findRequestOverlapGroups([ + { key: 'A', startS: 1, endS: 10 }, + { key: 'B', startS: 2, endS: 8 }, + { key: 'C', startS: 3, endS: 7 }, + ]); + expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]); + }); +}); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index ddb923b8..3dddb5dd 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -159,10 +159,17 @@ export function InferenceProvider({ // computing a kind-based default here would diverge between server and client // and cause a hydration mismatch. The scenario-kind default is applied in a // post-mount effect below (and a ref tracks whether the user has overridden). - type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps'; + type XAxisMode = + | 'ttft' + | 'e2e' + | 'normalized-e2e' + | 'interactivity' + | 'session-time' + | 'prefill-tps'; const VALID_X_MODES: XAxisMode[] = [ 'ttft', 'e2e', + 'normalized-e2e', 'interactivity', 'session-time', 'prefill-tps', @@ -544,7 +551,9 @@ export function InferenceProvider({ const kind = sequenceKind(effectiveSequence); const isInitialMount = lastSeqKindRef.current === null; const isAgenticOnlyMode = - selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps'; + selectedXAxisMode === 'normalized-e2e' || + selectedXAxisMode === 'session-time' || + selectedXAxisMode === 'prefill-tps'; // On a stale render where kind hasn't changed, bail unless the current // mode is agentic-only and we just landed on a fixed-seq scenario — in // that case force the snap so the chart doesn't try to plot trace-derived diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index e1bc1524..77d87997 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -1,8 +1,8 @@ 'use client'; import Link from 'next/link'; -import { useRouter } from 'next/navigation'; -import { useState } from 'react'; +import { usePathname, useRouter, useSearchParams } from 'next/navigation'; +import { useCallback, useState } from 'react'; import { ArrowLeft } from 'lucide-react'; import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates'; @@ -10,12 +10,20 @@ import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-reques import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; import { useTraceServerMetrics, + type MetricSource, type PointMeta, type QueueDepthPoint, type TimeSeriesPoint, } from '@/hooks/api/use-trace-server-metrics'; import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings'; import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; import { track } from '@/lib/analytics'; import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart'; @@ -26,16 +34,20 @@ import { SiblingNav, chipLabel } from './sibling-nav'; import { StackedAreaChart, TimeSeriesChart, - cumulativeAverage, + averageSequenceLengthInFlight, + buildThroughputChartSeries, + cumulativeCompletedRequests, cumulativeDifferenceMonotonic, + cumulativeTimeAverage, cumulativeUniqueInputTokens, inflightUniqueTokens, rollingAverage, rollingRequestMetric, - sumSeries, timeRollingAverage, + toggleThroughputSeries, type RequestMetric, type RequestPercentile, + type ThroughputSeriesKey, } from './time-series-chart'; interface Props { @@ -112,17 +124,56 @@ const DP_RANK_PALETTE = [ ]; type DetailView = 'point' | 'timeline' | 'aggregates'; +type RequestActivityView = 'queue' | 'completed'; +type SequenceMetricView = 'distribution' | 'inflight'; const VIEW_OPTIONS: SegmentedToggleOption[] = [ { value: 'point', label: 'Per-point', testId: 'detail-view-point' }, { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' }, { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' }, ]; +const isDetailView = (value: string | null): value is DetailView => + value === 'point' || value === 'timeline' || value === 'aggregates'; + const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption[] = [ { value: 'p75', label: 'P75' }, { value: 'p90', label: 'P90' }, ]; +const LATENCY_METRIC_OPTIONS: SegmentedToggleOption<'ttft' | 'e2e'>[] = [ + { value: 'ttft', label: 'TTFT', testId: 'latency-metric-ttft' }, + { value: 'e2e', label: 'E2E', testId: 'latency-metric-e2e' }, +]; + +const REQUEST_ACTIVITY_OPTIONS: SegmentedToggleOption[] = [ + { value: 'queue', label: 'Queue depth', testId: 'request-activity-queue' }, + { value: 'completed', label: 'Completed', testId: 'request-activity-completed' }, +]; + +const SEQUENCE_METRIC_OPTIONS: SegmentedToggleOption[] = [ + { value: 'distribution', label: 'Distribution' }, + { value: 'inflight', label: 'In-flight avg' }, +]; + +const SOURCE_ROLE_LABEL: Record = { + router: 'Router', + prefill: 'Prefill', + decode: 'Decode', + combined: 'Combined', + unknown: 'Unknown', +}; + +export function metricSourceLabel(source: MetricSource): string { + const instance = + source.workerId ?? + (source.dpRank ? `DP ${source.dpRank}` : null) ?? + source.endpointUrl ?? + (source.engine ? `engine ${source.engine}` : null); + return instance + ? `${SOURCE_ROLE_LABEL[source.role]} · ${instance}` + : SOURCE_ROLE_LABEL[source.role]; +} + // Unofficial-run overlays cannot open this persisted point-detail route: they // have no benchmark_results id or stored request timeline. These charts are // therefore intentionally limited to DB-backed agentic points. @@ -131,33 +182,68 @@ function RequestMetricOverTime({ metric, timeline, isLoading, + latencySelector = false, }: { title: string; metric: RequestMetric; timeline: RequestTimeline | null | undefined; isLoading: boolean; + latencySelector?: boolean; }) { const [percentile, setPercentile] = useState('p90'); - const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null; - const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity'; - const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4'; + const [latencyMetric, setLatencyMetric] = useState<'ttft' | 'e2e'>('ttft'); + const selectedMetric = latencySelector ? latencyMetric : metric; + const result = timeline + ? rollingRequestMetric(timeline.requests, selectedMetric, percentile, 50) + : null; + const metricLabel = + selectedMetric === 'ttft' ? 'TTFT' : selectedMetric === 'e2e' ? 'E2E latency' : 'Interactivity'; + const color = + selectedMetric === 'ttft' ? '#f59e0b' : selectedMetric === 'e2e' ? '#a855f7' : '#06b6d4'; + const pointCount = result?.raw.length; + const isLatency = selectedMetric !== 'interactivity'; const controls = ( - { - setPercentile(value); - track('inference_agentic_percentile_changed', { metric, percentile: value }); - }} - ariaLabel={`${metricLabel} percentile`} - testId={`${metric}-percentile-toggle`} - /> +
+ {latencySelector && ( + { + setLatencyMetric(value); + track('inference_agentic_latency_metric_changed', { metric: value }); + }} + ariaLabel="Latency metric" + testId="latency-metric-toggle" + /> + )} + + {pointCount === undefined + ? '— points' + : `${pointCount.toLocaleString()} ${pointCount === 1 ? 'point' : 'points'}`} + + { + setPercentile(value); + track('inference_agentic_percentile_changed', { + metric: selectedMetric, + percentile: value, + }); + }} + ariaLabel={`${metricLabel} percentile`} + testId={`${selectedMetric}-percentile-toggle`} + /> +
); return ( { @@ -174,10 +260,9 @@ function RequestMetricOverTime({ strokeWidth: 2.5, }, { - name: - metric === 'ttft' - ? `Cumulative ${percentile.toUpperCase()} TTFT` - : `1 / cumulative ${percentile.toUpperCase()} TPOT`, + name: isLatency + ? `Cumulative ${percentile.toUpperCase()} ${metricLabel}` + : `1 / cumulative ${percentile.toUpperCase()} TPOT`, data: result?.cumulative ?? [], color: '#ef4444', strokeWidth: 3, @@ -185,11 +270,11 @@ function RequestMetricOverTime({ ]} durationS={timeline.durationS} yFmt={ - metric === 'ttft' + isLatency ? (value) => `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s` : (value) => `${value.toFixed(0)}` } - yAxisLabel={metric === 'ttft' ? 'TTFT (s)' : 'Interactivity (tok/s/user)'} + yAxisLabel={isLatency ? `${metricLabel} (s)` : 'Interactivity (tok/s/user)'} {...size} /> ); @@ -198,6 +283,79 @@ function RequestMetricOverTime({ ); } +function SequenceMetricCard({ + metric, + values, + timeline, + histogramLoading, + timelineLoading, +}: { + metric: 'isl' | 'osl'; + values: readonly number[] | undefined; + timeline: RequestTimeline | null | undefined; + histogramLoading: boolean; + timelineLoading: boolean; +}) { + const [view, setView] = useState('distribution'); + const acronym = metric.toUpperCase(); + const fullName = metric === 'isl' ? 'Input sequence length' : 'Output sequence length'; + const testPrefix = `${metric}-metric`; + return ( + ({ + ...option, + testId: `${testPrefix}-${option.value}`, + }))} + onValueChange={(value) => { + setView(value); + track('inference_agentic_sequence_metric_view_changed', { metric, view: value }); + }} + ariaLabel={`${acronym} chart view`} + testId={`${testPrefix}-toggle`} + buttonClassName="px-2 py-1 text-xs" + /> + } + render={(expanded) => { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (view === 'distribution') { + if (values) return ; + return histogramLoading ? : ; + } + if (!timeline) return timelineLoading ? : ; + const raw = averageSequenceLengthInFlight(timeline.requests, metric); + return ( +
+ {metric === 'osl' && ( +

+ Retrospective: final observed OSL is assigned across each request's lifetime. +

+ )} + +
+ ); + }} + /> + ); +} + /** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */ function toAggPoint( sibling: { id: number; label: string }, @@ -216,6 +374,8 @@ function toAggPoint( export function AgenticPointDetail({ id }: Props) { const router = useRouter(); + const pathname = usePathname(); + const searchParams = useSearchParams(); const histQuery = useTraceHistograms([id], true); const metricsQuery = useTraceServerMetrics(id, true); const siblingsQuery = useBenchmarkSiblings(id); @@ -224,7 +384,24 @@ export function AgenticPointDetail({ id }: Props) { const metrics = metricsQuery.data; const siblingsData = siblingsQuery.data; - const [view, setView] = useState('point'); + const requestedView = searchParams.get('view'); + const view: DetailView = isDetailView(requestedView) ? requestedView : 'point'; + const setView = useCallback( + (nextView: DetailView) => { + const nextParams = new URLSearchParams(searchParams.toString()); + if (nextView === 'point') nextParams.delete('view'); + else nextParams.set('view', nextView); + const query = nextParams.toString(); + router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false }); + track('inference_agentic_detail_view_changed', { view: nextView }); + }, + [pathname, router, searchParams], + ); + const [metricSourceId, setMetricSourceId] = useState('all'); + const [requestActivityView, setRequestActivityView] = useState('queue'); + const [throughputSeries, setThroughputSeries] = useState>( + () => new Set(['input', 'decode']), + ); // Fetch aggregates only when the aggregates view is active. Uses the full // sibling set (across parallelism + concurrency configs) so each chart // shows how the metric varies across the SKU. @@ -234,6 +411,21 @@ export function AgenticPointDetail({ id }: Props) { // "Unique input tokens in flight" chart, so fetch whenever we're on // either view. const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point'); + const metricSources = metrics?.metricSources ?? []; + const selectedMetricSource = metricSources.find(({ source }) => source.id === metricSourceId); + const serverSeries = selectedMetricSource + ? { + kvCacheUsage: selectedMetricSource.kvCacheUsage, + prefixCacheHitRate: selectedMetricSource.prefixCacheHitRate, + queueDepth: selectedMetricSource.queueDepth, + promptTokensBySource: selectedMetricSource.promptTokensBySource, + prefillTps: selectedMetricSource.promptTps, + decodeTps: selectedMetricSource.generationTps, + prefixCacheHitsTps: selectedMetricSource.prefixCacheHitsTps, + hostKvCacheUsage: selectedMetricSource.hostKvCacheUsage, + kvCacheUsageByEngine: selectedMetricSource.kvCacheUsageByEngine, + } + : metrics; return (
@@ -297,6 +489,48 @@ export function AgenticPointDetail({ id }: Props) { )}
+ {view === 'point' && metricSources.length > 1 && ( +
+ Server metrics + +
+ )} + {view === 'aggregates' ? ( - { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (hist) return ; - return histQuery.isLoading ? : ; - }} + - { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (hist) return ; - return histQuery.isLoading ? : ; - }} + { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics) return ; + if (!metrics || !serverSeries) return ; // For SGLang hicache rows we have both GPU (HBM) util and // host (CPU offload pool) util — overlay them as two lines. - const hasHost = metrics.hostKvCacheUsage.length > 0; + const hasHost = serverSeries.hostKvCacheUsage.length > 0; // DEP runs report one series per engine. When there's more // than one, draw one line per rank in distinct colors so // load skew is visible at a glance; cluster-average sits on // top in white so it stands out. - const perEngine = metrics.kvCacheUsageByEngine ?? []; + const perEngine = serverSeries.kvCacheUsageByEngine ?? []; const hasPerEngine = perEngine.length > 1; // Render order matters: per-engine first → average drawn on top. const series = [ @@ -385,10 +618,10 @@ export function AgenticPointDetail({ id }: Props) { : hasPerEngine ? 'Avg' : 'GPU KV cache (avg n=50)', - data: rollingAverage(metrics.kvCacheUsage, 50), + data: rollingAverage(serverSeries.kvCacheUsage, 50), // Skip raw scatter when per-engine overlay is on — the // DP-rank lines already convey the spread, dots would be noise. - rawData: hasPerEngine ? undefined : metrics.kvCacheUsage, + rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage, // Bold red Avg sits on top of the translucent per-DP lines. // DP 1 in the palette is #ef4444 (lighter red); the darker // #dc2626 here plus the heavier stroke keeps it distinct. @@ -399,8 +632,8 @@ export function AgenticPointDetail({ id }: Props) { ? [ { name: 'CPU offload pool (avg n=50)', - data: rollingAverage(metrics.hostKvCacheUsage, 50), - rawData: metrics.hostKvCacheUsage, + data: rollingAverage(serverSeries.hostKvCacheUsage, 50), + rawData: serverSeries.hostKvCacheUsage, color: '#f97316', strokeWidth: 2, }, @@ -421,17 +654,55 @@ export function AgenticPointDetail({ id }: Props) { /> { + setRequestActivityView(value); + track('inference_agentic_request_activity_changed', { view: value }); + }} + ariaLabel="Request activity metric" + testId="request-activity-toggle" + buttonClassName="px-2 py-1 text-xs" + /> + } render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics) return ; + if (requestActivityView === 'completed') { + if (!timelineQuery.data) { + return timelineQuery.isLoading ? : ; + } + return ( + + ); + } + if (!metrics || !serverSeries) return ; return ( ({ + serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ t: p.t, value: p.running, })), @@ -443,7 +714,7 @@ export function AgenticPointDetail({ id }: Props) { { name: 'Waiting (avg n=50)', data: rollingAverage( - metrics.queueDepth.map((p: QueueDepthPoint) => ({ + serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ t: p.t, value: p.waiting, })), @@ -455,7 +726,7 @@ export function AgenticPointDetail({ id }: Props) { { name: 'Total (avg n=50)', data: rollingAverage( - metrics.queueDepth.map((p: QueueDepthPoint) => ({ + serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ t: p.t, value: p.total, })), @@ -477,14 +748,14 @@ export function AgenticPointDetail({ id }: Props) { title="Prefix cache hit rate per interval" render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics) return ; + if (!metrics || !serverSeries) return ; return ( + {( + [ + ['input', 'Input'], + ['decode', 'Decode'], + ] as const + ).map(([key, label]) => { + const active = throughputSeries.has(key); + const isOnlyActive = active && throughputSeries.size === 1; + return ( + + ); + })} +
+ } render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics) return ; - const total = sumSeries(metrics.prefillTps, metrics.decodeTps); + if (!metrics || !serverSeries) return ; return ( { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics) return ; + if (!metrics || !serverSeries) return ; return ( @@ -554,7 +853,7 @@ export function AgenticPointDetail({ id }: Props) { title="Total unique input tokens over time" render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics) return ; + if (!metrics || !serverSeries) return ; // Unique = total prompt tokens received minus tokens served from // any cache tier — i.e. the freshly prefill-computed tokens. Prefer // the promptTokensBySource breakdown (its buckets sum to the real @@ -564,11 +863,16 @@ export function AgenticPointDetail({ id }: Props) { // tokens across scheduler passes, so its cumulative can exceed the // prompt tokens received, driving the diff negative and freezing // the monotonic-clamped line after a few seconds. - const uniqueFromBreakdown = cumulativeUniqueInputTokens(metrics.promptTokensBySource); + const uniqueFromBreakdown = cumulativeUniqueInputTokens( + serverSeries.promptTokensBySource, + ); const uniqueData = uniqueFromBreakdown.length > 0 ? uniqueFromBreakdown - : cumulativeDifferenceMonotonic(metrics.prefillTps, metrics.prefixCacheHitsTps); + : cumulativeDifferenceMonotonic( + serverSeries.prefillTps, + serverSeries.prefixCacheHitsTps, + ); return ( { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!timelineQuery.data) { @@ -613,6 +918,12 @@ export function AgenticPointDetail({ id }: Props) { color: '#a855f7', strokeWidth: 2, }, + { + name: 'Cumulative average', + data: cumulativeTimeAverage(raw), + color: '#ef4444', + strokeWidth: 3, + }, ]} durationS={timelineQuery.data.durationS} yAxisLabel="Tokens" diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts new file mode 100644 index 00000000..d15da878 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -0,0 +1,101 @@ +import { describe, expect, it } from 'vitest'; + +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +import { buildRequestTimelineRows, requestIdleStats, splitTimelineCid } from './request-timeline'; + +const request = (start: number, end: number): RequestRecord => ({ + cid: 'conversation', + ti: start, + wid: 'worker', + ad: 0, + phase: 'profiling', + credit: start, + start, + ack: null, + end, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + cancelled: false, +}); + +describe('requestIdleStats', () => { + it('sums only gaps where no requests overlap', () => { + expect( + requestIdleStats([ + request(0, 10), + request(5, 20), + request(30, 40), + request(35, 50), + request(70, 80), + ]), + ).toEqual({ idleNs: 30, spanNs: 80 }); + }); + + it('handles unsorted and nested requests without double-counting busy time', () => { + expect(requestIdleStats([request(20, 30), request(0, 100), request(10, 40)])).toEqual({ + idleNs: 0, + spanNs: 100, + }); + }); + + it('does not count time before the first start or after the final end', () => { + expect(requestIdleStats([request(100, 200), request(300, 400)])).toEqual({ + idleNs: 100, + spanNs: 300, + }); + }); + + it('returns zeroes for an empty timeline', () => { + expect(requestIdleStats([])).toEqual({ idleNs: 0, spanNs: 0 }); + }); +}); + +describe('subagent timeline hierarchy', () => { + it('parses aux lanes separately from their parent subagent id', () => { + expect(splitTimelineCid('conv::sa:subagent_001_abcd:aux:011')).toEqual({ + parent: 'conv', + subagentBase: 'subagent_001_abcd', + stream: null, + aux: '011', + }); + }); + + it('renders aux requests as always-visible children of their subagent', () => { + const records = [ + { ...request(0, 10), cid: 'conv' }, + { ...request(10, 30), cid: 'conv::sa:subagent_001_abcd' }, + { ...request(12, 20), cid: 'conv::sa:subagent_001_abcd:aux:011' }, + { ...request(14, 24), cid: 'conv::sa:subagent_001_abcd:aux:012' }, + { ...request(40, 50), cid: 'conv::sa:subagent_002_ef01' }, + ]; + + const rows = buildRequestTimelineRows(records, 'conversation', new Set()); + expect(rows.map(({ kind, depth }) => ({ kind, depth }))).toEqual([ + { kind: 'parent', depth: 0 }, + { kind: 'subagent', depth: 1 }, + { kind: 'aux', depth: 2 }, + { kind: 'aux', depth: 2 }, + { kind: 'subagent', depth: 1 }, + ]); + expect(rows[1]!.requests.map((record) => record.cid)).toEqual(['conv::sa:subagent_001_abcd']); + expect(rows[1]!.auxCount).toBe(2); + expect(rows[2]!.label).toBe('aux 011 · parallel'); + expect(rows[3]!.label).toBe('aux 012 · parallel'); + }); + + it('keeps aux lanes visible while primary streams remain collapsed', () => { + const records = [ + { ...request(10, 20), cid: 'conv::sa:subagent_001_abcd:s0' }, + { ...request(12, 22), cid: 'conv::sa:subagent_001_abcd:s1' }, + { ...request(14, 18), cid: 'conv::sa:subagent_001_abcd:aux:001' }, + ]; + + const rows = buildRequestTimelineRows(records, 'conversation', new Set()); + expect(rows.map((row) => row.kind)).toEqual(['parent', 'subagent', 'aux']); + expect(rows[1]!.requests).toHaveLength(2); + expect(rows[2]!.requests).toHaveLength(1); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index baf3dc1f..bdf0a9b9 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -32,6 +32,35 @@ export function subagentIdOf(cid: string): string | null { return colon === -1 ? raw : raw.slice(0, colon); } +export interface RequestIdleStats { + /** Total time between the first start and last end with no request running. */ + idleNs: number; + /** Wall-clock span from the first request start to the final request end. */ + spanNs: number; +} + +/** + * Merge request intervals and sum the gaps between them. Queue time before a + * request starts is intentionally excluded: "in flight" means [start, end]. + */ +export function requestIdleStats(requests: readonly RequestRecord[]): RequestIdleStats { + const intervals = requests + .filter(({ start, end }) => Number.isFinite(start) && Number.isFinite(end) && end >= start) + .map(({ start, end }) => ({ start, end })) + .toSorted((a, b) => a.start - b.start || a.end - b.end); + if (intervals.length === 0) return { idleNs: 0, spanNs: 0 }; + + const firstStart = intervals[0]!.start; + let mergedEnd = intervals[0]!.end; + let idleNs = 0; + for (let i = 1; i < intervals.length; i++) { + const interval = intervals[i]!; + if (interval.start > mergedEnd) idleNs += interval.start - mergedEnd; + if (interval.end > mergedEnd) mergedEnd = interval.end; + } + return { idleNs, spanNs: mergedEnd - firstStart }; +} + /** * Gantt-style request timeline for one agentic benchmark point. * @@ -95,10 +124,12 @@ const PHASE_COLORS: Record = { * when collapsed. * stream — one :sN stream of a multi-stream subagent (depth 2). * Hidden by default; toggled in via the parent's chevron. + * aux — one :aux:N parallel lane (depth 2). Always visible + * beneath its owning subagent. */ -type RowKind = 'parent' | 'worker' | 'subagent' | 'stream'; +type RowKind = 'parent' | 'worker' | 'subagent' | 'stream' | 'aux'; -interface Row { +export interface RequestTimelineRow { key: string; label: string; color: string; @@ -109,28 +140,40 @@ interface Row { streamCount?: number; /** For stream rows: the parent subagent's row key (drives expand/collapse). */ parentRowKey?: string; + /** Number of always-visible auxiliary lanes under this subagent. */ + auxCount?: number; } /** * Conversation ids for subagent calls look like - * ::sa:[:s] + * ::sa:[:s|:aux:] * The optional `:s` suffix is set when the harness fans a single * subagent into multiple parallel "streams" (interval-graph * decomposition in weka_trace._pack_into_streams). We split it off so - * we can group all streams of one subagent under a single header row. + * we can group every parallel lane under a single subagent header row. */ -function splitCid(cid: string): { +export function splitTimelineCid(cid: string): { parent: string; subagentBase: string | null; stream: number | null; + aux: string | null; } { const sep = cid.indexOf('::sa:'); - if (sep === -1) return { parent: cid, subagentBase: null, stream: null }; + if (sep === -1) return { parent: cid, subagentBase: null, stream: null, aux: null }; const parent = cid.slice(0, sep); const raw = cid.slice(sep + 5); + const auxMatch = /^(?[^:]+):aux:(?.+)$/.exec(raw); + if (auxMatch) { + return { + parent, + subagentBase: auxMatch.groups!.base!, + stream: null, + aux: auxMatch.groups!.aux!, + }; + } const m = /^(?.*):s(?\d+)$/.exec(raw); - if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) }; - return { parent, subagentBase: raw, stream: null }; + if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]), aux: null }; + return { parent, subagentBase: raw, stream: null, aux: null }; } /** @@ -139,6 +182,7 @@ function splitCid(cid: string): { * subagent_001 (collapsed by default, container) * :s0 (hidden unless expanded) * :s1 + * aux 011 · parallel (always visible) * subagent_002 * ... * @@ -147,11 +191,11 @@ function splitCid(cid: string): { * streams' requests — overlapping bars visually communicate the * stream-level parallelism without expanding. */ -function buildRows( +export function buildRequestTimelineRows( requests: RequestRecord[], mode: RowMode, expandedSubagents: ReadonlySet, -): Row[] { +): RequestTimelineRow[] { if (mode !== 'conversation') { // Worker mode: flat rows, sorted by first activity. const groups = new Map(); @@ -163,7 +207,7 @@ function buildRows( } list.push(r); } - const rows: Row[] = []; + const rows: RequestTimelineRow[] = []; let i = 0; for (const [key, list] of groups) { list.sort((a, b) => a.start - b.start); @@ -181,17 +225,21 @@ function buildRows( return rows; } - // Conversation mode — tree: parent → subagent → stream. + // Conversation mode — tree: parent → subagent → stream/aux lane. + interface SubagentLanes { + streams: Map; + aux: Map; + } interface Tree { parentCid: string; parentReqs: RequestRecord[]; - // subagentBase → (streamIndex|null → requests) - subagents: Map>; + // subagentBase → primary streams + always-visible auxiliary lanes. + subagents: Map; firstStart: number; } const trees = new Map(); for (const r of requests) { - const { parent, subagentBase, stream } = splitCid(r.cid); + const { parent, subagentBase, stream, aux } = splitTimelineCid(r.cid); let tree = trees.get(parent); if (!tree) { tree = { @@ -205,20 +253,26 @@ function buildRows( if (subagentBase === null) { tree.parentReqs.push(r); } else { - let saMap = tree.subagents.get(subagentBase); - if (!saMap) { - saMap = new Map(); - tree.subagents.set(subagentBase, saMap); + let lanes = tree.subagents.get(subagentBase); + if (!lanes) { + lanes = { streams: new Map(), aux: new Map() }; + tree.subagents.set(subagentBase, lanes); + } + if (aux === null) { + const list = lanes.streams.get(stream); + if (list) list.push(r); + else lanes.streams.set(stream, [r]); + } else { + const list = lanes.aux.get(aux); + if (list) list.push(r); + else lanes.aux.set(aux, [r]); } - const list = saMap.get(stream); - if (list) list.push(r); - else saMap.set(stream, [r]); } if (r.start < tree.firstStart) tree.firstStart = r.start; } const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart); - const rows: Row[] = []; + const rows: RequestTimelineRow[] = []; let colorIdx = 0; for (const tree of sortedTrees) { const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!; @@ -237,20 +291,25 @@ function buildRows( // One subagent row per base (which may contain N streams). const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => { const aStart = Math.min( - ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ...[...a[1].streams.values(), ...a[1].aux.values()].map( + (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY, + ), ); const bStart = Math.min( - ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ...[...b[1].streams.values(), ...b[1].aux.values()].map( + (reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY, + ), ); return aStart - bStart; }); - for (const [saBase, streams] of subagentEntries) { + for (const [saBase, lanes] of subagentEntries) { const subagentKey = `${tree.parentCid}::sa:${saBase}`; - // Union of all stream requests for collapsed-view bars. + // Union of primary stream requests for collapsed-view bars. Aux lanes + // stay separate so their overlap remains visible as parallel work. const allReqs: RequestRecord[] = []; - for (const reqs of streams.values()) allReqs.push(...reqs); + for (const reqs of lanes.streams.values()) allReqs.push(...reqs); allReqs.sort((a, b) => a.start - b.start); - const streamCount = streams.size; + const streamCount = lanes.streams.size; rows.push({ key: subagentKey, label: `↳ ${formatSubagentLabel(saBase)}`, @@ -259,12 +318,13 @@ function buildRows( depth: 1, kind: 'subagent', streamCount, + auxCount: lanes.aux.size, }); // Stream children only when expanded AND there's more than one // stream (a single-stream subagent has nothing extra to show). if (streamCount > 1 && expandedSubagents.has(subagentKey)) { - const streamEntries = [...streams.entries()].toSorted((a, b) => { + const streamEntries = [...lanes.streams.entries()].toSorted((a, b) => { // Sort by stream index (null first as the "default" stream) const ai = a[0] ?? -1; const bi = b[0] ?? -1; @@ -283,6 +343,27 @@ function buildRows( }); } } + + // Aux lanes encode concurrent requests within the subagent. Keep them + // visible even when primary streams are collapsed so parallelism is not + // hidden behind an interaction. + const auxEntries = [...lanes.aux.entries()].toSorted( + (a, b) => + (a[1][0]?.start ?? Number.POSITIVE_INFINITY) - + (b[1][0]?.start ?? Number.POSITIVE_INFINITY), + ); + for (const [auxId, reqs] of auxEntries) { + reqs.sort((a, b) => a.start - b.start); + rows.push({ + key: `${subagentKey}:aux:${auxId}`, + label: `aux ${auxId} · parallel`, + color, + requests: reqs, + depth: 2, + kind: 'aux', + parentRowKey: subagentKey, + }); + } } } return rows; @@ -340,7 +421,7 @@ function countLt(sorted: number[], target: number): number { interface TooltipData { x: number; y: number; - row: Row; + row: RequestTimelineRow; req: RequestRecord; } @@ -475,9 +556,10 @@ export function RequestTimelineView({ [data.requests, phaseFilter, hasWarmup], ); const rows = useMemo( - () => buildRows(filtered, rowMode, expandedSubagents), + () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents), [filtered, rowMode, expandedSubagents], ); + const idleStats = useMemo(() => requestIdleStats(filtered), [filtered]); // Pre-sort the timestamp columns so the cursor-time stats popover can // count "running / waiting at time t" in O(log n). With a few hundred @@ -669,7 +751,16 @@ export function RequestTimelineView({ {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '} {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '} - {formatDuration((dataEnd - dataStart) / 1e6)} + {formatDuration((dataEnd - dataStart) / 1e6)} ·{' '} + + idle {formatDuration(idleStats.idleNs / 1e6)} + {idleStats.spanNs > 0 + ? ` (${((idleStats.idleNs / idleStats.spanNs) * 100).toFixed(1)}%)` + : ''} + {isZoomed && ( <> {' · '} @@ -705,12 +796,13 @@ export function RequestTimelineView({
{rows.map((row) => { const isSubagentRow = row.kind === 'subagent'; - const isStreamRow = row.kind === 'stream'; + const isChildRow = row.kind === 'stream' || row.kind === 'aux'; const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; const isExpanded = isExpandable && expandedSubagents.has(row.key); return (
{row.label} {isExpandable && ( ×{row.streamCount} )} + {isSubagentRow && (row.auxCount ?? 0) > 0 && ( + +{row.auxCount} aux + )} {row.requests.length > 0 ? row.requests.length : '—'} @@ -881,7 +976,7 @@ export function RequestTimelineView({ opacity={ req.cancelled ? 0.35 - : row.kind === 'stream' + : row.kind === 'stream' || row.kind === 'aux' ? 0.5 : row.kind === 'subagent' ? 0.6 diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts index 3506ff45..a9ece859 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts @@ -2,7 +2,16 @@ import { describe, expect, it } from 'vitest'; import type { RequestRecord } from '@/hooks/api/use-request-timeline'; -import { cumulativeUniqueInputTokens, rollingRequestMetric } from './time-series-chart'; +import { + averageSequenceLengthInFlight, + buildThroughputChartSeries, + cumulativeAverage, + cumulativeCompletedRequests, + cumulativeTimeAverage, + cumulativeUniqueInputTokens, + rollingRequestMetric, + toggleThroughputSeries, +} from './time-series-chart'; const request = ( endS: number, @@ -54,6 +63,22 @@ describe('rollingRequestMetric', () => { expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]); }); + it('computes E2E latency from request start through request end', () => { + const result = rollingRequestMetric( + [request(2, 100, 10, { start: 500_000_000 }), request(4, 200, 20, { start: 1_000_000_000 })], + 'e2e', + 'p90', + 50, + ); + + expect(result.raw).toEqual([ + { t: 2, value: 1.5 }, + { t: 4, value: 3 }, + ]); + expect(result.trend.at(-1)?.value).toBeCloseTo(2.85, 8); + expect(result.cumulative.at(-1)?.value).toBeCloseTo(2.85, 8); + }); + it('drops warmup, cancelled, missing, and non-positive samples', () => { const result = rollingRequestMetric( [ @@ -73,6 +98,154 @@ describe('rollingRequestMetric', () => { }); }); +describe('cumulativeAverage', () => { + it('hides the startup interval without removing it from later averages', () => { + const result = cumulativeAverage( + [ + { t: 0, value: 300 }, + { t: 30, value: 0 }, + { t: 60, value: 0 }, + { t: 90, value: 100 }, + ], + 60, + ); + + expect(result).toEqual([ + { t: 60, value: 100 }, + { t: 90, value: 100 }, + ]); + }); + + it('preserves the original behavior when no burn-in is requested', () => { + expect( + cumulativeAverage([ + { t: 0, value: 10 }, + { t: 1, value: 20 }, + ]), + ).toEqual([ + { t: 0, value: 10 }, + { t: 1, value: 15 }, + ]); + }); +}); + +describe('cumulativeTimeAverage', () => { + it('computes a run-to-date time-weighted average for a step series', () => { + expect( + cumulativeTimeAverage([ + { t: 0, value: 100 }, + { t: 1, value: 300 }, + { t: 3, value: 100 }, + { t: 4, value: 0 }, + ]), + ).toEqual([ + { t: 0, value: 100 }, + { t: 1, value: 100 }, + { t: 3, value: 700 / 3 }, + { t: 4, value: 200 }, + ]); + }); + + it('coalesces same-time request events to their final step value', () => { + expect( + cumulativeTimeAverage([ + { t: 0, value: 0 }, + { t: 0, value: 100 }, + { t: 2, value: 0 }, + ]), + ).toEqual([ + { t: 0, value: 100 }, + { t: 2, value: 100 }, + ]); + }); +}); + +describe('cumulativeCompletedRequests', () => { + it('sorts profiling completions and excludes warmup and cancelled requests', () => { + expect( + cumulativeCompletedRequests([ + request(4, 100, 10), + request(2, 100, 10), + request(1, 100, 10, { phase: 'warmup' }), + request(3, 100, 10, { cancelled: true }), + ]), + ).toEqual([ + { t: 0, value: 0 }, + { t: 2, value: 1 }, + { t: 4, value: 2 }, + ]); + }); + + it('returns no series when there are no successful profiling completions', () => { + expect(cumulativeCompletedRequests([request(1, 100, 10, { cancelled: true })])).toEqual([]); + }); +}); + +describe('averageSequenceLengthInFlight', () => { + it('computes the event-time average across overlapping profiling requests', () => { + expect( + averageSequenceLengthInFlight( + [ + request(4, 100, 10, { start: 0, end: 4_000_000_000, isl: 100 }), + request(3, 100, 10, { start: 1_000_000_000, end: 3_000_000_000, isl: 300 }), + ], + 'isl', + ), + ).toEqual([ + { t: 0, value: 100 }, + { t: 1, value: 200 }, + { t: 3, value: 100 }, + { t: 4, value: 0 }, + ]); + }); + + it('excludes cancelled, warmup, and missing sequence lengths', () => { + expect( + averageSequenceLengthInFlight( + [ + request(1, 100, 10, { osl: null }), + request(2, 100, 10, { osl: 20, cancelled: true }), + request(3, 100, 10, { osl: 30, phase: 'warmup' }), + ], + 'osl', + ), + ).toEqual([]); + }); +}); + +describe('toggleThroughputSeries', () => { + it('allows either series to be hidden when both are selected', () => { + expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'input')]).toEqual(['decode']); + expect([...toggleThroughputSeries(new Set(['input', 'decode']), 'decode')]).toEqual(['input']); + }); + + it('does not allow the final visible series to be hidden', () => { + const selected = new Set<'input' | 'decode'>(['decode']); + expect(toggleThroughputSeries(selected, 'decode')).toBe(selected); + }); + + it('allows the hidden series to be restored', () => { + expect([...toggleThroughputSeries(new Set(['decode']), 'input')]).toEqual(['decode', 'input']); + }); + + it('only includes the total running average when both series are visible', () => { + const input = [{ t: 0, value: 10 }]; + const decode = [{ t: 0, value: 20 }]; + + expect( + buildThroughputChartSeries(input, decode, new Set(['input', 'decode'])).map( + ({ name }) => name, + ), + ).toEqual(['Input (avg n=50)', 'Decode (avg n=50)', 'Total running avg (60s burn-in)']); + expect( + buildThroughputChartSeries(input, decode, new Set(['input'])).map(({ name }) => name), + ).toEqual(['Input (avg n=50)']); + expect( + buildThroughputChartSeries(input, decode, new Set(['decode'])).map(({ name }) => name), + ).toEqual(['Decode (avg n=50)']); + }); +}); + describe('cumulativeUniqueInputTokens', () => { it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => { const out = cumulativeUniqueInputTokens({ diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 0c0b5739..ab744286 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -33,8 +33,21 @@ interface TimeSeriesChartProps { height?: number; } -export type RequestMetric = 'interactivity' | 'ttft'; +export type RequestMetric = 'interactivity' | 'ttft' | 'e2e'; export type RequestPercentile = 'p75' | 'p90'; +export type ThroughputSeriesKey = 'input' | 'decode'; + +/** Toggle one throughput series while preserving the at-least-one invariant. */ +export function toggleThroughputSeries( + selected: ReadonlySet, + key: ThroughputSeriesKey, +): ReadonlySet { + if (selected.has(key) && selected.size === 1) return selected; + const next = new Set(selected); + if (next.has(key)) next.delete(key); + else next.add(key); + return next; +} /** Linear-interpolated percentile (matches numpy's default method). */ function quantile(sortedAsc: number[], q: number): number { @@ -47,7 +60,8 @@ function quantile(sortedAsc: number[], q: number): number { } /** - * Build raw request samples plus a trailing request-count percentile. + * Build raw request samples plus a trailing request-count percentile. E2E + * latency is measured from HTTP request start through final response byte. * * The percentile is computed in latency space. Interactivity then inverts * the selected TPOT percentile, matching the aggregate chart convention: @@ -63,7 +77,12 @@ export function rollingRequestMetric( const samples = requests .filter((request) => request.phase === 'profiling' && !request.cancelled) .flatMap((request) => { - const latencyMs = metric === 'ttft' ? request.ttftMs : request.tpotMs; + const latencyMs = + metric === 'ttft' + ? request.ttftMs + : metric === 'e2e' + ? (request.end - request.start) / 1e6 + : request.tpotMs; if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return []; return [{ t: request.end / 1e9, latencyMs }]; }) @@ -71,7 +90,7 @@ export function rollingRequestMetric( const raw = samples.map(({ t, latencyMs }) => ({ t, - value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs, + value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000, })); const trend = samples.map(({ t }, i) => { const start = Math.max(0, i - Math.max(1, windowSize) + 1); @@ -80,7 +99,7 @@ export function rollingRequestMetric( .map((sample) => sample.latencyMs) .toSorted((a, b) => a - b); const latencyMs = quantile(sorted, q); - return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs }; + return { t, value: metric === 'interactivity' ? 1000 / latencyMs : latencyMs / 1000 }; }); const prefixLatencies: number[] = []; const cumulative = samples.map(({ t, latencyMs }) => { @@ -95,7 +114,7 @@ export function rollingRequestMetric( const cumulativeLatencyMs = quantile(prefixLatencies, q); return { t, - value: metric === 'ttft' ? cumulativeLatencyMs / 1000 : 1000 / cumulativeLatencyMs, + value: metric === 'interactivity' ? 1000 / cumulativeLatencyMs : cumulativeLatencyMs / 1000, }; }); @@ -154,18 +173,60 @@ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): Tim return out; } -/** Expanding-window cumulative mean from index 0..i. */ -export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] { +/** + * Expanding-window cumulative mean from index 0..i. + * + * `burnInS` suppresses rendering during the unstable startup interval while + * retaining those samples in every later average. This avoids visually + * promoting a single bursty counter bucket without changing the run-to-date + * meaning of the line once it appears. + */ +export function cumulativeAverage(data: TimeSeriesPoint[], burnInS = 0): TimeSeriesPoint[] { if (data.length === 0) return data; - const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + const out: TimeSeriesPoint[] = []; + const firstT = data[0]!.t; let sum = 0; for (let i = 0; i < data.length; i++) { sum += data[i]!.value; - out[i] = { t: data[i]!.t, value: sum / (i + 1) }; + if (data[i]!.t - firstT >= burnInS) { + out.push({ t: data[i]!.t, value: sum / (i + 1) }); + } } return out; } +/** + * Run-to-date time-weighted average of a step series. + * + * Duplicate timestamps are coalesced to their final value before integration; + * this is important for request handoffs where several start/end events occur + * at the same instant. Each value is held until the next timestamp. + */ +export function cumulativeTimeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] { + if (data.length === 0) return []; + const points: TimeSeriesPoint[] = []; + for (const point of data.toSorted((a, b) => a.t - b.t)) { + if (!Number.isFinite(point.t) || !Number.isFinite(point.value)) continue; + const previous = points.at(-1); + if (previous?.t === point.t) previous.value = point.value; + else points.push({ ...point }); + } + if (points.length === 0) return []; + + const firstT = points[0]!.t; + let previousT = firstT; + let previousValue = points[0]!.value; + let area = 0; + return points.map((point, index) => { + if (index === 0) return { t: point.t, value: point.value }; + area += previousValue * (point.t - previousT); + const duration = point.t - firstT; + previousT = point.t; + previousValue = point.value; + return { t: point.t, value: duration > 0 ? area / duration : point.value }; + }); +} + /** * Running cumulative sum of a per-interval rate series. Each output point * is the integral of the rate from start to that point, assuming the rate @@ -183,6 +244,60 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } +/** Cumulative count of successfully completed profiling requests by end time. */ +export function cumulativeCompletedRequests(requests: readonly RequestRecord[]): TimeSeriesPoint[] { + const completionTimes = requests + .filter((request) => request.phase === 'profiling' && !request.cancelled) + .map((request) => request.end / 1e9) + .filter(Number.isFinite) + .toSorted((a, b) => a - b); + if (completionTimes.length === 0) return []; + return [{ t: 0, value: 0 }, ...completionTimes.map((t, index) => ({ t, value: index + 1 }))]; +} + +/** + * Retrospective average sequence length among requests active at each event. + * OSL uses the request's final observed length across its whole lifetime. + */ +export function averageSequenceLengthInFlight( + requests: readonly RequestRecord[], + metric: 'isl' | 'osl', +): TimeSeriesPoint[] { + const events = new Map(); + const addEvent = (t: number, tokenDelta: number, countDelta: number) => { + const current = events.get(t) ?? { tokenDelta: 0, countDelta: 0 }; + current.tokenDelta += tokenDelta; + current.countDelta += countDelta; + events.set(t, current); + }; + + for (const request of requests) { + const tokens = request[metric]; + if ( + request.phase !== 'profiling' || + request.cancelled || + tokens === null || + !Number.isFinite(tokens) || + tokens < 0 || + request.end < request.start + ) { + continue; + } + addEvent(request.start / 1e9, tokens, 1); + addEvent(request.end / 1e9, -tokens, -1); + } + + let tokensInFlight = 0; + let requestsInFlight = 0; + return [...events.entries()] + .toSorted((a, b) => a[0] - b[0]) + .map(([t, event]) => { + tokensInFlight += event.tokenDelta; + requestsInFlight += event.countDelta; + return { t, value: requestsInFlight > 0 ? tokensInFlight / requestsInFlight : 0 }; + }); +} + // A promptTokensBySource bucket label denotes tokens served from some cache // tier (local prefix cache, offloaded/host KV, remote KV transfer) rather than // freshly computed. Matches vllm labels (`local_cache_hit`, @@ -340,6 +455,40 @@ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSerie return out; } +/** Build throughput lines from the currently visible input/decode signals. */ +export function buildThroughputChartSeries( + input: TimeSeriesPoint[], + decode: TimeSeriesPoint[], + selected: ReadonlySet, +): Series[] { + const series: Series[] = []; + if (selected.has('input')) { + series.push({ + name: 'Input (avg n=50)', + data: rollingAverage(input, 50), + color: '#3b82f6', + strokeWidth: 1.6, + }); + } + if (selected.has('decode')) { + series.push({ + name: 'Decode (avg n=50)', + data: rollingAverage(decode, 50), + color: '#f97316', + strokeWidth: 1.6, + }); + } + if (selected.size === 2) { + series.push({ + name: 'Total running avg (60s burn-in)', + data: cumulativeAverage(sumSeries(input, decode), 60), + color: '#ef4444', + strokeWidth: 3, + }); + } + return series; +} + const fmtIntDefault = (n: number) => n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n)); diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 0f3eedc7..654dd1b9 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -43,7 +43,13 @@ import { type QuickFilters, } from '@/components/inference/utils/quickFilters'; -type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps'; +type XAxisMode = + | 'ttft' + | 'e2e' + | 'normalized-e2e' + | 'interactivity' + | 'session-time' + | 'prefill-tps'; /** * Resolve the percentile-prefixed e2e-latency field name for the given diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index e0f5ae1a..5d0981b8 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -728,13 +728,20 @@ export interface InferenceChartContextType { * at a time, picked by the big buttons above the chart. * - 'ttft' → e2e chartType with x-axis forced to p90_ttft * - 'e2e' → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el) + * - 'normalized-e2e'→ agentic-only; x = per-request E2E normalized to 400 output tokens * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty) * - 'session-time' → agentic-only; x = mean-normalized session time (live-computed from trace blobs) * - 'prefill-tps' → agentic-only; x = mean of P90 prefill TPS/user per session */ - selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps'; + selectedXAxisMode: + | 'ttft' + | 'e2e' + | 'normalized-e2e' + | 'interactivity' + | 'session-time' + | 'prefill-tps'; setSelectedXAxisMode: ( - mode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps', + mode: 'ttft' | 'e2e' | 'normalized-e2e' | 'interactivity' | 'session-time' | 'prefill-tps', ) => void; scaleType: 'auto' | 'linear' | 'log'; setScaleType: (type: 'auto' | 'linear' | 'log') => void; diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index caf713cc..9ad3d881 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -1,5 +1,8 @@ 'use client'; -import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants'; +import { + DISPLAY_MODEL_TO_DB, + NORMALIZED_E2E_OUTPUT_TOKENS, +} from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import dynamic from 'next/dynamic'; import { useEffect, useMemo, useRef, useState } from 'react'; @@ -14,7 +17,10 @@ import type { OverlayData, TrendDataPoint, } from '@/components/inference/types'; -import { processOverlayChartData } from '@/components/inference/utils'; +import { + processOverlayChartData, + selectUnofficialOverlayForMode, +} from '@/components/inference/utils'; import { isRunComparisonEntry, makeRunComparisonEntry, @@ -70,7 +76,13 @@ import WorkflowInfoDisplay from './WorkflowInfoDisplay'; type InferenceViewMode = 'chart' | 'table'; -type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps'; +type XAxisMode = + | 'ttft' + | 'e2e' + | 'normalized-e2e' + | 'interactivity' + | 'session-time' + | 'prefill-tps'; interface XAxisModeButton { value: XAxisMode; @@ -81,6 +93,7 @@ interface XAxisModeButton { const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [ { value: 'ttft', label: 'TTFT' }, { value: 'e2e', label: 'E2E Latency' }, + { value: 'normalized-e2e', label: 'Normalized E2E', agenticOnly: true }, { value: 'interactivity', label: 'Interactivity' }, { value: 'session-time', label: 'Session Time', agenticOnly: true }, { value: 'prefill-tps', label: 'Prefill TPS / user', agenticOnly: true }, @@ -378,7 +391,9 @@ export default function ChartDisplay() { const useDerived = sequenceKind(selectedSequence) === 'agentic' && - (selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps'); + (selectedXAxisMode === 'normalized-e2e' || + selectedXAxisMode === 'session-time' || + selectedXAxisMode === 'prefill-tps'); const derivedTargetIds = useMemo(() => { if (!useDerived) return [] as number[]; const ids = new Set(); @@ -403,10 +418,14 @@ export default function ChartDisplay() { if (!useDerived) return visibleGraphs; if (!derivedMetrics) return visibleGraphs.map((graph) => ({ ...graph, data: [] })); const isSession = selectedXAxisMode === 'session-time'; + const isNormalizedE2e = selectedXAxisMode === 'normalized-e2e'; + const percentileLabel = selectedPercentile.toUpperCase(); const xLabel = isSession ? 'Mean Normalized Session Time (min)' - : 'P90 Prefill TPS per user (tok/s)'; - const rooflineCorner = isSession ? 'upper_right' : 'upper_left'; + : isNormalizedE2e + ? `${percentileLabel} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens (s)` + : 'P90 Prefill TPS per user (tok/s)'; + const rooflineCorner = isSession || isNormalizedE2e ? 'upper_right' : 'upper_left'; return visibleGraphs.map((graph) => { const chartDefinition = { ...graph.chartDefinition, @@ -420,14 +439,25 @@ export default function ChartDisplay() { const metrics = derivedMetrics[point.id]; const raw = isSession ? metrics?.normalized_session_time_s - : metrics?.p90_prefill_tps_per_user; + : isNormalizedE2e + ? selectedPercentile === 'p75' + ? metrics?.p75_normalized_e2e_400_s + : metrics?.p90_normalized_e2e_400_s + : metrics?.p90_prefill_tps_per_user; if (raw === null || raw === undefined || !Number.isFinite(raw)) return null; return { ...point, x: isSession ? raw / 60 : raw }; }) .filter((point): point is NonNullable => point !== null); return { ...graph, chartDefinition, data }; }); - }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]); + }, [ + useDerived, + visibleGraphs, + derivedMetrics, + selectedXAxisMode, + selectedYAxisMetric, + selectedPercentile, + ]); const displayGraphs = isFirstLoad || isDerivedLoading @@ -488,10 +518,11 @@ export default function ChartDisplay() { ); // Match warnings against the same series the chart annotates, // including visible unofficial-run overlay series. - const overlay = - graph.chartDefinition.chartType === 'e2e' - ? overlayDataByChartType.e2e - : overlayDataByChartType.interactivity; + const overlay = selectUnofficialOverlayForMode( + selectedXAxisMode, + graph.chartDefinition.chartType, + overlayDataByChartType, + ); const visibleOverlayRows = isTimelineMode ? [] : (overlay?.data ?? []).filter( @@ -551,6 +582,9 @@ export default function ChartDisplay() { if (selectedXAxisMode === 'prefill-tps') { return 'vs. P90 Prefill TPS / user'; } + if (selectedXAxisMode === 'normalized-e2e') { + return `vs. ${selectedPercentile.toUpperCase()} Normalized E2E @ ${NORMALIZED_E2E_OUTPUT_TOKENS} output tokens`; + } const isAgentic = sequenceKind(selectedSequence) === 'agentic'; if (selectedE2eXAxisMetric?.endsWith('_ttft')) { const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, ''); @@ -597,15 +631,22 @@ export default function ChartDisplay() { )}

+ {isUnofficialRun && selectedXAxisMode === 'normalized-e2e' && ( +

+ Normalized E2E requires persisted per-request traces, so + unofficial-run overlays are unavailable for this experimental view. +

+ )} ); if (getViewMode(graphIndex) === 'table') { - const overlay = - graph.chartDefinition.chartType === 'e2e' - ? overlayDataByChartType.e2e - : overlayDataByChartType.interactivity; + const overlay = selectUnofficialOverlayForMode( + selectedXAxisMode, + graph.chartDefinition.chartType, + overlayDataByChartType, + ); const overlayRows = (overlay?.data ?? []).filter((p) => selectedPrecisions.includes(p.precision), ); @@ -657,9 +698,11 @@ export default function ChartDisplay() { chartDefinition={graph.chartDefinition} caption={chartCaption} overlayData={ - graph.chartDefinition.chartType === 'e2e' - ? (overlayDataByChartType.e2e ?? undefined) - : (overlayDataByChartType.interactivity ?? undefined) + selectUnofficialOverlayForMode( + selectedXAxisMode, + graph.chartDefinition.chartType, + overlayDataByChartType, + ) ?? undefined } /> {selectedGPUs.length > 0 && diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index f9031489..a8cfed48 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -12,6 +12,7 @@ import { getChartWatermark } from '@/lib/data-mappings'; import { generateGpuDateColors } from '@/lib/dynamic-colors'; import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils'; import { useThemeColors } from '@/hooks/useThemeColors'; +import { useTraceAvailability } from '@/hooks/api/use-trace-availability'; import { D3Chart } from '@/lib/d3-chart/D3Chart'; import type { CustomLayerConfig, @@ -260,6 +261,20 @@ const GPUGraph = React.memo( return pts; }, [groupedData, activeDates, hideNonOptimal, optimalPointKeys]); + // GPU comparison currently renders official DB-backed points only. Unofficial + // overlays have no benchmark_results id or persisted trace, so they cannot + // open the dedicated per-point charts route. + const agenticIds = useMemo( + () => + filteredData.flatMap((point) => + point.benchmark_type === 'agentic_traces' && typeof point.id === 'number' + ? [point.id] + : [], + ), + [filteredData], + ); + const { data: traceAvailability } = useTraceAvailability(agenticIds); + // Warning annotations for visible series with known upstream issues — // same treatment the scatter view gets, applied to the date-comparison view. // Lines here are colored per (gpu, date) pair, so take the first active @@ -799,6 +814,7 @@ const GPUGraph = React.memo( selectedYAxisMetric, hardwareConfig, runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined, + hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false, }), getRulerX: (d, xScale) => (xScale as d3.ScaleLinear)(d.x), getRulerY: (d, yScale) => (yScale as d3.ScaleLinear)(d.y), @@ -812,6 +828,37 @@ const GPUGraph = React.memo( sel.select('.visible-shape') as any, getShapeKeyForPrecision(d.precision, selectedPrecisions), ), + onPointClick: (d: InferenceData) => { + track('gpu_timeseries_data_point_clicked', { + id: d.id, + hw: String(d.hwKey), + x: d.x, + y: d.y, + }); + const tooltipEl = chartRef.current?.getTooltipElement(); + if (!tooltipEl) return; + const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]'); + if (!viewBtn || typeof d.id !== 'number') return; + viewBtn.addEventListener('click', (event) => { + event.stopPropagation(); + track('gpu_timeseries_view_charts_opened', { + id: d.id, + hwKey: String(d.hwKey), + conc: d.conc, + }); + }); + // Pinning updates D3Chart's React state. GPU comparison rebuilds + // several inline layer configs on that render, whose cleanup can + // briefly hide the otherwise-pinned portal tooltip. Restore its + // pinned visibility after that render settles. + requestAnimationFrame(() => { + const pinnedTooltip = chartRef.current?.getTooltipElement(); + if (!pinnedTooltip || chartRef.current?.getPinnedPoint() !== d) return; + pinnedTooltip.style.opacity = '1'; + pinnedTooltip.style.display = 'block'; + pinnedTooltip.style.pointerEvents = 'auto'; + }); + }, attachToLayer: 1, }} onRender={(ctx: RenderContext) => { diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index d3e185d9..b7328acc 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -969,20 +969,16 @@ const ScatterGraph = React.memo( }); } - // ── "View charts" → navigate to dedicated detail page ──────────── + // ── "View charts" real link (supports browser open-in-new-tab) ─── const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]'); if (viewBtn && typeof d.id === 'number') { - const pointId = d.id; viewBtn.addEventListener('click', (btnEvent) => { btnEvent.stopPropagation(); track('latency_view_charts_opened', { - id: pointId, + id: d.id, hwKey: String(d.hwKey), conc: d.conc, }); - chartRef.current?.dismissTooltip(); - chartRef.current?.hideTooltip(); - window.location.assign(`/inference/agentic/${pointId}`); }); } }, diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts index 589ba580..7d5b1482 100644 --- a/packages/app/src/components/inference/utils.test.ts +++ b/packages/app/src/components/inference/utils.test.ts @@ -1,7 +1,26 @@ import { describe, it, expect } from 'vitest'; import type { ChartDefinition, InferenceData } from '@/components/inference/types'; -import { filterDataByCostLimit, processOverlayChartData } from '@/components/inference/utils'; +import { + filterDataByCostLimit, + processOverlayChartData, + selectUnofficialOverlayForMode, +} from '@/components/inference/utils'; + +describe('selectUnofficialOverlayForMode', () => { + const overlays = { e2e: { id: 'e2e' }, interactivity: { id: 'interactivity' } }; + + it('suppresses raw unofficial E2E data for normalized E2E mode', () => { + expect(selectUnofficialOverlayForMode('normalized-e2e', 'e2e', overlays)).toBeNull(); + }); + + it('preserves matching unofficial overlays for supported modes', () => { + expect(selectUnofficialOverlayForMode('e2e', 'e2e', overlays)).toBe(overlays.e2e); + expect(selectUnofficialOverlayForMode('interactivity', 'interactivity', overlays)).toBe( + overlays.interactivity, + ); + }); +}); // --------------------------------------------------------------------------- // fixture factories diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts index 4876c614..f6ebd0f8 100644 --- a/packages/app/src/components/inference/utils.ts +++ b/packages/app/src/components/inference/utils.ts @@ -8,6 +8,20 @@ import chartDefinitions from '@/components/inference/inference-chart-config.json import type { ChartDefinition, InferenceData, YAxisMetricKey } from './types'; +/** + * Select the matching unofficial-run overlay for a chart mode. Normalized E2E + * is intentionally excluded: unofficial benchmark rows do not include the + * persisted per-request trace needed to normalize before taking percentiles. + */ +export function selectUnofficialOverlayForMode( + xAxisMode: string, + chartType: 'e2e' | 'interactivity', + overlays: { e2e: T | null; interactivity: T | null }, +): T | null { + if (xAxisMode === 'normalized-e2e') return null; + return overlays[chartType]; +} + /** * Filters data points based on cost limits defined in the chart definition. * Only applies filtering for cost-related metrics, and only filters based on diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts index 5a5bd7e9..e4b9d31f 100644 --- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts +++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts @@ -150,6 +150,15 @@ describe('getPointLabel', () => { // generateTooltipContent // =========================================================================== describe('generateTooltipContent', () => { + it('renders View charts as a same-tab anchor so browsers offer open-in-new-tab', () => { + const html = generateTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }), + ); + expect(html).toContain(' { const html = generateTooltipContent(tooltipConfig()); expect(html).toContain('H100'); @@ -365,4 +374,27 @@ describe('generateGPUGraphTooltipContent', () => { ); expect(html).toContain('vllm-v0.6.0
abc123'); }); + + it('shows View charts only for pinned points with stored trace data', () => { + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }), + ), + ).toContain('data-action="view-charts"'); + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: true }), + ), + ).toContain('href="/inference/agentic/1"'); + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: false, hasTrace: true }), + ), + ).not.toContain('data-action="view-charts"'); + expect( + generateGPUGraphTooltipContent( + tooltipConfig({ data: pt({ id: 1 }), isPinned: true, hasTrace: false }), + ), + ).not.toContain('data-action="view-charts"'); + }); }); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index ea039336..e3f0de6d 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -134,15 +134,19 @@ const generateAgenticHTML = (d: InferenceData): string => { return parts.join(''); }; -/** "View charts" button — only visible when the tooltip is pinned and the - * point has stored trace data. Wired up by the ScatterGraph click handler. */ -const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => { - if (!isPinned || !hasTraceData) return ''; - return `
`; + background: var(--accent); color: var(--accent-foreground); text-align: center; text-decoration: none; + ">View charts →`; }; const shortenSha = (image: string) => @@ -254,7 +258,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} - ${viewChartsButtonHTML(isPinned, Boolean(hasTrace))} + ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)} ${ isPinned ? `
${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} + ${viewChartsButtonHTML(isPinned, Boolean(hasTrace), d.id)}
`; }; diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts index 96b0f59f..46491b4e 100644 --- a/packages/app/src/hooks/api/use-datasets.ts +++ b/packages/app/src/hooks/api/use-datasets.ts @@ -17,6 +17,10 @@ export interface DatasetSummary { mainTurns?: number; subagentGroups?: number; subagentTurns?: number; + meanRequestsPerConversation?: number; + medianRequestsPerConversation?: number; + meanSubagentsPerTrace?: number; + medianSubagentsPerTrace?: number; modelMix?: Record; [k: string]: unknown; } @@ -63,6 +67,8 @@ export interface DatasetChartData { inputTokensPerTurn?: Distribution; uncachedInputTokensPerTurn?: Distribution; outputTokensPerTurn?: Distribution; + subagentInputTokensPerRequest?: Distribution; + subagentOutputTokensPerRequest?: Distribution; turnsPerConversation?: Distribution; subagentGroupsPerConversation?: Distribution; cachedFractionPerTurn?: Distribution; diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts new file mode 100644 index 00000000..2e54f418 --- /dev/null +++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, it } from 'vitest'; + +import { chunkDerivedAgenticMetricIds } from './use-derived-agentic-metrics'; + +describe('chunkDerivedAgenticMetricIds', () => { + it('keeps every id while respecting the API limit', () => { + const ids = Array.from({ length: 401 }, (_, index) => index + 1); + const chunks = chunkDerivedAgenticMetricIds(ids); + + expect(chunks.map((chunk) => chunk.length)).toEqual([200, 200, 1]); + expect(chunks.flat()).toEqual(ids); + }); +}); diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts index 6bc7ae5e..c4f517f7 100644 --- a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts +++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts @@ -8,18 +8,38 @@ export interface DerivedAgenticMetric { /** P90 of per-turn ISL/TTFT across every turn in every session. * Null when no prefill rates could be computed. */ p90_prefill_tps_per_user: number | null; + /** P75 normalized per-request E2E at a fixed 400-token output length. */ + p75_normalized_e2e_400_s: number | null; + /** P90 normalized per-request E2E at a fixed 400-token output length. */ + p90_normalized_e2e_400_s: number | null; } export type DerivedAgenticMetricMap = Record; +const MAX_IDS_PER_REQUEST = 200; + +export function chunkDerivedAgenticMetricIds(ids: number[]): number[][] { + const chunks: number[][] = []; + for (let i = 0; i < ids.length; i += MAX_IDS_PER_REQUEST) { + chunks.push(ids.slice(i, i + MAX_IDS_PER_REQUEST)); + } + return chunks; +} + async function fetchDerivedAgenticMetrics( ids: number[], signal?: AbortSignal, ): Promise { if (ids.length === 0) return {}; - const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${ids.join(',')}`, { signal }); - if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`); - return (await res.json()) as DerivedAgenticMetricMap; + const chunks = chunkDerivedAgenticMetricIds(ids); + const maps = await Promise.all( + chunks.map(async (chunk) => { + const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${chunk.join(',')}`, { signal }); + if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`); + return (await res.json()) as DerivedAgenticMetricMap; + }), + ); + return Object.assign({}, ...maps) as DerivedAgenticMetricMap; } /** diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts index 11905aaa..a16be558 100644 --- a/packages/app/src/hooks/api/use-trace-server-metrics.ts +++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts @@ -30,6 +30,32 @@ export interface PointMeta { server_cpu_cache_hit_rate: number | null; } +export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown'; + +export interface MetricSource { + id: string; + adapter: string; + role: MetricSourceRole; + endpointUrl: string | null; + nativeRole: string | null; + workerId: string | null; + dpRank: string | null; + engine: string | null; +} + +export interface MetricSourceSeries { + source: MetricSource; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + promptTps: TimeSeriesPoint[]; + generationTps: TimeSeriesPoint[]; + prefixCacheHitsTps: TimeSeriesPoint[]; + hostKvCacheUsage: TimeSeriesPoint[]; + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; +} + export interface TraceServerMetrics { meta: PointMeta; startNs: number; @@ -51,6 +77,8 @@ export interface TraceServerMetrics { * the cluster-average `kvCacheUsage` line covers that case alone. */ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; + /** Orchestrator-normalized metrics grouped by endpoint/worker. */ + metricSources: MetricSourceSeries[]; } async function fetchTraceServerMetrics( diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts index debbb788..8b691ee4 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.test.ts @@ -4,7 +4,7 @@ import { describe, expect, it } from 'vitest'; import type { ShapeKey } from '@/lib/chart-rendering'; -import { renderScatterPoints, syncPointShape } from './scatter-points'; +import { computeTooltipPosition, renderScatterPoints, syncPointShape } from './scatter-points'; interface TestPoint { hwKey: string; @@ -163,3 +163,51 @@ describe('syncPointShape', () => { expect(g.selectAll('.visible-shape').size()).toBe(1); }); }); + +describe('computeTooltipPosition', () => { + it('keeps a tall pinned tooltip inside the visible viewport', () => { + const tooltipNode = document.createElement('div'); + document.body.append(tooltipNode); + Object.defineProperty(tooltipNode, 'getBoundingClientRect', { + value: () => ({ + width: 300, + height: 400, + left: 0, + top: 0, + right: 300, + bottom: 400, + x: 0, + y: 0, + toJSON: () => ({}), + }), + }); + + const container = document.createElement('div'); + Object.defineProperties(container, { + clientWidth: { value: 800 }, + clientHeight: { value: 600 }, + getBoundingClientRect: { + value: () => ({ + width: 800, + height: 600, + left: 100, + top: 600, + right: 900, + bottom: 1200, + x: 100, + y: 600, + toJSON: () => ({}), + }), + }, + }); + Object.defineProperties(document.documentElement, { + clientWidth: { configurable: true, value: 1280 }, + clientHeight: { configurable: true, value: 720 }, + }); + + expect(computeTooltipPosition(450, 100, d3.select(tooltipNode), container)).toEqual({ + left: 560, + top: 316, + }); + }); +}); diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index c73f1302..433ed6d1 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -323,8 +323,9 @@ export function attachScatterTooltipHandlers< * whole problem; we just need the coordinates in viewport space. * * Strategy: pick preferred side (right/below cursor), flip if it overflows the - * container, then clamp to container bounds. Tall tooltips that don't fit get - * clamped to the container edges. + * container, then clamp the final fixed coordinates to the viewport. The + * viewport clamp matters when a chart continues below the fold: container- + * local coordinates can otherwise place a pinned tooltip's actions offscreen. */ export function computeTooltipPosition( mx: number, @@ -357,8 +358,16 @@ export function computeTooltipPosition( let top = my + offset + th <= ch ? my + offset : my - offset - th; top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top)); - // Convert container-local coords → viewport coords for `position: fixed`. - return { left: left + rect.left, top: top + rect.top }; + // Convert container-local coords → viewport coords for `position: fixed`, + // then keep the complete tooltip visible when its dimensions permit it. + const viewportWidth = document.documentElement.clientWidth || window.innerWidth; + const viewportHeight = document.documentElement.clientHeight || window.innerHeight; + left += rect.left; + top += rect.top; + left = Math.max(EDGE_PAD, Math.min(viewportWidth - tw - EDGE_PAD, left)); + top = Math.max(EDGE_PAD, Math.min(viewportHeight - th - EDGE_PAD, top)); + + return { left, top }; } /** Update scatter point positions on zoom. */ diff --git a/packages/constants/src/agentic.ts b/packages/constants/src/agentic.ts new file mode 100644 index 00000000..42eab306 --- /dev/null +++ b/packages/constants/src/agentic.ts @@ -0,0 +1,2 @@ +/** Fixed output length used by the experimental normalized-E2E chart metric. */ +export const NORMALIZED_E2E_OUTPUT_TOKENS = 400; diff --git a/packages/constants/src/index.ts b/packages/constants/src/index.ts index e767e500..7d3d6783 100644 --- a/packages/constants/src/index.ts +++ b/packages/constants/src/index.ts @@ -1,3 +1,4 @@ +export * from './agentic'; export * from './framework-aliases'; export * from './github'; export * from './gpu-keys'; diff --git a/packages/db/migrations/009_dataset_request_stats.sql b/packages/db/migrations/009_dataset_request_stats.sql new file mode 100644 index 00000000..0b7c11bb --- /dev/null +++ b/packages/db/migrations/009_dataset_request_stats.sql @@ -0,0 +1,55 @@ +-- Backfill dataset-level requests/conversation statistics. +-- A request is one actual model call: each top-level turn plus each child turn +-- inside a subagent group. The group container itself is not a request. + +with per_conversation as ( + select + dc.dataset_id, + dc.num_subagent_groups, + ( + dc.num_turns + coalesce(( + select sum(jsonb_array_length(node.value->'children')) + from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) as node(value) + where node.value->>'kind' = 'subagent' + ), 0) + )::double precision as request_count + from dataset_conversations dc +), request_stats as ( + select + dataset_id, + avg(request_count) as mean_requests, + percentile_cont(0.5) within group (order by request_count) as median_requests, + avg(num_subagent_groups::double precision) as mean_subagents, + percentile_cont(0.5) within group (order by num_subagent_groups) as median_subagents + from per_conversation + group by dataset_id +) +update datasets d +set summary = jsonb_set( + jsonb_set( + jsonb_set( + jsonb_set( + jsonb_set( + d.summary, + '{meanRequestsPerConversation}', + to_jsonb(request_stats.mean_requests), + true + ), + '{medianRequestsPerConversation}', + to_jsonb(request_stats.median_requests), + true + ), + '{meanSubagentsPerTrace}', + to_jsonb(request_stats.mean_subagents), + true + ), + '{medianSubagentsPerTrace}', + to_jsonb(request_stats.median_subagents), + true + ), + '{version}', + '3'::jsonb, + true +) +from request_stats +where d.id = request_stats.dataset_id; diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts index 8dd42dce..5bd760b7 100644 --- a/packages/db/src/backfill-aggregate-stats.ts +++ b/packages/db/src/backfill-aggregate-stats.ts @@ -23,7 +23,12 @@ */ import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; -import { computeAggregateStats, STATS_VERSION } from './etl/compute-aggregate-stats.js'; +import { + computeAggregateStats, + mergeProfileStatsUpgrade, + STATS_VERSION, + type AggregateStats, +} from './etl/compute-aggregate-stats.js'; import { createAdminSql } from './etl/db-utils.js'; interface CliFlags { @@ -104,9 +109,9 @@ async function main(): Promise { try { // Fetch one row at a time — the json_gz blob is the heavy field. const [row] = await sql< - { profile_export_jsonl_gz: Buffer | null; server_metrics_json_gz: Buffer | null }[] + { profile_export_jsonl_gz: Buffer | null; aggregate_stats: AggregateStats | null }[] >` - select profile_export_jsonl_gz, server_metrics_json_gz + select profile_export_jsonl_gz, aggregate_stats from agentic_trace_replay where id = ${id} `; @@ -115,10 +120,24 @@ async function main(): Promise { continue; } - const stats = await computeAggregateStats({ - profileBlob: row.profile_export_jsonl_gz, - serverBlob: row.server_metrics_json_gz, - }); + let stats: AggregateStats; + if (row.aggregate_stats?.version === 3) { + const profileStats = await computeAggregateStats({ + profileBlob: row.profile_export_jsonl_gz, + serverBlob: null, + }); + stats = mergeProfileStatsUpgrade(row.aggregate_stats, profileStats); + } else { + const [serverRow] = await sql<{ server_metrics_json_gz: Buffer | null }[]>` + select server_metrics_json_gz + from agentic_trace_replay + where id = ${id} + `; + stats = await computeAggregateStats({ + profileBlob: row.profile_export_jsonl_gz, + serverBlob: serverRow?.server_metrics_json_gz ?? null, + }); + } await sql` update agentic_trace_replay diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts index 66156b45..416904f9 100644 --- a/packages/db/src/backfill-chart-series.ts +++ b/packages/db/src/backfill-chart-series.ts @@ -108,17 +108,34 @@ async function main(): Promise { for (const { id } of candidates) { const start = Date.now(); try { - const [row] = await sql<{ server_metrics_json_gz: Buffer | null }[]>` - select server_metrics_json_gz - from agentic_trace_replay - where id = ${id} + const [row] = await sql< + { + server_metrics_json_gz: Buffer | null; + framework: string | null; + disagg: boolean | null; + }[] + >` + select atr.server_metrics_json_gz, source.framework, source.disagg + from agentic_trace_replay atr + left join lateral ( + select c.framework, c.disagg + from benchmark_results br + join configs c on c.id = br.config_id + where br.trace_replay_id = atr.id + order by br.id + limit 1 + ) source on true + where atr.id = ${id} `; if (!row) { console.warn(` id=${id}: row vanished, skipping`); continue; } - const series = await computeChartSeries(row.server_metrics_json_gz); + const series = await computeChartSeries(row.server_metrics_json_gz, { + framework: row.framework, + disagg: row.disagg ?? false, + }); await sql` update agentic_trace_replay diff --git a/packages/db/src/backfill-dataset-stats.ts b/packages/db/src/backfill-dataset-stats.ts new file mode 100644 index 00000000..6dce6164 --- /dev/null +++ b/packages/db/src/backfill-dataset-stats.ts @@ -0,0 +1,115 @@ +/** + * Backfill dataset summary stats and subagent-only ISL/OSL distributions from + * the compact structures already stored in `dataset_conversations`. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-dataset-stats --yes + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { createAdminSql } from './etl/db-utils.js'; +import { logHistogram, summarizeValues } from './etl/weka-structure.js'; + +interface DatasetRow { + id: string; + slug: string; + summary: Record; + chart_data: Record; +} + +interface ConversationRow { + num_subagent_groups: number | string; + request_count: number | string; +} + +interface SubagentRequestRow { + input_tokens: number | string; + output_tokens: number | string; +} + +const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); + +async function main(): Promise { + const datasets = await sql` + select id, slug, summary, chart_data + from datasets + order by slug + `; + if (datasets.length === 0) { + console.log('No datasets found.'); + return; + } + + console.log(`Backfill subagent dataset stats for ${datasets.length} dataset(s).`); + if (!hasYesFlag() && !(await confirm('Continue? (y/N) '))) return; + + for (const dataset of datasets) { + const conversations = await sql` + select + num_subagent_groups, + ( + num_turns + coalesce(( + select sum(jsonb_array_length(node.value->'children')) + from jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value) + where node.value->>'kind' = 'subagent' + ), 0) + ) as request_count + from dataset_conversations dc + where dataset_id = ${dataset.id} + `; + const requests = await sql` + select + (child.value->>'in')::double precision as input_tokens, + (child.value->>'out')::double precision as output_tokens + from dataset_conversations dc + cross join lateral jsonb_array_elements(coalesce(dc.structure->'nodes', '[]'::jsonb)) node(value) + cross join lateral jsonb_array_elements(coalesce(node.value->'children', '[]'::jsonb)) child(value) + where dc.dataset_id = ${dataset.id} + and node.value->>'kind' = 'subagent' + `; + + const subagentsPerTrace = conversations.map((row) => Number(row.num_subagent_groups)); + const requestsPerConversation = conversations.map((row) => Number(row.request_count)); + const inputTokens = requests.map((row) => Number(row.input_tokens)); + const outputTokens = requests.map((row) => Number(row.output_tokens)); + const subagentStats = summarizeValues(subagentsPerTrace); + const requestStats = summarizeValues(requestsPerConversation); + const summary = { + ...dataset.summary, + version: 3, + meanSubagentsPerTrace: subagentStats.mean, + medianSubagentsPerTrace: subagentStats.median, + meanRequestsPerConversation: requestStats.mean, + medianRequestsPerConversation: requestStats.median, + }; + const chartData = { + ...dataset.chart_data, + version: 3, + subagentInputTokensPerRequest: { + bins: logHistogram(inputTokens), + stats: summarizeValues(inputTokens), + }, + subagentOutputTokensPerRequest: { + bins: logHistogram(outputTokens), + stats: summarizeValues(outputTokens), + }, + }; + + await sql` + update datasets + set summary = ${sql.json(summary)}, + chart_data = ${sql.json(structuredClone(chartData) as unknown as Parameters[0])} + where id = ${dataset.id} + `; + console.log( + ` ${dataset.slug}: ${requests.length.toLocaleString()} inner requests, median ${subagentStats.median}, mean ${subagentStats.mean.toFixed(1)} subagents/trace`, + ); + } +} + +main() + .catch((error) => { + console.error('backfill-dataset-stats failed:', error); + process.exitCode = 1; + }) + .finally(() => sql.end()); diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts index de0009de..7b745c09 100644 --- a/packages/db/src/etl/compute-aggregate-stats.test.ts +++ b/packages/db/src/etl/compute-aggregate-stats.test.ts @@ -2,7 +2,11 @@ import { gzipSync } from 'node:zlib'; import { describe, expect, it } from 'vitest'; -import { STATS_VERSION, computeAggregateStats } from './compute-aggregate-stats.js'; +import { + STATS_VERSION, + computeAggregateStats, + mergeProfileStatsUpgrade, +} from './compute-aggregate-stats.js'; /** Build a minimal `profile_export.jsonl` from a few synthetic requests. */ function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) { @@ -64,6 +68,7 @@ describe('computeAggregateStats', () => { expect(stats.prefixCacheHitRate).toBeNull(); expect(stats.normalizedSessionTimeS).toBeNull(); expect(stats.p90PrefillTpsPerUser).toBeNull(); + expect(stats.normalizedE2e400).toBeNull(); }); it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => { @@ -90,6 +95,8 @@ describe('computeAggregateStats', () => { // scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625] // mean ≈ 1.9653 expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3); + expect(stats.normalizedE2e400?.n).toBe(3); + expect(stats.normalizedE2e400?.p90).toBeGreaterThan(0); }); it('computes KV util + prefix hit rate from the server blob alone', async () => { @@ -107,6 +114,7 @@ describe('computeAggregateStats', () => { expect(stats.osl).toBeNull(); expect(stats.normalizedSessionTimeS).toBeNull(); expect(stats.p90PrefillTpsPerUser).toBeNull(); + expect(stats.normalizedE2e400).toBeNull(); }); it('tolerates a malformed profile blob by leaving its metrics null', async () => { @@ -117,7 +125,28 @@ describe('computeAggregateStats', () => { expect(stats.osl).toBeNull(); expect(stats.normalizedSessionTimeS).toBeNull(); expect(stats.p90PrefillTpsPerUser).toBeNull(); + expect(stats.normalizedE2e400).toBeNull(); // Version still set so the row is considered "computed". expect(stats.version).toBe(STATS_VERSION); }); }); + +describe('mergeProfileStatsUpgrade', () => { + it('updates profile metrics while preserving existing server distributions', async () => { + const existing = await computeAggregateStats({ + profileBlob: null, + serverBlob: makeServerBlob(), + }); + const profile = await computeAggregateStats({ + profileBlob: makeProfileBlob([{ isl: 100, osl: 100, rl: 2080, ttft: 100 }]), + serverBlob: null, + }); + + const merged = mergeProfileStatsUpgrade(existing, profile); + expect(merged.version).toBe(STATS_VERSION); + expect(merged.isl?.mean).toBe(100); + expect(merged.normalizedE2e400?.p90).toBeGreaterThan(0); + expect(merged.kvCacheUtil).toEqual(existing.kvCacheUtil); + expect(merged.prefixCacheHitRate).toEqual(existing.prefixCacheHitRate); + }); +}); diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts index a422cfec..15e5f1ba 100644 --- a/packages/db/src/etl/compute-aggregate-stats.ts +++ b/packages/db/src/etl/compute-aggregate-stats.ts @@ -39,6 +39,30 @@ export interface AggregateStats { normalizedSessionTimeS: number | null; /** P90 of per-turn ISL/TTFT pooled across every session's turns. */ p90PrefillTpsPerUser: number | null; + /** Per-request normalized E2E distribution at a fixed 400-token OSL. */ + normalizedE2e400: MetricPercentiles | null; +} + +/** + * Upgrade an existing stats bundle when only profile-derived fields changed. + * This avoids re-reading and decompressing the much larger server-metrics blob + * while preserving its already-computed KV/cache distributions. + */ +export function mergeProfileStatsUpgrade( + existing: Omit & { + normalizedE2e400?: MetricPercentiles | null; + }, + profile: AggregateStats, +): AggregateStats { + return { + ...profile, + isl: profile.isl ?? existing.isl, + osl: profile.osl ?? existing.osl, + normalizedSessionTimeS: profile.normalizedSessionTimeS ?? existing.normalizedSessionTimeS, + p90PrefillTpsPerUser: profile.p90PrefillTpsPerUser ?? existing.p90PrefillTpsPerUser, + kvCacheUtil: existing.kvCacheUtil, + prefixCacheHitRate: existing.prefixCacheHitRate, + }; } /** Metric subtrees we extract via stream-parse on oversized server blobs. */ @@ -93,6 +117,7 @@ export async function computeAggregateStats(args: { let oslPct: MetricPercentiles | null = null; let normalized: number | null = null; let prefillP90: number | null = null; + let normalizedE2e400: MetricPercentiles | null = null; if (args.profileBlob) { try { @@ -103,6 +128,7 @@ export async function computeAggregateStats(args: { const derived = computeDerivedFromBlob(jsonl); normalized = derived.normalized_session_time_s; prefillP90 = derived.p90_prefill_tps_per_user; + normalizedE2e400 = derived.normalized_e2e_400; } catch { // ignore malformed blob — leave nulls } @@ -143,5 +169,6 @@ export async function computeAggregateStats(args: { prefixCacheHitRate: prefixPct, normalizedSessionTimeS: normalized, p90PrefillTpsPerUser: prefillP90, + normalizedE2e400, }; } diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts index 4c6f8791..7d292207 100644 --- a/packages/db/src/etl/compute-chart-series.test.ts +++ b/packages/db/src/etl/compute-chart-series.test.ts @@ -105,6 +105,20 @@ function buildEngineSeries(engineId: number, baseRunning: number) { }; } +function buildDynamoSeries( + endpoint_url: string, + dynamo_component: 'prefill' | 'backend', + worker_id: string, + value: number, + field: 'rate' | 'avg' = 'rate', +) { + return { + endpoint_url, + labels: { dynamo_component, worker_id, dp_rank: '0', engine: '0' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, [field]: value }], + }; +} + describe('computeChartSeries', () => { it('returns null when the blob is null', async () => { expect(await computeChartSeries(null)).toBeNull(); @@ -206,4 +220,79 @@ describe('computeChartSeries', () => { { t: 1, value: 300 }, ]); }); + + it('uses the Dynamo adapter to preserve workers and canonical prefill/decode roles', async () => { + const json = JSON.stringify({ + metrics: { + 'vllm:prompt_tokens': { + series: [ + buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 100), + buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 200), + buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 300), + ], + }, + 'vllm:generation_tokens': { + series: [ + buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 1), + buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 2), + buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 400), + ], + }, + 'vllm:num_requests_running': { + series: [ + buildDynamoSeries('10.30.1.56:7500', 'prefill', 'prefill-a', 3, 'avg'), + buildDynamoSeries('10.30.1.36:7508', 'prefill', 'prefill-b', 4, 'avg'), + buildDynamoSeries('10.30.1.206:7516', 'backend', 'decode-a', 5, 'avg'), + ], + }, + }, + }); + + const blob = gzipSync(Buffer.from(json)); + const result = await computeChartSeries(blob, { + framework: 'dynamo-vllm', + disagg: true, + }); + + expect(result?.metricSources).toHaveLength(3); + expect(result?.metricSources.map(({ source: s }) => [s.role, s.workerId, s.engine])).toEqual([ + ['prefill', 'prefill-b', '0'], + ['prefill', 'prefill-a', '0'], + ['decode', 'decode-a', '0'], + ]); + const prefillA = result?.metricSources.find(({ source: s }) => s.workerId === 'prefill-a'); + const decode = result?.metricSources.find(({ source: s }) => s.role === 'decode'); + expect(prefillA?.promptTps).toEqual([{ t: 0, value: 100 }]); + expect(prefillA?.queueDepth).toEqual([{ t: 0, running: 3, waiting: 0, total: 3 }]); + expect(decode?.generationTps).toEqual([{ t: 0, value: 400 }]); + + const nonDisagg = await computeChartSeries(blob, { + framework: 'dynamo-vllm', + disagg: false, + }); + expect(nonDisagg?.metricSources).toEqual([]); + }); + + it('does not interpret Dynamo-native labels without selecting the Dynamo adapter', async () => { + const json = JSON.stringify({ + metrics: { + 'vllm:prompt_tokens': { + series: [ + { + endpoint_url: '10.30.1.56:7500', + labels: { dynamo_component: 'prefill', worker_id: 'prefill-a', engine: '0' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 100 }], + }, + ], + }, + }, + }); + + const result = await computeChartSeries(gzipSync(Buffer.from(json)), { + framework: 'vllm', + disagg: true, + }); + + expect(result?.metricSources).toEqual([]); + }); }); diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 46600f7d..394a5826 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -17,6 +17,12 @@ import { parser } from 'stream-json'; import { pick } from 'stream-json/filters/pick.js'; import { streamObject } from 'stream-json/streamers/stream-object.js'; +import { + selectServerMetricsAdapter, + type MetricSource, + type ServerMetricsContext, +} from './server-metrics-adapters'; + /** * Bump when the extraction algorithm changes — backfill recomputes anything * older. @@ -49,8 +55,16 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * `kvCacheUsageByEngine` (one entry per DP rank). The cluster-average * line hides load skew on DEP configs; the detail page overlays the * per-rank lines so a hot rank is visible at a glance. + * + * v9: retain orchestrator-normalized per-source series. Dynamo labels are + * mapped to canonical router/prefill/decode roles, allowing the frontend to + * inspect individual workers without interpreting Dynamo-native labels. + * + * v10: only emit per-source series for disaggregated configs with a recognized + * orchestrator adapter. Non-disaggregated and unsupported configs retain the + * existing aggregate-only behavior. */ -export const CHART_SERIES_VERSION = 8; +export const CHART_SERIES_VERSION = 10; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -103,6 +117,26 @@ export interface ChartSeries { * visible without changing the headline number. */ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; + /** + * The same metrics grouped by normalized server source. Existing aggregate + * fields above remain the default and preserve compatibility with old rows. + */ + metricSources: MetricSourceSeries[]; +} + +export interface MetricSourceSeries { + source: MetricSource; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + /** Raw prompt-token counter rate for this source. */ + promptTps: TimeSeriesPoint[]; + /** Raw generation-token counter rate for this source. */ + generationTps: TimeSeriesPoint[]; + prefixCacheHitsTps: TimeSeriesPoint[]; + hostKvCacheUsage: TimeSeriesPoint[]; + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; } // ── Raw blob shapes (subset we read) ──────────────────────────────────── @@ -115,6 +149,7 @@ interface RawSlice { } interface RawSeries { + endpoint_url?: string; labels?: Record; timeslices?: RawSlice[]; } @@ -204,7 +239,10 @@ async function parseMetrics(buffer: Buffer): Promise { * The math mirrors `getTraceServerMetrics` — this helper exists so ingest, * backfill, and the API path produce byte-identical results. */ -export async function computeChartSeries(blob: Buffer | null): Promise { +export async function computeChartSeries( + blob: Buffer | null, + context: ServerMetricsContext = {}, +): Promise { if (!blob) return null; let metrics: MetricsMap; try { @@ -213,7 +251,7 @@ export async function computeChartSeries(blob: Buffer | null): Promise): [number, number][] { return [...m.entries()].toSorted((a, b) => a[0] - b[0]); } -function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { +function buildSeriesFromMetrics( + metrics: MetricsMap, + context: ServerMetricsContext, + includeMetricSources = true, + originStartNs?: number, +): ChartSeries { // Timing reference: smallest start_ns and largest end_ns across every // timeslice we extracted. timeslicesCount is the length of any single // series (engines are scraped on the same cadence), so picking the max @@ -269,7 +312,7 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } } if (!Number.isFinite(startNs)) startNs = 0; - const tOf = (ns: number) => (ns - startNs) / 1e9; + const tOf = (ns: number) => (ns - (originStartNs ?? startNs)) / 1e9; // Pick the first metric name whose series array has any data; fallback // chain lets the same code path serve both vllm:* and sglang:* blobs. @@ -465,6 +508,57 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } if (arr.length > 0) promptTokensBySource[source] = arr; } + + const metricSources: MetricSourceSeries[] = []; + const adapter = selectServerMetricsAdapter(context); + if (includeMetricSources && context.disagg && adapter.id !== 'generic') { + const grouped = new Map(); + for (const [metricName, metric] of Object.entries(metrics)) { + for (const series of metric.series ?? []) { + const source = adapter.identifySource(series); + let group = grouped.get(source.id); + if (!group) { + group = { source, metrics: {} }; + grouped.set(source.id, group); + } + const groupedMetric = (group.metrics[metricName] ??= { series: [] }); + groupedMetric.series!.push(series); + } + } + for (const { source, metrics: sourceMetrics } of grouped.values()) { + const sourceSeries = buildSeriesFromMetrics( + sourceMetrics, + context, + false, + originStartNs ?? startNs, + ); + metricSources.push({ + source, + kvCacheUsage: sourceSeries.kvCacheUsage, + prefixCacheHitRate: sourceSeries.prefixCacheHitRate, + queueDepth: sourceSeries.queueDepth, + promptTokensBySource: sourceSeries.promptTokensBySource, + promptTps: sourceSeries.prefillTps, + generationTps: sourceSeries.decodeTps, + prefixCacheHitsTps: sourceSeries.prefixCacheHitsTps, + hostKvCacheUsage: sourceSeries.hostKvCacheUsage, + kvCacheUsageByEngine: sourceSeries.kvCacheUsageByEngine, + }); + } + const roleOrder: Record = { + router: 0, + prefill: 1, + decode: 2, + combined: 3, + unknown: 4, + }; + metricSources.sort( + (a, b) => + roleOrder[a.source.role] - roleOrder[b.source.role] || + (a.source.endpointUrl ?? '').localeCompare(b.source.endpointUrl ?? '') || + a.source.id.localeCompare(b.source.id), + ); + } return { version: CHART_SERIES_VERSION, startNs, @@ -480,5 +574,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { prefixCacheHitsTps, hostKvCacheUsage, kvCacheUsageByEngine, + metricSources, }; } diff --git a/packages/db/src/etl/dataset-provenance.test.ts b/packages/db/src/etl/dataset-provenance.test.ts new file mode 100644 index 00000000..4022546e --- /dev/null +++ b/packages/db/src/etl/dataset-provenance.test.ts @@ -0,0 +1,40 @@ +import { describe, expect, it } from 'vitest'; + +import { datasetSlugFromBenchmarkRow } from './dataset-provenance'; + +describe('datasetSlugFromBenchmarkRow', () => { + it('maps aiperf public-dataset provenance to the dashboard dataset slug', () => { + expect( + datasetSlugFromBenchmarkRow({ + dataset: { + source_type: 'public_dataset', + loader: 'semianalysis_cc_traces_weka_with_subagents', + hf_dataset_name: 'semianalysisai/cc-traces-weka-062126', + hf_split: 'train', + num_dataset_entries: 393, + }, + }), + ).toBe('cc-traces-weka-062126'); + }); + + it('supports an unnamespaced Hugging Face dataset id', () => { + expect( + datasetSlugFromBenchmarkRow({ + dataset: { + source_type: 'public_dataset', + hf_dataset_name: 'cc-traces-weka-062126', + }, + }), + ).toBe('cc-traces-weka-062126'); + }); + + it.each([ + {}, + { dataset: null }, + { dataset: { source_type: 'synthetic', hf_dataset_name: 'owner/data' } }, + { dataset: { source_type: 'public_dataset', hf_dataset_name: '' } }, + { dataset: { source_type: 'public_dataset' } }, + ])('ignores rows without usable public-dataset provenance: %j', (row) => { + expect(datasetSlugFromBenchmarkRow(row)).toBeNull(); + }); +}); diff --git a/packages/db/src/etl/dataset-provenance.ts b/packages/db/src/etl/dataset-provenance.ts new file mode 100644 index 00000000..7c30716c --- /dev/null +++ b/packages/db/src/etl/dataset-provenance.ts @@ -0,0 +1,30 @@ +/** Dataset provenance emitted by aiperf and preserved in agentic benchmark rows. */ +export interface DatasetProvenance { + source_type?: unknown; + loader?: unknown; + hf_dataset_name?: unknown; + hf_split?: unknown; + hf_subset?: unknown; + num_dataset_entries?: unknown; +} + +/** + * Resolve the dashboard dataset slug from a benchmark row's provenance. + * + * Dataset ingest uses the final path component of the Hugging Face dataset id + * as `datasets.slug`, so `semianalysisai/cc-traces-weka-062126` maps to + * `cc-traces-weka-062126` here as well. + */ +export function datasetSlugFromBenchmarkRow(row: Record): string | null { + const dataset = row.dataset; + if (!dataset || typeof dataset !== 'object' || Array.isArray(dataset)) return null; + + const provenance = dataset as DatasetProvenance; + if (provenance.source_type !== 'public_dataset') return null; + if (typeof provenance.hf_dataset_name !== 'string') return null; + + const datasetId = provenance.hf_dataset_name.trim().replace(/\/+$/u, ''); + if (!datasetId) return null; + const slug = datasetId.slice(datasetId.lastIndexOf('/') + 1); + return slug || null; +} diff --git a/packages/db/src/etl/server-metrics-adapters.ts b/packages/db/src/etl/server-metrics-adapters.ts new file mode 100644 index 00000000..f123d9f8 --- /dev/null +++ b/packages/db/src/etl/server-metrics-adapters.ts @@ -0,0 +1,100 @@ +/** + * Normalize orchestrator-specific server-metric labels into a stable source + * identity consumed by the API and frontend. AIPerf owns the export envelope; + * the serving orchestrator owns the meaning of labels inside each series. + */ + +export type MetricSourceRole = 'router' | 'prefill' | 'decode' | 'combined' | 'unknown'; + +export interface RawMetricSourceSeries { + endpoint_url?: string; + labels?: Record; +} + +export interface ServerMetricsContext { + /** Canonical framework stored in configs, for example `dynamo-vllm`. */ + framework?: string | null; + /** Per-worker role series are only meaningful for disaggregated configs. */ + disagg?: boolean; +} + +export interface MetricSource { + /** Stable key used to join this source across different metric names. */ + id: string; + adapter: string; + role: MetricSourceRole; + endpointUrl: string | null; + nativeRole: string | null; + workerId: string | null; + dpRank: string | null; + engine: string | null; +} + +interface ServerMetricsAdapter { + id: string; + matches: (context: ServerMetricsContext) => boolean; + identifySource: (series: RawMetricSourceSeries) => MetricSource; +} + +function stableId(adapter: string, parts: (string | null | undefined)[]): string { + return [adapter, ...parts.map((part) => part ?? '')].join('|'); +} + +const dynamoAdapter: ServerMetricsAdapter = { + id: 'dynamo', + matches: ({ framework }) => framework?.startsWith('dynamo-') ?? false, + identifySource(series) { + const labels = series.labels ?? {}; + const nativeRole = labels['dynamo_component'] ?? null; + const role: MetricSourceRole = + nativeRole === 'prefill' + ? 'prefill' + : nativeRole === 'backend' + ? 'decode' + : nativeRole === 'frontend' || nativeRole === 'router' + ? 'router' + : 'unknown'; + const endpointUrl = series.endpoint_url ?? labels['dynamo_endpoint'] ?? null; + const workerId = labels['worker_id'] ?? null; + const dpRank = labels['dp_rank'] ?? null; + const engine = labels['engine'] ?? labels['engine_idx'] ?? null; + return { + id: stableId('dynamo', [role, endpointUrl, workerId, dpRank, engine]), + adapter: 'dynamo', + role, + endpointUrl, + nativeRole, + workerId, + dpRank, + engine, + }; + }, +}; + +const genericAdapter: ServerMetricsAdapter = { + id: 'generic', + matches: () => true, + identifySource(series) { + const labels = series.labels ?? {}; + const endpointUrl = series.endpoint_url ?? null; + const workerId = labels['worker_id'] ?? null; + const dpRank = labels['dp_rank'] ?? null; + const engine = labels['engine'] ?? labels['engine_idx'] ?? null; + return { + id: stableId('generic', [endpointUrl, workerId, dpRank, engine]), + adapter: 'generic', + role: endpointUrl || workerId || dpRank || engine ? 'unknown' : 'combined', + endpointUrl, + nativeRole: null, + workerId, + dpRank, + engine, + }; + }, +}; + +const ADAPTERS: readonly ServerMetricsAdapter[] = [dynamoAdapter, genericAdapter]; + +export function selectServerMetricsAdapter(context: ServerMetricsContext): ServerMetricsAdapter { + return ADAPTERS.find((adapter) => adapter.matches(context)) ?? genericAdapter; +} diff --git a/packages/db/src/etl/trace-artifact-discovery.test.ts b/packages/db/src/etl/trace-artifact-discovery.test.ts new file mode 100644 index 00000000..2bb1d51b --- /dev/null +++ b/packages/db/src/etl/trace-artifact-discovery.test.ts @@ -0,0 +1,66 @@ +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { discoverTraceReplayArtifacts } from './trace-artifact-discovery'; + +const tempDirs: string[] = []; + +function tempDir(): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'trace-artifacts-test-')); + tempDirs.push(dir); + return dir; +} + +function writeTraceFiles(dir: string): void { + fs.mkdirSync(path.join(dir, 'aiperf_artifacts'), { recursive: true }); + fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'profile_export.jsonl'), '{}\n'); + fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.csv'), 'x,y\n'); + fs.writeFileSync(path.join(dir, 'aiperf_artifacts', 'server_metrics_export.json'), '{}'); +} + +afterEach(() => { + for (const dir of tempDirs.splice(0)) fs.rmSync(dir, { recursive: true, force: true }); +}); + +describe('discoverTraceReplayArtifacts', () => { + it('discovers the existing single-node sibling layout', () => { + const root = tempDir(); + writeTraceFiles(path.join(root, 'agentic_config-a')); + + const found = discoverTraceReplayArtifacts(root); + + expect(found.get('config-a')).toMatchObject({ + profileJsonl: expect.stringContaining('profile_export.jsonl'), + serverMetricsCsv: expect.stringContaining('server_metrics_export.csv'), + serverMetricsJson: expect.stringContaining('server_metrics_export.json'), + }); + }); + + it('extracts and indexes multinode traces by concurrency', () => { + const root = tempDir(); + const artifactDir = path.join(root, 'multinode_server_logs_config-b'); + const archiveSource = path.join(root, 'archive-source'); + writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_96')); + writeTraceFiles(path.join(archiveSource, 'agentic', 'conc_128')); + fs.mkdirSync(artifactDir, { recursive: true }); + execFileSync('tar', [ + '-czf', + path.join(artifactDir, 'multinode_server_logs.tar.gz'), + '-C', + archiveSource, + '.', + ]); + fs.rmSync(archiveSource, { recursive: true, force: true }); + + const found = discoverTraceReplayArtifacts(root); + + expect([...found.keys()].toSorted()).toEqual(['config-b|128', 'config-b|96']); + expect(found.get('config-b|96')?.profileJsonl).toContain( + 'multinode_server_logs/agentic/conc_96/aiperf_artifacts/profile_export.jsonl', + ); + }); +}); diff --git a/packages/db/src/etl/trace-artifact-discovery.ts b/packages/db/src/etl/trace-artifact-discovery.ts new file mode 100644 index 00000000..cea0269e --- /dev/null +++ b/packages/db/src/etl/trace-artifact-discovery.ts @@ -0,0 +1,89 @@ +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; + +export interface TraceReplayArtifactPaths { + profileJsonl: string | null; + serverMetricsCsv: string | null; + serverMetricsJson: string | null; +} + +const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay']; + +function traceFilesIn(dir: string): TraceReplayArtifactPaths | null { + let profileJsonl: string | null = null; + let serverMetricsCsv: string | null = null; + let serverMetricsJson: string | null = null; + + for (const subdir of TRACE_SUBDIRS) { + const traceDir = path.join(dir, subdir); + if (!fs.existsSync(traceDir) || !fs.statSync(traceDir).isDirectory()) continue; + + const profilePath = path.join(traceDir, 'profile_export.jsonl'); + const csvPath = path.join(traceDir, 'server_metrics_export.csv'); + const jsonPath = path.join(traceDir, 'server_metrics_export.json'); + if (!profileJsonl && fs.existsSync(profilePath)) profileJsonl = profilePath; + if (!serverMetricsCsv && fs.existsSync(csvPath)) serverMetricsCsv = csvPath; + if (!serverMetricsJson && fs.existsSync(jsonPath)) serverMetricsJson = jsonPath; + } + + if (!profileJsonl && !serverMetricsCsv && !serverMetricsJson) return null; + return { profileJsonl, serverMetricsCsv, serverMetricsJson }; +} + +function extractMultinodeArchive(artifactDir: string): string | null { + const archivePath = path.join(artifactDir, 'multinode_server_logs.tar.gz'); + const extractedDir = path.join(artifactDir, 'multinode_server_logs'); + + if (!fs.existsSync(extractedDir) && fs.existsSync(archivePath)) { + fs.mkdirSync(extractedDir, { recursive: true }); + execFileSync('tar', ['-xzf', archivePath, '-C', extractedDir], { stdio: 'ignore' }); + } + + return fs.existsSync(extractedDir) ? extractedDir : null; +} + +/** + * Discover trace-replay siblings in both artifact layouts: + * + * - Single-node: `agentic_/aiperf_artifacts/*` + * - Multinode: `multinode_server_logs_/multinode_server_logs.tar.gz`, + * containing `agentic/conc_/aiperf_artifacts/*` + * + * Multinode keys include concurrency (`|`) because one artifact + * contains several points, each with a distinct trace payload. + */ +export function discoverTraceReplayArtifacts( + artifactsDir: string, +): Map { + const discovered = new Map(); + if (!fs.existsSync(artifactsDir)) return discovered; + + for (const entry of fs.readdirSync(artifactsDir)) { + const artifactDir = path.join(artifactsDir, entry); + if (!fs.statSync(artifactDir).isDirectory()) continue; + + if (entry.startsWith('agentic_')) { + const trace = traceFilesIn(artifactDir); + if (trace) discovered.set(entry.replace(/^agentic_/u, ''), trace); + continue; + } + + if (!entry.startsWith('multinode_server_logs_')) continue; + const extractedDir = extractMultinodeArchive(artifactDir); + if (!extractedDir) continue; + + const agenticDir = path.join(extractedDir, 'agentic'); + if (!fs.existsSync(agenticDir) || !fs.statSync(agenticDir).isDirectory()) continue; + + const suffix = entry.replace(/^multinode_server_logs_/u, ''); + for (const concEntry of fs.readdirSync(agenticDir)) { + const match = concEntry.match(/^conc_(?\d+)$/u); + if (!match?.groups?.conc) continue; + const trace = traceFilesIn(path.join(agenticDir, concEntry)); + if (trace) discovered.set(`${suffix}|${match.groups.conc}`, trace); + } + } + + return discovered; +} diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index cb022ca9..b50168db 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -15,6 +15,7 @@ import type postgres from 'postgres'; import { computeAggregateStats } from './compute-aggregate-stats.js'; import { computeChartSeries } from './compute-chart-series.js'; import { computeRequestTimeline } from './compute-request-timeline.js'; +import type { ServerMetricsContext } from './server-metrics-adapters'; type Sql = ReturnType; @@ -33,6 +34,8 @@ type Sql = ReturnType; * @param serverMetricsJson Raw bytes of `server_metrics_export.json` — * per-scrape time-series of every Prometheus metric. * Optional, gzipped before storage (~42x ratio). + * @param metricsContext Canonical framework used to select the + * orchestrator-specific metric-label adapter. */ export async function insertTraceReplay( sql: Sql, @@ -40,6 +43,7 @@ export async function insertTraceReplay( profileExportJsonl: Buffer | null, serverMetricsCsv: Buffer | null, serverMetricsJson: Buffer | null = null, + metricsContext: ServerMetricsContext = {}, ): Promise { if (benchmarkResultIds.length === 0) return; if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return; @@ -65,7 +69,7 @@ export async function insertTraceReplay( // a streaming parser for oversized server_metrics blobs. const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([ computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }), - computeChartSeries(metricsJsonGz), + computeChartSeries(metricsJsonGz, metricsContext), Promise.resolve(computeRequestTimeline(profileGz)), ]); diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts index 4debf1ae..97e8759d 100644 --- a/packages/db/src/etl/weka-structure.test.ts +++ b/packages/db/src/etl/weka-structure.test.ts @@ -2,9 +2,11 @@ import { describe, it, expect } from 'vitest'; import { countSeenPrefixBlocks, buildConversationStructure, + countConversationRequests, linearHistogram, logHistogram, logHistogramWithZero, + subagentRequestTurns, summarizeValues, type RawWekaConversation, type SubagentNode, @@ -88,7 +90,7 @@ describe('buildConversationStructure', () => { id: 'c4', block_size: 64, requests: [ - { type: 'n', model: 'main', t: 0, in: 64, out: 10, hash_ids: [1] }, + { type: 'n', model: 'main', t: 0, api_time: 1, in: 64, out: 10, hash_ids: [1] }, { type: 'subagent', agent_id: 'a1', @@ -119,7 +121,12 @@ describe('buildConversationStructure', () => { expect(sub.startS).toBe(12.5); expect(sub.endS).toBeCloseTo(13.734, 6); expect(sub.children).toHaveLength(2); - expect(sub.children.map((child) => child.startS)).toEqual([12.5, 13.1]); + expect(countConversationRequests(s)).toBe(4); + expect(subagentRequestTurns(s).map((turn) => turn.model)).toEqual(['sub', 'sub']); + expect(sub.children.map((child) => [child.startS, child.endS])).toEqual([ + [12.5, 12.5], + [13.1, 13.1], + ]); expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child expect(sub.in).toBe(256); @@ -127,6 +134,26 @@ describe('buildConversationStructure', () => { const afterSub = s.nodes[2] as TurnNode; expect(afterSub.cached).toBe(64); // only block 1; block 5 not folded back + expect((s.nodes[0] as TurnNode).endS).toBe(1); + }); + + it('counts top-level and subagent child turns as requests, but not subagent groups', () => { + const structure = buildConversationStructure({ + id: 'request-count', + requests: [ + { type: 'n', in: 1, out: 1 }, + { + type: 'subagent', + requests: [ + { type: 'n', in: 1, out: 1 }, + { type: 'n', in: 1, out: 1 }, + ], + }, + ], + }); + + expect(countConversationRequests(structure)).toBe(3); + expect(subagentRequestTurns(structure)).toHaveLength(2); }); it('falls back to the default block size and a generic subagent label', () => { @@ -156,6 +183,21 @@ describe('buildConversationStructure', () => { expect(sub.startS).toBe(5); expect(sub.endS).toBe(12); }); + + it('normalizes legacy subagent-relative request intervals', () => { + const structure = buildConversationStructure({ + id: 'legacy-relative', + requests: [ + { + type: 'subagent', + t: 100, + requests: [{ type: 'n', t: 2, api_time: 3, in: 10, out: 1 }], + }, + ], + }); + const child = (structure.nodes[0] as SubagentNode).children[0]!; + expect(child).toMatchObject({ startS: 102, endS: 105 }); + }); }); describe('histograms', () => { diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index 26cc8da1..f6cea1c1 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -50,7 +50,7 @@ export interface TurnNode { turnIndex: number; /** Seconds from the start of the conversation. */ startS?: number; - /** Seconds from the start of the conversation (startS + api_time). */ + /** End of the original request interval (`startS + api_time`). */ endS?: number; model?: string; in: number; @@ -92,6 +92,16 @@ export interface ConversationStructure { }; } +/** Actual model requests in a conversation: main turns plus subagent child turns. */ +export function countConversationRequests(structure: ConversationStructure): number { + return structure.totals.numTurns + subagentRequestTurns(structure).length; +} + +/** Model requests issued by inner subagents, excluding all parent-agent turns. */ +export function subagentRequestTurns(structure: ConversationStructure): TurnNode[] { + return structure.nodes.flatMap((node) => (node.kind === 'subagent' ? node.children : [])); +} + const isSubagent = (e: RawWekaEntry): e is RawWekaSubagent => (e as RawWekaSubagent).type === 'subagent'; @@ -142,17 +152,30 @@ function finiteTime(value: number | undefined): number | undefined { return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined; } -/** End of a turn = its start plus the request's api_time (seconds). */ -function turnEndS(req: RawWekaRequest): number | undefined { - const startS = finiteTime(req.t); +function requestEndS(startS: number | undefined, apiTime: number | undefined): number | undefined { if (startS === undefined) return undefined; - return startS + (finiteTime(req.api_time) ?? 0); + const duration = finiteTime(apiTime) ?? 0; + return startS + duration; +} + +/** Mirror aiperf's legacy-relative/current-absolute subagent timestamp handling. */ +function subagentRequestStartS( + entry: RawWekaSubagent, + request: RawWekaRequest, +): number | undefined { + const requestStart = finiteTime(request.t); + if (requestStart === undefined) return undefined; + const groupStart = finiteTime(entry.t); + if (groupStart !== undefined && requestStart + 1e-6 < groupStart) { + return groupStart + requestStart; + } + return requestStart; } function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } { const children = entry.requests ?? []; const childStarts = children - .map((child) => finiteTime(child.t)) + .map((child) => subagentRequestStartS(entry, child)) .filter((value): value is number => value !== undefined); const startS = finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined); @@ -162,7 +185,11 @@ function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: nu } const childEnds = children - .map((child) => turnEndS(child)) + .map((child) => { + const childStart = subagentRequestStartS(entry, child); + if (childStart === undefined) return undefined; + return childStart + (finiteTime(child.api_time) ?? 0); + }) .filter((value): value is number => value !== undefined); return { startS, @@ -203,11 +230,12 @@ export function buildConversationStructure( for (const inner of entry.requests ?? []) { const split = splitInput(inner, childSeen, blockSize); const out = Math.max(0, Math.round(inner.out ?? 0)); + const childStartS = subagentRequestStartS(entry, inner); children.push({ kind: 'turn', turnIndex: turnIndex++, - startS: finiteTime(inner.t), - endS: turnEndS(inner), + startS: childStartS, + endS: requestEndS(childStartS, inner.api_time), model: inner.model, in: split.in, out, @@ -240,11 +268,12 @@ export function buildConversationStructure( } else { const split = splitInput(entry, seen, blockSize); const out = Math.max(0, Math.round(entry.out ?? 0)); + const startS = finiteTime(entry.t); nodes.push({ kind: 'turn', turnIndex: turnIndex++, - startS: finiteTime(entry.t), - endS: turnEndS(entry), + startS, + endS: requestEndS(startS, entry.api_time), model: entry.model, in: split.in, out, diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 127522c8..2a5f15f0 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -46,6 +46,8 @@ import { insertServerLog, } from './etl/benchmark-ingest'; import { insertTraceReplay } from './etl/trace-replay-ingest'; +import { discoverTraceReplayArtifacts } from './etl/trace-artifact-discovery'; +import { datasetSlugFromBenchmarkRow } from './etl/dataset-provenance'; import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper'; import { ingestEvalRow } from './etl/eval-ingest'; import { mapEvalSamples } from './etl/eval-samples-mapper'; @@ -337,6 +339,7 @@ async function main(): Promise { let totalSampleFiles = 0; let totalChangelogs = 0; let totalTraceReplayLinked = 0; + const datasetSlugs = new Set(); // ── Check for evals-only flag in changelog ──────────────────────────── const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog); @@ -397,46 +400,7 @@ async function main(): Promise { // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current). // Older non-aiperf agentic runs don't ship this sibling. Key on the bare // suffix so both names map to the same Map entry. - const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay']; - const traceReplayPaths = new Map< - string, - { - profileJsonl: string | null; - serverMetricsCsv: string | null; - serverMetricsJson: string | null; - } - >(); - if (fs.existsSync(artifactsDir)) { - for (const d of fs.readdirSync(artifactsDir)) { - if (!d.startsWith('agentic_')) continue; - let profile: string | null = null; - let metrics: string | null = null; - let metricsJson: string | null = null; - for (const sub of TRACE_SUBDIRS) { - const dir = path.join(artifactsDir, d, sub); - if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) continue; - if (!profile) { - const p = path.join(dir, 'profile_export.jsonl'); - if (fs.existsSync(p)) profile = p; - } - if (!metrics) { - const m = path.join(dir, 'server_metrics_export.csv'); - if (fs.existsSync(m)) metrics = m; - } - if (!metricsJson) { - const j = path.join(dir, 'server_metrics_export.json'); - if (fs.existsSync(j)) metricsJson = j; - } - } - if (!profile && !metrics && !metricsJson) continue; - const suffix = stripBmkAndAgenticPrefix(d); - traceReplayPaths.set(suffix, { - profileJsonl: profile, - serverMetricsCsv: metrics, - serverMetricsJson: metricsJson, - }); - } - } + const traceReplayPaths = discoverTraceReplayArtifacts(artifactsDir); if (traceReplayPaths.size > 0) { console.log(` Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`); } @@ -452,6 +416,12 @@ async function main(): Promise { ? data : [data as Record]; + for (const rawRow of rawRows) { + if (!rawRow || typeof rawRow !== 'object') continue; + const datasetSlug = datasetSlugFromBenchmarkRow(rawRow); + if (datasetSlug) datasetSlugs.add(datasetSlug); + } + const rows = rawRows .filter((r) => typeof r === 'object' && r !== null) .map((r) => mapBenchmarkRow(r, tracker)) @@ -514,7 +484,11 @@ async function main(): Promise { // `bmk_agentic_` artifact we just ingested. if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) { const suffix = stripBmkAndAgenticPrefix(parentDir); - const trace = traceReplayPaths.get(suffix); + const concMatch = path.basename(file).match(/_conc(?\d+)\.json$/u); + const trace = + (concMatch?.groups?.conc + ? traceReplayPaths.get(`${suffix}|${concMatch.groups.conc}`) + : undefined) ?? traceReplayPaths.get(suffix); if (trace) { try { const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null; @@ -524,7 +498,10 @@ async function main(): Promise { const metricsJson = trace.serverMetricsJson ? fs.readFileSync(trace.serverMetricsJson) : null; - await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson); + await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson, { + framework: toInsert[0]?.config.framework, + disagg: toInsert[0]?.config.disagg, + }); totalTraceReplayLinked += insertedIds.length; } catch (error: any) { tracker.recordDbError(`trace_replay for ${suffix}`, error); @@ -553,6 +530,22 @@ async function main(): Promise { tracker.recordDbError('availability', error); } } + + if (datasetSlugs.size > 1) { + throw new Error( + `Conflicting dataset provenance in workflow run ${runId}: ${[...datasetSlugs].toSorted().join(', ')}`, + ); + } + const [datasetSlug] = datasetSlugs; + if (datasetSlug) { + await sql` + insert into run_datasets (workflow_run_id, dataset_slug) + values (${workflowRunId}, ${datasetSlug}) + on conflict (workflow_run_id) do update + set dataset_slug = excluded.dataset_slug + `; + console.log(` Dataset: linked workflow run to ${datasetSlug}`); + } } // ── Ingest run stats ────────────────────────────────────────────────── diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts index e00471d7..ed6774c0 100644 --- a/packages/db/src/ingest-weka-dataset.ts +++ b/packages/db/src/ingest-weka-dataset.ts @@ -22,9 +22,11 @@ import { createAdminSql } from './etl/db-utils'; import { hasNoSslFlag } from './cli-utils'; import { buildConversationStructure, + countConversationRequests, linearHistogram, logHistogram, logHistogramWithZero, + subagentRequestTurns, summarizeValues, type ConversationStructure, type RawWekaConversation, @@ -146,6 +148,9 @@ interface Accumulator { outputPerTurn: number[]; cachedFractionPerTurn: number[]; // cached/in, for turns with in>0 turnsPerConv: number[]; // main (top-level) turns + requestsPerConv: number[]; // main turns + subagent child turns + subagentInputPerRequest: number[]; + subagentOutputPerRequest: number[]; subagentGroupsPerConv: number[]; subagentTurnsPerGroup: number[]; totalIn: number; @@ -164,6 +169,9 @@ function newAccumulator(): Accumulator { outputPerTurn: [], cachedFractionPerTurn: [], turnsPerConv: [], + requestsPerConv: [], + subagentInputPerRequest: [], + subagentOutputPerRequest: [], subagentGroupsPerConv: [], subagentTurnsPerGroup: [], totalIn: 0, @@ -191,6 +199,11 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void { acc.mainTurns += s.totals.numTurns; acc.subagentGroups += s.totals.numSubagentGroups; acc.turnsPerConv.push(s.totals.numTurns); + acc.requestsPerConv.push(countConversationRequests(s)); + for (const turn of subagentRequestTurns(s)) { + acc.subagentInputPerRequest.push(turn.in); + acc.subagentOutputPerRequest.push(turn.out); + } acc.subagentGroupsPerConv.push(s.totals.numSubagentGroups); for (const node of s.nodes) { if (node.kind === 'turn') { @@ -205,7 +218,7 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void { function buildChartData(acc: Accumulator) { return { - version: 2, + version: 3, inputTokensPerTurn: { bins: logHistogram(acc.inputPerTurn), stats: summarizeValues(acc.inputPerTurn), @@ -218,6 +231,14 @@ function buildChartData(acc: Accumulator) { bins: logHistogram(acc.outputPerTurn), stats: summarizeValues(acc.outputPerTurn), }, + subagentInputTokensPerRequest: { + bins: logHistogram(acc.subagentInputPerRequest), + stats: summarizeValues(acc.subagentInputPerRequest), + }, + subagentOutputTokensPerRequest: { + bins: logHistogram(acc.subagentOutputPerRequest), + stats: summarizeValues(acc.subagentOutputPerRequest), + }, turnsPerConversation: { bins: linearHistogram(acc.turnsPerConv), stats: summarizeValues(acc.turnsPerConv), @@ -235,8 +256,10 @@ function buildChartData(acc: Accumulator) { function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string | null) { const cachedPct = acc.totalIn > 0 ? acc.totalCached / acc.totalIn : 0; + const requestsPerConversation = summarizeValues(acc.requestsPerConv); + const subagentsPerTrace = summarizeValues(acc.subagentGroupsPerConv); return { - version: 1, + version: 3, blockSize, hashIdScope, totalIn: acc.totalIn, @@ -246,6 +269,10 @@ function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string | mainTurns: acc.mainTurns, subagentGroups: acc.subagentGroups, subagentTurns: acc.subagentTurns, + meanRequestsPerConversation: requestsPerConversation.mean, + medianRequestsPerConversation: requestsPerConversation.median, + meanSubagentsPerTrace: subagentsPerTrace.mean, + medianSubagentsPerTrace: subagentsPerTrace.median, modelMix: acc.modelCounts, }; } diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts index da5d18a0..4493b7dc 100644 --- a/packages/db/src/queries/agentic-aggregates.ts +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -36,8 +36,10 @@ import type { DbClient } from '../connection.js'; * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way * they do for vllm runs. + * + * v4: add per-request normalized E2E percentiles at a fixed 400-token OSL. */ -export const STATS_VERSION = 3; +export const STATS_VERSION = 4; export interface MetricPercentiles { mean: number; diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts index 89c6ca5e..cfefe391 100644 --- a/packages/db/src/queries/datasets.ts +++ b/packages/db/src/queries/datasets.ts @@ -20,6 +20,10 @@ export interface DatasetSummary { mainTurns?: number; subagentGroups?: number; subagentTurns?: number; + meanRequestsPerConversation?: number; + medianRequestsPerConversation?: number; + meanSubagentsPerTrace?: number; + medianSubagentsPerTrace?: number; modelMix?: Record; [k: string]: unknown; } diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts index 321434be..afc5b22d 100644 --- a/packages/db/src/queries/derived-agentic-metrics.test.ts +++ b/packages/db/src/queries/derived-agentic-metrics.test.ts @@ -24,6 +24,21 @@ describe('computeDerivedFromBlob', () => { const out = computeDerivedFromBlob(''); expect(out.normalized_session_time_s).toBeNull(); expect(out.p90_prefill_tps_per_user).toBeNull(); + expect(out.normalized_e2e_400).toBeNull(); + }); + + it('normalizes each request to 400 output tokens before taking percentiles', () => { + const jsonl = [ + // Both requests have TTFT=2s and ITL=20ms, despite very different OSL/E2E. + rec('s1', 0, { isl: 100, osl: 100, ttft_ms: 2000, latency_ms: 3980 }), + rec('s2', 0, { isl: 100, osl: 1000, ttft_ms: 2000, latency_ms: 21_980 }), + ].join('\n'); + + const out = computeDerivedFromBlob(jsonl); + // 2s TTFT + 399 × 20ms ITL = 9.98s for both requests. + expect(out.normalized_e2e_400?.n).toBe(2); + expect(out.normalized_e2e_400?.p75).toBeCloseTo(9.98, 8); + expect(out.normalized_e2e_400?.p90).toBeCloseTo(9.98, 8); }); it('rescales single-session time and computes P90 prefill', () => { diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts index 35a4b76c..fda44280 100644 --- a/packages/db/src/queries/derived-agentic-metrics.ts +++ b/packages/db/src/queries/derived-agentic-metrics.ts @@ -20,8 +20,10 @@ import { gunzipSync } from 'node:zlib'; +import { NORMALIZED_E2E_OUTPUT_TOKENS } from '@semianalysisai/inferencex-constants'; + import type { DbClient } from '../connection.js'; -import { STATS_VERSION } from './agentic-aggregates'; +import { percentilesOf, STATS_VERSION, type MetricPercentiles } from './agentic-aggregates'; export interface DerivedAgenticMetric { /** benchmark_results.id this entry belongs to. */ @@ -30,6 +32,10 @@ export interface DerivedAgenticMetric { normalized_session_time_s: number | null; /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */ p90_prefill_tps_per_user: number | null; + /** P75 normalized per-request E2E at a fixed 400-token output length. */ + p75_normalized_e2e_400_s: number | null; + /** P90 normalized per-request E2E at a fixed 400-token output length. */ + p90_normalized_e2e_400_s: number | null; } export type DerivedAgenticMetricMap = Record; @@ -111,6 +117,7 @@ function meanOf(xs: number[]): number { export function computeDerivedFromBlob(jsonl: string): { normalized_session_time_s: number | null; p90_prefill_tps_per_user: number | null; + normalized_e2e_400: MetricPercentiles | null; } { // Group records by conversation_id, filter to the profiling phase. const bySession = new Map(); @@ -135,7 +142,11 @@ export function computeDerivedFromBlob(jsonl: string): { list.push(turn); } if (bySession.size === 0) { - return { normalized_session_time_s: null, p90_prefill_tps_per_user: null }; + return { + normalized_session_time_s: null, + p90_prefill_tps_per_user: null, + normalized_e2e_400: null, + }; } // Per-session aggregates for session time; per-turn prefill rates pool into @@ -143,6 +154,7 @@ export function computeDerivedFromBlob(jsonl: string): { const sessionTimesS: number[] = []; const sessionLoads: number[] = []; const allPrefillRates: number[] = []; + const allNormalizedE2eS: number[] = []; for (const turns of bySession.values()) { let timeMs = 0; let load = 0; @@ -151,6 +163,21 @@ export function computeDerivedFromBlob(jsonl: string): { load += t.isl + t.osl; const ttftSec = t.ttft_ms / 1000; if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec); + + // Keep the observed TTFT, then project the request's mean decode + // interval to a fixed output length. Do this per request before taking + // percentiles so long original outputs do not dominate the tail. + const observedDecodeIntervals = Math.max(t.osl - 1, 1); + const itlMs = (t.request_latency_ms - t.ttft_ms) / observedDecodeIntervals; + const normalizedMs = t.ttft_ms + (NORMALIZED_E2E_OUTPUT_TOKENS - 1) * itlMs; + if ( + Number.isFinite(itlMs) && + itlMs >= 0 && + Number.isFinite(normalizedMs) && + normalizedMs > 0 + ) { + allNormalizedE2eS.push(normalizedMs / 1000); + } } if (load > 0) { sessionTimesS.push(timeMs / 1000); @@ -182,6 +209,7 @@ export function computeDerivedFromBlob(jsonl: string): { return { normalized_session_time_s: normalized, p90_prefill_tps_per_user: prefill, + normalized_e2e_400: percentilesOf(allNormalizedE2eS), }; } @@ -210,6 +238,7 @@ export async function getDerivedAgenticMetrics( version?: number; normalizedSessionTimeS?: number | null; p90PrefillTpsPerUser?: number | null; + normalizedE2e400?: MetricPercentiles | null; } | null; }[]; @@ -221,6 +250,8 @@ export async function getDerivedAgenticMetrics( id, normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null, p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null, + p75_normalized_e2e_400_s: row.stats.normalizedE2e400?.p75 ?? null, + p90_normalized_e2e_400_s: row.stats.normalizedE2e400?.p90 ?? null, }; } else { idsNeedingBlob.push(id); @@ -250,11 +281,14 @@ export async function getDerivedAgenticMetrics( for (const row of rows) { try { const jsonl = gunzipSync(row.blob).toString('utf8'); - const { normalized_session_time_s, p90_prefill_tps_per_user } = computeDerivedFromBlob(jsonl); + const { normalized_session_time_s, p90_prefill_tps_per_user, normalized_e2e_400 } = + computeDerivedFromBlob(jsonl); result[Number(row.benchmark_result_id)] = { id: Number(row.benchmark_result_id), normalized_session_time_s, p90_prefill_tps_per_user, + p75_normalized_e2e_400_s: normalized_e2e_400?.p75 ?? null, + p90_normalized_e2e_400_s: normalized_e2e_400?.p90 ?? null, }; } catch { // Skip malformed blobs silently — frontend treats missing ids as "no data". diff --git a/packages/db/src/queries/request-timeline.test.ts b/packages/db/src/queries/request-timeline.test.ts new file mode 100644 index 00000000..62ba5385 --- /dev/null +++ b/packages/db/src/queries/request-timeline.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from 'vitest'; + +import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline'; +import type { DbClient } from '../connection.js'; + +import { getRequestTimeline } from './request-timeline'; + +function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } { + const responses = [...queue]; + const calls: string[] = []; + const sql = ((strings: TemplateStringsArray) => { + calls.push(strings.join('?')); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +const timeline: RequestTimeline = { + version: REQUEST_TIMELINE_VERSION, + startNs: 100, + endNs: 200, + durationS: 0.0000001, + requests: [], +}; + +describe('getRequestTimeline', () => { + it('returns the current precomputed timeline without selecting the raw profile blob', async () => { + const { sql, calls } = mockSql([ + [{ trace_replay_id: 870, has_blob: true, request_timeline: timeline }], + ]); + + await expect(getRequestTimeline(sql, 422991)).resolves.toEqual(timeline); + expect(calls).toHaveLength(1); + expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob'); + }); + + it('does not fetch a blob when neither a current timeline nor a blob exists', async () => { + const { sql, calls } = mockSql([ + [{ trace_replay_id: 870, has_blob: false, request_timeline: null }], + ]); + + await expect(getRequestTimeline(sql, 422991)).resolves.toBeNull(); + expect(calls).toHaveLength(1); + }); +}); diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts index 2bd3e251..2a6bb40c 100644 --- a/packages/db/src/queries/request-timeline.ts +++ b/packages/db/src/queries/request-timeline.ts @@ -18,23 +18,29 @@ import type { DbClient } from '../connection.js'; export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline'; -interface RawRow { - blob: Buffer | null; +interface RawMetaRow { + trace_replay_id: number; + has_blob: boolean; request_timeline: RequestTimeline | null; } +interface RawBlobRow { + blob: Buffer | null; +} + export async function getRequestTimeline( sql: DbClient, benchmarkResultId: number, ): Promise { const rows = (await sql` select - atr.profile_export_jsonl_gz as blob, + atr.id as trace_replay_id, + (atr.profile_export_jsonl_gz is not null) as has_blob, atr.request_timeline from benchmark_results br join agentic_trace_replay atr on atr.id = br.trace_replay_id where br.id = ${benchmarkResultId} - `) as unknown as RawRow[]; + `) as unknown as RawMetaRow[]; const row = rows[0]; if (!row) return null; @@ -43,6 +49,16 @@ export async function getRequestTimeline( return row.request_timeline; } - // Slow path: recompute from the blob (rare — only stale/missing rows). - return computeRequestTimeline(row.blob); + if (!row.has_blob) return null; + + // Slow path only: fetch the large profile blob after establishing that the + // pre-computed timeline is stale or missing. Long trace runs can have blobs + // large enough to exceed Neon's 64 MiB encoded-response limit, so the fast + // path must never select the blob alongside request_timeline. + const blobRows = (await sql` + select profile_export_jsonl_gz as blob + from agentic_trace_replay + where id = ${row.trace_replay_id} + `) as unknown as RawBlobRow[]; + return computeRequestTimeline(blobRows[0]?.blob ?? null); } diff --git a/packages/db/src/queries/trace-histograms.test.ts b/packages/db/src/queries/trace-histograms.test.ts new file mode 100644 index 00000000..c3c6ec8a --- /dev/null +++ b/packages/db/src/queries/trace-histograms.test.ts @@ -0,0 +1,78 @@ +import { describe, expect, it } from 'vitest'; + +import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline'; +import type { DbClient } from '../connection.js'; + +import { getTraceHistograms } from './trace-histograms'; + +function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } { + const responses = [...queue]; + const calls: string[] = []; + const sql = ((strings: TemplateStringsArray) => { + calls.push(strings.join('?')); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +const timeline: RequestTimeline = { + version: REQUEST_TIMELINE_VERSION, + startNs: 0, + endNs: 10, + durationS: 0.00000001, + requests: [ + { + cid: 'session-1', + ti: 0, + wid: '0', + ad: 0, + phase: 'profiling', + credit: 0, + start: 1, + ack: 2, + end: 3, + ttftMs: 1, + tpotMs: 2, + isl: 4096, + osl: 512, + cancelled: false, + }, + { + cid: 'session-1', + ti: 1, + wid: '0', + ad: 0, + phase: 'profiling', + credit: 4, + start: 5, + ack: 6, + end: 7, + ttftMs: 1, + tpotMs: 2, + isl: null, + osl: 128, + cancelled: false, + }, + ], +}; + +describe('getTraceHistograms', () => { + it('builds distributions from the precomputed timeline without selecting the raw blob', async () => { + const { sql, calls } = mockSql([ + [ + { + benchmark_result_id: 422991, + trace_replay_id: 870, + request_timeline: timeline, + has_blob: true, + }, + ], + ]); + + await expect(getTraceHistograms(sql, [422991])).resolves.toEqual({ + 422991: { id: 422991, isl: [4096], osl: [512, 128] }, + }); + expect(calls).toHaveLength(1); + expect(calls[0]).not.toContain('profile_export_jsonl_gz as blob'); + }); +}); diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts index 20ebc0d5..24b96c35 100644 --- a/packages/db/src/queries/trace-histograms.ts +++ b/packages/db/src/queries/trace-histograms.ts @@ -14,6 +14,8 @@ import { gunzipSync } from 'node:zlib'; +import { REQUEST_TIMELINE_VERSION, type RequestTimeline } from '../etl/compute-request-timeline'; + import type { DbClient } from '../connection.js'; export interface TraceHistogramPoint { @@ -27,13 +29,28 @@ export interface TraceHistogramPoint { export type TraceHistogramMap = Record; -/** - * Cap the number of blobs we pull in a single Neon HTTP query — the serverless - * driver returns 507 ("response is too large, max 64 MB") if the combined gzip - * payload exceeds that. Each profile_export.jsonl blob can be ~1-2 MB - * compressed, so we stay well below the cap at 12. - */ const QUERY_CHUNK_SIZE = 12; +// Bytea values expand in Neon's JSON-over-HTTP response. Keep raw fallback +// reads comfortably below its 64 MiB response cap; current ingests should use +// request_timeline instead and never need this path. +const MAX_FALLBACK_BLOB_BYTES = 24 * 1024 * 1024; + +interface TimelineRow { + benchmark_result_id: number; + trace_replay_id: number; + request_timeline: RequestTimeline | null; + has_blob: boolean; +} + +function histogramFromTimeline(id: number, timeline: RequestTimeline): TraceHistogramPoint { + const isl: number[] = []; + const osl: number[] = []; + for (const request of timeline.requests) { + if (typeof request.isl === 'number' && Number.isFinite(request.isl)) isl.push(request.isl); + if (typeof request.osl === 'number' && Number.isFinite(request.osl)) osl.push(request.osl); + } + return { id, isl, osl }; +} export async function getTraceHistograms( sql: DbClient, @@ -41,25 +58,47 @@ export async function getTraceHistograms( ): Promise { if (benchmarkResultIds.length === 0) return {}; - const rows: { benchmark_result_id: number; blob: Buffer }[] = []; + const result: TraceHistogramMap = {}; + const fallbackRows: TimelineRow[] = []; for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) { const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE); const chunkRows = (await sql` select br.id as benchmark_result_id, - atr.profile_export_jsonl_gz as blob + atr.id as trace_replay_id, + atr.request_timeline, + (atr.profile_export_jsonl_gz is not null) as has_blob from benchmark_results br join agentic_trace_replay atr on atr.id = br.trace_replay_id where br.id = any(${chunk}::bigint[]) - and atr.profile_export_jsonl_gz is not null - `) as { benchmark_result_id: number; blob: Buffer }[]; - rows.push(...chunkRows); + `) as unknown as TimelineRow[]; + for (const row of chunkRows) { + const id = Number(row.benchmark_result_id); + if ( + row.request_timeline && + Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION + ) { + result[id] = histogramFromTimeline(id, row.request_timeline); + } else if (row.has_blob) { + fallbackRows.push(row); + } + } } - const result: TraceHistogramMap = {}; - for (const row of rows) { + // Compatibility fallback for pre-timeline rows. Fetch one small blob at a + // time; oversized legacy rows are omitted instead of turning the whole API + // response into a 507. + for (const row of fallbackRows) { + const blobRows = (await sql` + select profile_export_jsonl_gz as blob + from agentic_trace_replay + where id = ${row.trace_replay_id} + and octet_length(profile_export_jsonl_gz) <= ${MAX_FALLBACK_BLOB_BYTES} + `) as unknown as { blob: Buffer }[]; + const blob = blobRows[0]?.blob; + if (!blob) continue; try { - const jsonl = gunzipSync(row.blob).toString('utf8'); + const jsonl = gunzipSync(blob).toString('utf8'); const isl: number[] = []; const osl: number[] = []; for (const line of jsonl.split('\n')) { diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts new file mode 100644 index 00000000..61d21d35 --- /dev/null +++ b/packages/db/src/queries/trace-server-metrics.test.ts @@ -0,0 +1,104 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { CHART_SERIES_VERSION, type ChartSeries } from '../etl/compute-chart-series'; +import type { DbClient } from '../connection.js'; + +import { getTraceServerMetrics } from './trace-server-metrics'; + +function currentSeries(): ChartSeries { + return { + version: CHART_SERIES_VERSION, + startNs: 0, + endNs: 1e9, + durationS: 1, + timeslicesCount: 1, + kvCacheUsage: [], + prefixCacheHitRate: [], + queueDepth: [], + promptTokensBySource: {}, + prefillTps: [{ t: 0, value: 100 }], + decodeTps: [], + prefixCacheHitsTps: [], + hostKvCacheUsage: [], + kvCacheUsageByEngine: [], + metricSources: [], + }; +} + +function metaRow(overrides: Record = {}) { + return { + id: 42, + trace_replay_id: 7, + has_blob: true, + chart_series: currentSeries(), + hardware: 'gb200', + framework: 'dynamo-vllm', + model: 'deepseek-r1-0528', + precision: 'fp8', + spec_method: 'none', + disagg: true, + conc: 128, + offload_mode: 'off', + isl: null, + osl: null, + benchmark_type: 'agentic_traces', + date: '2026-06-23', + run_url: null, + server_gpu_cache_hit_rate: null, + server_cpu_cache_hit_rate: null, + ...overrides, + }; +} + +function mockSql(queue: unknown[][]): { sql: DbClient; calls: string[] } { + const responses = [...queue]; + const calls: string[] = []; + const sql = ((strings: TemplateStringsArray) => { + calls.push(strings.join('?')); + return Promise.resolve(responses.shift() ?? []); + }) as unknown as DbClient; + return { sql, calls }; +} + +describe('getTraceServerMetrics', () => { + it('returns current precomputed series without selecting the raw blob', async () => { + const { sql, calls } = mockSql([[metaRow()]]); + + const result = await getTraceServerMetrics(sql, 42); + + expect(result?.prefillTps).toEqual([{ t: 0, value: 100 }]); + expect(calls).toHaveLength(1); + expect(calls[0]).not.toContain('server_metrics_json_gz as blob'); + }); + + it('fetches and computes the raw blob only when chart_series is stale', async () => { + const raw = gzipSync( + Buffer.from( + JSON.stringify({ + metrics: { + 'vllm:prompt_tokens': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 321 }] }], + }, + }, + }), + ), + ); + const stale = { ...currentSeries(), version: CHART_SERIES_VERSION - 1 }; + const { sql, calls } = mockSql([[metaRow({ chart_series: stale })], [{ blob: raw }]]); + + const result = await getTraceServerMetrics(sql, 42); + + expect(result?.prefillTps).toEqual([{ t: 0, value: 321 }]); + expect(calls).toHaveLength(2); + expect(calls[1]).toContain('server_metrics_json_gz as blob'); + }); + + it('returns null without a blob and does not issue a second query', async () => { + const { sql, calls } = mockSql([[metaRow({ has_blob: false, chart_series: null })]]); + + await expect(getTraceServerMetrics(sql, 42)).resolves.toBeNull(); + expect(calls).toHaveLength(1); + }); +}); diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 5594d514..61cacaae 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -14,6 +14,7 @@ import { CHART_SERIES_VERSION, computeChartSeries, type ChartSeries, + type MetricSourceSeries, type QueueDepthPoint, type TimeSeriesPoint, } from '../etl/compute-chart-series'; @@ -80,13 +81,20 @@ export interface TraceServerMetrics { * the cluster-average `kvCacheUsage` line covers that case alone. */ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; + /** Orchestrator-normalized metrics grouped by endpoint/worker. */ + metricSources: MetricSourceSeries[]; } interface RawMetaRow extends PointMeta { - blob: Buffer | null; + trace_replay_id: number | null; + has_blob: boolean; chart_series: ChartSeries | null; } +interface RawBlobRow { + blob: Buffer | null; +} + function buildMeta(row: RawMetaRow): PointMeta { return { id: Number(row.id), @@ -128,6 +136,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { hostKvCacheUsage: series.hostKvCacheUsage ?? [], // v8+ field; older chart_series rows lack it → omit per-engine overlay. kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [], + // v9+ field; old rows are served without a source selector until backfilled. + metricSources: series.metricSources ?? [], }; } @@ -137,7 +147,8 @@ export async function getTraceServerMetrics( ): Promise { const rows = (await sql` select - atr.server_metrics_json_gz as blob, + br.trace_replay_id, + (atr.server_metrics_json_gz is not null) as has_blob, atr.chart_series, br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg, br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type, @@ -153,7 +164,7 @@ export async function getTraceServerMetrics( `) as unknown as RawMetaRow[]; const row = rows[0]; if (!row) return null; - if (!row.blob) return null; + if (!row.has_blob || row.trace_replay_id === null) return null; const meta = buildMeta(row); // Fast path: pre-computed chart_series at the current version. @@ -161,10 +172,25 @@ export async function getTraceServerMetrics( return merge(meta, row.chart_series); } - // Slow path: compute from the blob. `computeChartSeries` handles + // Slow path only: fetch the large raw blob after establishing that the + // pre-computed series is missing or stale. Disaggregated blobs can be tens + // of MB compressed, so selecting this in the metadata query defeats the + // fast path even when chart_series is current. + const blobRows = (await sql` + select server_metrics_json_gz as blob + from agentic_trace_replay + where id = ${row.trace_replay_id} + `) as unknown as RawBlobRow[]; + const blob = blobRows[0]?.blob; + if (!blob) return null; + + // `computeChartSeries` handles // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP // rows succeed even before the backfill drains them. - const series = await computeChartSeries(row.blob); + const series = await computeChartSeries(blob, { + framework: row.framework, + disagg: row.disagg, + }); if (!series) return null; return merge(meta, series); } From 8b243e47e96465adcde20dbd00f551830af61bc2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 30 Jun 2026 18:38:14 -0500 Subject: [PATCH 102/111] feat(agentic): KV-cache pool ceiling + warmup/profiling phase split Agentic point-detail enhancements (shared files across workstreams): - KV-cache pool size: derive total pool tokens from the authoritative vLLM "GPU KV cache size: N tokens" server-log line (summed across DP engine cores; TP already aggregated), stored on benchmark_results.metrics. The vllm:cache_config_info metric is unreliable for MLA models, so the log is the source of truth. Drawn as a horizontal ceiling on the "unique input tokens in flight" chart via a reusable TimeSeriesChart refLines prop. - Fix ingest: link agentic server logs (bmk_agentic_ -> server_logs_ key mismatch meant agentic rows never got a server log). - New: server-log-metrics parser (line-based, robust to multi-MB log lines), db:backfill-agentic-server-logs and db:backfill-kv-pool scripts. - Warmup/profiling phase split: chart_series now merges the warmup_metrics block; per-request phase tags drive timeline + per-point phase slicing (phase-slice). - Request timeline: restore zoom/scroll/filter position on browser back via a one-shot sessionStorage snapshot; phase toggle is Profiling/Warmup. Tests added for the parser, snapshot (de)serialization, and chart-series. Co-Authored-By: Claude Opus 4.8 --- .../agentic-point/agentic-point-detail.tsx | 959 ++++++++++-------- .../agentic-point/phase-slice.test.ts | 212 ++++ .../inference/agentic-point/phase-slice.ts | 188 ++++ .../agentic-point/request-timeline.test.ts | 75 +- .../agentic-point/request-timeline.tsx | 181 +++- .../agentic-point/time-series-chart.test.ts | 33 +- .../agentic-point/time-series-chart.tsx | 71 +- .../src/hooks/api/use-trace-server-metrics.ts | 5 + packages/db/package.json | 2 + .../db/src/backfill-agentic-server-logs.ts | 267 +++++ packages/db/src/backfill-kv-pool.ts | 137 +++ packages/db/src/etl/benchmark-ingest.ts | 12 +- .../db/src/etl/compute-chart-series.test.ts | 43 + packages/db/src/etl/compute-chart-series.ts | 58 +- .../db/src/etl/server-log-metrics.test.ts | 43 + packages/db/src/etl/server-log-metrics.ts | 65 ++ packages/db/src/ingest-ci-run.ts | 9 +- .../src/queries/trace-server-metrics.test.ts | 1 + .../db/src/queries/trace-server-metrics.ts | 23 +- 19 files changed, 1924 insertions(+), 460 deletions(-) create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.test.ts create mode 100644 packages/app/src/components/inference/agentic-point/phase-slice.ts create mode 100644 packages/db/src/backfill-agentic-server-logs.ts create mode 100644 packages/db/src/backfill-kv-pool.ts create mode 100644 packages/db/src/etl/server-log-metrics.test.ts create mode 100644 packages/db/src/etl/server-log-metrics.ts diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 77d87997..c6697442 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -2,12 +2,11 @@ import Link from 'next/link'; import { usePathname, useRouter, useSearchParams } from 'next/navigation'; -import { useCallback, useState } from 'react'; +import { useCallback, useMemo, useState } from 'react'; import { ArrowLeft } from 'lucide-react'; import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates'; import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-request-timeline'; -import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; import { useTraceServerMetrics, type MetricSource, @@ -29,6 +28,14 @@ import { track } from '@/lib/analytics'; import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart'; import { Distribution } from './distribution'; import { ExpandableChart } from './expandable-chart'; +import { + phaseBoundarySec, + sliceServerSeriesByPhase, + sliceTimelineByPhase, + timelineHasWarmup, + type ServerSeriesLike, + type StagePhase, +} from './phase-slice'; import { RequestTimelineView } from './request-timeline'; import { SiblingNav, chipLabel } from './sibling-nav'; import { @@ -57,6 +64,13 @@ interface Props { const fmtPct = (v: number | null | undefined): string => v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`; +/** Compact token count for chart labels: 306808 → "307K tok", 3.2e6 → "3.2M tok". */ +const fmtTokensCompact = (n: number): string => { + if (n >= 1e6) return `${(n / 1e6).toFixed(1)}M tok`; + if (n >= 1e3) return `${Math.round(n / 1e3)}K tok`; + return `${Math.round(n)} tok`; +}; + function MetaLine({ label, value }: { label: string; value: React.ReactNode }) { return (
@@ -155,6 +169,14 @@ const SEQUENCE_METRIC_OPTIONS: SegmentedToggleOption[] = [ { value: 'inflight', label: 'In-flight avg' }, ]; +// Warmup vs profiling stage selector. Drives the server-metric charts AND the +// request-derived charts (ISL/OSL, latency-over-time, in-flight). Only shown +// when the point actually has a warmup phase. +const STAGE_PHASE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'profiling', label: 'Profiling', testId: 'stage-phase-profiling' }, + { value: 'warmup', label: 'Warmup', testId: 'stage-phase-warmup' }, +]; + const SOURCE_ROLE_LABEL: Record = { router: 'Router', prefill: 'Prefill', @@ -285,21 +307,25 @@ function RequestMetricOverTime({ function SequenceMetricCard({ metric, - values, timeline, - histogramLoading, timelineLoading, }: { metric: 'isl' | 'osl'; - values: readonly number[] | undefined; + /** Phase-scoped timeline — distribution values + in-flight are both derived from it. */ timeline: RequestTimeline | null | undefined; - histogramLoading: boolean; timelineLoading: boolean; }) { const [view, setView] = useState('distribution'); const acronym = metric.toUpperCase(); const fullName = metric === 'isl' ? 'Input sequence length' : 'Output sequence length'; const testPrefix = `${metric}-metric`; + // Per-request ISL/OSL for the selected phase (request_timeline carries both, + // so the distribution honours the warmup/profiling toggle for free). + const values = timeline + ? timeline.requests + .map((r) => r[metric]) + .filter((v): v is number => typeof v === 'number' && Number.isFinite(v)) + : undefined; return ( { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (view === 'distribution') { - if (values) return ; - return histogramLoading ? : ; + if (values && values.length > 0) + return ; + return timelineLoading ? : ; } if (!timeline) return timelineLoading ? : ; const raw = averageSequenceLengthInFlight(timeline.requests, metric); @@ -376,11 +403,9 @@ export function AgenticPointDetail({ id }: Props) { const router = useRouter(); const pathname = usePathname(); const searchParams = useSearchParams(); - const histQuery = useTraceHistograms([id], true); const metricsQuery = useTraceServerMetrics(id, true); const siblingsQuery = useBenchmarkSiblings(id); - const hist = histQuery.data?.[id]; const metrics = metricsQuery.data; const siblingsData = siblingsQuery.data; @@ -407,25 +432,73 @@ export function AgenticPointDetail({ id }: Props) { // shows how the metric varies across the SKU. const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? []; const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates'); - // Per-request timeline used by both the timeline view AND the per-point - // "Unique input tokens in flight" chart, so fetch whenever we're on - // either view. + // Per-request timeline used by the timeline view AND every per-point + // request-derived chart (ISL/OSL, latency-over-time, in-flight), so fetch + // whenever we're on either view. const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point'); + const timeline = timelineQuery.data; + + // Warmup vs profiling stage. Only meaningful when the point actually has a + // warmup phase (older runs are profiling-only) — when absent the toggle is + // hidden and everything falls back to the full (profiling) run. + const [phase, setPhase] = useState('profiling'); + const hasWarmup = useMemo(() => timelineHasWarmup(timeline), [timeline]); + const effectivePhase: StagePhase = hasWarmup ? phase : 'profiling'; + + // Server-metric boundary on the chart's own t-axis (rebased through absolute + // ns — see phase-slice header for the origin-gap invariant). Request charts + // get a phase-scoped timeline (filtered + rebased) so they share a 0-based + // axis with the server charts for the selected phase. + const boundarySec = useMemo(() => phaseBoundarySec(metrics, timeline), [metrics, timeline]); + const phaseTimeline = useMemo( + () => (timeline ? sliceTimelineByPhase(timeline, effectivePhase) : null), + [timeline, effectivePhase], + ); + const metricSources = metrics?.metricSources ?? []; const selectedMetricSource = metricSources.find(({ source }) => source.id === metricSourceId); - const serverSeries = selectedMetricSource - ? { - kvCacheUsage: selectedMetricSource.kvCacheUsage, - prefixCacheHitRate: selectedMetricSource.prefixCacheHitRate, - queueDepth: selectedMetricSource.queueDepth, - promptTokensBySource: selectedMetricSource.promptTokensBySource, - prefillTps: selectedMetricSource.promptTps, - decodeTps: selectedMetricSource.generationTps, - prefixCacheHitsTps: selectedMetricSource.prefixCacheHitsTps, - hostKvCacheUsage: selectedMetricSource.hostKvCacheUsage, - kvCacheUsageByEngine: selectedMetricSource.kvCacheUsageByEngine, - } - : metrics; + const baseServerSeries: ServerSeriesLike | undefined = useMemo(() => { + const src = metrics?.metricSources?.find((m) => m.source.id === metricSourceId); + if (src) { + return { + kvCacheUsage: src.kvCacheUsage, + prefixCacheHitRate: src.prefixCacheHitRate, + queueDepth: src.queueDepth, + promptTokensBySource: src.promptTokensBySource, + prefillTps: src.promptTps, + decodeTps: src.generationTps, + prefixCacheHitsTps: src.prefixCacheHitsTps, + hostKvCacheUsage: src.hostKvCacheUsage, + kvCacheUsageByEngine: src.kvCacheUsageByEngine, + }; + } + return metrics ?? undefined; + }, [metrics, metricSourceId]); + // Phase-sliced server series (+ matching durationS) consumed by every server + // chart. Null only when there are no server metrics at all. Each chart reads + // `sliced.series` (locally aliased to `serverSeries`) and `sliced.durationS`. + const sliced = useMemo( + () => + baseServerSeries + ? sliceServerSeriesByPhase( + baseServerSeries, + effectivePhase, + boundarySec, + metrics?.durationS ?? 0, + ) + : null, + [baseServerSeries, effectivePhase, boundarySec, metrics?.durationS], + ); + // Some runs only scrape server metrics during profiling — `chart_series` + // starts at the profiling boundary, so the warmup slice collapses to ~0–1 + // points (just the t=0 origin) even though request-level warmup data exists. + // Require ≥2 points in some series to count as real warmup coverage; otherwise + // show an explanatory note instead of six silently-blank charts. + const slicedHasServerData = + (sliced?.series.kvCacheUsage.length ?? 0) > 1 || + (sliced?.series.queueDepth.length ?? 0) > 1 || + (sliced?.series.prefillTps.length ?? 0) > 1 || + (sliced?.series.prefixCacheHitRate.length ?? 0) > 1; return (
@@ -489,45 +562,67 @@ export function AgenticPointDetail({ id }: Props) { )}
- {view === 'point' && metricSources.length > 1 && ( + {view === 'point' && (metricSources.length > 1 || hasWarmup) && (
- Server metrics - { + setMetricSourceId(value); + const source = metricSources.find((entry) => entry.source.id === value)?.source; + track('inference_agentic_metric_source_changed', { + source: value, + role: source?.role ?? 'all', + adapter: source?.adapter ?? metrics?.meta.framework ?? 'unknown', + }); + }} + > + - {metricSourceLabel(source)} - - ))} - - + + + + All endpoints + {metricSources.map(({ source }) => ( + + {metricSourceLabel(source)} + + ))} + + +
+ ) : null}
)} @@ -546,6 +641,7 @@ export function AgenticPointDetail({ id }: Props) { ) : (
@@ -554,385 +650,410 @@ export function AgenticPointDetail({ id }: Props) {
) ) : ( -
- - + <> + {effectivePhase === 'warmup' && ( +

+ Showing the warmup phase — a + cache-warming pass whose outputs are capped at 1 token. Warmup OSL ≈ 1, and + interactivity/decode are blank (single-token outputs have no inter-token latency). + {!slicedHasServerData && + ' Warmup server-side metrics aren’t available for this point, so the server charts below are empty — the request-level charts above still reflect warmup.'} +

+ )} +
+ + - + - + - { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics || !serverSeries) return ; - // For SGLang hicache rows we have both GPU (HBM) util and - // host (CPU offload pool) util — overlay them as two lines. - const hasHost = serverSeries.hostKvCacheUsage.length > 0; - // DEP runs report one series per engine. When there's more - // than one, draw one line per rank in distinct colors so - // load skew is visible at a glance; cluster-average sits on - // top in white so it stands out. - const perEngine = serverSeries.kvCacheUsageByEngine ?? []; - const hasPerEngine = perEngine.length > 1; - // Render order matters: per-engine first → average drawn on top. - const series = [ - ...(hasPerEngine - ? perEngine.map((e, i) => ({ - name: `DP ${e.engineLabel}`, - data: rollingAverage(e.points, 50), - color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!, - // Thin + translucent so the Avg line on top reads as - // the headline number, not just one more series. - strokeWidth: 1, - strokeOpacity: 0.5, - })) - : []), - { - name: hasHost - ? 'GPU HBM (avg n=50)' - : hasPerEngine - ? 'Avg' - : 'GPU KV cache (avg n=50)', - data: rollingAverage(serverSeries.kvCacheUsage, 50), - // Skip raw scatter when per-engine overlay is on — the - // DP-rank lines already convey the spread, dots would be noise. - rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage, - // Bold red Avg sits on top of the translucent per-DP lines. - // DP 1 in the palette is #ef4444 (lighter red); the darker - // #dc2626 here plus the heavier stroke keeps it distinct. - color: hasPerEngine ? '#dc2626' : '#3b82f6', - strokeWidth: hasPerEngine ? 3.5 : 2, - }, - ...(hasHost - ? [ - { - name: 'CPU offload pool (avg n=50)', - data: rollingAverage(serverSeries.hostKvCacheUsage, 50), - rawData: serverSeries.hostKvCacheUsage, - color: '#f97316', - strokeWidth: 2, - }, - ] - : []), - ]; - return ( - `${(v * 100).toFixed(0)}%`} - yAxisLabel="KV cache (%)" - {...size} - /> - ); - }} - /> + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics || !sliced) return ; + const serverSeries = sliced.series; + // For SGLang hicache rows we have both GPU (HBM) util and + // host (CPU offload pool) util — overlay them as two lines. + const hasHost = serverSeries.hostKvCacheUsage.length > 0; + // DEP runs report one series per engine. When there's more + // than one, draw one line per rank in distinct colors so + // load skew is visible at a glance; cluster-average sits on + // top in white so it stands out. + const perEngine = serverSeries.kvCacheUsageByEngine ?? []; + const hasPerEngine = perEngine.length > 1; + // Render order matters: per-engine first → average drawn on top. + const series = [ + ...(hasPerEngine + ? perEngine.map((e, i) => ({ + name: `DP ${e.engineLabel}`, + data: rollingAverage(e.points, 50), + color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!, + // Thin + translucent so the Avg line on top reads as + // the headline number, not just one more series. + strokeWidth: 1, + strokeOpacity: 0.5, + })) + : []), + { + name: hasHost + ? 'GPU HBM (avg n=50)' + : hasPerEngine + ? 'Avg' + : 'GPU KV cache (avg n=50)', + data: rollingAverage(serverSeries.kvCacheUsage, 50), + // Skip raw scatter when per-engine overlay is on — the + // DP-rank lines already convey the spread, dots would be noise. + rawData: hasPerEngine ? undefined : serverSeries.kvCacheUsage, + // Bold red Avg sits on top of the translucent per-DP lines. + // DP 1 in the palette is #ef4444 (lighter red); the darker + // #dc2626 here plus the heavier stroke keeps it distinct. + color: hasPerEngine ? '#dc2626' : '#3b82f6', + strokeWidth: hasPerEngine ? 3.5 : 2, + }, + ...(hasHost + ? [ + { + name: 'CPU offload pool (avg n=50)', + data: rollingAverage(serverSeries.hostKvCacheUsage, 50), + rawData: serverSeries.hostKvCacheUsage, + color: '#f97316', + strokeWidth: 2, + }, + ] + : []), + ]; + return ( + `${(v * 100).toFixed(0)}%`} + yAxisLabel="KV cache (%)" + {...size} + /> + ); + }} + /> - { - setRequestActivityView(value); - track('inference_agentic_request_activity_changed', { view: value }); - }} - ariaLabel="Request activity metric" - testId="request-activity-toggle" - buttonClassName="px-2 py-1 text-xs" - /> - } - render={(expanded) => { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (requestActivityView === 'completed') { - if (!timelineQuery.data) { - return timelineQuery.isLoading ? : ; + { + setRequestActivityView(value); + track('inference_agentic_request_activity_changed', { view: value }); + }} + ariaLabel="Request activity metric" + testId="request-activity-toggle" + buttonClassName="px-2 py-1 text-xs" + /> + } + render={(expanded) => { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (requestActivityView === 'completed') { + if (!phaseTimeline) { + return timelineQuery.isLoading ? : ; + } + return ( + + ); } + if (!metrics || !sliced) return ; + const serverSeries = sliced.series; return ( ({ + t: p.t, + value: p.running, + })), + 50, + ), + color: '#22c55e', + strokeWidth: 2, + }, + { + name: 'Waiting (avg n=50)', + data: rollingAverage( + serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ + t: p.t, + value: p.waiting, + })), + 50, + ), + color: '#ef4444', + strokeWidth: 2, + }, + { + name: 'Total (avg n=50)', + data: rollingAverage( + serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ + t: p.t, + value: p.total, + })), + 50, + ), color: '#3b82f6', - strokeWidth: 2.5, + strokeWidth: 2, }, ]} - durationS={timelineQuery.data.durationS} + durationS={sliced.durationS} yAxisLabel="Requests" {...size} /> ); - } - if (!metrics || !serverSeries) return ; - return ( - ({ - t: p.t, - value: p.running, - })), - 50, - ), - color: '#22c55e', - strokeWidth: 2, - }, - { - name: 'Waiting (avg n=50)', - data: rollingAverage( - serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ - t: p.t, - value: p.waiting, - })), - 50, - ), - color: '#ef4444', - strokeWidth: 2, - }, - { - name: 'Total (avg n=50)', - data: rollingAverage( - serverSeries.queueDepth.map((p: QueueDepthPoint) => ({ - t: p.t, - value: p.total, - })), - 50, - ), - color: '#3b82f6', - strokeWidth: 2, - }, - ]} - durationS={metrics.durationS} - yAxisLabel="Requests" - {...size} - /> - ); - }} - /> + }} + /> - { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics || !serverSeries) return ; - return ( - `${(v * 100).toFixed(0)}%`} - yAxisLabel="Hit rate (%)" - {...size} - /> - ); - }} - /> + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics || !sliced) return ; + const serverSeries = sliced.series; + return ( + `${(v * 100).toFixed(0)}%`} + yAxisLabel="Hit rate (%)" + {...size} + /> + ); + }} + /> - - {( - [ - ['input', 'Input'], - ['decode', 'Decode'], - ] as const - ).map(([key, label]) => { - const active = throughputSeries.has(key); - const isOnlyActive = active && throughputSeries.size === 1; - return ( - - ); - })} -
- } - render={(expanded) => { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics || !serverSeries) return ; - return ( - - ); - }} - /> + + {( + [ + ['input', 'Input'], + ['decode', 'Decode'], + ] as const + ).map(([key, label]) => { + const active = throughputSeries.has(key); + const isOnlyActive = active && throughputSeries.size === 1; + return ( + + ); + })} +
+ } + render={(expanded) => { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics || !sliced) return ; + const serverSeries = sliced.series; + return ( + + ); + }} + /> - { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics || !serverSeries) return ; - return ( - - ); - }} - /> + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics || !sliced) return ; + const serverSeries = sliced.series; + return ( + + ); + }} + /> - { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!metrics || !serverSeries) return ; - // Unique = total prompt tokens received minus tokens served from - // any cache tier — i.e. the freshly prefill-computed tokens. Prefer - // the promptTokensBySource breakdown (its buckets sum to the real - // prompt-token total, so subtracting cache tiers is exact). Fall - // back to cumsum(prefillTps - prefixCacheHitsTps) only for older - // data without the breakdown: vllm:prefix_cache_hits re-counts - // tokens across scheduler passes, so its cumulative can exceed the - // prompt tokens received, driving the diff negative and freezing - // the monotonic-clamped line after a few seconds. - const uniqueFromBreakdown = cumulativeUniqueInputTokens( - serverSeries.promptTokensBySource, - ); - const uniqueData = - uniqueFromBreakdown.length > 0 - ? uniqueFromBreakdown - : cumulativeDifferenceMonotonic( - serverSeries.prefillTps, - serverSeries.prefixCacheHitsTps, - ); - return ( - - ); - }} - /> + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics || !sliced) return ; + const serverSeries = sliced.series; + // Unique = total prompt tokens received minus tokens served from + // any cache tier — i.e. the freshly prefill-computed tokens. Prefer + // the promptTokensBySource breakdown (its buckets sum to the real + // prompt-token total, so subtracting cache tiers is exact). Fall + // back to cumsum(prefillTps - prefixCacheHitsTps) only for older + // data without the breakdown: vllm:prefix_cache_hits re-counts + // tokens across scheduler passes, so its cumulative can exceed the + // prompt tokens received, driving the diff negative and freezing + // the monotonic-clamped line after a few seconds. + const uniqueFromBreakdown = cumulativeUniqueInputTokens( + serverSeries.promptTokensBySource, + ); + const uniqueData = + uniqueFromBreakdown.length > 0 + ? uniqueFromBreakdown + : cumulativeDifferenceMonotonic( + serverSeries.prefillTps, + serverSeries.prefixCacheHitsTps, + ); + return ( + + ); + }} + /> - { - const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; - if (!timelineQuery.data) { - return timelineQuery.isLoading ? : ; - } - // Step function: at each request start/end, sum the ISLs of - // currently-active requests across distinct cids. Within one - // cid turns are sequential so each cid contributes at most - // one in-flight ISL; across cids we treat content as - // independent (cross-conv prefix sharing adds <1pp in - // practice). Smooth with a 30s time-weighted rolling average - // so brief turn-handoff dips don't dominate the chart. - const raw = inflightUniqueTokens(timelineQuery.data.requests); - const smoothed = timeRollingAverage(raw, 30); - return ( - - ); - }} - /> -
+ { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!phaseTimeline) { + return timelineQuery.isLoading ? : ; + } + // Step function: at each request start/end, sum the ISLs of + // currently-active requests across distinct cids. Within one + // cid turns are sequential so each cid contributes at most + // one in-flight ISL; across cids we treat content as + // independent (cross-conv prefix sharing adds <1pp in + // practice). Smooth with a 30s time-weighted rolling average + // so brief turn-handoff dips don't dominate the chart. + const raw = inflightUniqueTokens(phaseTimeline.requests); + const smoothed = timeRollingAverage(raw, 30); + // KV-cache pool size (vLLM only) drawn as a constant ceiling so + // you can see how close the working set gets to eviction + // pressure. Phase-independent — it's a static config value. + const pool = metrics?.kvCachePoolTokens ?? null; + return ( + 0 + ? [{ value: pool, label: `KV cache pool · ${fmtTokensCompact(pool)}` }] + : undefined + } + {...size} + /> + ); + }} + /> +
+ )}
); diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.test.ts b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts new file mode 100644 index 00000000..ef6cdaab --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/phase-slice.test.ts @@ -0,0 +1,212 @@ +import { describe, expect, it } from 'vitest'; + +import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline'; +import { + phaseBoundaryNs, + phaseBoundarySec, + requestsForPhase, + sliceServerSeriesByPhase, + sliceTimelineByPhase, + timelineHasWarmup, + type ServerSeriesLike, +} from './phase-slice'; + +function req(overrides: Partial): RequestRecord { + return { + cid: 'c', + ti: 0, + wid: 'w', + ad: 0, + phase: 'profiling', + credit: 0, + start: 0, + ack: null, + end: 1, + ttftMs: null, + tpotMs: null, + isl: null, + osl: null, + cancelled: false, + ...overrides, + }; +} + +function timeline(requests: RequestRecord[], startNs = 1_000): RequestTimeline { + return { version: 3, startNs, endNs: startNs + 1, durationS: 1, requests }; +} + +function makeSeries(ts: number[]): ServerSeriesLike { + const pts = ts.map((t) => ({ t, value: t * 10 })); + return { + kvCacheUsage: pts, + prefixCacheHitRate: pts, + queueDepth: ts.map((t) => ({ t, running: t, waiting: t + 1, total: 2 * t + 1 })), + promptTokensBySource: { src: pts }, + prefillTps: pts, + decodeTps: pts, + prefixCacheHitsTps: pts, + hostKvCacheUsage: pts, + kvCacheUsageByEngine: [{ engineLabel: 'e0', points: pts }], + }; +} + +describe('phaseBoundaryNs', () => { + it('returns null when there are no profiling requests', () => { + expect(phaseBoundaryNs(timeline([req({ phase: 'warmup', start: 5 })]))).toBeNull(); + }); + + it('returns null when there are no warmup requests', () => { + expect(phaseBoundaryNs(timeline([req({ phase: 'profiling', start: 5 })]))).toBeNull(); + }); + + it('returns startNs + earliest profiling start when both phases present', () => { + const t = timeline( + [ + req({ phase: 'warmup', start: 0 }), + req({ phase: 'profiling', start: 900 }), + req({ phase: 'profiling', start: 700 }), + ], + 1_000, + ); + expect(phaseBoundaryNs(t)).toBe(1_700); + }); + + it('returns null for nullish timeline', () => { + expect(phaseBoundaryNs(null)).toBeNull(); + expect(phaseBoundaryNs(undefined)).toBeNull(); + }); +}); + +describe('phaseBoundarySec', () => { + it('rebases through absolute ns by subtracting serverMetrics.startNs (origin gap)', () => { + // timeline origin and server-metrics origin differ — the classic ~124s gap. + const tl = timeline( + [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 600 * 1e9 })], + 200 * 1e9, // timeline.startNs + ); + // boundaryNs = 200e9 + 600e9 = 800e9 ; serverMetrics origin = 124e9 earlier + const boundarySec = phaseBoundarySec({ startNs: 76 * 1e9 }, tl); + // (800e9 - 76e9)/1e9 = 724 + expect(boundarySec).toBe(724); + }); + + it('clamps a negative mapping to 0', () => { + const tl = timeline( + [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 0 })], + 0, + ); + expect(phaseBoundarySec({ startNs: 5 * 1e9 }, tl)).toBe(0); + }); + + it('returns null when serverMetrics missing or no split', () => { + const tl = timeline( + [req({ phase: 'warmup', start: 0 }), req({ phase: 'profiling', start: 1e9 })], + 0, + ); + expect(phaseBoundarySec(null, tl)).toBeNull(); + expect(phaseBoundarySec({ startNs: 0 }, timeline([req({ phase: 'profiling' })]))).toBeNull(); + }); +}); + +describe('timelineHasWarmup', () => { + it('detects warmup presence', () => { + expect(timelineHasWarmup(timeline([req({ phase: 'profiling' })]))).toBe(false); + expect(timelineHasWarmup(timeline([req({ phase: 'warmup' })]))).toBe(true); + expect(timelineHasWarmup(null)).toBe(false); + }); +}); + +describe('sliceServerSeriesByPhase', () => { + it('is an identity passthrough (full duration) when boundary is null', () => { + const s = makeSeries([0, 1, 2]); + const out = sliceServerSeriesByPhase(s, 'profiling', null, 99); + expect(out.series).toBe(s); + expect(out.durationS).toBe(99); + }); + + it('warmup keeps t < boundary, no rebase, durationS = boundary', () => { + const s = makeSeries([0, 1, 2, 3, 4]); + const out = sliceServerSeriesByPhase(s, 'warmup', 2, 5); + expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1]); // excludes t===2 + expect(out.durationS).toBe(2); + }); + + it('profiling keeps t >= boundary and rebases to start at 0', () => { + const s = makeSeries([0, 1, 2, 3, 4]); + const out = sliceServerSeriesByPhase(s, 'profiling', 2, 5); + expect(out.series.kvCacheUsage.map((p) => p.t)).toEqual([0, 1, 2]); // 2,3,4 -> 0,1,2 + expect(out.series.kvCacheUsage.map((p) => p.value)).toEqual([20, 30, 40]); // values preserved + expect(out.durationS).toBe(3); // 5 - 2 + }); + + it('slices queueDepth, promptTokensBySource, and kvCacheUsageByEngine; preserves queue fields', () => { + const s = makeSeries([0, 1, 2, 3]); + const out = sliceServerSeriesByPhase(s, 'profiling', 2, 4); + expect(out.series.queueDepth).toEqual([ + { t: 0, running: 2, waiting: 3, total: 5 }, + { t: 1, running: 3, waiting: 4, total: 7 }, + ]); + expect(out.series.promptTokensBySource.src.map((p) => p.t)).toEqual([0, 1]); + expect(out.series.kvCacheUsageByEngine[0]!.points.map((p) => p.t)).toEqual([0, 1]); + expect(out.series.kvCacheUsageByEngine[0]!.engineLabel).toBe('e0'); + }); + + it('does not mutate the input series', () => { + const s = makeSeries([0, 1, 2]); + const before = s.kvCacheUsage.map((p) => p.t); + sliceServerSeriesByPhase(s, 'profiling', 1, 3); + expect(s.kvCacheUsage.map((p) => p.t)).toEqual(before); + }); +}); + +describe('requestsForPhase', () => { + const rs = [ + req({ phase: 'warmup', isl: 1 }), + req({ phase: 'profiling', isl: 2 }), + req({ phase: 'unknown', isl: 3 }), + ]; + + it('profiling selects only profiling rows', () => { + expect(requestsForPhase(rs, 'profiling').map((r) => r.isl)).toEqual([2]); + }); + + it('warmup selects everything that is not profiling', () => { + expect(requestsForPhase(rs, 'warmup').map((r) => r.isl)).toEqual([1, 3]); + }); +}); + +describe('sliceTimelineByPhase', () => { + // startNs origin = 1000; warmup request at offset 0..50, profiling at 100..300. + const tl = timeline( + [ + req({ phase: 'warmup', credit: 0, start: 0, ack: 10, end: 50, isl: 1 }), + req({ phase: 'profiling', credit: 90, start: 100, ack: 120, end: 300, isl: 2 }), + ], + 1_000, + ); + // tl.durationS default = 1 from helper; override for window math. + const tlDur: RequestTimeline = { ...tl, durationS: 3 }; + + it('returns the input unchanged for a single-phase timeline', () => { + const single = timeline([req({ phase: 'profiling', start: 5 })]); + expect(sliceTimelineByPhase(single, 'profiling')).toBe(single); + }); + + it('warmup keeps pre-boundary requests, no rebase, startNs unchanged', () => { + const out = sliceTimelineByPhase(tlDur, 'warmup'); + expect(out.requests.map((r) => r.isl)).toEqual([1]); + expect(out.requests[0]!.start).toBe(0); // not rebased + expect(out.startNs).toBe(1_000); + }); + + it('profiling keeps post-boundary requests and rebases offsets + startNs', () => { + const out = sliceTimelineByPhase(tlDur, 'profiling'); + expect(out.requests.map((r) => r.isl)).toEqual([2]); + // boundary offset = 100 → rebased: start 100→0, end 300→200, ack 120→20, credit 90→-10 + expect(out.requests[0]!.start).toBe(0); + expect(out.requests[0]!.end).toBe(200); + expect(out.requests[0]!.ack).toBe(20); + // startNs shifts forward by the boundary offset so absolute time is preserved + expect(out.startNs).toBe(1_100); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/phase-slice.ts b/packages/app/src/components/inference/agentic-point/phase-slice.ts new file mode 100644 index 00000000..e6e17719 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/phase-slice.ts @@ -0,0 +1,188 @@ +/** + * Warmup vs profiling phase slicing for the agentic per-point detail page. + * + * Agentic trace-replay runs have two phases: a warmup (cache-warming) pass, then + * the measured profiling window. The server-metric time-series (`chart_series`) + * spans the whole run with no per-point phase label, but the per-request + * `request_timeline` IS phase-tagged. We derive the warmup→profiling boundary + * from the timeline and slice the server series at it. + * + * ⚠️ ORIGIN-GAP INVARIANT: the two payloads share the aiperf clock but have + * DIFFERENT zero origins — `serverMetrics.startNs` is the first server scrape, + * `timeline.startNs` is the first request's credit (observed ~124 s apart in + * real runs). The boundary must therefore be rebased through absolute ns by + * subtracting `serverMetrics.startNs`; a same-axis offset comparison would be + * off by the origin gap. This rebasing lives in `phaseBoundarySec` only. + */ + +import type { RequestRecord, RequestTimeline } from '@/hooks/api/use-request-timeline'; +import type { + QueueDepthPoint, + TimeSeriesPoint, + TraceServerMetrics, +} from '@/hooks/api/use-trace-server-metrics'; + +export type StagePhase = 'warmup' | 'profiling'; + +/** + * The subset of server-metric series the per-point charts render. Both the + * top-level `TraceServerMetrics` and a per-source object (after the detail page + * remaps `promptTps`→`prefillTps`, `generationTps`→`decodeTps`) are assignable. + */ +export interface ServerSeriesLike { + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + prefillTps: TimeSeriesPoint[]; + decodeTps: TimeSeriesPoint[]; + prefixCacheHitsTps: TimeSeriesPoint[]; + hostKvCacheUsage: TimeSeriesPoint[]; + kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; +} + +/** True when the timeline contains at least one non-profiling (warmup) request. */ +export function timelineHasWarmup(timeline: RequestTimeline | null | undefined): boolean { + return Boolean(timeline?.requests.some((r) => r.phase !== 'profiling')); +} + +/** + * Absolute-ns wall-clock instant where the profiling phase begins + * = `timeline.startNs + earliest profiling request's start offset`. + * Returns null unless BOTH a warmup and a profiling request exist (nothing to + * split otherwise). + */ +export function phaseBoundaryNs(timeline: RequestTimeline | null | undefined): number | null { + if (!timeline) return null; + let hasWarmup = false; + let minProfilingStart: number | null = null; + for (const r of timeline.requests) { + if (r.phase === 'profiling') { + if (minProfilingStart === null || r.start < minProfilingStart) minProfilingStart = r.start; + } else { + hasWarmup = true; + } + } + if (!hasWarmup || minProfilingStart === null) return null; + return timeline.startNs + minProfilingStart; +} + +/** + * The profiling-start boundary expressed on the SERVER-METRIC chart's own t-axis + * (seconds from `serverMetrics.startNs`). See the origin-gap invariant at the top + * of the file — the `- serverMetrics.startNs` subtraction is mandatory. + * + * Returns null when there's no warmup/profiling split, or `serverMetrics` is + * absent (→ callers fall back to the full-run series). + */ +export function phaseBoundarySec( + serverMetrics: Pick | null | undefined, + timeline: RequestTimeline | null | undefined, +): number | null { + if (!serverMetrics) return null; + const boundaryNs = phaseBoundaryNs(timeline); + if (boundaryNs === null) return null; + return Math.max(0, (boundaryNs - serverMetrics.startNs) / 1e9); +} + +export interface PhaseSlicedSeries { + series: S; + durationS: number; +} + +/** + * Slice every server-metric series to one phase: + * - warmup: keep points with `t < boundary`, no rebase, `durationS = boundary` + * - profiling: keep points with `t >= boundary`, rebased so `t` starts at 0, + * `durationS = full - boundary` + * + * A point exactly at `t === boundary` belongs to profiling. Null boundary + * (single-phase point, or no server metrics) → identity passthrough with the + * full `durationS`. Pure — returns new objects, never mutates the input. + * + * NOTE: rebasing the profiling slice to start at 0 makes the cumulative charts + * (prompt-token source, unique-input-tokens) read as "since profiling start" + * rather than "since run start" — intended. + */ +export function sliceServerSeriesByPhase( + series: S, + phase: StagePhase, + boundarySec: number | null, + fullDurationS: number, +): PhaseSlicedSeries { + if (boundarySec === null) return { series, durationS: fullDurationS }; + const b = boundarySec; + const keep = phase === 'warmup' ? (t: number) => t < b : (t: number) => t >= b; + const rebase = phase === 'profiling' ? (t: number) => t - b : (t: number) => t; + + const sliceTs = (pts: TimeSeriesPoint[]): TimeSeriesPoint[] => + pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) })); + const sliceQd = (pts: QueueDepthPoint[]): QueueDepthPoint[] => + pts.filter((p) => keep(p.t)).map((p) => ({ ...p, t: rebase(p.t) })); + const sliceRecord = ( + rec: Record, + ): Record => { + const out: Record = {}; + for (const [k, v] of Object.entries(rec)) out[k] = sliceTs(v); + return out; + }; + + const slicedFields: ServerSeriesLike = { + kvCacheUsage: sliceTs(series.kvCacheUsage), + prefixCacheHitRate: sliceTs(series.prefixCacheHitRate), + queueDepth: sliceQd(series.queueDepth), + promptTokensBySource: sliceRecord(series.promptTokensBySource), + prefillTps: sliceTs(series.prefillTps), + decodeTps: sliceTs(series.decodeTps), + prefixCacheHitsTps: sliceTs(series.prefixCacheHitsTps), + hostKvCacheUsage: sliceTs(series.hostKvCacheUsage), + kvCacheUsageByEngine: series.kvCacheUsageByEngine.map((e) => ({ + engineLabel: e.engineLabel, + points: sliceTs(e.points), + })), + }; + + const durationS = phase === 'warmup' ? b : Math.max(1, fullDurationS - b); + return { series: { ...series, ...slicedFields } as S, durationS }; +} + +/** Filter request-timeline records to one phase (warmup = anything not profiling). */ +export function requestsForPhase(requests: RequestRecord[], phase: StagePhase): RequestRecord[] { + return phase === 'warmup' + ? requests.filter((r) => r.phase !== 'profiling') + : requests.filter((r) => r.phase === 'profiling'); +} + +/** + * Scope a whole request timeline to one phase: keep only that phase's requests + * and, for profiling, rebase every ns offset (and `startNs`) so the phase starts + * at t=0 — mirroring `sliceServerSeriesByPhase` so the request-derived charts and + * the server charts share a 0-based axis for the same phase. `durationS` becomes + * the phase window. Returns the input unchanged when there's no warmup/profiling + * split (single-phase point). Pure — new object, original untouched. + * + * The boundary here is on the REQUEST clock (offset from `timeline.startNs`), so + * we use `phaseBoundaryNs` minus `timeline.startNs` rather than the server-axis + * `phaseBoundarySec` (different origin — see the file header). + */ +export function sliceTimelineByPhase( + timeline: RequestTimeline, + phase: StagePhase, +): RequestTimeline { + const boundaryNs = phaseBoundaryNs(timeline); + if (boundaryNs === null) return timeline; + const boundaryOff = boundaryNs - timeline.startNs; // ns offset on the request clock + const inPhase = (r: RequestRecord) => + phase === 'warmup' ? r.start < boundaryOff : r.start >= boundaryOff; + const shift = phase === 'profiling' ? boundaryOff : 0; + const requests = timeline.requests.filter(inPhase).map((r) => ({ + ...r, + credit: r.credit - shift, + start: r.start - shift, + ack: r.ack === null ? null : r.ack - shift, + end: r.end - shift, + })); + const durationS = + phase === 'warmup' ? boundaryOff / 1e9 : Math.max(1, timeline.durationS - boundaryOff / 1e9); + return { ...timeline, startNs: timeline.startNs + shift, requests, durationS }; +} diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts index d15da878..fe3c1231 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -2,7 +2,13 @@ import { describe, expect, it } from 'vitest'; import type { RequestRecord } from '@/hooks/api/use-request-timeline'; -import { buildRequestTimelineRows, requestIdleStats, splitTimelineCid } from './request-timeline'; +import { + buildRequestTimelineRows, + parseTimelineViewSnapshot, + requestIdleStats, + splitTimelineCid, + type TimelineViewSnapshot, +} from './request-timeline'; const request = (start: number, end: number): RequestRecord => ({ cid: 'conversation', @@ -99,3 +105,70 @@ describe('subagent timeline hierarchy', () => { expect(rows[2]!.requests).toHaveLength(1); }); }); + +describe('parseTimelineViewSnapshot', () => { + const full: TimelineViewSnapshot = { + viewStart: 1_000, + viewEnd: 5_000, + rowMode: 'worker', + phaseFilter: 'warmup', + expanded: ['conv::sa:subagent_001_abcd'], + scrollTop: 240, + scrollLeft: 80, + }; + + it('round-trips a full snapshot', () => { + expect(parseTimelineViewSnapshot(JSON.stringify(full))).toEqual(full); + }); + + it('round-trips the profiling phase and rejects the removed "all" value', () => { + expect( + parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'profiling' }))?.phaseFilter, + ).toBe('profiling'); + // 'all' is no longer a valid phase — coerces back to the profiling default. + expect( + parseTimelineViewSnapshot(JSON.stringify({ ...full, phaseFilter: 'all' }))?.phaseFilter, + ).toBe('profiling'); + }); + + it('returns null for absent or unparseable input', () => { + expect(parseTimelineViewSnapshot(null)).toBeNull(); + expect(parseTimelineViewSnapshot('')).toBeNull(); + expect(parseTimelineViewSnapshot('{not json')).toBeNull(); + expect(parseTimelineViewSnapshot('42')).toBeNull(); + }); + + it('preserves a null viewEnd (not zoomed) and rejects non-finite viewEnd', () => { + const restored = parseTimelineViewSnapshot(JSON.stringify({ ...full, viewEnd: null })); + expect(restored?.viewEnd).toBeNull(); + // NaN / Infinity don't survive JSON, but a malformed string value must coerce to null. + expect(parseTimelineViewSnapshot('{"viewEnd":"oops"}')?.viewEnd).toBeNull(); + }); + + it('falls back to defaults for invalid enums and missing numbers', () => { + expect(parseTimelineViewSnapshot('{}')).toEqual({ + viewStart: 0, + viewEnd: null, + rowMode: 'conversation', + phaseFilter: 'profiling', + expanded: [], + scrollTop: 0, + scrollLeft: 0, + }); + const bogus = parseTimelineViewSnapshot( + JSON.stringify({ rowMode: 'nope', phaseFilter: 'nope', viewStart: 'x', scrollTop: null }), + )!; + expect(bogus.rowMode).toBe('conversation'); + expect(bogus.phaseFilter).toBe('profiling'); + expect(bogus.viewStart).toBe(0); + expect(bogus.scrollTop).toBe(0); + }); + + it('drops non-string entries from the expanded list', () => { + expect(parseTimelineViewSnapshot('{"expanded":["a",1,null,"b"]}')!.expanded).toEqual([ + 'a', + 'b', + ]); + expect(parseTimelineViewSnapshot('{"expanded":"nope"}')!.expanded).toEqual([]); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index bdf0a9b9..f3870bb1 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -1,12 +1,14 @@ 'use client'; -import { useCallback, useMemo, useRef, useState } from 'react'; +import { useCallback, useLayoutEffect, useMemo, useRef, useState } from 'react'; import { useRouter } from 'next/navigation'; import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline'; import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; import { track } from '@/lib/analytics'; +import { requestsForPhase, type StagePhase } from './phase-slice'; + /** * The dataset conversation id for a request: the cid with any subagent/forked * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in @@ -79,13 +81,110 @@ const ROW_MODE_OPTIONS: SegmentedToggleOption[] = [ { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' }, ]; -type PhaseFilter = 'all' | 'profiling'; +// Two phases shown separately (no combined view) — matches the per-point detail +// stage toggle. Reuses StagePhase so the filter predicate is shared. +type PhaseFilter = StagePhase; const PHASE_OPTIONS: SegmentedToggleOption[] = [ { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' }, - { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' }, + { value: 'warmup', label: 'Warmup', testId: 'timeline-phase-warmup' }, ]; +/** + * Persisted snapshot of the timeline's view state, used to restore the user's + * zoom / scroll / filter position when they return to the page (e.g. clicking a + * request to open the dataset flamegraph, then hitting the browser back button). + * Stored in sessionStorage keyed by point id; written on click-through and + * consumed once on the next mount. + */ +export interface TimelineViewSnapshot { + /** Zoom-pan window start (ns offset from dataStart). */ + viewStart: number; + /** Zoom-pan window end, or null when not zoomed (full extent). */ + viewEnd: number | null; + rowMode: RowMode; + phaseFilter: PhaseFilter; + /** Keys of expanded multi-stream subagent rows. */ + expanded: string[]; + /** Scroll container offsets (vertical row scroll + horizontal). */ + scrollTop: number; + scrollLeft: number; +} + +const TIMELINE_VIEW_SNAPSHOT_PREFIX = 'agentic-timeline-view:'; +const ROW_MODE_VALUES: readonly RowMode[] = ['conversation', 'worker']; +const PHASE_FILTER_VALUES: readonly PhaseFilter[] = ['warmup', 'profiling']; + +const finiteOr = (value: unknown, fallback: number): number => + typeof value === 'number' && Number.isFinite(value) ? value : fallback; + +/** + * Parse a persisted snapshot, coercing/validating each field and falling back + * to defaults so a malformed or stale blob can never break restore. Returns + * null only when the input is absent or not parseable JSON. + */ +export function parseTimelineViewSnapshot(raw: string | null): TimelineViewSnapshot | null { + if (!raw) return null; + let parsed: unknown; + try { + parsed = JSON.parse(raw); + } catch { + return null; + } + if (!parsed || typeof parsed !== 'object') return null; + const record = parsed as Record; + const rowMode = ROW_MODE_VALUES.includes(record.rowMode as RowMode) + ? (record.rowMode as RowMode) + : 'conversation'; + const phaseFilter = PHASE_FILTER_VALUES.includes(record.phaseFilter as PhaseFilter) + ? (record.phaseFilter as PhaseFilter) + : 'profiling'; + const viewEnd = + typeof record.viewEnd === 'number' && Number.isFinite(record.viewEnd) ? record.viewEnd : null; + const expanded = Array.isArray(record.expanded) + ? record.expanded.filter((entry): entry is string => typeof entry === 'string') + : []; + return { + viewStart: finiteOr(record.viewStart, 0), + viewEnd, + rowMode, + phaseFilter, + expanded, + scrollTop: finiteOr(record.scrollTop, 0), + scrollLeft: finiteOr(record.scrollLeft, 0), + }; +} + +function timelineSnapshotKey(pointId: number): string { + return `${TIMELINE_VIEW_SNAPSHOT_PREFIX}${pointId}`; +} + +function saveTimelineViewSnapshot(pointId: number, snapshot: TimelineViewSnapshot): void { + if (typeof window === 'undefined') return; + try { + window.sessionStorage.setItem(timelineSnapshotKey(pointId), JSON.stringify(snapshot)); + } catch { + // sessionStorage can throw (private mode / quota exceeded) — restore is + // best-effort, so a failed write just means no restore next time. + } +} + +/** + * Read AND remove the snapshot (one-shot): we only want to restore once per + * click-through, so a later reload of the same point starts from defaults. + */ +function consumeTimelineViewSnapshot(pointId: number): TimelineViewSnapshot | null { + if (typeof window === 'undefined') return null; + try { + const key = timelineSnapshotKey(pointId); + const raw = window.sessionStorage.getItem(key); + window.sessionStorage.removeItem(key); + return parseTimelineViewSnapshot(raw); + } catch { + return null; + } +} + // The timeline body is capped at this height and scrolls internally, so a run // with many conversations/workers doesn't make the card grow unbounded and push // the rest of the detail page down. Sized to show ~16 rows + the header. @@ -497,19 +596,54 @@ function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) export function RequestTimelineView({ data, datasetSlug, + pointId, }: { data: RequestTimeline; /** Source dataset slug for this run; enables click-to-conversation deep links. */ datasetSlug?: string | null; + /** benchmark_results.id — keys the per-point view-state snapshot for restore. */ + pointId: number; }) { const router = useRouter(); const [rowMode, setRowMode] = useState('conversation'); const [phaseFilter, setPhaseFilter] = useState('profiling'); const [tooltip, setTooltip] = useState(null); + // The scroll container (vertical row scroll + horizontal chart scroll) and a + // ref mirror of the live view state, so click-through can snapshot the exact + // position without rebuilding openConversation on every zoom/pan tick. + const scrollRef = useRef(null); + const liveStateRef = useRef<{ + viewStart: number; + viewEnd: number | null; + rowMode: RowMode; + phaseFilter: PhaseFilter; + expandedSubagents: ReadonlySet; + }>({ + viewStart: 0, + viewEnd: null, + rowMode: 'conversation', + phaseFilter: 'profiling', + expandedSubagents: new Set(), + }); + const openConversation = useCallback( (req: RequestRecord) => { if (!datasetSlug) return; + // Snapshot the current zoom/scroll/filter position so the browser back + // button restores it (see the restore effect below). + if (scrollRef.current) { + const live = liveStateRef.current; + saveTimelineViewSnapshot(pointId, { + viewStart: live.viewStart, + viewEnd: live.viewEnd, + rowMode: live.rowMode, + phaseFilter: live.phaseFilter, + expanded: [...live.expandedSubagents], + scrollTop: scrollRef.current.scrollTop, + scrollLeft: scrollRef.current.scrollLeft, + }); + } const convId = datasetConvId(req.cid); // Carry the turn (and, for subagent requests, the subagent id) so the // flamegraph can scroll to / highlight the exact node this bar maps to. @@ -521,7 +655,7 @@ export function RequestTimelineView({ `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`, ); }, - [datasetSlug, router], + [datasetSlug, router, pointId], ); // Which multi-stream subagents currently have their per-stream rows // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id). @@ -549,10 +683,7 @@ export function RequestTimelineView({ // Apply phase filter, then group into rows. With no warmup data the filter // collapses to "profiling" regardless of the (hidden) toggle state. const filtered = useMemo( - () => - phaseFilter === 'all' && hasWarmup - ? data.requests - : data.requests.filter((r) => r.phase === 'profiling'), + () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'), [data.requests, phaseFilter, hasWarmup], ); const rows = useMemo( @@ -595,6 +726,34 @@ export function RequestTimelineView({ const visibleDur = Math.max(vEnd - vStart, 1); const isZoomed = viewEnd !== null; + // Mirror the live view state into a ref so the click-through snapshot reads + // the latest values without rebuilding openConversation on every zoom tick. + liveStateRef.current = { viewStart, viewEnd, rowMode, phaseFilter, expandedSubagents }; + + // Restore the snapshot written on click-through (e.g. open a request in the + // dataset flamegraph, then hit the browser back button). Runs once per mount, + // keyed by point id; the snapshot is consumed so a later reload starts fresh. + // Scroll is applied after the restored filters/expansions re-render the rows + // (rAF fires after that synchronous commit, before paint — no visible jump). + useLayoutEffect(() => { + const snapshot = consumeTimelineViewSnapshot(pointId); + if (!snapshot) return; + setRowMode(snapshot.rowMode); + setPhaseFilter(snapshot.phaseFilter); + setExpandedSubagents(new Set(snapshot.expanded)); + setViewStart(snapshot.viewStart); + setViewEnd(snapshot.viewEnd); + const target = { top: snapshot.scrollTop, left: snapshot.scrollLeft }; + requestAnimationFrame(() => { + const el = scrollRef.current; + if (!el) return; + el.scrollTop = target.top; + el.scrollLeft = target.left; + }); + // setState setters are stable; only re-run if the point itself changes. + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [pointId]); + // Layout // Wide enough for a full 36-char conversation id at 10px font, plus the // indent + color stripe + count badge. Subagent rows inherit the same @@ -778,7 +937,11 @@ export function RequestTimelineView({ horizontally inside it, so the card doesn't grow to fit every conversation/worker AND the horizontal scrollbar stays pinned to the window's bottom edge (rather than the bottom of the tall content). */} -
+
{/* Label column — pinned left (sticky) so it stays put during horizontal scroll, while scrolling vertically with the rows. */} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts index a9ece859..9f6adc6a 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts @@ -79,11 +79,11 @@ describe('rollingRequestMetric', () => { expect(result.cumulative.at(-1)?.value).toBeCloseTo(2.85, 8); }); - it('drops warmup, cancelled, missing, and non-positive samples', () => { + it('drops cancelled, missing, and non-positive samples (phase is the caller’s concern)', () => { const result = rollingRequestMetric( [ request(1, 100, 10), - request(2, 200, 20, { phase: 'warmup' }), + request(2, 200, 20, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline request(3, 300, 30, { cancelled: true }), request(4, null, null), request(5, 0, 0), @@ -92,9 +92,10 @@ describe('rollingRequestMetric', () => { 'p90', ); - expect(result.raw).toEqual([{ t: 1, value: 0.1 }]); - expect(result.trend).toEqual([{ t: 1, value: 0.1 }]); - expect(result.cumulative).toEqual([{ t: 1, value: 0.1 }]); + expect(result.raw).toEqual([ + { t: 1, value: 0.1 }, + { t: 2, value: 0.2 }, + ]); }); }); @@ -161,22 +162,23 @@ describe('cumulativeTimeAverage', () => { }); describe('cumulativeCompletedRequests', () => { - it('sorts profiling completions and excludes warmup and cancelled requests', () => { + it('sorts completions and excludes cancelled requests (phase is the caller’s concern)', () => { expect( cumulativeCompletedRequests([ request(4, 100, 10), request(2, 100, 10), - request(1, 100, 10, { phase: 'warmup' }), + request(1, 100, 10, { phase: 'warmup' }), // kept — caller passes a phase-scoped timeline request(3, 100, 10, { cancelled: true }), ]), ).toEqual([ { t: 0, value: 0 }, - { t: 2, value: 1 }, - { t: 4, value: 2 }, + { t: 1, value: 1 }, + { t: 2, value: 2 }, + { t: 4, value: 3 }, ]); }); - it('returns no series when there are no successful profiling completions', () => { + it('returns no series when there are no successful completions', () => { expect(cumulativeCompletedRequests([request(1, 100, 10, { cancelled: true })])).toEqual([]); }); }); @@ -199,17 +201,22 @@ describe('averageSequenceLengthInFlight', () => { ]); }); - it('excludes cancelled, warmup, and missing sequence lengths', () => { + it('excludes cancelled and missing sequence lengths (phase is the caller’s concern)', () => { + // Only the null-osl and cancelled rows are dropped; the warmup row is kept + // (the caller passes a phase-scoped timeline), so it produces a step series. expect( averageSequenceLengthInFlight( [ request(1, 100, 10, { osl: null }), request(2, 100, 10, { osl: 20, cancelled: true }), - request(3, 100, 10, { osl: 30, phase: 'warmup' }), + request(3, 100, 10, { osl: 30, phase: 'warmup', start: 0, end: 3_000_000_000 }), ], 'osl', ), - ).toEqual([]); + ).toEqual([ + { t: 0, value: 30 }, + { t: 3, value: 0 }, + ]); }); }); diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index ab744286..088a5e3b 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -23,6 +23,14 @@ interface Series { hideFromHover?: boolean; } +/** A constant horizontal reference line (e.g. a capacity ceiling). */ +export interface ReferenceLine { + value: number; + label: string; + /** Line + label color. Defaults to a muted emerald. */ + color?: string; +} + interface TimeSeriesChartProps { series: Series[]; durationS: number; @@ -31,8 +39,16 @@ interface TimeSeriesChartProps { yAxisLabel?: string; width?: number; height?: number; + /** + * Horizontal reference lines drawn across the plot. Their values are folded + * into the auto y-max so the line stays on-chart even when it exceeds the + * data (e.g. a KV-cache pool ceiling well above the working set). + */ + refLines?: readonly ReferenceLine[]; } +const NO_REF_LINES: readonly ReferenceLine[] = []; + export type RequestMetric = 'interactivity' | 'ttft' | 'e2e'; export type RequestPercentile = 'p75' | 'p90'; export type ThroughputSeriesKey = 'input' | 'decode'; @@ -74,8 +90,11 @@ export function rollingRequestMetric( windowSize = 50, ): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } { const q = percentile === 'p75' ? 0.75 : 0.9; + // Phase is the caller's concern — the agentic detail page passes a + // phase-scoped (warmup or profiling) timeline. Here we only drop cancelled + // requests and samples without a usable latency value. const samples = requests - .filter((request) => request.phase === 'profiling' && !request.cancelled) + .filter((request) => !request.cancelled) .flatMap((request) => { const latencyMs = metric === 'ttft' @@ -244,10 +263,13 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } -/** Cumulative count of successfully completed profiling requests by end time. */ +/** + * Cumulative count of successfully completed (non-cancelled) requests by end + * time. Phase is the caller's concern — pass a phase-scoped timeline. + */ export function cumulativeCompletedRequests(requests: readonly RequestRecord[]): TimeSeriesPoint[] { const completionTimes = requests - .filter((request) => request.phase === 'profiling' && !request.cancelled) + .filter((request) => !request.cancelled) .map((request) => request.end / 1e9) .filter(Number.isFinite) .toSorted((a, b) => a - b); @@ -271,10 +293,10 @@ export function averageSequenceLengthInFlight( events.set(t, current); }; + // Phase is the caller's concern — pass a phase-scoped timeline. for (const request of requests) { const tokens = request[metric]; if ( - request.phase !== 'profiling' || request.cancelled || tokens === null || !Number.isFinite(tokens) || @@ -527,6 +549,7 @@ export function TimeSeriesChart({ yAxisLabel, width = 720, height = 260, + refLines = NO_REF_LINES, }: TimeSeriesChartProps) { const W = width; const H = height; @@ -536,11 +559,15 @@ export function TimeSeriesChart({ const innerW = W - PAD.left - PAD.right; const innerH = H - PAD.top - PAD.bottom; const xMax = Math.max(durationS, 1); - const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value))); + // Fold reference-line values into the auto max so a ceiling above the data + // (e.g. KV-cache pool >> working set) still renders inside the plot. + const refMax = refLines.length > 0 ? Math.max(...refLines.map((r) => r.value)) : 0; + const yMax = + yMaxOpt ?? Math.max(1e-9, refMax, ...series.flatMap((s) => s.data.map((d) => d.value))); const xScale = (t: number) => PAD.left + (t / xMax) * innerW; const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH; return { innerW, innerH, xMax, yMax, xScale, yScale }; - }, [series, durationS, yMaxOpt, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]); + }, [series, durationS, yMaxOpt, refLines, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]); const { innerW, innerH, xMax, yMax, xScale, yScale } = layout; @@ -640,6 +667,38 @@ export function TimeSeriesChart({ ); })} + {/* Horizontal reference lines (e.g. KV-cache pool ceiling). Drawn on top + of the data lines, with a label pinned to the right edge. */} + {refLines.map((ref, i) => { + if (!Number.isFinite(ref.value) || ref.value < 0 || ref.value > yMax) return null; + const y = yScale(ref.value); + const color = ref.color ?? '#16a34a'; + return ( + + + + {ref.label} + + + ); + })} + {/* X-axis */} ` artifact, + * but the ingest path historically failed to link it to agentic rows (the + * `bmk_agentic_` → `server_logs_` key mismatch, now fixed in + * ingest-ci-run). As a result the agentic server log text was never stored, so + * `kv_cache_pool_tokens` cannot be derived from the DB — we must re-fetch the + * artifacts from GitHub. + * + * For each agentic workflow run this: + * 1. lists the run's artifacts and keeps only `server_logs_*` + `bmk_agentic_*` + * (dedup by logical name, mirroring ingest's runner-suffix collapse), + * 2. downloads + unzips just those (small — skips the multi-MB trace dirs), + * 3. maps each `bmk_agentic_` JSON → config → benchmark_results rows via + * the same mapBenchmarkRow/config-cache logic ingest uses, + * 4. calls insertServerLog(), which stores+links the log AND derives + * `kv_cache_pool_tokens` into benchmark_results.metrics. + * + * Idempotent: insertServerLog only links rows whose server_log_id is null. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-agentic-server-logs + * [--limit N] only process the first N workflow runs + * [--yes] skip the confirmation prompt + */ + +import { execSync } from 'node:child_process'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { insertServerLog } from './etl/benchmark-ingest.js'; +import { mapBenchmarkRow } from './etl/benchmark-mapper.js'; +import { createConfigCache } from './etl/config-cache.js'; +import { createAdminSql } from './etl/db-utils.js'; +import { createSkipTracker } from './etl/skip-tracker.js'; + +const REPO = 'SemiAnalysisAI/InferenceX'; +// Strip the trailing `__` token so `server_logs_` and +// `bmk_agentic_` collapse to the same logical key (matches ingest). +const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/u; + +function parseFlags(): { limit: number | null } { + let limit: number | null = null; + for (let i = 2; i < process.argv.length; i++) { + if (process.argv[i] === '--limit') { + const next = process.argv[++i]; + if (!next || Number.isNaN(Number(next))) { + console.error('--limit requires a numeric argument'); + process.exit(1); + } + limit = Number(next); + } + } + return { limit }; +} + +const flags = parseFlags(); +const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1, onnotice: () => {} }); + +interface ArtifactMeta { + name: string; + archive_download_url: string; + created_at: string; +} + +/** List the run's artifacts, dedup by logical name keeping the most recent. */ +function listArtifacts(githubRunId: string): Map { + const json = execSync( + `gh api "repos/${REPO}/actions/runs/${githubRunId}/artifacts" --paginate --jq '.artifacts[]'`, + { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024 }, + ); + const byLogical = new Map(); + for (const line of json.trim().split('\n')) { + if (!line) continue; + let a: ArtifactMeta; + try { + a = JSON.parse(line) as ArtifactMeta; + } catch { + continue; + } + if (!a.name.startsWith('server_logs_') && !a.name.startsWith('bmk_agentic_')) continue; + const key = a.name.replace(RUNNER_SUFFIX_RE, ''); + const existing = byLogical.get(key); + if (!existing || a.created_at > existing.created_at) byLogical.set(key, a); + } + return byLogical; +} + +function download(artifact: ArtifactMeta, destRoot: string): string { + const zipPath = path.join(destRoot, 'a.zip'); + execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, { + stdio: ['pipe', 'pipe', 'inherit'], + }); + const destDir = path.join(destRoot, artifact.name); + fs.mkdirSync(destDir, { recursive: true }); + execSync(`unzip -oq "${zipPath}" -d "${destDir}"`, { stdio: 'inherit' }); + fs.unlinkSync(zipPath); + return destDir; +} + +/** Logical key shared by a server_logs_/bmk_agentic_ artifact pair. */ +function logicalKey(name: string): string { + return name + .replace(/^server_logs_/u, '') + .replace(/^bmk_agentic_/u, '') + .replace(RUNNER_SUFFIX_RE, ''); +} + +/** + * Read up to `maxBytes` of a (possibly huge) server log as UTF-8, stripping NUL + * bytes. vLLM's "GPU KV cache size" startup lines are near the top, so a head + * read is enough to derive the KV pool — and it caps storage for the rare + * multi-hundred-MB logs that exceed V8's ~512 MB string limit. + */ +const stripNul = (s: string): string => s.replaceAll(String.fromCodePoint(0), ''); + +function readServerLogCapped(p: string, maxBytes = 64 * 1024 * 1024): string { + if (fs.statSync(p).size <= maxBytes) return stripNul(fs.readFileSync(p, 'utf8')); + const fd = fs.openSync(p, 'r'); + try { + const buf = Buffer.allocUnsafe(maxBytes); + const n = fs.readSync(fd, buf, 0, maxBytes, 0); + return stripNul(buf.subarray(0, n).toString('utf8')); + } finally { + fs.closeSync(fd); + } +} + +function findJsonFiles(dir: string): string[] { + const out: string[] = []; + const walk = (d: string) => { + for (const e of fs.readdirSync(d, { withFileTypes: true })) { + const p = path.join(d, e.name); + if (e.isDirectory()) walk(p); + else if (e.name.endsWith('.json')) out.push(p); + } + }; + walk(dir); + return out; +} + +async function main(): Promise { + console.log('=== backfill-agentic-server-logs ==='); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Agentic workflow runs that still have unlinked server logs. + const runs = await sql<{ github_run_id: string; workflow_run_id: number }[]>` + select distinct wr.github_run_id::text as github_run_id, wr.id as workflow_run_id + from benchmark_results br + join workflow_runs wr on wr.id = br.workflow_run_id + where br.benchmark_type = 'agentic_traces' + and br.server_log_id is null + order by wr.id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (runs.length === 0) { + console.log('\n Nothing to do — all agentic rows already have a server log.'); + return; + } + console.log(`\n ${runs.length} agentic workflow run(s) to process.`); + if (!hasYesFlag()) { + const ok = await confirm('\nProceed? (y/N) '); + if (!ok) { + console.log('Aborted.'); + return; + } + } + + const cache = createConfigCache(sql); + await cache.preloadConfigs(); + const tracker = createSkipTracker(); + + let linkedRows = 0; + let runsOk = 0; + let runsFailed = 0; + const t0 = Date.now(); + + for (const { github_run_id: githubRunId, workflow_run_id: wrId } of runs) { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `kvpool-${githubRunId}-`)); + try { + const artifacts = listArtifacts(githubRunId); + // server log path by logical key + const serverLogByKey = new Map(); + const bmkDirs: string[] = []; + for (const art of artifacts.values()) { + const dir = download(art, tmp); + if (art.name.startsWith('server_logs_')) { + const logPath = path.join(dir, 'server.log'); + if (fs.existsSync(logPath)) serverLogByKey.set(logicalKey(art.name), logPath); + } else { + bmkDirs.push(dir); + } + } + + let runLinked = 0; + for (const bmkDir of bmkDirs) { + const key = logicalKey(path.basename(bmkDir)); + const logPath = serverLogByKey.get(key); + if (!logPath) continue; + for (const file of findJsonFiles(bmkDir)) { + let raw: unknown; + try { + raw = JSON.parse(fs.readFileSync(file, 'utf8')); + } catch { + continue; + } + const rows = Array.isArray(raw) ? raw : [raw]; + for (const row of rows) { + if (!row || typeof row !== 'object') continue; + const mapped = mapBenchmarkRow(row as Record, tracker); + if (!mapped || mapped.benchmarkType !== 'agentic_traces') continue; + const configId = await cache.getOrCreateConfig(mapped.config); + const ids = await sql<{ id: number }[]>` + select id from benchmark_results + where workflow_run_id = ${wrId} + and config_id = ${configId} + and conc = ${mapped.conc} + and benchmark_type = 'agentic_traces' + and server_log_id is null + `; + if (ids.length === 0) continue; + const serverLog = readServerLogCapped(logPath); + await insertServerLog( + sql, + ids.map((r) => r.id), + serverLog, + ); + runLinked += ids.length; + } + } + } + linkedRows += runLinked; + runsOk++; + const elapsed = Math.round((Date.now() - t0) / 1000); + console.log( + ` ✓ run ${githubRunId}: ${serverLogByKey.size} log(s), linked ${runLinked} row(s) ` + + `(${runsOk}/${runs.length}, ${elapsed}s total)`, + ); + } catch (error) { + runsFailed++; + console.error( + ` ✗ run ${githubRunId}: ${error instanceof Error ? (error.stack ?? error.message) : String(error)}`, + ); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + } + + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log( + `\n=== complete: ${linkedRows} row(s) linked across ${runsOk} run(s) ` + + `(${runsFailed} failed) in ${totalSec}s ===`, + ); + if (runsFailed > 0) process.exitCode = 1; +} + +main() + .catch((error) => { + console.error('backfill-agentic-server-logs failed:', error); + process.exitCode = 1; + }) + .finally(() => sql.end()); diff --git a/packages/db/src/backfill-kv-pool.ts b/packages/db/src/backfill-kv-pool.ts new file mode 100644 index 00000000..6cf40a33 --- /dev/null +++ b/packages/db/src/backfill-kv-pool.ts @@ -0,0 +1,137 @@ +/** + * Backfill `benchmark_results.metrics->kv_cache_pool_tokens` from the captured + * server logs. The value is parsed from vLLM's authoritative + * "GPU KV cache size: N tokens" startup line(s), summed across data-parallel + * engine cores (see {@link kvCachePoolTokensFromServerLog}). + * + * The ingest path now derives this inline in `insertServerLog`, but existing + * rows need this one-time pass. Idempotent: re-running only touches rows that + * still lack the value (unless --force). + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-kv-pool + * [--limit N] only process the first N candidate server logs + * [--force] recompute even when the value is already set + * [--yes] skip the confirmation prompt + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { createAdminSql } from './etl/db-utils.js'; +import { kvCachePoolTokensFromServerLog } from './etl/server-log-metrics.js'; + +interface CliFlags { + limit: number | null; + force: boolean; +} + +function parseFlags(): CliFlags { + let limit: number | null = null; + let force = false; + for (let i = 2; i < process.argv.length; i++) { + const arg = process.argv[i]!; + if (arg === '--force') force = true; + else if (arg === '--limit') { + const next = process.argv[++i]; + if (!next || Number.isNaN(Number(next))) { + console.error('--limit requires a numeric argument'); + process.exit(1); + } + limit = Number(next); + } + } + return { limit, force }; +} + +const flags = parseFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-kv-pool ==='); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // One server log can be linked to several benchmark_results (multiple + // concurrency points share a server). Group by log id so we parse each log + // once and fan the value out to all its rows. + const candidates = flags.force + ? await sql<{ server_log_id: number }[]>` + select distinct server_log_id + from benchmark_results + where server_log_id is not null + order by server_log_id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ server_log_id: number }[]>` + select distinct server_log_id + from benchmark_results + where server_log_id is not null + and metrics->>'kv_cache_pool_tokens' is null + order by server_log_id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + console.log(`\n ${candidates.length} candidate server log(s).`); + if (!hasYesFlag()) { + const ok = await confirm('\nProceed? (y/N) '); + if (!ok) { + console.log('Aborted.'); + return; + } + } + + let updated = 0; + let logsWithValue = 0; + let logsNoValue = 0; + let failed = 0; + const t0 = Date.now(); + for (const { server_log_id: logId } of candidates) { + try { + const [row] = await sql<{ server_log: string | null }[]>` + select server_log from server_logs where id = ${logId} + `; + const tokens = kvCachePoolTokensFromServerLog(row?.server_log ?? null); + if (tokens === null) { + logsNoValue++; + continue; // non-vLLM or no startup line — leave unset + } + logsWithValue++; + const targets = flags.force + ? sql`server_log_id = ${logId}` + : sql`server_log_id = ${logId} and metrics->>'kv_cache_pool_tokens' is null`; + const result = await sql` + update benchmark_results + set metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${tokens}::bigint)) + where ${targets} + `; + updated += result.count; + console.log(` ✓ log=${logId}: ${tokens.toLocaleString()} tok → ${result.count} row(s)`); + } catch (error) { + failed++; + console.error(` ✗ log=${logId}: ${error instanceof Error ? error.message : String(error)}`); + } + } + + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log( + `\n=== backfill complete: ${updated} row(s) updated from ${logsWithValue} log(s) ` + + `(${logsNoValue} log(s) had no KV-pool line, ${failed} failed) in ${totalSec}s ===`, + ); + if (failed > 0) process.exitCode = 1; +} + +main() + .catch((error) => { + console.error('backfill-kv-pool failed:', error); + process.exitCode = 1; + }) + .finally(() => sql.end()); diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index 343d7fb7..a405789d 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -4,6 +4,7 @@ import type postgres from 'postgres'; import type { BenchmarkParams } from './benchmark-mapper'; +import { kvCachePoolTokensFromServerLog } from './server-log-metrics'; type Sql = ReturnType; @@ -106,9 +107,18 @@ export async function insertServerLog( insert into server_logs (server_log) values (${serverLog}) returning id `; + // Derive the KV-cache pool size (tokens) from the log's authoritative + // "GPU KV cache size: N tokens" line(s) and stash it on the result's metrics + // JSON, mirroring how trace-replay-ingest derives cache-hit rates. The + // scraped vllm:cache_config_info metric can't reconstruct this for MLA models. + const kvCachePoolTokens = kvCachePoolTokensFromServerLog(serverLog); await sql` update benchmark_results - set server_log_id = ${logId} + set server_log_id = ${logId}${ + kvCachePoolTokens === null + ? sql`` + : sql`, metrics = jsonb_set(metrics, '{kv_cache_pool_tokens}', to_jsonb(${kvCachePoolTokens}::bigint))` + } where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; } diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts index 7d292207..3f088cd6 100644 --- a/packages/db/src/etl/compute-chart-series.test.ts +++ b/packages/db/src/etl/compute-chart-series.test.ts @@ -138,6 +138,49 @@ describe('computeChartSeries', () => { ]); }); + it('merges warmup_metrics before profiling into one continuous series (v11)', async () => { + // warmup scrapes at t=0,1s; profiling scrapes at t=10,11s (own start_ns). + const blob = gzipSync( + Buffer.from( + JSON.stringify({ + warmup_metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1e9, avg: 0.2 }, + { start_ns: 1e9, end_ns: 2e9, avg: 0.3 }, + ], + }, + ], + }, + }, + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 10e9, end_ns: 11e9, avg: 0.8 }, + { start_ns: 11e9, end_ns: 12e9, avg: 0.9 }, + ], + }, + ], + }, + }, + }), + ), + ); + const series = await computeChartSeries(blob); + // Origin is the earliest (warmup) start_ns, so warmup sits at low t and + // profiling follows on the same axis — the frontend slices at the boundary. + expect(series?.kvCacheUsage).toEqual([ + { t: 0, value: 0.2 }, + { t: 1, value: 0.3 }, + { t: 10, value: 0.8 }, + { t: 11, value: 0.9 }, + ]); + }); + it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => { const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 })); expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]); diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 394a5826..c87df26b 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -63,8 +63,18 @@ import { * v10: only emit per-source series for disaggregated configs with a recognized * orchestrator adapter. Non-disaggregated and unsupported configs retain the * existing aggregate-only behavior. + * + * v12: also consume the `warmup_metrics` block from the server-metrics blob and + * merge its scrapes into the same series as the profiling `metrics` block. + * Warmup and profiling timeslices carry their own absolute `start_ns` and never + * overlap in time, so the merged series is continuous (warmup at lower t, + * profiling after). This lets the agentic detail page slice `chart_series` into + * warmup vs profiling at the request-derived boundary; older blobs without a + * warmup block are unaffected. (v11 was a short-lived, since-reverted attempt to + * carry kvCachePoolTokens in chart_series; that value now lives in + * benchmark_results.metrics, derived from the server log — unrelated to this.) */ -export const CHART_SERIES_VERSION = 10; +export const CHART_SERIES_VERSION = 12; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -193,14 +203,37 @@ const CHART_METRIC_KEYS = new Set([ * subtrees the chart needs. Avoids Node's 512 MB max-string-length cap that * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows. */ -async function streamCollectMetrics(buffer: Buffer): Promise { +/** + * Merge a warmup phase metric map into the profiling one by concatenating each + * metric's `series`. The two phases' timeslices carry their own absolute + * `start_ns` and never overlap in time, so `buildSeriesFromMetrics` (which keys + * by `start_ns`) yields one continuous series — warmup scrapes at lower t, + * profiling after. No-ops when either side is empty (older blobs have no warmup). + */ +function mergePhaseMetrics(profiling: MetricsMap, warmup: MetricsMap): MetricsMap { + if (Object.keys(warmup).length === 0) return profiling; + if (Object.keys(profiling).length === 0) return warmup; + const out: MetricsMap = {}; + for (const name of new Set([...Object.keys(profiling), ...Object.keys(warmup)])) { + out[name] = { + series: [...(profiling[name]?.series ?? []), ...(warmup[name]?.series ?? [])], + }; + } + return out; +} + +/** Stream-collect one top-level phase block (`metrics` or `warmup_metrics`). */ +async function streamCollectPhase( + buffer: Buffer, + filter: 'metrics' | 'warmup_metrics', +): Promise { /* eslint-disable @typescript-eslint/no-explicit-any */ const collected: MetricsMap = {}; const pipeline = chain([ Readable.from(buffer), createGunzip(), parser(), - pick({ filter: 'metrics' }), + pick({ filter }), streamObject(), ]); await new Promise((resolve, reject) => { @@ -215,15 +248,28 @@ async function streamCollectMetrics(buffer: Buffer): Promise { return collected; } +/** Stream-parse fallback: collect both phase blocks and merge (see v11). */ +async function streamCollectMetrics(buffer: Buffer): Promise { + const [profiling, warmup] = await Promise.all([ + streamCollectPhase(buffer, 'metrics'), + streamCollectPhase(buffer, 'warmup_metrics'), + ]); + return mergePhaseMetrics(profiling, warmup); +} + /** * Parse the gzipped server_metrics blob into the metric map. Tries the * synchronous fast path first; falls back to stream-parse on - * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed. + * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed. Merges the warmup block + * into the profiling one (v11) so the series span both phases. */ async function parseMetrics(buffer: Buffer): Promise { try { - const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { metrics?: MetricsMap }; - return obj.metrics ?? {}; + const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { + metrics?: MetricsMap; + warmup_metrics?: MetricsMap; + }; + return mergePhaseMetrics(obj.metrics ?? {}, obj.warmup_metrics ?? {}); } catch (error) { const code = error && (error as NodeJS.ErrnoException).code; const msg = error instanceof Error ? error.message : String(error); diff --git a/packages/db/src/etl/server-log-metrics.test.ts b/packages/db/src/etl/server-log-metrics.test.ts new file mode 100644 index 00000000..9e0fa852 --- /dev/null +++ b/packages/db/src/etl/server-log-metrics.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from 'vitest'; + +import { kvCachePoolTokensFromServerLog } from './server-log-metrics'; + +describe('kvCachePoolTokensFromServerLog', () => { + it('returns null for empty / missing logs', () => { + expect(kvCachePoolTokensFromServerLog(null)).toBeNull(); + expect(kvCachePoolTokensFromServerLog('')).toBeNull(); + expect(kvCachePoolTokensFromServerLog('no kv cache line here')).toBeNull(); + }); + + it('reads a single-engine (ep1) pool size', () => { + const log = ` +(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1744] GPU KV cache size: 11,294,463 tokens +(EngineCore pid=1950943) INFO 06-30 18:28:46 [kv_cache_utils.py:1745] Maximum concurrency for 1,048,576 tokens per request: 10.77x +`; + expect(kvCachePoolTokensFromServerLog(log)).toBe(11_294_463); + }); + + it('sums across data-parallel engine cores (ep8)', () => { + const lines = Array.from( + { length: 8 }, + (_, i) => + `(EngineCore_DP${i} pid=${2337827 + i}) INFO [kv_cache_utils.py:1744] GPU KV cache size: 11,577,333 tokens`, + ).join('\n'); + expect(kvCachePoolTokensFromServerLog(lines)).toBe(11_577_333 * 8); + }); + + it('dedups reprinted lines for the same engine core', () => { + const log = ` +(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens +(EngineCore_DP0 pid=1) GPU KV cache size: 5,000,000 tokens +(EngineCore_DP1 pid=2) GPU KV cache size: 5,000,000 tokens +`; + // DP0 counted once + DP1 once = 10M, not 15M. + expect(kvCachePoolTokensFromServerLog(log)).toBe(10_000_000); + }); + + it('falls back to bare lines when no engine-core prefix is present', () => { + const log = `INFO GPU KV cache size: 1,234,567 tokens`; + expect(kvCachePoolTokensFromServerLog(log)).toBe(1_234_567); + }); +}); diff --git a/packages/db/src/etl/server-log-metrics.ts b/packages/db/src/etl/server-log-metrics.ts new file mode 100644 index 00000000..b8b26dd1 --- /dev/null +++ b/packages/db/src/etl/server-log-metrics.ts @@ -0,0 +1,65 @@ +/** + * Derive server-side scalars from the captured vLLM server log + * (`server_logs.server_log`). These come from startup log lines rather than the + * scraped Prometheus `/metrics`, because for MLA / sparse-attention models the + * `vllm:cache_config_info` labels (num_gpu_blocks × block_size) do NOT + * reconstruct the real KV-cache token capacity — they undercount by a + * non-constant factor. vLLM's own `GPU KV cache size: N tokens` line is the + * authoritative number. + */ + +/** + * Total KV-cache pool size in tokens. + * + * vLLM prints one `GPU KV cache size: N tokens` line per engine core (one per + * data-parallel rank; tensor-parallel is already aggregated into that single + * per-engine number). We sum across distinct engine cores so the result is the + * deployment-wide total: + * + * (EngineCore pid=…) GPU KV cache size: 11,294,463 tokens → ep1 total + * (EngineCore_DP0 pid=…) GPU KV cache size: 11,577,333 tokens ┐ + * (EngineCore_DP1 pid=…) GPU KV cache size: 11,577,333 tokens ┘ → ×8 = total + * + * Returns null when the log has no such line (non-vLLM frameworks, or a log + * that didn't capture engine startup). + */ +export function kvCachePoolTokensFromServerLog(serverLog: string | null): number | null { + if (!serverLog) return null; + + // Scan line-by-line. We deliberately avoid a global regex over the whole blob + // with a lazy `[^\n]*?` bridge between the engine tag and the size: some logs + // contain multi-megabyte single lines (progress bars, tracebacks) that make + // such a regex recurse and blow the stack. A per-line substring pre-filter + // means the (cheap) regexes only ever run on the short KV-size lines. + // + // Each engine core prints one line; the tag (e.g. `EngineCore_DP3`) is stable + // across a run while the pid is not, so key on the tag to dedup reprints and + // sum across data-parallel ranks. + const tagRe = /\((?EngineCore(?:_DP\d+)?)\s+pid=\d+\)/u; + const sizeRe = /GPU KV cache size:\s*(?[\d,]+)\s*tokens/u; + const perEngine = new Map(); + let bareTotal = 0; + let bareFound = false; + for (const line of serverLog.split('\n')) { + if (!line.includes('GPU KV cache size')) continue; + const sizeMatch = sizeRe.exec(line); + if (!sizeMatch) continue; + const tokens = Number(sizeMatch.groups!.tokens!.replaceAll(',', '')); + if (!Number.isFinite(tokens) || tokens <= 0) continue; + const tagMatch = tagRe.exec(line); + if (tagMatch) { + perEngine.set(tagMatch.groups!.tag!, tokens); + } else { + // Fallback for logs without the engine-core prefix: count each occurrence + // (one per engine when there are no reprints). Best-effort only. + bareTotal += tokens; + bareFound = true; + } + } + if (perEngine.size > 0) { + let total = 0; + for (const v of perEngine.values()) total += v; + return total; + } + return bareFound ? bareTotal : null; +} diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 2a5f15f0..8ec1fb9e 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -467,8 +467,15 @@ async function main(): Promise { const parentDir = path.basename(path.dirname(file)); if (parentDir.startsWith('bmk_') && insertedIds.length > 0) { + // Single-turn artifacts are `bmk_` paired with + // `server_logs_`. Agentic artifacts are `bmk_agentic_` + // but the server log is still `server_logs_` (no `agentic_` + // prefix), so fall back to the fully-stripped suffix — otherwise + // agentic rows never get their server log (and KV-pool size) linked. const configKey = parentDir.replace(/^bmk_/u, ''); - const logPath = serverLogPaths.get(configKey); + const logPath = + serverLogPaths.get(configKey) ?? + serverLogPaths.get(stripBmkAndAgenticPrefix(parentDir)); if (logPath) { try { const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', ''); diff --git a/packages/db/src/queries/trace-server-metrics.test.ts b/packages/db/src/queries/trace-server-metrics.test.ts index 61d21d35..f045dfda 100644 --- a/packages/db/src/queries/trace-server-metrics.test.ts +++ b/packages/db/src/queries/trace-server-metrics.test.ts @@ -48,6 +48,7 @@ function metaRow(overrides: Record = {}) { run_url: null, server_gpu_cache_hit_rate: null, server_cpu_cache_hit_rate: null, + kv_cache_pool_tokens: null, ...overrides, }; } diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 61cacaae..d24d0879 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -81,6 +81,11 @@ export interface TraceServerMetrics { * the cluster-average `kvCacheUsage` line covers that case alone. */ kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[]; + /** + * Total KV-cache pool size in tokens (num_gpu_blocks × block_size, summed + * across engines). vLLM only — null for SGLang/TRT or older rows. + */ + kvCachePoolTokens: number | null; /** Orchestrator-normalized metrics grouped by endpoint/worker. */ metricSources: MetricSourceSeries[]; } @@ -89,6 +94,8 @@ interface RawMetaRow extends PointMeta { trace_replay_id: number | null; has_blob: boolean; chart_series: ChartSeries | null; + /** Derived at server-log ingest from "GPU KV cache size: N tokens" lines. */ + kv_cache_pool_tokens: string | null; } interface RawBlobRow { @@ -118,9 +125,14 @@ function buildMeta(row: RawMetaRow): PointMeta { }; } -function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { +function merge( + meta: PointMeta, + series: ChartSeries, + kvCachePoolTokens: number | null, +): TraceServerMetrics { return { meta, + kvCachePoolTokens, startNs: series.startNs, endNs: series.endNs, durationS: series.durationS, @@ -155,7 +167,8 @@ export async function getTraceServerMetrics( br.date::text, case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url, (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate, - (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate + (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate, + (br.metrics ->> 'kv_cache_pool_tokens')::numeric as kv_cache_pool_tokens from benchmark_results br join configs c on c.id = br.config_id join workflow_runs wr on wr.id = br.workflow_run_id @@ -166,10 +179,12 @@ export async function getTraceServerMetrics( if (!row) return null; if (!row.has_blob || row.trace_replay_id === null) return null; const meta = buildMeta(row); + const kvCachePoolTokens = + row.kv_cache_pool_tokens === null ? null : Number(row.kv_cache_pool_tokens); // Fast path: pre-computed chart_series at the current version. if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) { - return merge(meta, row.chart_series); + return merge(meta, row.chart_series, kvCachePoolTokens); } // Slow path only: fetch the large raw blob after establishing that the @@ -192,5 +207,5 @@ export async function getTraceServerMetrics( disagg: row.disagg, }); if (!series) return null; - return merge(meta, series); + return merge(meta, series, kvCachePoolTokens); } From af6bc11987e41c7cf5ca9298231fe02e90b5d9ce Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Jul 2026 16:16:05 -0500 Subject: [PATCH 103/111] fix(agentic): stable conversation row order + color across timeline phase toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switching the request timeline between Warmup and Profiling re-derived both the row order (sorted by first-start) and the color palette (assigned in iteration order) from the phase-filtered subset, so a conversation jumped rows and swapped color on every toggle. Compute a stable per-group index (conversation cid or worker id) from the full, unfiltered request set — keyed by earliest start across all phases — and drive both ordering and color from it. buildRequestTimelineRows takes it as an optional 4th arg (falls back to the legacy self-contained behavior for unit tests). Co-Authored-By: Claude Opus 4.8 --- .../agentic-point/request-timeline.test.ts | 61 +++++++++++++++++ .../agentic-point/request-timeline.tsx | 68 ++++++++++++++++--- 2 files changed, 119 insertions(+), 10 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts index fe3c1231..6fcf1c57 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -4,6 +4,7 @@ import type { RequestRecord } from '@/hooks/api/use-request-timeline'; import { buildRequestTimelineRows, + computeStableRowIndex, parseTimelineViewSnapshot, requestIdleStats, splitTimelineCid, @@ -106,6 +107,66 @@ describe('subagent timeline hierarchy', () => { }); }); +describe('stable row order + color across phase filters', () => { + // Same conversations appear in both warmup and profiling. Their global + // first-start order is A (0) < B (10) < C (only profiling, 50). The bug: + // filtering to a phase re-sorted + re-colored by the visible subset, so a + // conversation jumped rows and swapped color when toggling phases. + const rec = ( + cid: string, + phase: RequestRecord['phase'], + start: number, + end: number, + ): RequestRecord => ({ ...request(start, end), cid, phase }); + const full: RequestRecord[] = [ + rec('A', 'warmup', 0, 5), + rec('A', 'profiling', 100, 110), + rec('B', 'warmup', 10, 15), + rec('B', 'profiling', 120, 130), + rec('C', 'profiling', 50, 60), // profiling-only; earliest profiling start + ]; + + it('keeps each conversation in the same position and color when the phase changes', () => { + const index = computeStableRowIndex(full, 'conversation'); + const warmupRows = buildRequestTimelineRows( + full.filter((r) => r.phase === 'warmup'), + 'conversation', + new Set(), + index, + ).filter((r) => r.kind === 'parent'); + const profilingRows = buildRequestTimelineRows( + full.filter((r) => r.phase === 'profiling'), + 'conversation', + new Set(), + index, + ).filter((r) => r.kind === 'parent'); + + // Position: A before B in both phases (C only shows in profiling, and sorts + // after A/B by its global index — NOT first by its earlier profiling start). + expect(warmupRows.map((r) => r.label)).toEqual(['A', 'B']); + expect(profilingRows.map((r) => r.label)).toEqual(['A', 'B', 'C']); + + // Color: identical per conversation across phases, distinct between them. + const warmupColors = Object.fromEntries(warmupRows.map((r) => [r.label, r.color])); + const profilingColors = Object.fromEntries(profilingRows.map((r) => [r.label, r.color])); + expect(warmupColors.A).toBe(profilingColors.A); + expect(warmupColors.B).toBe(profilingColors.B); + expect(warmupColors.A).not.toBe(warmupColors.B); + }); + + it('without a shared index, the same subset re-sorts by its own start times (regression guard)', () => { + // Sanity: the legacy self-contained path (no index arg) orders by the + // subset's own first-start, which is exactly why the shared index is needed. + const profilingOnly = buildRequestTimelineRows( + full.filter((r) => r.phase === 'profiling'), + 'conversation', + new Set(), + ).filter((r) => r.kind === 'parent'); + // C (start 50) sorts first here, ahead of A (100) and B (120). + expect(profilingOnly.map((r) => r.label)).toEqual(['C', 'A', 'B']); + }); +}); + describe('parseTimelineViewSnapshot', () => { const full: TimelineViewSnapshot = { viewStart: 1_000, diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index f3870bb1..db1ac93f 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -275,6 +275,36 @@ export function splitTimelineCid(cid: string): { return { parent, subagentBase: raw, stream: null, aux: null }; } +/** + * Stable order/color index for the top-level row groups (conversations in + * conversation mode, workers in worker mode), keyed by group id and computed + * over the FULL (unfiltered) request set. Both the row ordering and the color + * palette are driven by this index, so a conversation/worker keeps the same + * position and color when the phase filter changes the visible subset — without + * it, filtering to warmup vs profiling re-sorts and re-colors by whatever subset + * is showing, making rows jump and swap colors. + * + * Order key is the group's earliest request start across all phases; ties break + * on the group id for determinism. + */ +export function computeStableRowIndex( + requests: readonly RequestRecord[], + mode: RowMode, +): Map { + const firstStart = new Map(); + for (const r of requests) { + const key = mode === 'conversation' ? splitTimelineCid(r.cid).parent : r.wid; + const cur = firstStart.get(key); + if (cur === undefined || r.start < cur) firstStart.set(key, r.start); + } + const keys = [...firstStart.keys()].toSorted( + (a, b) => firstStart.get(a)! - firstStart.get(b)! || (a < b ? -1 : a > b ? 1 : 0), + ); + const index = new Map(); + keys.forEach((key, i) => index.set(key, i)); + return index; +} + /** * Group requests into rows. In conversation mode, output order is: * parent_conv @@ -289,12 +319,23 @@ export function splitTimelineCid(cid: string): { * stream children. Bars on a collapsed subagent are the UNION of all its * streams' requests — overlapping bars visually communicate the * stream-level parallelism without expanding. + * + * `stableRowIndex` (optional) pins the top-level order + color per group so they + * survive phase-filter changes; when omitted it's derived from `requests` (the + * legacy self-contained behavior, used by unit tests). */ export function buildRequestTimelineRows( requests: RequestRecord[], mode: RowMode, expandedSubagents: ReadonlySet, + stableRowIndex?: ReadonlyMap, ): RequestTimelineRow[] { + const index = stableRowIndex ?? computeStableRowIndex(requests, mode); + const colorFor = (key: string) => + ROW_COLORS[ + (((index.get(key) ?? 0) % ROW_COLORS.length) + ROW_COLORS.length) % ROW_COLORS.length + ]!; + const orderOf = (key: string) => index.get(key) ?? Number.POSITIVE_INFINITY; if (mode !== 'conversation') { // Worker mode: flat rows, sorted by first activity. const groups = new Map(); @@ -307,20 +348,20 @@ export function buildRequestTimelineRows( list.push(r); } const rows: RequestTimelineRow[] = []; - let i = 0; for (const [key, list] of groups) { list.sort((a, b) => a.start - b.start); rows.push({ key, label: shortenWid(key), - color: ROW_COLORS[i % ROW_COLORS.length]!, + color: colorFor(key), requests: list, depth: 0, kind: 'worker', }); - i++; } - rows.sort((a, b) => a.requests[0]!.start - b.requests[0]!.start); + rows.sort( + (a, b) => orderOf(a.key) - orderOf(b.key) || a.requests[0]!.start - b.requests[0]!.start, + ); return rows; } @@ -370,12 +411,12 @@ export function buildRequestTimelineRows( if (r.start < tree.firstStart) tree.firstStart = r.start; } - const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart); + const sortedTrees = [...trees.values()].toSorted( + (a, b) => orderOf(a.parentCid) - orderOf(b.parentCid) || a.firstStart - b.firstStart, + ); const rows: RequestTimelineRow[] = []; - let colorIdx = 0; for (const tree of sortedTrees) { - const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!; - colorIdx++; + const color = colorFor(tree.parentCid); // Parent row (use a placeholder key if the parent itself wasn't replayed). tree.parentReqs.sort((a, b) => a.start - b.start); rows.push({ @@ -686,9 +727,16 @@ export function RequestTimelineView({ () => requestsForPhase(data.requests, hasWarmup ? phaseFilter : 'profiling'), [data.requests, phaseFilter, hasWarmup], ); + // Stable order/color per conversation (or worker), computed over the FULL + // request set — NOT the phase-filtered subset — so a row keeps its position + // and color when the user toggles between warmup and profiling. + const stableRowIndex = useMemo( + () => computeStableRowIndex(data.requests, rowMode), + [data.requests, rowMode], + ); const rows = useMemo( - () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents), - [filtered, rowMode, expandedSubagents], + () => buildRequestTimelineRows(filtered, rowMode, expandedSubagents, stableRowIndex), + [filtered, rowMode, expandedSubagents, stableRowIndex], ); const idleStats = useMemo(() => requestIdleStats(filtered), [filtered]); From 25ac7f910bc5933b16798108ff016de38d844bcf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Jul 2026 22:51:46 -0500 Subject: [PATCH 104/111] feat(agentic): open-in-new-tab for request timeline bars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Render each linkable request bar as a real SVG (the dataset conversation deep link) instead of a with an onClick, so the browser's native "open in new tab" works — right-click → Open Link in New Tab, plus ⌘/Ctrl-click and middle-click. Plain left-click still does the in-app SPA navigation + view-state snapshot; modified/non-primary clicks fall through to the browser, and native link-drag is suppressed so it doesn't fight the pan gesture. Extract the URL construction into an exported conversationHref() helper shared by the click handler and the href. Non-linkable points (no source dataset) keep the plain . Co-Authored-By: Claude Opus 4.8 --- .../agentic-point/request-timeline.test.ts | 19 +++++ .../agentic-point/request-timeline.tsx | 74 ++++++++++++++----- 2 files changed, 75 insertions(+), 18 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts index 6fcf1c57..3a3ebcc5 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -5,6 +5,7 @@ import type { RequestRecord } from '@/hooks/api/use-request-timeline'; import { buildRequestTimelineRows, computeStableRowIndex, + conversationHref, parseTimelineViewSnapshot, requestIdleStats, splitTimelineCid, @@ -107,6 +108,24 @@ describe('subagent timeline hierarchy', () => { }); }); +describe('conversationHref', () => { + it('builds a turn-carrying dataset link for a main-conversation request', () => { + expect( + conversationHref('cc-traces-weka-062126', { ...request(0, 10), cid: 'abc123', ti: 4 }), + ).toBe('/datasets/cc-traces-weka-062126/conversations/abc123?turn=4'); + }); + + it('carries the subagent id and strips the ::sa suffix from the conv id', () => { + expect( + conversationHref('slug', { + ...request(0, 10), + cid: 'abc123::sa:subagent_001_bf1c5c16:s2', + ti: 7, + }), + ).toBe('/datasets/slug/conversations/abc123?turn=7&sa=subagent_001_bf1c5c16'); + }); +}); + describe('stable row order + color across phase filters', () => { // Same conversations appear in both warmup and profiling. Their global // first-start order is A (0) < B (10) < C (only profiling, 50). The bug: diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index db1ac93f..3e0edd9e 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -34,6 +34,21 @@ export function subagentIdOf(cid: string): string | null { return colon === -1 ? raw : raw.slice(0, colon); } +/** + * Deep-link URL for the dataset conversation a request maps to. Carries the turn + * (and, for subagent requests, the subagent id) so the flamegraph can scroll to + * / highlight the exact node. Used both for SPA navigation on click and as the + * real `href` on the request bar so the browser's native "open in new tab" + * (right-click, ⌘/Ctrl-click, middle-click) works. + */ +export function conversationHref(datasetSlug: string, req: RequestRecord): string { + const convId = datasetConvId(req.cid); + const params = new URLSearchParams({ turn: String(req.ti) }); + const sa = subagentIdOf(req.cid); + if (sa) params.set('sa', sa); + return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`; +} + export interface RequestIdleStats { /** Total time between the first start and last end with no request running. */ idleNs: number; @@ -685,16 +700,8 @@ export function RequestTimelineView({ scrollLeft: scrollRef.current.scrollLeft, }); } - const convId = datasetConvId(req.cid); - // Carry the turn (and, for subagent requests, the subagent id) so the - // flamegraph can scroll to / highlight the exact node this bar maps to. - const params = new URLSearchParams({ turn: String(req.ti) }); - const sa = subagentIdOf(req.cid); - if (sa) params.set('sa', sa); track('agentic_timeline_to_dataset', { slug: datasetSlug }); - router.push( - `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`, - ); + router.push(conversationHref(datasetSlug, req)); }, [datasetSlug, router, pointId], ); @@ -1156,14 +1163,12 @@ export function RequestTimelineView({ const runW = Math.max(xEnd - xStart, 1); const queueW = Math.max(xStart - xCredit, 0); const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!; - return ( - setTooltip({ x: e.clientX, y: e.clientY, row, req })} - onMouseLeave={() => setTooltip(null)} - onClick={datasetSlug ? () => openConversation(req) : undefined} - style={datasetSlug ? { cursor: 'pointer' } : undefined} - > + const barKey = `${req.cid}-${req.ti}-${req.start}`; + const showTooltip = (e: React.MouseEvent) => + setTooltip({ x: e.clientX, y: e.clientY, row, req }); + const hideTooltip = () => setTooltip(null); + const barChildren = ( + <> {/* Queue lead-in (faint) — only drawn when noticeable. */} {queueW >= 1 && ( )} - + + ); + // No source dataset → not linkable; plain group. + if (!datasetSlug) { + return ( + + {barChildren} + + ); + } + // Linkable: render a real SVG anchor with the conversation + // href so the browser's native "open in new tab" works + // (right-click menu, ⌘/Ctrl-click, middle-click). Plain + // left-click stays an in-app navigation; modified or + // non-primary clicks fall through to the browser. Suppress + // the native link drag so it doesn't fight the pan gesture. + return ( + { + if (e.metaKey || e.ctrlKey || e.shiftKey || e.altKey || e.button !== 0) { + return; + } + e.preventDefault(); + openConversation(req); + }} + onDragStart={(e) => e.preventDefault()} + style={{ cursor: 'pointer' }} + > + {barChildren} + ); }); })} From 7558faa58930a386af8dfb83d27afb460ba6e564 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Jul 2026 22:56:28 -0500 Subject: [PATCH 105/111] fix(agentic): show 1-based turn number in request timeline tooltip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timeline tooltip showed the raw 0-based turn_index while the dataset flamegraph labels turns 1-based ("Turn 1" for the first turn), so the same request read as a different turn in the two views. Display req.ti + 1 to align them. The deep-link `turn` param stays 0-based — the flamegraph matches it against a 0-based turn ordinal for scroll/highlight, so click-through targeting is unchanged. Co-Authored-By: Claude Opus 4.8 --- .../components/inference/agentic-point/request-timeline.tsx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 3e0edd9e..834b7a83 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -592,7 +592,10 @@ function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean })
{row.label} - · turn {req.ti} + {/* Display 1-based to match the dataset flamegraph's "Turn N" labels. + The deep-link `turn` param stays 0-based (req.ti) — the flamegraph + matches it against a 0-based turn ordinal for highlighting. */} + · turn {req.ti + 1} {req.cancelled && · cancelled}
From ad85bed7236f44c4a48d71b3fffb3147734efaf3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Jul 2026 23:28:47 -0500 Subject: [PATCH 106/111] feat(agentic): link timeline requests by raw source Signed-off-by: Cam Quilici --- .../components/datasets/conversation-view.tsx | 5 +++- .../components/datasets/trace-flamegraph.tsx | 17 +++++++++-- .../agentic-point/request-timeline.test.ts | 13 ++++++++ .../agentic-point/request-timeline.tsx | 26 +++++++++++++--- .../app/src/hooks/api/use-request-timeline.ts | 6 ++++ .../src/etl/compute-request-timeline.test.ts | 30 +++++++++++++++++++ .../db/src/etl/compute-request-timeline.ts | 14 ++++++++- packages/db/src/etl/weka-structure.test.ts | 14 +++++++++ packages/db/src/etl/weka-structure.ts | 5 +++- 9 files changed, 120 insertions(+), 10 deletions(-) diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index ce10241a..0be8e58a 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -11,13 +11,15 @@ import { compact } from './format'; export function ConversationView({ slug, convId }: { slug: string; convId: string }) { const { data, isLoading, isError } = useDatasetConversation(slug, convId); - // Deep-link target from a request-timeline click: ?turn=[&sa=]. + // Deep-link target from a request-timeline click: ?raw= or ?turn=[&sa=]. // useSearchParams (not a one-shot window.location read) so the params are // present on the very first client-side navigation, not just after a reload. const params = useSearchParams(); const turnRaw = params.get('turn'); + const sourceRaw = params.get('raw'); const highlight = { turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null, + raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null, agent: params.get('sa'), }; @@ -95,6 +97,7 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 158c03c3..f82f0b5f 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -169,20 +169,31 @@ interface TooltipState { export function TraceFlamegraph({ structure, highlightTurn, + highlightRawIndex, highlightAgentId, }: { structure: ConversationStructure; /** Turn index to scroll to / highlight (from a request-timeline deep link). */ highlightTurn?: number | null; + /** Raw Weka top-level request index to scroll to / highlight. */ + highlightRawIndex?: number | null; /** Subagent id when the highlighted turn is inside a subagent group. */ highlightAgentId?: string | null; }) { const nodes = structure.nodes; // Resolve the deep-link target to a row key (+ the group that must be open to - // show it). Main turns match by their main-turn ordinal; subagent turns match - // the group by agentId, then the ti-th child. + // show it). Raw Weka indexes are exact source coordinates and take precedence; + // otherwise main turns match by main-turn ordinal and subagent turns match the + // group by agentId, then the ti-th child. const target = useMemo(() => { + if (typeof highlightRawIndex === 'number' && highlightRawIndex >= 0) { + const i = nodes.findIndex( + (node) => node.kind === 'turn' && node.rawIndex === highlightRawIndex, + ); + if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null }; + return null; + } if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null; if (highlightAgentId) { const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId); @@ -199,7 +210,7 @@ export function TraceFlamegraph({ } } return null; - }, [nodes, highlightTurn, highlightAgentId]); + }, [nodes, highlightTurn, highlightRawIndex, highlightAgentId]); // Subagent groups collapsed by default — except the deep-link target's group. const [expanded, setExpanded] = useState>(() => diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts index 3a3ebcc5..bebb63a9 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -124,6 +124,19 @@ describe('conversationHref', () => { }), ).toBe('/datasets/slug/conversations/abc123?turn=7&sa=subagent_001_bf1c5c16'); }); + + it('uses raw source provenance for flattened-agent dataset links', () => { + expect( + conversationHref('slug', { + ...request(0, 10), + cid: '02bc0afb13f7a2d9efa86c28511261d85c0e::fa:003', + ti: 3, + srcTrace: '02bc0afb13f7a2d9efa86c28511261d85c0e', + srcOuter: 204, + srcKind: 'weka_flat', + }), + ).toBe('/datasets/slug/conversations/02bc0afb13f7a2d9efa86c28511261d85c0e?turn=3&raw=204'); + }); }); describe('stable row order + color across phase filters', () => { diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 834b7a83..592f5c37 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -42,8 +42,11 @@ export function subagentIdOf(cid: string): string | null { * (right-click, ⌘/Ctrl-click, middle-click) works. */ export function conversationHref(datasetSlug: string, req: RequestRecord): string { - const convId = datasetConvId(req.cid); + const convId = req.srcTrace ?? datasetConvId(req.cid); const params = new URLSearchParams({ turn: String(req.ti) }); + if (typeof req.srcOuter === 'number' && Number.isInteger(req.srcOuter) && req.srcOuter >= 0) { + params.set('raw', String(req.srcOuter)); + } const sa = subagentIdOf(req.cid); if (sa) params.set('sa', sa); return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`; @@ -299,21 +302,36 @@ export function splitTimelineCid(cid: string): { * it, filtering to warmup vs profiling re-sorts and re-colors by whatever subset * is showing, making rows jump and swap colors. * - * Order key is the group's earliest request start across all phases; ties break - * on the group id for determinism. + * Groups that span BOTH phases sort first. The shared set is by definition + * present in either phase's view, so this leading block renders identically in + * both — a conversation that carries over from warmup into profiling stays on + * the exact same row when the toggle flips. Phase-exclusive groups follow, and + * only they reflow between views. Within each block the order key is the + * group's earliest request start across all phases; ties break on the group id + * for determinism. */ export function computeStableRowIndex( requests: readonly RequestRecord[], mode: RowMode, ): Map { const firstStart = new Map(); + // Which phases each group appears in. Mirrors requestsForPhase's split: + // 'profiling' is exact, anything else counts as warmup. + const inProfiling = new Set(); + const inWarmup = new Set(); for (const r of requests) { const key = mode === 'conversation' ? splitTimelineCid(r.cid).parent : r.wid; const cur = firstStart.get(key); if (cur === undefined || r.start < cur) firstStart.set(key, r.start); + if (r.phase === 'profiling') inProfiling.add(key); + else inWarmup.add(key); } + const spansBoth = (key: string) => inProfiling.has(key) && inWarmup.has(key); const keys = [...firstStart.keys()].toSorted( - (a, b) => firstStart.get(a)! - firstStart.get(b)! || (a < b ? -1 : a > b ? 1 : 0), + (a, b) => + Number(spansBoth(b)) - Number(spansBoth(a)) || + firstStart.get(a)! - firstStart.get(b)! || + (a < b ? -1 : a > b ? 1 : 0), ); const index = new Map(); keys.forEach((key, i) => index.set(key, i)); diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts index 094d2230..306d1416 100644 --- a/packages/app/src/hooks/api/use-request-timeline.ts +++ b/packages/app/src/hooks/api/use-request-timeline.ts @@ -5,6 +5,12 @@ export interface RequestRecord { cid: string; /** Zero-based turn index within the conversation. */ ti: number; + /** Source trace id from the original raw dataset, when provided by AIPerf. */ + srcTrace?: string; + /** Original raw top-level request index within srcTrace. */ + srcOuter?: number; + /** Loader-specific source kind, e.g. weka_main or weka_flat. */ + srcKind?: string; /** Worker id (concurrency slot that handled this request). */ wid: string; /** Sub-agent depth (0 = top-level). */ diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts index 61e69fe8..409dc091 100644 --- a/packages/db/src/etl/compute-request-timeline.test.ts +++ b/packages/db/src/etl/compute-request-timeline.test.ts @@ -7,6 +7,9 @@ import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-requ interface SyntheticRequest { cid: string; ti: number; + srcTrace?: string; + srcOuter?: number; + srcKind?: string; wid?: string; ad?: number; phase?: string; @@ -28,6 +31,9 @@ function makeBlob(requests: SyntheticRequest[]) { metadata: { conversation_id: r.cid, turn_index: r.ti, + ...(r.srcTrace === undefined ? {} : { source_trace_id: r.srcTrace }), + ...(r.srcOuter === undefined ? {} : { source_outer_idx: r.srcOuter }), + ...(r.srcKind === undefined ? {} : { source_kind: r.srcKind }), worker_id: r.wid ?? 'worker_default', agent_depth: r.ad ?? 0, benchmark_phase: r.phase ?? 'profiling', @@ -119,6 +125,30 @@ describe('computeRequestTimeline', () => { expect(r.phase).toBe('profiling'); }); + it('preserves raw source provenance fields when present', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'trace::fa:003', + ti: 3, + srcTrace: 'trace', + srcOuter: 204, + srcKind: 'weka_flat', + credit: 0, + start: 10, + end: 100, + }, + ]), + ); + expect(tl?.requests[0]).toMatchObject({ + cid: 'trace::fa:003', + ti: 3, + srcTrace: 'trace', + srcOuter: 204, + srcKind: 'weka_flat', + }); + }); + it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => { const tl = computeRequestTimeline( makeBlob([ diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts index 707e8c54..85f782fc 100644 --- a/packages/db/src/etl/compute-request-timeline.ts +++ b/packages/db/src/etl/compute-request-timeline.ts @@ -14,13 +14,19 @@ import { gunzipSync } from 'node:zlib'; /** Bump when the extraction algorithm changes — backfill recomputes anything older. */ -export const REQUEST_TIMELINE_VERSION = 3; +export const REQUEST_TIMELINE_VERSION = 4; export interface RequestRecord { /** Conversation id (groups turns of one agent session). */ cid: string; /** Zero-based turn index within the conversation. */ ti: number; + /** Source trace id from the original raw dataset, when distinct from replay cid. */ + srcTrace?: string; + /** Original raw top-level request index within srcTrace. */ + srcOuter?: number; + /** Loader-specific source kind, e.g. weka_main or weka_flat. */ + srcKind?: string; /** Worker id (concurrency slot that handled this request). */ wid: string; /** Sub-agent depth (0 = top-level). */ @@ -60,6 +66,9 @@ export interface RequestTimeline { interface RawMetadata { conversation_id?: string; turn_index?: number; + source_trace_id?: string; + source_outer_idx?: number; + source_kind?: string; worker_id?: string; agent_depth?: number; benchmark_phase?: string; @@ -164,6 +173,9 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n requests.push({ cid: m.conversation_id ?? 'unknown', ti: typeof m.turn_index === 'number' ? m.turn_index : 0, + srcTrace: typeof m.source_trace_id === 'string' ? m.source_trace_id : undefined, + srcOuter: typeof m.source_outer_idx === 'number' ? m.source_outer_idx : undefined, + srcKind: typeof m.source_kind === 'string' ? m.source_kind : undefined, wid: m.worker_id ?? 'unknown', ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0, phase: m.benchmark_phase ?? 'unknown', diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts index 97e8759d..dec2254c 100644 --- a/packages/db/src/etl/weka-structure.test.ts +++ b/packages/db/src/etl/weka-structure.test.ts @@ -61,6 +61,20 @@ describe('buildConversationStructure', () => { }); }); + it('stamps top-level turns with their raw Weka request index', () => { + const structure = buildConversationStructure({ + id: 'raw-index', + requests: [ + { type: 'n', in: 1, out: 1 }, + { type: 'subagent', requests: [{ type: 'n', in: 1, out: 1 }] }, + { type: 'n', in: 1, out: 1 }, + ], + }); + + expect((structure.nodes[0] as TurnNode).rawIndex).toBe(0); + expect((structure.nodes[2] as TurnNode).rawIndex).toBe(2); + }); + it('clamps cached to the effective input on a partial last block', () => { const conv: RawWekaConversation = { id: 'c2', diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index f6cea1c1..bbdb8791 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -48,6 +48,8 @@ export interface RawWekaConversation { export interface TurnNode { kind: 'turn'; turnIndex: number; + /** Zero-based index in the raw Weka requests array, when this row maps to one. */ + rawIndex?: number; /** Seconds from the start of the conversation. */ startS?: number; /** End of the original request interval (`startS + api_time`). */ @@ -218,7 +220,7 @@ export function buildConversationStructure( let numSubagentGroups = 0; let turnIndex = 0; - for (const entry of conv.requests ?? []) { + for (const [idx, entry] of (conv.requests ?? []).entries()) { if (isSubagent(entry)) { const { startS, endS } = subagentTimeRange(entry); const childSeen = new Set(seen); // snapshot at spawn; not merged back @@ -272,6 +274,7 @@ export function buildConversationStructure( nodes.push({ kind: 'turn', turnIndex: turnIndex++, + rawIndex: idx, startS, endS: requestEndS(startS, entry.api_time), model: entry.model, From c3c3d4044c3b590454ffccdce4583119ccb88690 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Jul 2026 23:33:36 -0500 Subject: [PATCH 107/111] test(agentic): absolute row alignment for phase-spanning timeline conversations Covers the shared-first ordering in computeStableRowIndex (landed in ad85bed): conversations present in both warmup and profiling must occupy the same absolute row in both phase views, with phase-exclusive conversations filling in below. Guards the case where earlier-starting warmup-only conversations would otherwise push the shared block down in one view only. Co-Authored-By: Claude Fable 5 --- .../agentic-point/request-timeline.test.ts | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts index bebb63a9..779b79f3 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -186,6 +186,37 @@ describe('stable row order + color across phase filters', () => { expect(warmupColors.A).not.toBe(warmupColors.B); }); + it('phase-spanning conversations occupy the same ABSOLUTE row in both phase views', () => { + // Warmup-only conversations start earliest — under a plain global-start + // ordering they'd sit above the shared ones in the warmup view but be + // absent from the profiling view, sliding every shared row up when the + // toggle flips. Spanning conversations must sort first so the leading block + // is identical in both views and a carried-over conversation never moves. + const data: RequestRecord[] = [ + rec('W1', 'warmup', 0, 2), + rec('W2', 'warmup', 3, 4), + rec('A', 'warmup', 5, 8), + rec('A', 'profiling', 100, 110), + rec('B', 'warmup', 10, 15), + rec('B', 'profiling', 120, 130), + rec('P', 'profiling', 50, 60), + ]; + const index = computeStableRowIndex(data, 'conversation'); + const parentLabels = (phase: RequestRecord['phase']) => + buildRequestTimelineRows( + data.filter((r) => r.phase === phase), + 'conversation', + new Set(), + index, + ) + .filter((r) => r.kind === 'parent') + .map((r) => r.label); + // Shared block [A, B] leads both views at rows 0 and 1; phase-unique + // conversations fill in below. + expect(parentLabels('warmup')).toEqual(['A', 'B', 'W1', 'W2']); + expect(parentLabels('profiling')).toEqual(['A', 'B', 'P']); + }); + it('without a shared index, the same subset re-sorts by its own start times (regression guard)', () => { // Sanity: the legacy self-contained path (no index arg) orders by the // subset's own first-start, which is exactly why the shared index is needed. From 9a31af3880d560177879be6e527baa71091d1948 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Jul 2026 23:50:05 -0500 Subject: [PATCH 108/111] feat(agentic): link subagent requests by source Signed-off-by: Cam Quilici --- .../components/datasets/conversation-view.tsx | 3 +++ .../components/datasets/trace-flamegraph.tsx | 23 +++++++++++++++---- .../agentic-point/request-timeline.test.ts | 16 +++++++++++++ .../agentic-point/request-timeline.tsx | 18 +++++++++++---- .../app/src/hooks/api/use-request-timeline.ts | 2 ++ .../src/etl/compute-request-timeline.test.ts | 4 ++++ .../db/src/etl/compute-request-timeline.ts | 6 ++++- packages/db/src/etl/weka-structure.test.ts | 5 ++++ packages/db/src/etl/weka-structure.ts | 9 +++++++- 9 files changed, 75 insertions(+), 11 deletions(-) diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index 0be8e58a..359ca381 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -17,9 +17,11 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin const params = useSearchParams(); const turnRaw = params.get('turn'); const sourceRaw = params.get('raw'); + const sourceInner = params.get('inner'); const highlight = { turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null, raw: sourceRaw !== null && /^\d+$/u.test(sourceRaw) ? Number(sourceRaw) : null, + inner: sourceInner !== null && /^\d+$/u.test(sourceInner) ? Number(sourceInner) : null, agent: params.get('sa'), }; @@ -98,6 +100,7 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin structure={data.structure} highlightTurn={highlight.turn} highlightRawIndex={highlight.raw} + highlightInnerIndex={highlight.inner} highlightAgentId={highlight.agent} /> diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index f82f0b5f..a3366342 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -170,6 +170,7 @@ export function TraceFlamegraph({ structure, highlightTurn, highlightRawIndex, + highlightInnerIndex, highlightAgentId, }: { structure: ConversationStructure; @@ -177,17 +178,31 @@ export function TraceFlamegraph({ highlightTurn?: number | null; /** Raw Weka top-level request index to scroll to / highlight. */ highlightRawIndex?: number | null; + /** Raw Weka nested request index under highlightRawIndex, for subagent children. */ + highlightInnerIndex?: number | null; /** Subagent id when the highlighted turn is inside a subagent group. */ highlightAgentId?: string | null; }) { const nodes = structure.nodes; // Resolve the deep-link target to a row key (+ the group that must be open to - // show it). Raw Weka indexes are exact source coordinates and take precedence; - // otherwise main turns match by main-turn ordinal and subagent turns match the - // group by agentId, then the ti-th child. + // show it). Raw Weka source coordinates are exact and take precedence: + // raw= -> top-level Weka request + // raw=&inner= -> subagent child inside that top-level marker + // Otherwise main turns match by main-turn ordinal and subagent turns match + // the group by agentId, then the ti-th child. const target = useMemo(() => { if (typeof highlightRawIndex === 'number' && highlightRawIndex >= 0) { + if (typeof highlightInnerIndex === 'number' && highlightInnerIndex >= 0) { + const gi = nodes.findIndex( + (node) => node.kind === 'subagent' && node.rawIndex === highlightRawIndex, + ); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + const ci = group.children.findIndex((child) => child.innerIndex === highlightInnerIndex); + if (ci === -1) return null; + return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi }; + } const i = nodes.findIndex( (node) => node.kind === 'turn' && node.rawIndex === highlightRawIndex, ); @@ -210,7 +225,7 @@ export function TraceFlamegraph({ } } return null; - }, [nodes, highlightTurn, highlightRawIndex, highlightAgentId]); + }, [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId]); // Subagent groups collapsed by default — except the deep-link target's group. const [expanded, setExpanded] = useState>(() => diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts index 779b79f3..17d6d1bc 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.test.ts +++ b/packages/app/src/components/inference/agentic-point/request-timeline.test.ts @@ -137,6 +137,22 @@ describe('conversationHref', () => { }), ).toBe('/datasets/slug/conversations/02bc0afb13f7a2d9efa86c28511261d85c0e?turn=3&raw=204'); }); + + it('uses raw nested source provenance for subagent child links', () => { + expect( + conversationHref('slug', { + ...request(0, 10), + cid: '117ebe75819d050f308a0a81647893abd02d::sa:subagent_010_32ee2daa', + ti: 16, + srcTrace: '117ebe75819d050f308a0a81647893abd02d', + srcOuter: 39, + srcInner: 16, + srcKind: 'weka_subagent', + }), + ).toBe( + '/datasets/slug/conversations/117ebe75819d050f308a0a81647893abd02d?turn=16&raw=39&inner=16', + ); + }); }); describe('stable row order + color across phase filters', () => { diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 592f5c37..9afad5e6 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -46,12 +46,23 @@ export function conversationHref(datasetSlug: string, req: RequestRecord): strin const params = new URLSearchParams({ turn: String(req.ti) }); if (typeof req.srcOuter === 'number' && Number.isInteger(req.srcOuter) && req.srcOuter >= 0) { params.set('raw', String(req.srcOuter)); + if (typeof req.srcInner === 'number' && Number.isInteger(req.srcInner) && req.srcInner >= 0) { + params.set('inner', String(req.srcInner)); + } } const sa = subagentIdOf(req.cid); - if (sa) params.set('sa', sa); + if (sa && !params.has('inner')) params.set('sa', sa); return `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`; } +function requestSourceLabel(req: RequestRecord): string { + if (typeof req.srcOuter === 'number') { + if (typeof req.srcInner === 'number') return `raw ${req.srcOuter} / child ${req.srcInner}`; + return `raw ${req.srcOuter}`; + } + return `replay turn ${req.ti + 1}`; +} + export interface RequestIdleStats { /** Total time between the first start and last end with no request running. */ idleNs: number; @@ -610,10 +621,7 @@ function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean })
{row.label} - {/* Display 1-based to match the dataset flamegraph's "Turn N" labels. - The deep-link `turn` param stays 0-based (req.ti) — the flamegraph - matches it against a 0-based turn ordinal for highlighting. */} - · turn {req.ti + 1} + · {requestSourceLabel(req)} {req.cancelled && · cancelled}
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts index 306d1416..d2143c2b 100644 --- a/packages/app/src/hooks/api/use-request-timeline.ts +++ b/packages/app/src/hooks/api/use-request-timeline.ts @@ -9,6 +9,8 @@ export interface RequestRecord { srcTrace?: string; /** Original raw top-level request index within srcTrace. */ srcOuter?: number; + /** Original nested request index within srcOuter, for subagent children. */ + srcInner?: number; /** Loader-specific source kind, e.g. weka_main or weka_flat. */ srcKind?: string; /** Worker id (concurrency slot that handled this request). */ diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts index 409dc091..1ad9e63b 100644 --- a/packages/db/src/etl/compute-request-timeline.test.ts +++ b/packages/db/src/etl/compute-request-timeline.test.ts @@ -9,6 +9,7 @@ interface SyntheticRequest { ti: number; srcTrace?: string; srcOuter?: number; + srcInner?: number; srcKind?: string; wid?: string; ad?: number; @@ -33,6 +34,7 @@ function makeBlob(requests: SyntheticRequest[]) { turn_index: r.ti, ...(r.srcTrace === undefined ? {} : { source_trace_id: r.srcTrace }), ...(r.srcOuter === undefined ? {} : { source_outer_idx: r.srcOuter }), + ...(r.srcInner === undefined ? {} : { source_inner_idx: r.srcInner }), ...(r.srcKind === undefined ? {} : { source_kind: r.srcKind }), worker_id: r.wid ?? 'worker_default', agent_depth: r.ad ?? 0, @@ -133,6 +135,7 @@ describe('computeRequestTimeline', () => { ti: 3, srcTrace: 'trace', srcOuter: 204, + srcInner: 16, srcKind: 'weka_flat', credit: 0, start: 10, @@ -145,6 +148,7 @@ describe('computeRequestTimeline', () => { ti: 3, srcTrace: 'trace', srcOuter: 204, + srcInner: 16, srcKind: 'weka_flat', }); }); diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts index 85f782fc..2cbe5174 100644 --- a/packages/db/src/etl/compute-request-timeline.ts +++ b/packages/db/src/etl/compute-request-timeline.ts @@ -14,7 +14,7 @@ import { gunzipSync } from 'node:zlib'; /** Bump when the extraction algorithm changes — backfill recomputes anything older. */ -export const REQUEST_TIMELINE_VERSION = 4; +export const REQUEST_TIMELINE_VERSION = 5; export interface RequestRecord { /** Conversation id (groups turns of one agent session). */ @@ -25,6 +25,8 @@ export interface RequestRecord { srcTrace?: string; /** Original raw top-level request index within srcTrace. */ srcOuter?: number; + /** Original nested request index within srcOuter, for subagent children. */ + srcInner?: number; /** Loader-specific source kind, e.g. weka_main or weka_flat. */ srcKind?: string; /** Worker id (concurrency slot that handled this request). */ @@ -68,6 +70,7 @@ interface RawMetadata { turn_index?: number; source_trace_id?: string; source_outer_idx?: number; + source_inner_idx?: number; source_kind?: string; worker_id?: string; agent_depth?: number; @@ -175,6 +178,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n ti: typeof m.turn_index === 'number' ? m.turn_index : 0, srcTrace: typeof m.source_trace_id === 'string' ? m.source_trace_id : undefined, srcOuter: typeof m.source_outer_idx === 'number' ? m.source_outer_idx : undefined, + srcInner: typeof m.source_inner_idx === 'number' ? m.source_inner_idx : undefined, srcKind: typeof m.source_kind === 'string' ? m.source_kind : undefined, wid: m.worker_id ?? 'unknown', ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0, diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts index dec2254c..5900d151 100644 --- a/packages/db/src/etl/weka-structure.test.ts +++ b/packages/db/src/etl/weka-structure.test.ts @@ -131,6 +131,7 @@ describe('buildConversationStructure', () => { expect(sub.kind).toBe('subagent'); expect(sub.label).toBe('Explore'); expect(sub.agentId).toBe('a1'); + expect(sub.rawIndex).toBe(1); expect(sub.durationMs).toBe(1234); expect(sub.startS).toBe(12.5); expect(sub.endS).toBeCloseTo(13.734, 6); @@ -141,6 +142,10 @@ describe('buildConversationStructure', () => { [12.5, 12.5], [13.1, 13.1], ]); + expect(sub.children.map((child) => [child.rawIndex, child.innerIndex])).toEqual([ + [1, 0], + [1, 1], + ]); expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child expect(sub.in).toBe(256); diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index bbdb8791..edc192ea 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -50,6 +50,8 @@ export interface TurnNode { turnIndex: number; /** Zero-based index in the raw Weka requests array, when this row maps to one. */ rawIndex?: number; + /** Zero-based index within a raw nested request array, when this row maps to one. */ + innerIndex?: number; /** Seconds from the start of the conversation. */ startS?: number; /** End of the original request interval (`startS + api_time`). */ @@ -67,6 +69,8 @@ export interface SubagentNode { kind: 'subagent'; label: string; agentId?: string; + /** Zero-based index of the raw top-level subagent marker. */ + rawIndex?: number; /** Seconds from the start of the conversation. */ startS?: number; /** Seconds from the start of the conversation. */ @@ -229,13 +233,15 @@ export function buildConversationStructure( let gout = 0; let gcached = 0; let guncached = 0; - for (const inner of entry.requests ?? []) { + for (const [innerIdx, inner] of (entry.requests ?? []).entries()) { const split = splitInput(inner, childSeen, blockSize); const out = Math.max(0, Math.round(inner.out ?? 0)); const childStartS = subagentRequestStartS(entry, inner); children.push({ kind: 'turn', turnIndex: turnIndex++, + rawIndex: idx, + innerIndex: innerIdx, startS: childStartS, endS: requestEndS(childStartS, inner.api_time), model: inner.model, @@ -253,6 +259,7 @@ export function buildConversationStructure( kind: 'subagent', label: subagentLabel(entry), agentId: entry.agent_id, + rawIndex: idx, startS, endS, durationMs: entry.duration_ms, From 173836e6ad25d7256cb1b2143798331a80687993 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 00:13:31 -0500 Subject: [PATCH 109/111] feat(ingest): support v3 agentic agg schema (nested request/server metrics) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agentic bmk rows (2026-07-02+) restructure the flat metric keys into nested request_metrics / server_metrics containers, add p50 percentiles, ship intvty already slow-tail inverted, scope the hw id (cluster:b300-nv), and replace offload_mode with kv_offloading ('none'|'dram'|…) + kv_offload_backend. - flattenAgenticAggRow(): map the nested containers back onto the canonical flat metric schema before the rest of the mapper runs, so v1/v2/v3 rows all produce one consistent metrics shape. p50 stats are stored as median_* to match the existing naming; the derive-from-itl intvty invariant is kept (it now agrees with the artifact's pre-inverted values). - hwToGpuKey(): strip the v3 scope prefix (cluster:) — without this every v3 row would be skipped as unmapped hardware. - offload: kv_offloading descriptor reduces to the binary on/off used for row identity; the tier + backend strings are preserved as metrics for the UI. - METRIC_KEYS: register gpu_kv_cache_usage_pct and server_external_cache_hit_rate (both previously warned as auto-captured). - Deliberately not mapped yet: prefix hit/query counts, cpu KV detail, prompt_by_source split, sources[] — noted inline for when a view needs them. No DB schema changes — everything lands in the existing metrics JSONB and offload_mode column. Verified against the real artifacts from GH run 28553943579 (71 flat metrics on the conc16 row; kvdram-mooncake row maps to offload on with backend preserved). Co-Authored-By: Claude Fable 5 --- packages/constants/src/metric-keys.ts | 5 + packages/db/src/etl/benchmark-mapper.test.ts | 247 +++++++++++++++++++ packages/db/src/etl/benchmark-mapper.ts | 164 +++++++++++- packages/db/src/etl/normalizers.test.ts | 5 + packages/db/src/etl/normalizers.ts | 8 +- 5 files changed, 419 insertions(+), 10 deletions(-) diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts index 0acf3fbf..914eed4b 100644 --- a/packages/constants/src/metric-keys.ts +++ b/packages/constants/src/metric-keys.ts @@ -106,7 +106,12 @@ export const METRIC_KEYS = new Set([ // server prefix-cache observability (agentic aiperf) 'server_gpu_cache_hit_rate', 'server_cpu_cache_hit_rate', + 'server_external_cache_hit_rate', 'theoretical_cache_hit_rate', + // server KV-cache occupancy — mean GPU KV-cache usage fraction (0-1) over the + // profiling window (agentic aiperf; flat in v2 artifacts, mapped from + // server_metrics.kv_cache.gpu_usage_pct in v3) + 'gpu_kv_cache_usage_pct', // measured power / energy (emitted by runner's aggregate_power.py) // avg_power_w: mean per-GPU draw (W) during the load window // joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE on diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts index 5fe9ffde..cde2f74b 100644 --- a/packages/db/src/etl/benchmark-mapper.test.ts +++ b/packages/db/src/etl/benchmark-mapper.test.ts @@ -614,3 +614,250 @@ describe('mapBenchmarkRow — agentic interactivity normalization', () => { expect(result!.metrics.p90_intvty).toBe(999); }); }); + +/** + * Minimal v3 agentic row (2026-07-02+): nested request_metrics/server_metrics, + * p50 percentiles, pre-inverted intvty, kv_offloading descriptors. Mirrors the + * real artifact from GH run 28553943579 (trimmed). + */ +function makeV3AgenticRow(overrides: Record = {}): Record { + return { + infmax_model_prefix: 'dsv4', + hw: 'cluster:b300-nv', + framework: 'vllm', + precision: 'fp4', + spec_decoding: 'none', + disagg: false, + scenario_type: 'agentic-coding', + is_multinode: false, + tp: 4, + ep: 1, + dp_attention: 'false', + conc: 16, + image: 'vllm/vllm-openai:v0.23.0', + kv_offloading: 'none', + kv_offload_backend: '', + num_requests_total: 1648, + num_requests_successful: 1648, + dataset: { + source_type: 'public_dataset', + hf_dataset_name: 'semianalysisai/cc-traces-weka-062126', + }, + request_metrics: { + qps: { + window_seconds: 1, + samples: 7209, + mean: 0.22846, + p50: 0, + p75: 0, + p90: 1, + p95: 1, + std: 0.60707, + }, + latency: { + ttft: { + mean: 12.90033, + p50: 1.49712, + p75: 12.09501, + p90: 56.22194, + p95: 68.03156, + std: 22.68353, + }, + e2el: { + mean: 81.05644, + p50: 26.18817, + p75: 84.93601, + p90: 199.85996, + p95: 360.31579, + std: 149.59205, + }, + itl: { + mean: 0.07548, + p50: 0.03677, + p75: 0.10253, + p90: 0.16652, + p95: 0.22255, + std: 0.08327, + }, + tpot: { + mean: 0.07548, + p50: 0.03677, + p75: 0.10253, + p90: 0.16652, + p95: 0.22255, + std: 0.08327, + }, + // already slow-tail inverted upstream (pXX_intvty = 1/pXX_itl) + intvty: { + mean: 13.2482, + p50: 27.19411, + p75: 9.75304, + p90: 6.00526, + p95: 4.49335, + std: 24.77636, + }, + }, + tokens: { + input: { + mean: 157676.054, + p50: 96047, + p75: 197684.25, + p90: 404935.9, + p95: 547502.85, + std: 152480.17653, + }, + output_actual: { + mean: 849.06735, + p50: 290.5, + p75: 783.5, + p90: 2231.8, + p95: 3915.45, + std: 1568.90823, + }, + output_expected: { + mean: 1432.32728, + p50: 571.5, + p75: 1820, + p90: 3927, + p95: 5312.9, + std: 2067.19215, + }, + }, + throughput: { + input: { tokens_per_second: 35980.14001 }, + output: { tokens_per_second: 193.7489 }, + total: { tokens_per_second: 36173.88892 }, + duration_seconds: 7222.04352, + per_gpu: { + total_tput_tps: 9043.47223, + output_tput_tps: 48.43723, + input_tput_tps: 8995.035, + }, + }, + cache: { theoretical_cache_hit_rate: 0.97509 }, + }, + server_metrics: { + present: true, + adapter: 'vllm', + metric_count: 49, + cache: { + gpu_cache_hit_rate: 0.78539, + cpu_cache_hit_rate: 0, + external_cache_hit_rate: 0, + overall_cache_hit_rate: 0.78539, + prefix_cache_hits: 205576960, + prefix_cache_queries: 261750519, + frontend_cache_hit_rate: null, + }, + kv_cache: { gpu_usage_pct: 0.82134, cpu_usage_pct: null, cpu_used_tokens: null }, + tokens: { + prompt_total: 261750519, + generation_total: 1422696, + requests_completed: 1648, + prompt_by_source: { + gpu_cache_hit: 205576960, + cpu_or_external_cache_hit: 0, + computed: 56173559, + }, + }, + sources: [{ id: 'combined|http://localhost:8888/metrics|engine=0', role: 'combined' }], + }, + ...overrides, + }; +} + +describe('mapBenchmarkRow — v3 agentic nested agg schema', () => { + it('maps identity/routing and flattens the nested containers', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV3AgenticRow(), tracker); + + expect(result).not.toBeNull(); + expect(result!.benchmarkType).toBe('agentic_traces'); + expect(result!.config.hardware).toBe('b300'); + expect(result!.conc).toBe(16); + expect(result!.isl).toBeNull(); + expect(result!.osl).toBeNull(); + + const m = result!.metrics; + // latency distributions, p50 stored under the canonical median_* name + expect(m.median_ttft).toBeCloseTo(1.49712, 6); + expect(m.p90_ttft).toBeCloseTo(56.22194, 6); + expect(m.std_e2el).toBeCloseTo(149.59205, 6); + expect(m.p95_itl).toBeCloseTo(0.22255, 6); + expect(m.mean_tpot).toBeCloseTo(0.07548, 6); + // qps + token distributions + expect(m.median_qps).toBe(0); + expect(m.p90_input_tokens).toBeCloseTo(404935.9, 3); + expect(m.median_output_tokens_actual).toBeCloseTo(290.5, 3); + expect(m.p95_output_tokens_expected).toBeCloseTo(5312.9, 3); + // throughput scalars under the v2 flat names + expect(m.tput_per_gpu).toBeCloseTo(9043.47223, 3); + expect(m.output_tput_per_gpu).toBeCloseTo(48.43723, 3); + expect(m.input_tput_per_gpu).toBeCloseTo(8995.035, 3); + expect(m.total_tput_tps).toBeCloseTo(36173.88892, 3); + expect(m.duration_seconds).toBeCloseTo(7222.04352, 3); + // cache / kv / totals + expect(m.theoretical_cache_hit_rate).toBeCloseTo(0.97509, 6); + expect(m.server_gpu_cache_hit_rate).toBeCloseTo(0.78539, 6); + expect(m.server_external_cache_hit_rate).toBe(0); + expect(m.gpu_kv_cache_usage_pct).toBeCloseTo(0.82134, 6); + expect(m.total_prompt_tokens).toBe(261750519); + expect(m.total_generation_tokens).toBe(1422696); + expect(m.total_requests_completed).toBe(1648); + // nested containers must not leak into metrics + expect(m).not.toHaveProperty('request_metrics'); + expect(m).not.toHaveProperty('server_metrics'); + }); + + it('re-derives *_intvty from *_itl (matching the pre-inverted artifact values)', () => { + const tracker = createSkipTracker(); + const m = mapBenchmarkRow(makeV3AgenticRow(), tracker)!.metrics; + // The artifact already ships slow-tail intvty; the derive invariant keeps + // one definition and must agree with it (up to the artifact's rounding). + expect(m.median_intvty).toBeCloseTo(1 / 0.03677, 6); + expect(m.p90_intvty).toBeCloseTo(1 / 0.16652, 6); + expect(m.median_intvty).toBeCloseTo(27.19411, 2); + expect(m.p90_intvty).toBeCloseTo(6.00526, 2); + // std is never inverted — passes through from the artifact + expect(m.std_intvty).toBeCloseTo(24.77636, 6); + }); + + it("maps kv_offloading 'none' to offload off and skips the empty backend", () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV3AgenticRow(), tracker); + expect(result!.offloadMode).toBe('off'); + expect(result!.metrics).not.toHaveProperty('kv_offload_backend'); + }); + + it("maps kv_offloading 'dram' + backend to offload on with the backend preserved", () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeV3AgenticRow({ kv_offloading: 'dram', kv_offload_backend: 'mooncake', conc: 32 }), + tracker, + ); + expect(result!.offloadMode).toBe('on'); + expect((result!.metrics as Record).kv_offloading).toBe('dram'); + expect((result!.metrics as Record).kv_offload_backend).toBe('mooncake'); + }); + + it('still applies the failed-run guard to v3 rows', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeV3AgenticRow({ num_requests_successful: 0, num_requests_total: 100 }), + tracker, + ); + expect(result).toBeNull(); + expect(tracker.skips.failedRun).toBe(1); + }); + + it('leaves v2 flat agentic rows byte-identical (no flattening applied)', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeAgenticRow({ p90_itl: 0.1, mean_ttft: 1.5, offload_mode: 'on' }), + tracker, + ); + expect(result!.metrics.mean_ttft).toBe(1.5); + expect(result!.metrics.p90_intvty).toBeCloseTo(10, 6); + expect(result!.offloadMode).toBe('on'); + }); +}); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 5ec3343c..59309945 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -63,6 +63,14 @@ const NON_METRIC_KEYS = new Set([ 'offload_mode', 'num_requests_total', 'num_requests_successful', + // v3 agentic KV-offload descriptors ('none'|'dram'|… + backend name). Mapped + // to offloadMode / stringified metrics explicitly in mapBenchmarkRow. + 'kv_offloading', + 'kv_offload_backend', + // v3 agentic nested containers — flattened by flattenAgenticAggRow before + // the auto-capture loop runs; the raw objects themselves are not metrics. + 'request_metrics', + 'server_metrics', // Public-dataset provenance emitted by aiperf. The ingest runner uses this // object to populate run_datasets; it is not a benchmark metric. 'dataset', @@ -79,6 +87,136 @@ const NON_METRIC_KEYS = new Set([ */ export type BenchmarkType = 'single_turn' | 'agentic_traces'; +// --------------------------------------------------------------------------- +// v3 agentic agg schema (2026-07-02+): nested containers → canonical flat keys +// --------------------------------------------------------------------------- + +/** + * Distribution stat names accepted from v3 nested stat blocks, with the rename + * applied when flattening. `p50` is stored as `median_*` to match the + * established METRIC_KEYS naming (fixed-seq runs and the frontend both use + * `median_*`; no `p50_*` key exists anywhere downstream). + */ +const V3_STAT_KEYS: Record = { + mean: 'mean', + p50: 'median', + median: 'median', + p75: 'p75', + p90: 'p90', + p95: 'p95', + p99: 'p99', + 'p99.9': 'p99.9', + std: 'std', +}; + +/** v3 `request_metrics.latency` sub-blocks → flat metric suffix (same name). */ +const V3_LATENCY_METRICS = ['ttft', 'e2el', 'itl', 'tpot', 'intvty'] as const; + +/** v3 `request_metrics.tokens` sub-blocks → flat metric suffix. */ +const V3_TOKEN_METRICS: Record = { + input: 'input_tokens', + output_actual: 'output_tokens_actual', + output_expected: 'output_tokens_expected', +}; + +/** + * Scalar paths in the v3 nested containers → canonical flat metric key. Keys + * reuse the flat v2-agentic names wherever one existed so already-ingested runs + * and the frontend see one consistent schema; genuinely new information gets a + * new key (registered in METRIC_KEYS). + */ +const V3_SCALAR_PATHS: [string[], string][] = [ + // client-side throughput + [['request_metrics', 'throughput', 'input', 'tokens_per_second'], 'input_tput_tps'], + [['request_metrics', 'throughput', 'output', 'tokens_per_second'], 'output_tput_tps'], + [['request_metrics', 'throughput', 'total', 'tokens_per_second'], 'total_tput_tps'], + [['request_metrics', 'throughput', 'duration_seconds'], 'duration_seconds'], + [['request_metrics', 'throughput', 'per_gpu', 'total_tput_tps'], 'tput_per_gpu'], + [['request_metrics', 'throughput', 'per_gpu', 'output_tput_tps'], 'output_tput_per_gpu'], + [['request_metrics', 'throughput', 'per_gpu', 'input_tput_tps'], 'input_tput_per_gpu'], + [['request_metrics', 'cache', 'theoretical_cache_hit_rate'], 'theoretical_cache_hit_rate'], + // server-side prefix-cache observability (same fields v2 emitted flat) + [['server_metrics', 'cache', 'gpu_cache_hit_rate'], 'server_gpu_cache_hit_rate'], + [['server_metrics', 'cache', 'cpu_cache_hit_rate'], 'server_cpu_cache_hit_rate'], + [['server_metrics', 'cache', 'external_cache_hit_rate'], 'server_external_cache_hit_rate'], + // KV-cache occupancy (gpu key predates v3 as a flat auto-captured field) + [['server_metrics', 'kv_cache', 'gpu_usage_pct'], 'gpu_kv_cache_usage_pct'], + // server token totals + [['server_metrics', 'tokens', 'prompt_total'], 'total_prompt_tokens'], + [['server_metrics', 'tokens', 'generation_total'], 'total_generation_tokens'], + [['server_metrics', 'tokens', 'requests_completed'], 'total_requests_completed'], + // Deliberately NOT mapped (yet): cache.overall/prefix_cache_hits/queries, + // kv_cache.cpu_*, tokens.prompt_by_source, sources[] — new v3 detail we don't + // consume anywhere; add here + METRIC_KEYS when a view needs them. +]; + +/** Reduce an offload descriptor ('none'|'dram'|…) to the binary on/off. */ +function descriptorToOnOff(v: unknown): string | null { + return typeof v === 'string' && v.length > 0 ? (v === 'none' ? 'off' : 'on') : null; +} + +/** Walk a nested object path; returns undefined on any non-object hop. */ +function atPath(obj: Record, path: string[]): unknown { + let cur: unknown = obj; + for (const seg of path) { + if (!cur || typeof cur !== 'object' || Array.isArray(cur)) return undefined; + cur = (cur as Record)[seg]; + } + return cur; +} + +/** Flatten one v3 stat block ({mean, p50, …}) into `out` as `{stat}_{suffix}`. */ +function flattenStatBlock(block: unknown, suffix: string, out: Record): void { + if (!block || typeof block !== 'object' || Array.isArray(block)) return; + for (const [stat, canonical] of Object.entries(V3_STAT_KEYS)) { + const n = parseNum((block as Record)[stat]); + if (n !== undefined) out[`${canonical}_${suffix}`] = n; + } +} + +/** + * Flatten a v3 agentic agg row (nested `request_metrics` / `server_metrics` + * containers, 2026-07-02+) into the canonical flat metric schema that v1/v2 + * artifacts emitted directly and that the DB / API / frontend consume. + * + * Returns the row unchanged when `request_metrics` is absent (v1/v2 rows pass + * through untouched). Otherwise returns a copy with the flattened metrics + * merged in; the nested containers stay on the row (they're in NON_METRIC_KEYS + * so the auto-capture loop ignores them). + * + * Notes on the v3 source data: + * - `p50` percentiles are new (v2 had no median for agentic); stored as + * `median_*` to match the frontend's naming. + * - `latency.intvty` arrives already slow-tail inverted (pXX_intvty = + * 1/pXX_itl). It's flattened here for completeness, but mapBenchmarkRow's + * derive-from-itl invariant still overwrites it, keeping one definition + * across all harness versions. + */ +export function flattenAgenticAggRow(row: Record): Record { + const rm = row.request_metrics; + if (!rm || typeof rm !== 'object' || Array.isArray(rm)) return row; + + const flat: Record = {}; + + // latency distributions + for (const metric of V3_LATENCY_METRICS) { + flattenStatBlock(atPath(row, ['request_metrics', 'latency', metric]), metric, flat); + } + // qps distribution (window_seconds / samples are intentionally not stats) + flattenStatBlock(atPath(row, ['request_metrics', 'qps']), 'qps', flat); + // per-request token-count distributions + for (const [src, suffix] of Object.entries(V3_TOKEN_METRICS)) { + flattenStatBlock(atPath(row, ['request_metrics', 'tokens', src]), suffix, flat); + } + // scalars + for (const [path, key] of V3_SCALAR_PATHS) { + const n = parseNum(atPath(row, path)); + if (n !== undefined) flat[key] = n; + } + + return { ...row, ...flat }; +} + /** * METRIC_KEYS from constants is the canonical set of known metric keys. * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured @@ -148,6 +286,11 @@ export function mapBenchmarkRow( tracker: SkipTracker, islOslFallback?: { isl: number; osl: number } | null, ): BenchmarkParams | null { + // v3 agentic rows nest their metrics; flatten to the canonical flat schema + // first so the rest of the mapper (auto-capture, intvty invariant, guards) + // is version-agnostic. No-op for v1/v2 rows. + row = flattenAgenticAggRow(row); + const modelKey = resolveModelKey(row); if (!modelKey) { tracker.skips.unmappedModel++; @@ -192,16 +335,15 @@ export function mapBenchmarkRow( return null; } - // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` - // ('none' → 'off'; any other non-empty value → 'on'). + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), then the v3 + // `kv_offloading` descriptor ('none'|'dram'|…), then legacy `offloading`. + // Descriptors reduce to the binary on/off used for row identity ('none' → + // 'off', anything else → 'on') so v3 offload points keep colliding-key parity + // with their v2 predecessors instead of forking a third offload_mode value. const offloadModeRaw = typeof row.offload_mode === 'string' && row.offload_mode.length > 0 ? row.offload_mode - : typeof row.offloading === 'string' && row.offloading.length > 0 - ? row.offloading === 'none' - ? 'off' - : 'on' - : 'off'; + : (descriptorToOnOff(row.kv_offloading) ?? descriptorToOnOff(row.offloading) ?? 'off'); const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg); const isMultinode = parseBool(row.is_multinode); @@ -265,8 +407,16 @@ export function mapBenchmarkRow( // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`) // — preserve as a stringified metric so the frontend can expose it in tooltips. + // v3 rows additionally carry the offload tier + backend ('dram'/'mooncake'); + // keep them so the UI can say *what kind* of offload, not just on/off. if (isAgentic) { (metrics as Record).offload_mode = offloadModeRaw; + if (typeof row.kv_offloading === 'string' && row.kv_offloading.length > 0) { + (metrics as Record).kv_offloading = row.kv_offloading; + } + if (typeof row.kv_offload_backend === 'string' && row.kv_offload_backend.length > 0) { + (metrics as Record).kv_offload_backend = row.kv_offload_backend; + } } // Slow-tail interactivity invariant. Agentic artifacts ship `*_intvty`, but the diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts index e569143a..82aaf67c 100644 --- a/packages/db/src/etl/normalizers.test.ts +++ b/packages/db/src/etl/normalizers.test.ts @@ -25,6 +25,11 @@ describe('hwToGpuKey', () => { expect(hwToGpuKey('mi300x-amd')).toBe('mi300x'); }); + it('strips a v3 scope prefix (cluster:…)', () => { + expect(hwToGpuKey('cluster:b300-nv')).toBe('b300'); + expect(hwToGpuKey('cluster:h200')).toBe('h200'); + }); + it('strips -amds suffix', () => { expect(hwToGpuKey('mi355x-amds')).toBe('mi355x'); }); diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index c5ff69dc..844e1751 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -22,9 +22,11 @@ export { GPU_KEYS }; * stripped base is not in `GPU_KEYS`. */ export function hwToGpuKey(hw: string): string | null { - // Take the first segment before `-` as the canonical key. Subsumes all the - // prior explicit suffix strips (-nv, -amds, -dgxc-slurm, -p1, -cw, …). - const base = hw.toLowerCase().split('-')[0]; + // v3 agentic artifacts scope the hw id (`cluster:b300-nv`) — drop everything + // up to the last `:` first. Then take the first segment before `-` as the + // canonical key; that subsumes all the prior explicit suffix strips + // (-nv, -amds, -dgxc-slurm, -p1, -cw, …). + const base = hw.toLowerCase().split(':').pop()!.split('-')[0]; return GPU_KEYS.has(base) ? base : null; } From 3a06f891158eee16851d9fe86484d6fb8161f681 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 00:43:29 -0500 Subject: [PATCH 110/111] fix(ingest): find server.log in the v3 harness's nested results/ layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit feat-agentx-v1.0 moved the log inside server_logs_* artifacts from the root (server.log) to results/server.log; the discovery loop only checked the root, so run 28553943579's first ingest pass attached 0/20 server logs. Check both locations. Also harden the ingest agent doc: changelog entries are mandatory for every ingest (derive from the run name when no text is given, never block asking), and fix a doc typo that pointed the cache purge at port 3000 — the port the same doc forbids touching. Co-Authored-By: Claude Fable 5 --- .claude/agents/ingest.md | 15 ++++++++++----- packages/db/src/ingest-ci-run.ts | 9 +++++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md index 59045378..10e37d6c 100644 --- a/.claude/agents/ingest.md +++ b/.claude/agents/ingest.md @@ -130,7 +130,12 @@ The `spec_method` column has a lowercase check constraint — always lowercase. Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`. -## Adding a perf changelog entry +## Adding a perf changelog entry — MANDATORY for every ingest + +**You ALWAYS MUST add a changelog entry for every run you ingest. This is not optional.** Every standard ingest, delete+reingest, and partial ingest gets exactly one changelog entry. Never finish an ingest without one. + +- If the user gave changelog text, use it verbatim (substitute `` with the run's hardware SKU when the text contains that placeholder). +- If the user did NOT specify text, DO NOT skip the changelog — derive a sensible description from the run name (see convention below) and add it anyway, then tell the user what you used so they can adjust. Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `---` (matches what the user actually sees in the filter chain). @@ -147,7 +152,7 @@ Description convention from prior entries: ` Ingest # (`); otherwise derive one from the run name and add it anyway. Never skip this step. +6. **Purge both caches** (localhost 3002 + preview — never port 3000). +7. **Report** the row count, date, hardware, run id, and the changelog id (always present). ## Related: ingesting agentic _datasets_ (not benchmark runs) diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 8ec1fb9e..8bea3378 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -384,8 +384,13 @@ async function main(): Promise { if (fs.existsSync(artifactsDir)) { for (const d of fs.readdirSync(artifactsDir)) { if (!d.startsWith('server_logs_')) continue; - const logPath = path.join(artifactsDir, d, 'server.log'); - if (!fs.existsSync(logPath)) continue; + // feat-agentx-v1.0 harness nests the log under `results/server.log`; + // older runs keep it at the artifact root. Check both. + const logPath = [ + path.join(artifactsDir, d, 'server.log'), + path.join(artifactsDir, d, 'results', 'server.log'), + ].find((p) => fs.existsSync(p)); + if (!logPath) continue; const configKey = d.replace(/^server_logs_/u, ''); serverLogPaths.set(configKey, logPath); } From f67b349d96aa0e3c53cbe6577954bdd32053f251 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Jul 2026 09:44:53 -0500 Subject: [PATCH 111/111] fix(datasets): resolve raw= deep links for pre-rawIndex conversation structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Timeline request bars link to the dataset flamegraph with raw weka coordinates (raw=/inner=), but stored conversation structures ingested before rawIndex/innerIndex existed have no such fields, so the lookup never matched and the deep link silently did nothing. Since buildConversationStructure emits exactly one node per raw entry, array position is definitionally the raw index — resolve via (node.rawIndex ?? arrayIndex) in an extracted, unit-tested resolveDeepLinkTarget. Out-of-range coords still return null rather than guessing; positional turn=/sa= links are unchanged. Verified with Playwright against real data: point 425111 (new mapping) turn-1, later-turn, and subagent-child links highlight the exact row; old positional-link runs (424976) unregressed. Co-Authored-By: Claude Fable 5 --- .../datasets/trace-flamegraph.test.ts | 102 ++++++++++++++- .../components/datasets/trace-flamegraph.tsx | 118 ++++++++++++------ 2 files changed, 179 insertions(+), 41 deletions(-) diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts index 2ead726b..0cbf92f3 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.test.ts +++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts @@ -1,6 +1,16 @@ import { describe, expect, it } from 'vitest'; -import { findRequestOverlapGroups, formatElapsedTime } from './trace-flamegraph'; +import type { + StructureNode, + SubagentNode, + TurnNode, +} from '@semianalysisai/inferencex-db/etl/weka-structure'; + +import { + findRequestOverlapGroups, + formatElapsedTime, + resolveDeepLinkTarget, +} from './trace-flamegraph'; describe('formatElapsedTime', () => { it('formats elapsed seconds below and above one hour', () => { @@ -53,3 +63,93 @@ describe('findRequestOverlapGroups', () => { expect(groups).toMatchObject([{ requestKeys: ['A', 'B', 'C'], startS: 3, endS: 7 }]); }); }); + +const turn = (turnIndex: number, extra: Partial = {}): TurnNode => ({ + kind: 'turn', + turnIndex, + in: 100, + out: 10, + cached: 0, + uncached: 100, + ...extra, +}); +const subagent = (children: TurnNode[], extra: Partial = {}): SubagentNode => ({ + kind: 'subagent', + label: 'Subagent', + in: 100, + out: 10, + cached: 0, + uncached: 100, + children, + ...extra, +}); + +describe('resolveDeepLinkTarget', () => { + // Node layout mirroring a real Weka conversation: raw entries + // 0: turn, 1: subagent (2 children), 2: turn + const withRawIndexes: StructureNode[] = [ + turn(0, { rawIndex: 0 }), + subagent([turn(1, { rawIndex: 1, innerIndex: 0 }), turn(2, { rawIndex: 1, innerIndex: 1 })], { + agentId: 'subagent_001_abcd1234', + rawIndex: 1, + }), + turn(3, { rawIndex: 2 }), + ]; + // The same conversation as stored by the pre-rawIndex ingest (fields absent). + const legacy: StructureNode[] = [ + turn(0), + subagent([turn(1), turn(2)], { agentId: 'subagent_001_abcd1234' }), + turn(3), + ]; + + it('resolves raw source coordinates against explicit rawIndex fields', () => { + expect(resolveDeepLinkTarget(withRawIndexes, { raw: 2 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(withRawIndexes, { raw: 1, inner: 1 })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + }); + + it('falls back to node array position for structures ingested before rawIndex existed', () => { + // One node per raw entry means position === raw index, so the deep link + // must still resolve exactly (regression: it previously returned null and + // the flamegraph neither scrolled nor highlighted anything). + expect(resolveDeepLinkTarget(legacy, { raw: 2, turn: 1 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(legacy, { raw: 0, turn: 0 })).toEqual({ + rowKey: 't-0', + expandGroup: null, + }); + }); + + it('resolves subagent children positionally when innerIndex is absent', () => { + expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 1, turn: 1 })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + }); + + it('returns null for out-of-range raw coordinates instead of guessing', () => { + expect(resolveDeepLinkTarget(legacy, { raw: 9 })).toBeNull(); + expect(resolveDeepLinkTarget(legacy, { raw: 1, inner: 5 })).toBeNull(); + // raw pointing at a subagent marker without inner does not match a turn. + expect(resolveDeepLinkTarget(legacy, { raw: 1 })).toBeNull(); + }); + + it('keeps the positional turn/agent fallback for links without raw coordinates', () => { + expect(resolveDeepLinkTarget(legacy, { turn: 1 })).toEqual({ + rowKey: 't-2', + expandGroup: null, + }); + expect(resolveDeepLinkTarget(legacy, { turn: 1, agent: 'subagent_001_abcd1234' })).toEqual({ + rowKey: 'g-1-c-1', + expandGroup: 1, + }); + expect(resolveDeepLinkTarget(legacy, {})).toBeNull(); + }); +}); diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index a3366342..d57567e5 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -160,6 +160,73 @@ interface TooltipState { row: VisibleRow; } +export interface DeepLinkHighlight { + turn?: number | null; + raw?: number | null; + inner?: number | null; + agent?: string | null; +} + +export interface DeepLinkTarget { + rowKey: string; + expandGroup: number | null; +} + +/** + * Resolve a request-timeline deep link to a flamegraph row key (+ the subagent + * group that must be expanded to show it). Raw Weka source coordinates are + * exact and take precedence: + * raw= -> top-level Weka request + * raw=&inner= -> subagent child inside that top-level marker + * Otherwise main turns match by main-turn ordinal and subagent turns match the + * group by agentId, then the ti-th child. + * + * `buildConversationStructure` emits exactly one node per raw Weka entry (and + * one child per nested entry), so a node's array position IS its raw index. + * Structures ingested before rawIndex/innerIndex were stored omit the explicit + * fields — fall back to the array position so deep links keep resolving against + * those older rows instead of silently doing nothing. + */ +export function resolveDeepLinkTarget( + nodes: readonly StructureNode[], + highlight: DeepLinkHighlight, +): DeepLinkTarget | null { + const { turn, raw, inner, agent } = highlight; + if (typeof raw === 'number' && raw >= 0) { + if (typeof inner === 'number' && inner >= 0) { + const gi = nodes.findIndex( + (node, i) => node.kind === 'subagent' && (node.rawIndex ?? i) === raw, + ); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + const ci = group.children.findIndex((child, i) => (child.innerIndex ?? i) === inner); + if (ci === -1) return null; + return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi }; + } + const i = nodes.findIndex( + (node, idx) => node.kind === 'turn' && (node.rawIndex ?? idx) === raw, + ); + if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null }; + return null; + } + if (typeof turn !== 'number' || turn < 0) return null; + if (agent) { + const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === agent); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + if (turn >= group.children.length) return null; + return { rowKey: `g-${gi}-c-${turn}`, expandGroup: gi }; + } + let ordinal = 0; + for (let i = 0; i < nodes.length; i++) { + if (nodes[i].kind === 'turn') { + if (ordinal === turn) return { rowKey: `t-${i}`, expandGroup: null }; + ordinal += 1; + } + } + return null; +} + /** * Per-conversation flamegraph driven by the precomputed `structure` JSONB. * One row per turn; subagent groups render a collapsible header with indented @@ -186,46 +253,17 @@ export function TraceFlamegraph({ const nodes = structure.nodes; // Resolve the deep-link target to a row key (+ the group that must be open to - // show it). Raw Weka source coordinates are exact and take precedence: - // raw= -> top-level Weka request - // raw=&inner= -> subagent child inside that top-level marker - // Otherwise main turns match by main-turn ordinal and subagent turns match - // the group by agentId, then the ti-th child. - const target = useMemo(() => { - if (typeof highlightRawIndex === 'number' && highlightRawIndex >= 0) { - if (typeof highlightInnerIndex === 'number' && highlightInnerIndex >= 0) { - const gi = nodes.findIndex( - (node) => node.kind === 'subagent' && node.rawIndex === highlightRawIndex, - ); - if (gi === -1) return null; - const group = nodes[gi] as Extract; - const ci = group.children.findIndex((child) => child.innerIndex === highlightInnerIndex); - if (ci === -1) return null; - return { rowKey: `g-${gi}-c-${ci}`, expandGroup: gi }; - } - const i = nodes.findIndex( - (node) => node.kind === 'turn' && node.rawIndex === highlightRawIndex, - ); - if (i !== -1) return { rowKey: `t-${i}`, expandGroup: null }; - return null; - } - if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null; - if (highlightAgentId) { - const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId); - if (gi === -1) return null; - const group = nodes[gi] as Extract; - if (highlightTurn >= group.children.length) return null; - return { rowKey: `g-${gi}-c-${highlightTurn}`, expandGroup: gi }; - } - let ordinal = 0; - for (let i = 0; i < nodes.length; i++) { - if (nodes[i].kind === 'turn') { - if (ordinal === highlightTurn) return { rowKey: `t-${i}`, expandGroup: null }; - ordinal += 1; - } - } - return null; - }, [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId]); + // show it). See resolveDeepLinkTarget for the matching rules. + const target = useMemo( + () => + resolveDeepLinkTarget(nodes, { + turn: highlightTurn, + raw: highlightRawIndex, + inner: highlightInnerIndex, + agent: highlightAgentId, + }), + [nodes, highlightTurn, highlightRawIndex, highlightInnerIndex, highlightAgentId], + ); // Subagent groups collapsed by default — except the deep-link target's group. const [expanded, setExpanded] = useState>(() =>