hw-native-sys · ChaoWao · Jun 22, 2026 · Jun 8, 2026
diff --git a/conftest.py b/conftest.py
@@ -187,6 +187,14 @@ def pytest_addoption(parser):
         default=False,
         help="Enable per-scope peak collection and emit <output_prefix>/scope_stats.jsonl (per-scope ring-fill peaks).",
     )
+    parser.addoption(
+        "--enable-swimlane-overhead",
+        action="store_true",
+        default=False,
+        help="Add the 8 Overhead Analysis counter tracks (per-engine "
+        "idle/ready/overhead + system all/has overhead) to the swimlane JSON. "
+        "Requires --enable-l2-swimlane + deps.json (re-run with --enable-dep-gen if absent).",
+    )
     parser.addoption(
         "--pto-isa-commit",
         action="store",

diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md
@@ -34,12 +34,15 @@ available.
   [`dep_gen`](dep_gen.md) at post-process time; see
   [§3.5](#35-dependency-arrows-from-dep_gen).
 - **AICPU scheduler phases** — per-iteration breakdown into
-  `complete` / `dispatch`. Idle iterations no longer emit a record
-  on a2a3; the host tooling reconstructs idle spans from the gap
-  between consecutive work records on the same thread. Legacy
-  captures (and a5) may still carry `scan` / `idle` records — both
-  are silently dropped by the parser (idle is double-painted
-  by the gap reconstruction; `scan` was never emitted in a2a3).
+  `complete` / `dispatch` / `release` / `resolve` (Complete sub-bar
+  for the consumer-release walk) / `early_dispatch` (speculative
+  pre-staging). Idle iterations no longer emit a record on a2a3;
+  the host tooling reconstructs idle spans from the gap between
+  consecutive work records on the same thread. Legacy captures may
+  carry `scan` / `poll` / `idle` / `fanout` / `prestage` — current
+  a2a3 builds no longer emit them (PR #1079's Scan/Poll debug
+  overlay was removed; Fanout was renamed Resolve and now also
+  filters out <1 µs walks; Prestage was renamed EarlyDispatch).
 - **Orchestrator submit envelope** — one record per `submit_task()`
   / `alloc_tensors()` call covering the whole submit's
   `[start, end]` window (`orch_submit` phase). Per-sub-step
@@ -208,7 +211,7 @@ Phase records (per scheduler thread, level >= 3 for
 | Field | Meaning |
 | ----- | ------- |
 | `start_time_us` / `end_time_us` | Phase start / end timestamps in microseconds (reader-side cycle→µs conversion) |
-| `phase` | Lowercase phase name. Scheduler: `complete` / `dispatch` (`scan` / `idle` may appear in legacy captures and a5; both are dropped by the parser). Orchestrator: `orch_submit` — one record per `submit_task()` / `alloc_tensors()` call spanning its full `[start, end]` window. Legacy per-sub-step strings (`orch_sync` / `orch_alloc` / `orch_params` / `orch_lookup` / `orch_insert` / `orch_fanin`) may appear in old captures. |
+| `phase` | Lowercase phase name. Scheduler: `complete` / `dispatch` / `release` / `resolve` / `early_dispatch`. Legacy (still parsed for old captures): `scan` / `poll` / `fanout` / `prestage` / `idle` — current a2a3 builds no longer emit Scan/Poll (PR #1079 debug overlay removed) or Prestage (renamed to EarlyDispatch); Fanout was renamed to Resolve. Orchestrator: `orch_submit` — one record per `submit_task()` / `alloc_tensors()` call spanning its full `[start, end]` window. Legacy per-sub-step strings (`orch_sync` / `orch_alloc` / `orch_params` / `orch_lookup` / `orch_insert` / `orch_fanin`) may appear in old captures. |
 | `loop_iter` (scheduler) / `submit_idx` (orchestrator) | Iteration / submit-call counter for the producing thread |
 | `tasks_processed` (scheduler) / `task_id` (orchestrator) | Phase-specific union field |
 | `pop_hit` / `pop_miss` (dispatch only) | Ready-queue pop deltas since the previous dispatch emit |
@@ -557,7 +560,7 @@ bounded only by how fast the host drains — not by the per-core buffer
 sum.
 
 **Measured impact.** Hardware bench on a2a3 paged_attention_unroll
-Case1 with swimlane=4: rotation design delivers sched −4 µs / orch −19 µs
+Case1 with swimlane=4: rotation design delivers sched -4 µs / orch -19 µs
 vs the upstream/main baseline, comparable to the no-rotation predecessor
 (which had this PR's earlier commit; the rotation adds about 3 µs
 sched overhead per session as price for unbounded session length).

diff --git a/docs/investigations/2026-06-aicore-cold-start-warmup.md b/docs/investigations/2026-06-aicore-cold-start-warmup.md
@@ -45,7 +45,7 @@ cores per task, each task busy-waits 50 µs in `get_sys_cnt()`. Run on
 a2a3 onboard via `task-submit --device auto --device-num 1 --run "...
 --enable-l2-swimlane 2"`. Compared three runs, all with the
 batched-publish dispatch optimization (PR #989) and the eager swimlane
-head resolve (also in #989). Head-OH per task = `start_time_us −
+head resolve (also in #989). Head-OH per task = `start_time_us -
 dispatch_time_us`, joined per `(core_id, reg_task_id)` from the level-2
 `l2_swimlane_records.json`.
 
@@ -59,16 +59,16 @@ Per-task head-OH (us) across 72 cores:
 | Eager head resolve, no warmup | 0.36 | 2.61 | 5.62 | 0.35 | 0.39 | 0.23 |
 | Eager + warmup | **0.32** | **2.29** | 5.64 | 0.36 | 0.36 | 0.31 |
 
-Warmup vs eager-only: t0 avg −0.32 µs (12% reduction), t0 min −0.04 µs
+Warmup vs eager-only: t0 avg -0.32 µs (12% reduction), t0 min -0.04 µs
 (into noise), **t0 max unchanged** (within run-to-run variance).
 Steady-state t1–t3 unchanged. t0 max for the slowest cores is essentially
 the same as without warmup.
 
 Per-core breakdown for the warmup run reveals the cost is **bimodal**:
 
 ```text
-Δ = t0 − avg(t1..t3) over 72 cores
-min = −0.03 µs  (warmup fully effective; t0 ≈ steady state)
+Δ = t0 - avg(t1..t3) over 72 cores
+min = -0.03 µs  (warmup fully effective; t0 ≈ steady state)
 p25 = 0.53 µs
 p50 = 1.77 µs
 p75 = 3.39 µs
@@ -139,10 +139,88 @@ property of the dispatch path that future profiling work should expect:
   the first task of a run should not assume software warmup will fix
   it.
 
+## Follow-up — receive_time DFX field + what propagation/local_setup actually mean
+
+PR #1004 adds an AICore-side `receive_time` capture marking the moment
+AICPU's full "task is ready to execute" signal landed on the core. With
+it the swimlane reports two new per-task fields:
+
+- `local_setup_us` = start_time - receive_time (AICore-side critical-path
+  prep between "task genuinely ready" and "kernel starts").
+- `propagation_us` = receive_time - dispatch_ts (AICPU→AICore-ready
+  delivery, including any scheduling-side waits).
+
+Where receive_time is stamped depends on the dispatch path:
+
+- **Common path (`not_ready == 0`)**: stamped immediately after
+  `read_reg(DATA_MAIN_BASE)` returns the new task_id — that read IS the
+  ready signal. `local_setup_us` ≈ dcci + ack cost on AICore.
+- **Speculative early-dispatch (`not_ready == 1`, PR #1079)**: the
+  initial task_id only stages the work; the real ready signal is the
+  doorbell on `DATA_MAIN_BASE` high32 that AICPU rings once the
+  dependencies resolve. receive_time is re-stamped at the moment the
+  doorbell match exits the gate-wait spin. The per-task dcci ran during
+  the spin (overlapped with the dependency wait, off the critical path),
+  so `local_setup_us` ≈ ack-only cost; `propagation_us` absorbs both the
+  original NoC delivery AND any speculation overshoot.
+
+The cleanest reading **only applies to IDLE-phase dispatches** (where
+AICPU writes the new task_id to a core whose RUNNING slot is empty —
+core was actually idle and polling). In that case:
+
+- `propagation_us` ≈ AICPU→AICore SoC delivery + AICore's next poll iter
+  picking up the new value. Hardware-bound at the ns scale per the
+  cold-start measurement above.
+- `local_setup_us` ≈ dcci + ack cost on AICore. Empirically 0–0.02 µs on
+  warm cache, 0.5–1 µs on cold first task.
+
+For **PENDING-phase dispatches** (AICPU pre-loads next task_id while
+RUNNING slot is still busy), `dispatch_ts` is captured at pre-load time,
+which can be tens of µs before the prior task FINs. AICore picks up the
+pending task immediately after prior FIN. So:
+
+- `propagation_us` here is **not** AICPU→AICore delay. It is mostly
+  "remaining exec time of the prior task at the moment of pre-load",
+  i.e. a measurement artifact of dual-buffer scheduling, not a physical
+  delivery delay.
+- `local_setup_us` interpretation still holds (dcci + ack on AICore for
+  the common path, ack-only for the speculative path).
+
+How to know which kind of dispatch a record represents: compare
+`dispatch_ts` against the prior task's `end_time` on the same core
+(both available in the swimlane records). `dispatch_ts ≥ prior.end` →
+IDLE-phase, propagation reflects real NoC delivery. `dispatch_ts <
+prior.end` → PENDING-phase, propagation is a pre-load offset.
+
+Level-3 capture on `vector_example` (5 tasks, all IDLE-dispatched):
+
+| Task | head_OH | propagation | local_setup |
+| ---- | ------: | ----------: | ----------: |
+| Cold first on core 5 | 2.12 µs | 1.20 µs | 0.92 µs |
+| Warm on core 5 | 0.16 µs | 0.16 µs | 0.00 µs |
+| Cold first on core 6 | 1.66 µs | 0.80 µs | 0.86 µs |
+| Warm on core 5 | 0.34 µs | 0.34 µs | 0.00 µs |
+| Warm on core 5 | 0.32 µs | 0.32 µs | 0.00 µs |
+
+Cold tasks pay both halves; warm tasks have local_setup at cycle-zero
+(dcci on already-invalidated cache is free) and propagation dominates.
+**This pattern is specific to the IDLE-phase regime.** Mid-workload
+traces (qwen3 decode_layer) include both IDLE and PENDING dispatches
+interleaved, and aggregating `propagation_us` without separating them
+yields confusing distributions — the "long tail" in such aggregates is
+usually PENDING pre-load offset, not slow NoC.
+
+Future warmup proposals targeting `local_setup_us` can now be evaluated
+directly against captured data. Proposals targeting `propagation_us`
+should first filter to IDLE-dispatched records to avoid attacking a
+measurement artifact.
+
 ## References
 
 - PR #989: dispatch path batched publish (where the eager swimlane head
   resolve lives).
+- PR #1004: AICore receive_time DFX field (the measurement that
+  confirmed the split above).
 - PR #988: `spmd_serial_chain_mix` example used for measurement.
 - Issue #545 comment #2: SPMD dispatch stagger.
 - `.claude/rules/...` DFX priority guidance.
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
@@ -679,12 +679,18 @@ def _build_output_prefix(case_label: str) -> Path:
 def _run_swimlane_converter(
     input_path: Path | None = None,
     func_names_path: Path | None = None,
+    enable_overhead: bool = False,
 ) -> None:
     """Invoke the bundled swimlane converter as a subprocess.
 
     When ``input_path`` is given, the converter derives its output filename from
     the input's timestamp (see ``swimlane_converter._resolve_output_path``).
     Without it, the converter auto-selects the latest ``l2_swimlane_records_*.json``.
+
+    ``enable_overhead`` forwards the converter's ``--overhead`` flag — adds the
+    8 Overhead Analysis counter tracks (per-engine idle/ready/overhead + system
+    all/has overhead) under the AICPU Scheduler process. Needs deps.json; the
+    converter silently no-ops if deps is absent.
     """
     import logging  # noqa: PLC0415
     import subprocess  # noqa: PLC0415
@@ -695,6 +701,8 @@ def _run_swimlane_converter(
         cmd.append(str(input_path))
     if func_names_path is not None:
         cmd += ["--func-names", str(func_names_path)]
+    if enable_overhead:
+        cmd.append("--overhead")
     try:
         result = subprocess.run(cmd, check=True, capture_output=True, text=True)
         if result.stdout:
@@ -716,6 +724,7 @@ def _convert_case_swimlane(
     case_label: str,
     output_prefix: Path,
     callable_spec: dict | None = None,
+    enable_overhead: bool = False,
 ) -> None:
     """Post-case: invoke the swimlane converter on the perf file the runtime
     just wrote into ``<output_prefix>/l2_swimlane_records.json``. No diff/rename
@@ -736,7 +745,7 @@ def _convert_case_swimlane(
         safe_label = _sanitize_for_filename(case_label)
         func_names_path = _dump_name_map(mapping, output_prefix / f"name_map_{safe_label}.json")
 
-    _run_swimlane_converter(input_path=perf_file, func_names_path=func_names_path)
+    _run_swimlane_converter(input_path=perf_file, func_names_path=func_names_path, enable_overhead=enable_overhead)
 
 
 def _run_deps_viewer(
@@ -837,6 +846,7 @@ def run_class_cases(  # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI
     enable_dep_gen,
     enable_scope_stats,
     enable_device_log_timing=False,
+    enable_swimlane_overhead=False,
 ):
     """Execute a pre-filtered list of cases for one class (layers 5-6).
 
@@ -887,7 +897,12 @@ def run_class_cases(  # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI
             )
         finally:
             if enable_l2_swimlane:
-                _convert_case_swimlane(case_label, prefix, callable_spec=callable_spec)
+                _convert_case_swimlane(
+                    case_label,
+                    prefix,
+                    callable_spec=callable_spec,
+                    enable_overhead=enable_swimlane_overhead,
+                )
             if enable_dep_gen:
                 _graph_case_dep_gen(case_label, prefix, callable_spec=callable_spec)
             if enable_scope_stats:
@@ -1354,6 +1369,7 @@ def test_run(self, st_platform, st_worker, request):
         # device-log timing is cheap (PTO2_PROFILING markers, one block/round)
         # so unlike the heavy diagnostics it is NOT disabled when --rounds > 1.
         enable_device_log_timing = request.config.getoption("--enable-device-log-timing", default=False)
+        enable_swimlane_overhead = request.config.getoption("--enable-swimlane-overhead", default=False)
         if rounds > 1:
             if enable_l2_swimlane:
                 logger.warning("Profiling disabled: --rounds > 1")
@@ -1409,6 +1425,7 @@ def test_run(self, st_platform, st_worker, request):
             enable_dep_gen=enable_dep_gen,
             enable_scope_stats=enable_scope_stats,
             enable_device_log_timing=enable_device_log_timing,
+            enable_swimlane_overhead=enable_swimlane_overhead,
         )
 
     # ------------------------------------------------------------------
@@ -1507,6 +1524,15 @@ def run_module(module_name):  # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch
             help="Enable per-scope peak collection and emit <output_prefix>/scope_stats/scope_stats.jsonl "
             "(per-scope ring-fill peaks).",
         )
+        parser.add_argument(
+            "--enable-swimlane-overhead",
+            action="store_true",
+            default=False,
+            help="Add the 8 Overhead Analysis counter tracks (per-engine "
+            "idle/ready/overhead + system all/has overhead) to the swimlane "
+            "JSON. Requires --enable-l2-swimlane + deps.json (re-run with "
+            "--enable-dep-gen if absent).",
+        )
         parser.add_argument(
             "--runtime",
             default=None,
@@ -1722,6 +1748,7 @@ def run_module(module_name):  # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch
                                 enable_dep_gen=args.enable_dep_gen,
                                 enable_scope_stats=args.enable_scope_stats,
                                 enable_device_log_timing=args.enable_device_log_timing,
+                                enable_swimlane_overhead=args.enable_swimlane_overhead,
                             )
                             print("PASSED")
                         except Exception as e:  # noqa: BLE001
@@ -1767,6 +1794,8 @@ def _dispatch_test_phases_standalone(module_name, selected_by_cls, args):  # noq
         common.append("--enable-scope-stats")
     if args.enable_device_log_timing:
         common.append("--enable-device-log-timing")
+    if args.enable_swimlane_overhead:
+        common.append("--enable-swimlane-overhead")
 
     # ----- L3 phase: one subprocess per class (not per case).
     # The child's _create_standalone_worker allocates max(cls.CASES.device_count)