From 22fce06dea0ab0d416ce15e1c46ec560cba94b5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 28 May 2026 14:39:17 -0300 Subject: [PATCH 1/3] docs: document missing metrics and fix gean repo link The metrics doc had drifted from the implementation. Add the 11 metrics that were implemented but undocumented and reconcile the doc with the pinned leanMetrics spec: - New Block Production Metrics section (block_aggregated_payloads, block_building_{time,payload_aggregation_time}_seconds, block_building_{success,failures}_total) - lean_node_sync_status (Fork-Choice), lean_gossip_mesh_peers (Network) - spec's lean_justified_slot / lean_finalized_slot listed as unsupported (ethlambda only emits the latest_* variants) - Custom (non-leanMetrics) Storage and Attestation Aggregate Coverage subsections for lean_table_bytes and the coverage gauges Also flag the reserved per-subnet coverage breakdown with a TODO in coverage.rs, and fix the stale gean repository URL in introduction.md (devlongs/gean -> geanlabs/gean). --- crates/blockchain/src/coverage.rs | 4 ++++ docs/introduction.md | 2 +- docs/metrics.md | 30 ++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/crates/blockchain/src/coverage.rs b/crates/blockchain/src/coverage.rs index ebb6f5a7..22a559d6 100644 --- a/crates/blockchain/src/coverage.rs +++ b/crates/blockchain/src/coverage.rs @@ -56,6 +56,10 @@ fn cov_add(seen: &mut [bool], has_subnet: &mut [bool], bits: &AggregationBits) { } fn cov_record(section: &str, seen: &[bool], has_subnet: &[bool]) { + // TODO: emit a per-subnet breakdown (subnet=subnet_N) alongside the + // subnet=combined total. `has_subnet` already tracks which subnets are + // covered, but we only report the aggregate count here; the per-subnet + // label is reserved in the metric definition and not yet populated. metrics::set_attestation_aggregate_coverage_validators( section, "combined", diff --git a/docs/introduction.md b/docs/introduction.md index 4b54ffc6..06e5bc7c 100644 --- a/docs/introduction.md +++ b/docs/introduction.md @@ -45,5 +45,5 @@ For comparison and cross-client testing: - [ream](https://github.com/ReamLabs/ream) (Rust) - [qlean](https://github.com/qdrvm/qlean-mini) (C++) - [grandine](https://github.com/grandinetech/lean/tree/main/lean_client) (Rust) -- [gean](https://github.com/devlongs/gean) (Go) +- [gean](https://github.com/geanlabs/gean) (Go) - [Lantern](https://github.com/Pier-Two/lantern) (C) diff --git a/docs/metrics.md b/docs/metrics.md index 68aacd98..53dd1601 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -30,6 +30,16 @@ The exposed metrics follow [the leanMetrics specification](https://github.com/le | `lean_pq_sig_aggregated_signatures_building_time_seconds` | Histogram | Time taken to build an aggregated attestation signature | On aggregated signature production | | 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 2, 4 | ✅ | | `lean_pq_sig_aggregated_signatures_verification_time_seconds` | Histogram | Time taken to verify an aggregated attestation signature | On aggregated signature verification | | 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 2, 4 | ✅ | +## Block Production Metrics + +| Name | Type | Usage | Sample collection event | Labels | Buckets | Supported | +|--------|-------|-------|-------------------------|--------|---------|-----------| +| `lean_block_aggregated_payloads` | Histogram | Number of `aggregated_payloads` in a block | On block production | | 1, 2, 4, 8, 16, 32, 64, 128 | ✅ | +| `lean_block_building_payload_aggregation_time_seconds` | Histogram | Time taken to build `aggregated_payloads` during block building | On block production | | 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 4 | ✅ | +| `lean_block_building_time_seconds` | Histogram | Time taken to build a block | On block production | | 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1 | ✅ | +| `lean_block_building_success_total` | Counter | Successful block builds | On block production | | | ✅ | +| `lean_block_building_failures_total` | Counter | Failed block builds (exception in build_block) | On block production failure | | | ✅ | + ## Fork-Choice Metrics | Name | Type | Usage | Sample collection event | Labels | Buckets | Supported | @@ -48,6 +58,7 @@ The exposed metrics follow [the leanMetrics specification](https://github.com/le | `lean_latest_new_aggregated_payloads` | Gauge | Number of new aggregated payload items | On `latest_new_aggregated_payloads` update | | | ✅ | | `lean_latest_known_aggregated_payloads` | Gauge | Number of known aggregated payload items | On `latest_known_aggregated_payloads` update | | | ✅ | | `lean_committee_signatures_aggregation_time_seconds` | Histogram | Time taken to aggregate committee signatures | On committee signatures aggregation | | 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1 | ✅ | +| `lean_node_sync_status` | Gauge | Node sync status | On node sync status change | status=idle,syncing,synced | | ✅ | ## State Transition Metrics @@ -55,6 +66,8 @@ The exposed metrics follow [the leanMetrics specification](https://github.com/le |--------|-------|-------|-------------------------|--------|---------|-----------| | `lean_latest_justified_slot` | Gauge | Latest justified slot | On state transition | | | ✅ | | `lean_latest_finalized_slot` | Gauge | Latest finalized slot | On state transition | | | ✅ | +| `lean_justified_slot` | Gauge | Current justified slot | On state transition | | | ❌ | +| `lean_finalized_slot` | Gauge | Current finalized slot | On state transition | | | ❌ | | `lean_finalizations_total` | Counter | Total number of finalization attempts | On finalization attempt | result=success,error | | ✅ | |`lean_state_transition_time_seconds`| Histogram | Time to process state transition | On state transition | | 0.25, 0.5, 0.75, 1, 1.25, 1.5, 2, 2.5, 3, 4 | ✅ | |`lean_state_transition_slots_processed_total`| Counter | Total number of processed slots | On state transition process slots | | | ✅ | @@ -78,6 +91,7 @@ The exposed metrics follow [the leanMetrics specification](https://github.com/le |`lean_attestation_committee_count`| Gauge | Number of attestation committees | On node start | | ✅ | |`lean_attestation_committee_subnet`| Gauge | Node's attestation committee subnet | On node start | | ✅ | |`lean_connected_peers`| Gauge | Number of connected peers | On scrape | client=ethlambda,grandine,lantern,lighthouse,qlean,ream,zeam | ✅(*) | +|`lean_gossip_mesh_peers`| Gauge | Number of peers in the gossipsub mesh | On scrape | client=`_`,unknown (ex. zeam_0) | ✅(*) | |`lean_peer_connection_events_total`| Counter | Total number of peer connection events | On peer connection | direction=inbound,outbound
result=success,timeout,error | ✅ | |`lean_peer_disconnection_events_total`| Counter | Total number of peer disconnection events | On peer disconnection | direction=inbound,outbound
reason=timeout,remote_close,local_close,error | ✅ | @@ -101,6 +115,22 @@ The metrics below are not part of the [leanMetrics specification](https://github | `lean_reqresp_request_size_bytes` | Histogram | Bytes size of a req/resp request (raw SSZ or snappy on-wire) | On req/resp request send/receive | protocol=status,blocks_by_root
compression=raw,snappy | 64, 128, 256, 512, 1024, 4096, 16384, 65536 | | `lean_reqresp_response_chunk_size_bytes` | Histogram | Bytes size of a single req/resp response chunk (raw SSZ or snappy on-wire) | On req/resp response chunk send/receive | protocol=status,blocks_by_root
compression=raw,snappy | 128, 1024, 10000, 100000, 500000, 1000000, 5000000, 10000000 | +### Storage + +| Name | Type | Usage | Sample collection event | Labels | +|------|------|-------|-------------------------|--------| +| `lean_table_bytes` | Gauge | Estimated byte size of a storage table (key + value bytes) | On each slot (one update per table) | table=`` | + +### Attestation Aggregate Coverage + +Per-slot observability into how many validators/subnets are covered by the attestations the node has aggregated, broken down by pipeline section. The slot is the X-axis (these are sampled once per slot). + +| Name | Type | Usage | Sample collection event | Labels | +|------|------|-------|-------------------------|--------| +| `lean_attestation_aggregate_coverage_validators` | Gauge | Validator coverage in attestation aggregate reports | Each slot | section=timely,late,block,combined,agg_start_new,proposal_combined
subnet=combined (per-subnet breakdown reserved, not yet populated) | +| `lean_attestation_aggregate_coverage_subnets` | Gauge | Number of covered subnets in attestation aggregate reports | Each slot | section=timely,late,block,combined,agg_start_new,proposal_combined | +| `lean_attestation_aggregate_coverage_diff_validators` | Gauge | Validators in the symmetric difference between block-included aggregates and locally-aggregated timely aggregates for the same slot | Each slot | direction=block_only,timely_only | + --- ✅(*) **Partial support**: These metrics are implemented but not collected "on scrape" as the spec requires. They are updated on specific events (e.g., on tick, on block processing) rather than being computed fresh on each Prometheus scrape. From b567480e3bcd0726941188a72b2b07f2075bcd14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 28 May 2026 14:59:41 -0300 Subject: [PATCH 2/3] docs: correct collection events and fix committee histogram buckets Address automated review feedback (Codex/Claude/Kimi): - block_building_failures_total: broaden description; it also counts block-root signing and local process_block failures, not just build - table_bytes: updated after each processed block (not every slot); it retains its previous value on empty slots - attestation aggregate coverage: emission is gated by source data, so document the per-section timing instead of claiming "each slot" - committee_signatures_aggregation_time_seconds: fix pre-existing bucket drift (doc listed 0.005..1; impl is 0.05,0.1,0.25,0.5,0.75,1,2,3,4) --- docs/metrics.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 53dd1601..e248ee05 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -38,7 +38,7 @@ The exposed metrics follow [the leanMetrics specification](https://github.com/le | `lean_block_building_payload_aggregation_time_seconds` | Histogram | Time taken to build `aggregated_payloads` during block building | On block production | | 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 4 | ✅ | | `lean_block_building_time_seconds` | Histogram | Time taken to build a block | On block production | | 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1 | ✅ | | `lean_block_building_success_total` | Counter | Successful block builds | On block production | | | ✅ | -| `lean_block_building_failures_total` | Counter | Failed block builds (exception in build_block) | On block production failure | | | ✅ | +| `lean_block_building_failures_total` | Counter | Failed block builds (error building the block, signing the block root, or processing it locally) | On block production failure | | | ✅ | ## Fork-Choice Metrics @@ -57,7 +57,7 @@ The exposed metrics follow [the leanMetrics specification](https://github.com/le | `lean_gossip_signatures` | Gauge | Number of gossip signatures in fork-choice store | On gossip signatures update | | | ✅ | | `lean_latest_new_aggregated_payloads` | Gauge | Number of new aggregated payload items | On `latest_new_aggregated_payloads` update | | | ✅ | | `lean_latest_known_aggregated_payloads` | Gauge | Number of known aggregated payload items | On `latest_known_aggregated_payloads` update | | | ✅ | -| `lean_committee_signatures_aggregation_time_seconds` | Histogram | Time taken to aggregate committee signatures | On committee signatures aggregation | | 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1 | ✅ | +| `lean_committee_signatures_aggregation_time_seconds` | Histogram | Time taken to aggregate committee signatures | On committee signatures aggregation | | 0.05, 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 4 | ✅ | | `lean_node_sync_status` | Gauge | Node sync status | On node sync status change | status=idle,syncing,synced | | ✅ | ## State Transition Metrics @@ -119,17 +119,21 @@ The metrics below are not part of the [leanMetrics specification](https://github | Name | Type | Usage | Sample collection event | Labels | |------|------|-------|-------------------------|--------| -| `lean_table_bytes` | Gauge | Estimated byte size of a storage table (key + value bytes) | On each slot (one update per table) | table=`` | +| `lean_table_bytes` | Gauge | Estimated byte size of a storage table (key + value bytes) | After each processed block (one update per table); retains its previous value on empty slots | table=`` | ### Attestation Aggregate Coverage -Per-slot observability into how many validators/subnets are covered by the attestations the node has aggregated, broken down by pipeline section. The slot is the X-axis (these are sampled once per slot). +Observability into how many validators/subnets are covered by the attestations the node has aggregated, broken down by pipeline section (the `section` label). The slot is the X-axis. These are sampled roughly once per slot, but emission is gated by the section's source data, so a gauge can retain its previous value: + +- `timely`, `late`, `block`, `combined` and the `diff_validators` directions are emitted on block import, and **only when the canonical head block carries that round's votes** (otherwise the round is skipped and prior values are kept). +- `agg_start_new` is emitted at interval 2, right before fork-choice aggregation runs. +- `proposal_combined` is emitted only when this node proposes a block. | Name | Type | Usage | Sample collection event | Labels | |------|------|-------|-------------------------|--------| -| `lean_attestation_aggregate_coverage_validators` | Gauge | Validator coverage in attestation aggregate reports | Each slot | section=timely,late,block,combined,agg_start_new,proposal_combined
subnet=combined (per-subnet breakdown reserved, not yet populated) | -| `lean_attestation_aggregate_coverage_subnets` | Gauge | Number of covered subnets in attestation aggregate reports | Each slot | section=timely,late,block,combined,agg_start_new,proposal_combined | -| `lean_attestation_aggregate_coverage_diff_validators` | Gauge | Validators in the symmetric difference between block-included aggregates and locally-aggregated timely aggregates for the same slot | Each slot | direction=block_only,timely_only | +| `lean_attestation_aggregate_coverage_validators` | Gauge | Validator coverage in attestation aggregate reports | Per round, per section (see note above) | section=timely,late,block,combined,agg_start_new,proposal_combined
subnet=combined (per-subnet breakdown reserved, not yet populated) | +| `lean_attestation_aggregate_coverage_subnets` | Gauge | Number of covered subnets in attestation aggregate reports | Per round, per section (see note above) | section=timely,late,block,combined,agg_start_new,proposal_combined | +| `lean_attestation_aggregate_coverage_diff_validators` | Gauge | Validators in the symmetric difference between block-included aggregates and locally-aggregated timely aggregates for the same slot | On block import, when the head carries the round's votes (see note above) | direction=block_only,timely_only | --- From 2b1dd33d54315961e462204c8406fd610c259402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 28 May 2026 15:08:38 -0300 Subject: [PATCH 3/3] chore: link per-subnet coverage TODO to tracking issue #398 --- crates/blockchain/src/coverage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/blockchain/src/coverage.rs b/crates/blockchain/src/coverage.rs index 22a559d6..2019e41b 100644 --- a/crates/blockchain/src/coverage.rs +++ b/crates/blockchain/src/coverage.rs @@ -56,7 +56,7 @@ fn cov_add(seen: &mut [bool], has_subnet: &mut [bool], bits: &AggregationBits) { } fn cov_record(section: &str, seen: &[bool], has_subnet: &[bool]) { - // TODO: emit a per-subnet breakdown (subnet=subnet_N) alongside the + // TODO(#398): emit a per-subnet breakdown (subnet=subnet_N) alongside the // subnet=combined total. `has_subnet` already tracks which subnets are // covered, but we only report the aggregate count here; the per-subnet // label is reserved in the metric definition and not yet populated.