From d7c546844cfaf58ab63739a181fbf73c924fb4d5 Mon Sep 17 00:00:00 2001 From: Paul Hauner Date: Mon, 12 Aug 2019 17:44:47 +1000 Subject: [PATCH] Fix broken beacon chain metrics, add slot clock metrics --- beacon_node/beacon_chain/src/lib.rs | 1 + beacon_node/beacon_chain/src/metrics.rs | 28 +----------------- beacon_node/rest_api/Cargo.toml | 1 + beacon_node/rest_api/src/metrics.rs | 17 +++++++++++ eth2/utils/slot_clock/Cargo.toml | 2 ++ eth2/utils/slot_clock/src/lib.rs | 10 ++++++- eth2/utils/slot_clock/src/metrics.rs | 29 +++++++++++++++++++ .../slot_clock/src/system_time_slot_clock.rs | 4 +++ .../slot_clock/src/testing_slot_clock.rs | 4 +++ 9 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 eth2/utils/slot_clock/src/metrics.rs diff --git a/beacon_node/beacon_chain/src/lib.rs b/beacon_node/beacon_chain/src/lib.rs index 1262bc537..cc7725dd8 100644 --- a/beacon_node/beacon_chain/src/lib.rs +++ b/beacon_node/beacon_chain/src/lib.rs @@ -1,3 +1,4 @@ +#![recursion_limit = "128"] // For lazy-static #[macro_use] extern crate lazy_static; diff --git a/beacon_node/beacon_chain/src/metrics.rs b/beacon_node/beacon_chain/src/metrics.rs index 00a3e5eb2..a4b36cd37 100644 --- a/beacon_node/beacon_chain/src/metrics.rs +++ b/beacon_node/beacon_chain/src/metrics.rs @@ -1,6 +1,6 @@ use crate::{BeaconChain, BeaconChainTypes}; pub use lighthouse_metrics::*; -use types::{BeaconState, Epoch, EthSpec, Hash256, Slot}; +use types::{BeaconState, Epoch, Hash256, Slot}; lazy_static! { /* @@ -140,17 +140,6 @@ lazy_static! { */ pub static ref PERSIST_CHAIN: Result = try_create_histogram("beacon_persist_chain", "Time taken to update the canonical head"); -} - -// Lazy-static is split so we don't reach the crate-level recursion limit. -lazy_static! { - /* - * Slot Clock - */ - pub static ref PRESENT_SLOT: Result = - try_create_int_gauge("beacon_present_slot", "The present slot, according to system time"); - pub static ref PRESENT_EPOCH: Result = - try_create_int_gauge("beacon_present_epoch", "The present epoch, according to system time"); /* * Chain Head @@ -194,21 +183,6 @@ lazy_static! { /// Scrape the `beacon_chain` for metrics that are not constantly updated (e.g., the present slot, /// head state info, etc) and update the Prometheus `DEFAULT_REGISTRY`. pub fn scrape_for_metrics(beacon_chain: &BeaconChain) { - set_gauge_by_slot( - &PRESENT_SLOT, - beacon_chain - .read_slot_clock() - .unwrap_or_else(|| Slot::new(0)), - ); - - set_gauge_by_epoch( - &PRESENT_EPOCH, - beacon_chain - .read_slot_clock() - .map(|s| s.epoch(T::EthSpec::slots_per_epoch())) - .unwrap_or_else(|| Epoch::new(0)), - ); - scrape_head_state::( &beacon_chain.head().beacon_state, beacon_chain.head().beacon_state_root, diff --git a/beacon_node/rest_api/Cargo.toml b/beacon_node/rest_api/Cargo.toml index 100e680de..c7026014c 100644 --- a/beacon_node/rest_api/Cargo.toml +++ b/beacon_node/rest_api/Cargo.toml @@ -26,3 +26,4 @@ tokio = "0.1.17" url = "2.0" lazy_static = "1.3.0" lighthouse_metrics = { path = "../../eth2/utils/lighthouse_metrics" } +slot_clock = { path = "../../eth2/utils/slot_clock" } diff --git a/beacon_node/rest_api/src/metrics.rs b/beacon_node/rest_api/src/metrics.rs index b0f1c1b98..f0ccef5f8 100644 --- a/beacon_node/rest_api/src/metrics.rs +++ b/beacon_node/rest_api/src/metrics.rs @@ -39,6 +39,23 @@ pub fn get_prometheus(req: Request) -> ApiR .get::() .ok_or_else(|| ApiError::ServerError("DBPath extension missing".to_string()))?; + // There are two categories of metrics: + // + // - Dynamically updated: things like histograms and event counters that are updated on the + // fly. + // - Statically updated: things which are only updated at the time of the scrape (used where we + // can avoid cluttering up code with metrics calls). + // + // The `prometheus` crate has a `DEFAULT_REGISTRY` global singleton (via `lazy_static`) which + // keeps the state of all the metrics. Dynamically updated things will already be up-to-date in + // the registry (because they update themselves) however statically updated things need to be + // "scraped". + // + // We proceed by, first updating all the static metrics using `scrape_for_metrics(..)`. Then, + // using `prometheus::gather(..)` to collect the global `DEFAULT_REGISTRY` metrics into a + // string that can be returned via HTTP. + + slot_clock::scrape_for_metrics::(&beacon_chain.slot_clock); store::scrape_for_metrics(&db_path); beacon_chain::scrape_for_metrics(&beacon_chain); diff --git a/eth2/utils/slot_clock/Cargo.toml b/eth2/utils/slot_clock/Cargo.toml index 31a435725..c4b9df5ed 100644 --- a/eth2/utils/slot_clock/Cargo.toml +++ b/eth2/utils/slot_clock/Cargo.toml @@ -6,3 +6,5 @@ edition = "2018" [dependencies] types = { path = "../../types" } +lazy_static = "1.3.0" +lighthouse_metrics = { path = "../lighthouse_metrics" } diff --git a/eth2/utils/slot_clock/src/lib.rs b/eth2/utils/slot_clock/src/lib.rs index 7b86684fa..871743c9e 100644 --- a/eth2/utils/slot_clock/src/lib.rs +++ b/eth2/utils/slot_clock/src/lib.rs @@ -1,9 +1,15 @@ +#[macro_use] +extern crate lazy_static; + +mod metrics; mod system_time_slot_clock; mod testing_slot_clock; +use std::time::Duration; + pub use crate::system_time_slot_clock::{Error as SystemTimeSlotClockError, SystemTimeSlotClock}; pub use crate::testing_slot_clock::{Error as TestingSlotClockError, TestingSlotClock}; -use std::time::Duration; +pub use metrics::scrape_for_metrics; pub use types::Slot; pub trait SlotClock: Send + Sync + Sized { @@ -17,4 +23,6 @@ pub trait SlotClock: Send + Sync + Sized { fn present_slot(&self) -> Result, Self::Error>; fn duration_to_next_slot(&self) -> Result, Self::Error>; + + fn slot_duration_millis(&self) -> u64; } diff --git a/eth2/utils/slot_clock/src/metrics.rs b/eth2/utils/slot_clock/src/metrics.rs new file mode 100644 index 000000000..a9153a10c --- /dev/null +++ b/eth2/utils/slot_clock/src/metrics.rs @@ -0,0 +1,29 @@ +use crate::SlotClock; +pub use lighthouse_metrics::*; +use types::{EthSpec, Slot}; + +lazy_static! { + pub static ref PRESENT_SLOT: Result = + try_create_int_gauge("slotclock_present_slot", "The present wall-clock slot"); + pub static ref PRESENT_EPOCH: Result = + try_create_int_gauge("slotclock_present_epoch", "The present wall-clock epoch"); + pub static ref MILLISECONDS_PER_SLOT: Result = try_create_int_gauge( + "slotclock_slot_time_milliseconds", + "The duration in milliseconds between each slot" + ); +} + +/// Update the global metrics `DEFAULT_REGISTRY` with info from the slot clock. +pub fn scrape_for_metrics(clock: &U) { + let present_slot = match clock.present_slot() { + Ok(Some(slot)) => slot, + _ => Slot::new(0), + }; + + set_gauge(&PRESENT_SLOT, present_slot.as_u64() as i64); + set_gauge( + &PRESENT_EPOCH, + present_slot.epoch(T::slots_per_epoch()).as_u64() as i64, + ); + set_gauge(&MILLISECONDS_PER_SLOT, clock.slot_duration_millis() as i64); +} diff --git a/eth2/utils/slot_clock/src/system_time_slot_clock.rs b/eth2/utils/slot_clock/src/system_time_slot_clock.rs index 7c184b02b..c493a8be8 100644 --- a/eth2/utils/slot_clock/src/system_time_slot_clock.rs +++ b/eth2/utils/slot_clock/src/system_time_slot_clock.rs @@ -52,6 +52,10 @@ impl SlotClock for SystemTimeSlotClock { fn duration_to_next_slot(&self) -> Result, Error> { duration_to_next_slot(self.genesis_seconds, self.slot_duration_seconds) } + + fn slot_duration_millis(&self) -> u64 { + self.slot_duration_seconds * 1000 + } } impl From for Error { diff --git a/eth2/utils/slot_clock/src/testing_slot_clock.rs b/eth2/utils/slot_clock/src/testing_slot_clock.rs index ab00d2baa..f741d3b87 100644 --- a/eth2/utils/slot_clock/src/testing_slot_clock.rs +++ b/eth2/utils/slot_clock/src/testing_slot_clock.rs @@ -40,6 +40,10 @@ impl SlotClock for TestingSlotClock { fn duration_to_next_slot(&self) -> Result, Error> { Ok(Some(Duration::from_secs(1))) } + + fn slot_duration_millis(&self) -> u64 { + 0 + } } #[cfg(test)]