From 618001cfcf83871066063f29b1b0bd4f88c81dda Mon Sep 17 00:00:00 2001 From: Scott Opell Date: Wed, 14 Jan 2026 21:39:54 +0000 Subject: [PATCH] Fix CPU metric spike on first observation The CPU samplers (both cgroup v2 and procfs) were initializing prev stats to zeros, causing the first delta calculation to be (cumulative_since_container_start - 0) which produces an enormous spike in total_cpu_usage_millicores. Fix by making prev an Option. On first poll, we record baseline stats but skip metric emission. Subsequent polls compute proper deltas. Changes: - cgroup/v2/cpu: Make prev Option, skip first poll emission - procfs/stat: Make prev Option, skip first poll emission Ports commits 5d51345 and 706bc95 from sopell/expose-observer-public to the extracted lading-observer crate. Co-Authored-By: Claude Sonnet 4.5 --- lading_observer/src/linux/cgroup/v2/cpu.rs | 50 +++++++++++++--------- lading_observer/src/linux/procfs/stat.rs | 19 +++++--- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/lading_observer/src/linux/cgroup/v2/cpu.rs b/lading_observer/src/linux/cgroup/v2/cpu.rs index fb0e98115..f3ded61bf 100644 --- a/lading_observer/src/linux/cgroup/v2/cpu.rs +++ b/lading_observer/src/linux/cgroup/v2/cpu.rs @@ -23,21 +23,17 @@ struct Stats { last_instant: Instant, } -#[derive(Debug)] +#[derive(Debug, Default)] pub(crate) struct Sampler { - prev: Stats, + /// Previous stats for delta calculation. None on first poll. + prev: Option, } impl Sampler { + /// Create a new CPU Sampler + #[must_use] pub(crate) fn new() -> Self { - Self { - prev: Stats { - usage_usec: 0, - user_usec: 0, - system_usec: 0, - last_instant: Instant::now(), - }, - } + Self { prev: None } } // Read cgroup CPU data and calculate a percentage of usage. @@ -80,17 +76,29 @@ impl Sampler { } let now = Instant::now(); - let delta_time = now.duration_since(self.prev.last_instant).as_micros(); - let delta_usage = usage_usec.saturating_sub(self.prev.usage_usec); - let delta_user = user_usec.saturating_sub(self.prev.user_usec); - let delta_system = system_usec.saturating_sub(self.prev.system_usec); - - // Update previous stats and if there's a time delta calculate the CPU - // usage. - self.prev.usage_usec = usage_usec; - self.prev.user_usec = user_usec; - self.prev.system_usec = system_usec; - self.prev.last_instant = now; + let current = Stats { + usage_usec, + user_usec, + system_usec, + last_instant: now, + }; + + // On first poll, just record baseline stats without emitting metrics. + // This avoids a spike where delta = (cumulative_since_container_start - 0). + let Some(ref prev) = self.prev else { + self.prev = Some(current); + return Ok(()); + }; + + let delta_time = now.duration_since(prev.last_instant).as_micros(); + let delta_usage = usage_usec.saturating_sub(prev.usage_usec); + let delta_user = user_usec.saturating_sub(prev.user_usec); + let delta_system = system_usec.saturating_sub(prev.system_usec); + + // Update previous stats for next poll + self.prev = Some(current); + + // Emit metrics if there's a time delta if delta_time > 0 { let delta_time = delta_time as f64; diff --git a/lading_observer/src/linux/procfs/stat.rs b/lading_observer/src/linux/procfs/stat.rs index f06913c5a..75ba9f4a7 100644 --- a/lading_observer/src/linux/procfs/stat.rs +++ b/lading_observer/src/linux/procfs/stat.rs @@ -38,14 +38,15 @@ struct CpuUtilization { #[derive(Debug)] pub(crate) struct Sampler { ticks_per_second: f64, - prev: Stats, + /// Previous stats for delta calculation. None on first poll. + prev: Option, } impl Sampler { pub(crate) fn new() -> Self { Self { ticks_per_second: unsafe { nix::libc::sysconf(nix::libc::_SC_CLK_TCK) } as f64, - prev: Stats::default(), + prev: None, } } @@ -78,16 +79,20 @@ impl Sampler { let (cur_pid, utime_ticks, stime_ticks) = parse(&stat_contents)?; assert!(cur_pid == pid); - // Get or initialize the previous stats. Note that the first time this is - // initialized we intentionally set last_instance to now to avoid scheduling - // shenanigans. let cur_stats = Stats { user_ticks: utime_ticks, system_ticks: stime_ticks, uptime_ticks: (uptime_secs * self.ticks_per_second).round() as u64, }; - if let Some(util) = compute_cpu_usage(self.prev, cur_stats, allowed_cores) { + // On first poll, just record baseline stats without emitting metrics. + // This avoids a spike where delta = (cumulative_since_process_start - 0). + let Some(ref prev) = self.prev else { + self.prev = Some(cur_stats); + return Ok(()); + }; + + if let Some(util) = compute_cpu_usage(*prev, cur_stats, allowed_cores) { // NOTE these metric names are paired with names in cgroup/v2/cpu.rs and // must remain consistent. If you change these, change those. gauge!("stat.total_cpu_percentage", labels).set(util.total_cpu_percentage); @@ -103,7 +108,7 @@ impl Sampler { gauge!("stat.cpu_limit_millicores", labels).set(limit_millicores); } - self.prev = cur_stats; + self.prev = Some(cur_stats); Ok(()) }