pd/metrics/sleep_worker.rs
1//! A sleep worker.
2//!
3//! ### Overview
4//!
5//! This submodule defines a metric, and an accompanying worker task, for use in measuring
6//! scheduler latency in the tokio runtime. This worker will repeatedly sleep for one second, and
7//! then observe the amount of time it *actually* spent waiting to be woken up. This is useful for
8//! detecting when the asynchronous runtime is being disrupted by blocking I/O, or other expensive
9//! non-coöperative computation.
10//!
11//! Use [`register_metrics()`] to register the [`SLEEP_DRIFT`] metric with an exporter, and spawn
12//! the worker onto a runtime by calling [`run()`].
13
14use {
15 super::*,
16 std::time::{Duration, Instant},
17 tokio::time::sleep,
18};
19
20pub const SLEEP_DRIFT: &str = "pd_async_sleep_drift_microseconds";
21
22const ONE_SECOND: Duration = Duration::from_secs(1);
23const ONE_SECOND_US: u128 = ONE_SECOND.as_micros();
24
25pub fn register_metrics() {
26 describe_counter!(
27 SLEEP_DRIFT,
28 Unit::Microseconds,
29 "Tracks drift in the async runtime's timer, in microseconds."
30 );
31}
32
33/// Run the sleep worker.
34///
35/// This function will never return.
36pub async fn run() -> std::convert::Infallible {
37 let counter = counter!(SLEEP_DRIFT);
38
39 loop {
40 // Ask the async runtime to pause this task for one second, and then observe the amount of
41 // microseconds we were actually suspended.
42 let start = Instant::now();
43 sleep(ONE_SECOND).await;
44 let end = Instant::now();
45 let actual = end.duration_since(start).as_micros();
46
47 // Find the difference between the observed sleep duration and our expected duration.
48 let drift: u64 = actual
49 .saturating_sub(ONE_SECOND_US)
50 .try_into()
51 .unwrap_or_else(|error| {
52 // In the unlikely event that the number of microseconds we waited can't fit into
53 // a u64, round down to u64::MAX. This is lossy, but will still indicate that
54 // there is a severe issue with the runtime.
55 tracing::error!(?error, %actual, "failed to convert timer drift into a u64");
56 u64::MAX
57 });
58
59 // If there was scheduler drift, increment the counter.
60 match drift {
61 0 => continue,
62 n => counter.increment(n),
63 }
64 }
65}