pd/metrics/
sleep_worker.rs

1//! A sleep worker.
2//!
3//! ### Overview
4//!
5//! This submodule defines a metric, and an accompanying worker task, for use in measuring
6//! scheduler latency in the tokio runtime. This worker will repeatedly sleep for one second, and
7//! then observe the amount of time it *actually* spent waiting to be woken up. This is useful for
8//! detecting when the asynchronous runtime is being disrupted by blocking I/O, or other expensive
9//! non-coöperative computation.
10//!
11//! Use [`register_metrics()`] to register the [`SLEEP_DRIFT`] metric with an exporter, and spawn
12//! the worker onto a runtime by calling [`run()`].
13
14use {
15    super::*,
16    std::time::{Duration, Instant},
17    tokio::time::sleep,
18};
19
20pub const SLEEP_DRIFT: &str = "pd_async_sleep_drift_microseconds";
21
22const ONE_SECOND: Duration = Duration::from_secs(1);
23const ONE_SECOND_US: u128 = ONE_SECOND.as_micros();
24
25pub fn register_metrics() {
26    describe_counter!(
27        SLEEP_DRIFT,
28        Unit::Microseconds,
29        "Tracks drift in the async runtime's timer, in microseconds."
30    );
31}
32
33/// Run the sleep worker.
34///
35/// This function will never return.
36pub async fn run() -> std::convert::Infallible {
37    let counter = counter!(SLEEP_DRIFT);
38
39    loop {
40        // Ask the async runtime to pause this task for one second, and then observe the amount of
41        // microseconds we were actually suspended.
42        let start = Instant::now();
43        sleep(ONE_SECOND).await;
44        let end = Instant::now();
45        let actual = end.duration_since(start).as_micros();
46
47        // Find the difference between the observed sleep duration and our expected duration.
48        let drift: u64 = actual
49            .saturating_sub(ONE_SECOND_US)
50            .try_into()
51            .unwrap_or_else(|error| {
52                // In the unlikely event that the number of microseconds we waited can't fit into
53                // a u64, round down to u64::MAX. This is lossy, but will still indicate that
54                // there is a severe issue with the runtime.
55                tracing::error!(?error, %actual, "failed to convert timer drift into a u64");
56                u64::MAX
57            });
58
59        // If there was scheduler drift, increment the counter.
60        match drift {
61            0 => continue,
62            n => counter.increment(n),
63        }
64    }
65}