pd/metrics/
sleep_worker.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
//! A sleep worker.
//!
//! ### Overview
//!
//! This submodule defines a metric, and an accompanying worker task, for use in measuring
//! scheduler latency in the tokio runtime. This worker will repeatedly sleep for one second, and
//! then observe the amount of time it *actually* spent waiting to be woken up. This is useful for
//! detecting when the asynchronous runtime is being disrupted by blocking I/O, or other expensive
//! non-coöperative computation.
//!
//! Use [`register_metrics()`] to register the [`SLEEP_DRIFT`] metric with an exporter, and spawn
//! the worker onto a runtime by calling [`run()`].

use {
    super::*,
    std::time::{Duration, Instant},
    tokio::time::sleep,
};

pub const SLEEP_DRIFT: &str = "pd_async_sleep_drift_microseconds";

const ONE_SECOND: Duration = Duration::from_secs(1);
const ONE_SECOND_US: u128 = ONE_SECOND.as_micros();

pub fn register_metrics() {
    describe_counter!(
        SLEEP_DRIFT,
        Unit::Microseconds,
        "Tracks drift in the async runtime's timer, in microseconds."
    );
}

/// Run the sleep worker.
///
/// This function will never return.
pub async fn run() -> std::convert::Infallible {
    let counter = counter!(SLEEP_DRIFT);

    loop {
        // Ask the async runtime to pause this task for one second, and then observe the amount of
        // microseconds we were actually suspended.
        let start = Instant::now();
        sleep(ONE_SECOND).await;
        let end = Instant::now();
        let actual = end.duration_since(start).as_micros();

        // Find the difference between the observed sleep duration and our expected duration.
        let drift: u64 = actual
            .saturating_sub(ONE_SECOND_US)
            .try_into()
            .unwrap_or_else(|error| {
                // In the unlikely event that the number of microseconds we waited can't fit into
                // a u64, round down to u64::MAX. This is lossy, but will still indicate that
                // there is a severe issue with the runtime.
                tracing::error!(?error, %actual, "failed to convert timer drift into a u64");
                u64::MAX
            });

        // If there was scheduler drift, increment the counter.
        match drift {
            0 => continue,
            n => counter.increment(n),
        }
    }
}