Skip to main content

fedimint_server/
metrics.rs

1#![allow(clippy::disallowed_types)]
2// Prometheus registration macros use `HashMap` internally.
3
4pub(crate) mod jsonrpsee;
5
6use std::sync::LazyLock;
7use std::time::Duration;
8
9use fedimint_core::backup::ClientBackupKeyPrefix;
10use fedimint_core::db::{Database, IDatabaseTransactionOpsCoreTyped};
11use fedimint_core::task::{TaskGroup, sleep};
12use fedimint_metrics::prometheus::{
13    HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, register_histogram_vec_with_registry,
14    register_int_gauge_vec_with_registry, register_int_gauge_with_registry,
15};
16use fedimint_metrics::{
17    Histogram, REGISTRY, histogram_opts, opts, register_histogram_with_registry,
18    register_int_counter_vec_with_registry,
19};
20use futures::StreamExt as _;
21use tokio::sync::OnceCell;
22
23use crate::consensus::api::backup_statistics_static;
24
25const BACKUP_STATS_REFRESH_INTERVAL: Duration = Duration::from_mins(1);
26
27pub static TX_ELEMS_BUCKETS: LazyLock<Vec<f64>> = LazyLock::new(|| {
28    vec![
29        1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0,
30    ]
31});
32pub(crate) static CONSENSUS_TX_PROCESSED_INPUTS: LazyLock<Histogram> = LazyLock::new(|| {
33    register_histogram_with_registry!(
34        histogram_opts!(
35            "consensus_tx_processed_inputs",
36            "Number of inputs processed in a transaction",
37            TX_ELEMS_BUCKETS.clone()
38        ),
39        REGISTRY
40    )
41    .unwrap()
42});
43pub(crate) static CONSENSUS_TX_PROCESSED_OUTPUTS: LazyLock<Histogram> = LazyLock::new(|| {
44    register_histogram_with_registry!(
45        histogram_opts!(
46            "consensus_tx_processed_outputs",
47            "Number of outputs processed in a transaction",
48            TX_ELEMS_BUCKETS.clone()
49        ),
50        REGISTRY
51    )
52    .unwrap()
53});
54pub(crate) static CONSENSUS_ITEMS_PROCESSED_TOTAL: LazyLock<IntCounterVec> = LazyLock::new(|| {
55    register_int_counter_vec_with_registry!(
56        opts!(
57            "consensus_items_processed_total",
58            "Number of consensus items processed in the consensus",
59        ),
60        &["peer_id"],
61        REGISTRY
62    )
63    .unwrap()
64});
65pub(crate) static CONSENSUS_ITEM_PROCESSING_DURATION_SECONDS: LazyLock<HistogramVec> =
66    LazyLock::new(|| {
67        register_histogram_vec_with_registry!(
68            histogram_opts!(
69                "consensus_item_processing_duration_seconds",
70                "Duration of processing a consensus item",
71            ),
72            &["peer_id"],
73            REGISTRY
74        )
75        .unwrap()
76    });
77pub(crate) static CONSENSUS_ITEM_PROCESSING_MODULE_AUDIT_DURATION_SECONDS: LazyLock<HistogramVec> =
78    LazyLock::new(|| {
79        register_histogram_vec_with_registry!(
80            histogram_opts!(
81                "consensus_item_processing_module_audit_duration_seconds",
82                "Duration of processing a consensus item",
83            ),
84            &["module_id", "module_kind"],
85            REGISTRY
86        )
87        .unwrap()
88    });
89
90pub(crate) static CONSENSUS_ORDERING_LATENCY_SECONDS: LazyLock<Histogram> = LazyLock::new(|| {
91    register_histogram_with_registry!(
92        histogram_opts!(
93            "consensus_ordering_latency_seconds",
94            "Duration of ordering a batch of consensus items",
95        ),
96        REGISTRY
97    )
98    .unwrap()
99});
100
101pub(crate) static IROH_API_CONNECTIONS_ACTIVE: LazyLock<IntGauge> = LazyLock::new(|| {
102    register_int_gauge_with_registry!(
103        opts!(
104            "iroh_api_connections_active",
105            "Number of currently active iroh API connections",
106        ),
107        REGISTRY
108    )
109    .unwrap()
110});
111
112pub(crate) static IROH_API_CONNECTION_DURATION_SECONDS: LazyLock<Histogram> = LazyLock::new(|| {
113    register_histogram_with_registry!(
114        histogram_opts!(
115            "iroh_api_connection_duration_seconds",
116            "Duration of iroh API connections",
117        ),
118        REGISTRY
119    )
120    .unwrap()
121});
122
123pub(crate) static IROH_API_REQUEST_DURATION_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
124    register_histogram_vec_with_registry!(
125        histogram_opts!(
126            "iroh_api_request_duration_seconds",
127            "Duration of processing an iroh API request",
128        ),
129        &["method"],
130        REGISTRY
131    )
132    .unwrap()
133});
134
135pub(crate) static JSONRPC_API_REQUEST_DURATION_SECONDS: LazyLock<HistogramVec> =
136    LazyLock::new(|| {
137        register_histogram_vec_with_registry!(
138            histogram_opts!(
139                "jsonrpc_api_request_duration_seconds",
140                "Duration of processing an rpc request",
141            ),
142            &["method"],
143            REGISTRY
144        )
145        .unwrap()
146    });
147pub(crate) static JSONRPC_API_REQUEST_RESPONSE_CODE: LazyLock<IntCounterVec> =
148    LazyLock::new(|| {
149        register_int_counter_vec_with_registry!(
150            opts!(
151                "jsonrpc_api_request_response_code_total",
152                "Count of response counts and types",
153            ),
154            &["method", "code", "type"],
155            REGISTRY
156        )
157        .unwrap()
158    });
159pub(crate) static CONSENSUS_SESSION_COUNT: LazyLock<IntGauge> = LazyLock::new(|| {
160    register_int_gauge_with_registry!(
161        opts!(
162            "consensus_session_count",
163            "Fedimint consensus session count",
164        ),
165        REGISTRY
166    )
167    .unwrap()
168});
169pub(crate) static CONSENSUS_PEER_CONTRIBUTION_SESSION_IDX: LazyLock<IntGaugeVec> =
170    LazyLock::new(|| {
171        register_int_gauge_vec_with_registry!(
172            opts!(
173                "consensus_peer_contribution_session_idx",
174                "Latest contribution session idx by peer_id",
175            ),
176            &["self_id", "peer_id"],
177            REGISTRY
178        )
179        .unwrap()
180    });
181pub(crate) static BACKUP_WRITE_SIZE_BYTES: LazyLock<Histogram> = LazyLock::new(|| {
182    register_histogram_with_registry!(
183        histogram_opts!(
184            "backup_write_size_bytes",
185            "Size of every backup being written",
186            vec![
187                1.0, 10., 100., 1_000., 5_000., 10_000., 50_000., 100_000., 1_000_000.
188            ]
189        ),
190        REGISTRY
191    )
192    .unwrap()
193});
194pub(crate) static STORED_BACKUPS_COUNT: LazyLock<IntGauge> = LazyLock::new(|| {
195    register_int_gauge_with_registry!(
196        opts!("stored_backups_count", "Total amount of backups stored",),
197        REGISTRY
198    )
199    .unwrap()
200});
201
202pub(crate) static BACKUP_COUNTS: LazyLock<IntGaugeVec> = LazyLock::new(|| {
203    register_int_gauge_vec_with_registry!(
204        opts!(
205            "backup_counts",
206            "Backups refreshed at least once in a given timeframe",
207        ),
208        &["timeframe"],
209        REGISTRY
210    )
211    .unwrap()
212});
213
214pub(crate) static TOTAL_BACKUP_SIZE: LazyLock<IntGauge> = LazyLock::new(|| {
215    register_int_gauge_with_registry!(
216        opts!("total_backup_size", "Total size og backups in the DB",),
217        REGISTRY
218    )
219    .unwrap()
220});
221
222/// Lock for spawning exactly one task for updating backup related gauges that
223/// are computed fresh from DB regularly instead of being updated incrementally.
224static BACKUP_COUNTS_UPDATE_TASK: OnceCell<()> = OnceCell::const_new();
225
226pub(crate) static PEER_CONNECT_COUNT: LazyLock<IntCounterVec> = LazyLock::new(|| {
227    register_int_counter_vec_with_registry!(
228        opts!("peer_connect_total", "Number of times peer (re/)connected",),
229        &["self_id", "peer_id", "direction"],
230        REGISTRY
231    )
232    .unwrap()
233});
234pub(crate) static PEER_DISCONNECT_COUNT: LazyLock<IntCounterVec> = LazyLock::new(|| {
235    register_int_counter_vec_with_registry!(
236        opts!(
237            "peer_disconnect_total",
238            "Number of times peer (re/)connected",
239        ),
240        &["self_id", "peer_id"],
241        REGISTRY
242    )
243    .unwrap()
244});
245pub(crate) static PEER_MESSAGES_COUNT: LazyLock<IntCounterVec> = LazyLock::new(|| {
246    register_int_counter_vec_with_registry!(
247        opts!("peer_messages_total", "Messages with the peer",),
248        &["self_id", "peer_id", "direction"],
249        REGISTRY
250    )
251    .unwrap()
252});
253
254/// Initialize gauges or other metrics that need eager initialization on start,
255/// e.g. because they are triggered infrequently.
256pub(crate) async fn initialize_gauge_metrics(tg: &TaskGroup, db: &Database) {
257    STORED_BACKUPS_COUNT.set(
258        db.begin_transaction_nc()
259            .await
260            .find_by_prefix(&ClientBackupKeyPrefix)
261            .await
262            .count()
263            .await as i64,
264    );
265
266    let db_inner = db.clone();
267    BACKUP_COUNTS_UPDATE_TASK
268        .get_or_init(move || async move {
269            tg.spawn_cancellable("prometheus_backup_stats", async move {
270                loop {
271                    let backup_counts =
272                        backup_statistics_static(&mut db_inner.begin_transaction_nc().await).await;
273
274                    BACKUP_COUNTS.with_label_values(&["1d"]).set(
275                        backup_counts
276                            .refreshed_1d
277                            .try_into()
278                            .expect("u64 to i64 overflow"),
279                    );
280                    BACKUP_COUNTS.with_label_values(&["1w"]).set(
281                        backup_counts
282                            .refreshed_1w
283                            .try_into()
284                            .expect("u64 to i64 overflow"),
285                    );
286                    BACKUP_COUNTS.with_label_values(&["1m"]).set(
287                        backup_counts
288                            .refreshed_1m
289                            .try_into()
290                            .expect("u64 to i64 overflow"),
291                    );
292                    BACKUP_COUNTS.with_label_values(&["3m"]).set(
293                        backup_counts
294                            .refreshed_3m
295                            .try_into()
296                            .expect("u64 to i64 overflow"),
297                    );
298                    BACKUP_COUNTS.with_label_values(&["all_time"]).set(
299                        backup_counts
300                            .num_backups
301                            .try_into()
302                            .expect("u64 to i64 overflow"),
303                    );
304
305                    TOTAL_BACKUP_SIZE.set(
306                        backup_counts
307                            .total_size
308                            .try_into()
309                            .expect("u64 to i64 overflow"),
310                    );
311
312                    sleep(BACKUP_STATS_REFRESH_INTERVAL).await;
313                }
314            });
315        })
316        .await;
317}