Skip to content

Commit f060537

Browse files
authored
Add safekeeper reconciler metrics (neondatabase#12062)
Adds two metrics to the storcon that are related to the safekeeper reconciler: * `storage_controller_safkeeper_reconciles_queued` to indicate currrent queue depth * `storage_controller_safkeeper_reconciles_complete` to indicate the number of complete reconciles Both metrics operate on a per-safekeeper basis (as reconcilers run on a per-safekeeper basis too). These metrics mirror the `storage_controller_pending_reconciles` and `storage_controller_reconcile_complete` metrics, although those are not scoped on a per-pageserver basis but are global for the entire storage controller. Part of neondatabase#11670
1 parent 8a6fc6f commit f060537

File tree

2 files changed

+63
-1
lines changed

2 files changed

+63
-1
lines changed

storage_controller/src/metrics.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,14 @@ pub(crate) struct StorageControllerMetricGroup {
139139
/// HTTP request status counters for handled requests
140140
pub(crate) storage_controller_reconcile_long_running:
141141
measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
142+
143+
/// Indicator of safekeeper reconciler queue depth, broken down by safekeeper, excluding ongoing reconciles.
144+
pub(crate) storage_controller_safkeeper_reconciles_queued:
145+
measured::GaugeVec<SafekeeperReconcilerLabelGroupSet>,
146+
147+
/// Indicator of completed safekeeper reconciles, broken down by safekeeper.
148+
pub(crate) storage_controller_safkeeper_reconciles_complete:
149+
measured::CounterVec<SafekeeperReconcilerLabelGroupSet>,
142150
}
143151

144152
impl StorageControllerMetrics {
@@ -257,6 +265,17 @@ pub(crate) enum Method {
257265
Other,
258266
}
259267

268+
#[derive(measured::LabelGroup, Clone)]
269+
#[label(set = SafekeeperReconcilerLabelGroupSet)]
270+
pub(crate) struct SafekeeperReconcilerLabelGroup<'a> {
271+
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
272+
pub(crate) sk_az: &'a str,
273+
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
274+
pub(crate) sk_node_id: &'a str,
275+
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
276+
pub(crate) sk_hostname: &'a str,
277+
}
278+
260279
impl From<hyper::Method> for Method {
261280
fn from(value: hyper::Method) -> Self {
262281
if value == hyper::Method::GET {

storage_controller/src/service/safekeeper_reconciler.rs

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ use utils::{
2020
};
2121

2222
use crate::{
23-
persistence::SafekeeperTimelineOpKind, safekeeper::Safekeeper,
23+
metrics::{METRICS_REGISTRY, SafekeeperReconcilerLabelGroup},
24+
persistence::SafekeeperTimelineOpKind,
25+
safekeeper::Safekeeper,
2426
safekeeper_client::SafekeeperClient,
2527
};
2628

@@ -218,7 +220,26 @@ impl ReconcilerHandle {
218220
fn schedule_reconcile(&self, req: ScheduleRequest) {
219221
let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id);
220222
let hostname = req.safekeeper.skp.host.clone();
223+
let sk_az = req.safekeeper.skp.availability_zone_id.clone();
224+
let sk_node_id = req.safekeeper.get_id().to_string();
225+
226+
// We don't have direct access to the queue depth here, so increase it blindly by 1.
227+
// We know that putting into the queue increases the queue depth. The receiver will
228+
// update with the correct value once it processes the next item. To avoid races where we
229+
// reduce before we increase, leaving the gauge with a 1 value for a long time, we
230+
// increase it before putting into the queue.
231+
let queued_gauge = &METRICS_REGISTRY
232+
.metrics_group
233+
.storage_controller_safkeeper_reconciles_queued;
234+
let label_group = SafekeeperReconcilerLabelGroup {
235+
sk_az: &sk_az,
236+
sk_node_id: &sk_node_id,
237+
sk_hostname: &hostname,
238+
};
239+
queued_gauge.inc(label_group.clone());
240+
221241
if let Err(err) = self.tx.send((req, cancel, token_id)) {
242+
queued_gauge.set(label_group, 0);
222243
tracing::info!("scheduling request onto {hostname} returned error: {err}");
223244
}
224245
}
@@ -283,6 +304,18 @@ impl SafekeeperReconciler {
283304
continue;
284305
}
285306

307+
let queued_gauge = &METRICS_REGISTRY
308+
.metrics_group
309+
.storage_controller_safkeeper_reconciles_queued;
310+
queued_gauge.set(
311+
SafekeeperReconcilerLabelGroup {
312+
sk_az: &req.safekeeper.skp.availability_zone_id,
313+
sk_node_id: &req.safekeeper.get_id().to_string(),
314+
sk_hostname: &req.safekeeper.skp.host,
315+
},
316+
self.rx.len() as i64,
317+
);
318+
286319
tokio::task::spawn(async move {
287320
let kind = req.kind;
288321
let tenant_id = req.tenant_id;
@@ -511,6 +544,16 @@ impl SafekeeperReconcilerInner {
511544
req.generation,
512545
)
513546
.await;
547+
548+
let complete_counter = &METRICS_REGISTRY
549+
.metrics_group
550+
.storage_controller_safkeeper_reconciles_complete;
551+
complete_counter.inc(SafekeeperReconcilerLabelGroup {
552+
sk_az: &req.safekeeper.skp.availability_zone_id,
553+
sk_node_id: &req.safekeeper.get_id().to_string(),
554+
sk_hostname: &req.safekeeper.skp.host,
555+
});
556+
514557
if let Err(err) = res {
515558
tracing::info!(
516559
"couldn't remove reconciliation request onto {} from persistence: {err:?}",

0 commit comments

Comments
 (0)