Skip to content

Commit fd64b8f

Browse files
authored
Merge pull request kubernetes-sigs#811 from djzager/metrics-doc
📖 document exposed metrics
2 parents 9f8aab6 + daa6f4d commit fd64b8f

File tree

3 files changed

+136
-113
lines changed

3 files changed

+136
-113
lines changed

pkg/metrics/client_go_adapter.go

Lines changed: 48 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -29,78 +29,91 @@ import (
2929
// that client-go registers metrics. We copy the names and formats
3030
// from Kubernetes so that we match the core controllers.
3131

32+
// Metrics subsystem and all of the keys used by the rest client.
33+
const (
34+
RestClientSubsystem = "rest_client"
35+
LatencyKey = "request_latency_seconds"
36+
ResultKey = "requests_total"
37+
)
38+
39+
// Metrics subsystem and all keys used by the reflectors.
40+
const (
41+
ReflectorSubsystem = "reflector"
42+
ListsTotalKey = "lists_total"
43+
ListsDurationKey = "list_duration_seconds"
44+
ItemsPerListKey = "items_per_list"
45+
WatchesTotalKey = "watches_total"
46+
ShortWatchesTotalKey = "short_watches_total"
47+
WatchDurationKey = "watch_duration_seconds"
48+
ItemsPerWatchKey = "items_per_watch"
49+
LastResourceVersionKey = "last_resource_version"
50+
)
51+
3252
var (
3353
// client metrics
34-
35-
requestLatency = prometheus.NewHistogramVec(
36-
prometheus.HistogramOpts{
37-
Name: "rest_client_request_latency_seconds",
38-
Help: "Request latency in seconds. Broken down by verb and URL.",
39-
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10),
40-
},
41-
[]string{"verb", "url"},
42-
)
43-
44-
requestResult = prometheus.NewCounterVec(
45-
prometheus.CounterOpts{
46-
Name: "rest_client_requests_total",
47-
Help: "Number of HTTP requests, partitioned by status code, method, and host.",
48-
},
49-
[]string{"code", "method", "host"},
50-
)
54+
requestLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
55+
Subsystem: RestClientSubsystem,
56+
Name: LatencyKey,
57+
Help: "Request latency in seconds. Broken down by verb and URL.",
58+
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10),
59+
}, []string{"verb", "url"})
60+
61+
requestResult = prometheus.NewCounterVec(prometheus.CounterOpts{
62+
Subsystem: RestClientSubsystem,
63+
Name: ResultKey,
64+
Help: "Number of HTTP requests, partitioned by status code, method, and host.",
65+
}, []string{"code", "method", "host"})
5166

5267
// reflector metrics
5368

5469
// TODO(directxman12): update these to be histograms once the metrics overhaul KEP
5570
// PRs start landing.
5671

57-
reflectorSubsystem = "reflector"
58-
5972
listsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
60-
Subsystem: reflectorSubsystem,
61-
Name: "lists_total",
73+
Subsystem: ReflectorSubsystem,
74+
Name: ListsTotalKey,
6275
Help: "Total number of API lists done by the reflectors",
6376
}, []string{"name"})
6477

6578
listsDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
66-
Subsystem: reflectorSubsystem,
67-
Name: "list_duration_seconds",
79+
Subsystem: ReflectorSubsystem,
80+
Name: ListsDurationKey,
6881
Help: "How long an API list takes to return and decode for the reflectors",
6982
}, []string{"name"})
7083

7184
itemsPerList = prometheus.NewSummaryVec(prometheus.SummaryOpts{
72-
Subsystem: reflectorSubsystem,
73-
Name: "items_per_list",
85+
Subsystem: ReflectorSubsystem,
86+
Name: ItemsPerListKey,
7487
Help: "How many items an API list returns to the reflectors",
7588
}, []string{"name"})
7689

7790
watchesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
78-
Subsystem: reflectorSubsystem,
79-
Name: "watches_total",
91+
Subsystem: ReflectorSubsystem,
92+
Name: WatchesTotalKey,
8093
Help: "Total number of API watches done by the reflectors",
8194
}, []string{"name"})
8295

8396
shortWatchesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
84-
Subsystem: reflectorSubsystem,
85-
Name: "short_watches_total",
97+
Subsystem: ReflectorSubsystem,
98+
Name: ShortWatchesTotalKey,
8699
Help: "Total number of short API watches done by the reflectors",
87100
}, []string{"name"})
88101

89102
watchDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
90-
Subsystem: reflectorSubsystem,
91-
Name: "watch_duration_seconds",
103+
Subsystem: ReflectorSubsystem,
104+
Name: WatchDurationKey,
92105
Help: "How long an API watch takes to return and decode for the reflectors",
93106
}, []string{"name"})
94107

95108
itemsPerWatch = prometheus.NewSummaryVec(prometheus.SummaryOpts{
96-
Subsystem: reflectorSubsystem,
97-
Name: "items_per_watch",
109+
Subsystem: ReflectorSubsystem,
110+
Name: ItemsPerWatchKey,
98111
Help: "How many items an API watch returns to the reflectors",
99112
}, []string{"name"})
100113

101114
lastResourceVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
102-
Subsystem: reflectorSubsystem,
103-
Name: "last_resource_version",
115+
Subsystem: ReflectorSubsystem,
116+
Name: LastResourceVersionKey,
104117
Help: "Last resource version seen for the reflectors",
105118
}, []string{"name"})
106119
)

pkg/metrics/listener.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@ package metrics
1919
import (
2020
"fmt"
2121
"net"
22+
23+
logf "sigs.k8s.io/controller-runtime/pkg/internal/log"
2224
)
2325

26+
var log = logf.RuntimeLog.WithName("metrics")
27+
2428
// DefaultBindAddress sets the default bind address for the metrics listener
2529
// The metrics is on by default.
2630
var DefaultBindAddress = ":8080"

pkg/metrics/workqueue.go

Lines changed: 84 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -19,106 +19,112 @@ package metrics
1919
import (
2020
"github.com/prometheus/client_golang/prometheus"
2121
"k8s.io/client-go/util/workqueue"
22-
logf "sigs.k8s.io/controller-runtime/pkg/internal/log"
2322
)
2423

25-
var log = logf.RuntimeLog.WithName("metrics")
26-
2724
// This file is copied and adapted from k8s.io/kubernetes/pkg/util/workqueue/prometheus
2825
// which registers metrics to the default prometheus Registry. We require very
2926
// similar functionality, but must register metrics to a different Registry.
3027

28+
// Metrics subsystem and all keys used by the workqueue.
29+
const (
30+
WorkQueueSubsystem = "workqueue"
31+
DepthKey = "depth"
32+
AddsKey = "adds_total"
33+
QueueLatencyKey = "queue_duration_seconds"
34+
WorkDurationKey = "work_duration_seconds"
35+
UnfinishedWorkKey = "unfinished_work_seconds"
36+
LongestRunningProcessorKey = "longest_running_processor_seconds"
37+
RetriesKey = "retries_total"
38+
)
39+
40+
var (
41+
depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
42+
Subsystem: WorkQueueSubsystem,
43+
Name: DepthKey,
44+
Help: "Current depth of workqueue",
45+
}, []string{"name"})
46+
47+
adds = prometheus.NewCounterVec(prometheus.CounterOpts{
48+
Subsystem: WorkQueueSubsystem,
49+
Name: AddsKey,
50+
Help: "Total number of adds handled by workqueue",
51+
}, []string{"name"})
52+
53+
latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
54+
Subsystem: WorkQueueSubsystem,
55+
Name: QueueLatencyKey,
56+
Help: "How long in seconds an item stays in workqueue before being requested",
57+
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
58+
}, []string{"name"})
59+
60+
workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
61+
Subsystem: WorkQueueSubsystem,
62+
Name: WorkDurationKey,
63+
Help: "How long in seconds processing an item from workqueue takes.",
64+
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
65+
}, []string{"name"})
66+
67+
unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{
68+
Subsystem: WorkQueueSubsystem,
69+
Name: UnfinishedWorkKey,
70+
Help: "How many seconds of work has been done that " +
71+
"is in progress and hasn't been observed by work_duration. Large " +
72+
"values indicate stuck threads. One can deduce the number of stuck " +
73+
"threads by observing the rate at which this increases.",
74+
}, []string{"name"})
75+
76+
longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{
77+
Subsystem: WorkQueueSubsystem,
78+
Name: LongestRunningProcessorKey,
79+
Help: "How many seconds has the longest running " +
80+
"processor for workqueue been running.",
81+
}, []string{"name"})
82+
83+
retries = prometheus.NewCounterVec(prometheus.CounterOpts{
84+
Subsystem: WorkQueueSubsystem,
85+
Name: RetriesKey,
86+
Help: "Total number of retries handled by workqueue",
87+
}, []string{"name"})
88+
)
89+
3190
func init() {
32-
workqueue.SetProvider(workqueueMetricsProvider{})
33-
}
91+
Registry.MustRegister(depth)
92+
Registry.MustRegister(adds)
93+
Registry.MustRegister(latency)
94+
Registry.MustRegister(workDuration)
95+
Registry.MustRegister(unfinished)
96+
Registry.MustRegister(longestRunningProcessor)
97+
Registry.MustRegister(retries)
3498

35-
func registerWorkqueueMetric(c prometheus.Collector, name, queue string) {
36-
if err := Registry.Register(c); err != nil {
37-
log.Error(err, "failed to register metric", "name", name, "queue", queue)
38-
}
99+
workqueue.SetProvider(workqueueMetricsProvider{})
39100
}
40101

41102
type workqueueMetricsProvider struct{}
42103

43-
func (workqueueMetricsProvider) NewDepthMetric(queue string) workqueue.GaugeMetric {
44-
const name = "workqueue_depth"
45-
m := prometheus.NewGauge(prometheus.GaugeOpts{
46-
Name: name,
47-
Help: "Current depth of workqueue",
48-
ConstLabels: prometheus.Labels{"name": queue},
49-
})
50-
registerWorkqueueMetric(m, name, queue)
51-
return m
104+
func (workqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
105+
return depth.WithLabelValues(name)
52106
}
53107

54-
func (workqueueMetricsProvider) NewAddsMetric(queue string) workqueue.CounterMetric {
55-
const name = "workqueue_adds_total"
56-
m := prometheus.NewCounter(prometheus.CounterOpts{
57-
Name: name,
58-
Help: "Total number of adds handled by workqueue",
59-
ConstLabels: prometheus.Labels{"name": queue},
60-
})
61-
registerWorkqueueMetric(m, name, queue)
62-
return m
108+
func (workqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric {
109+
return adds.WithLabelValues(name)
63110
}
64111

65-
func (workqueueMetricsProvider) NewLatencyMetric(queue string) workqueue.HistogramMetric {
66-
const name = "workqueue_queue_duration_seconds"
67-
m := prometheus.NewHistogram(prometheus.HistogramOpts{
68-
Name: name,
69-
Help: "How long in seconds an item stays in workqueue before being requested.",
70-
ConstLabels: prometheus.Labels{"name": queue},
71-
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
72-
})
73-
registerWorkqueueMetric(m, name, queue)
74-
return m
112+
func (workqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
113+
return latency.WithLabelValues(name)
75114
}
76115

77-
func (workqueueMetricsProvider) NewWorkDurationMetric(queue string) workqueue.HistogramMetric {
78-
const name = "workqueue_work_duration_seconds"
79-
m := prometheus.NewHistogram(prometheus.HistogramOpts{
80-
Name: name,
81-
Help: "How long in seconds processing an item from workqueue takes.",
82-
ConstLabels: prometheus.Labels{"name": queue},
83-
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
84-
})
85-
registerWorkqueueMetric(m, name, queue)
86-
return m
116+
func (workqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
117+
return workDuration.WithLabelValues(name)
87118
}
88119

89-
func (workqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(queue string) workqueue.SettableGaugeMetric {
90-
const name = "workqueue_unfinished_work_seconds"
91-
m := prometheus.NewGauge(prometheus.GaugeOpts{
92-
Name: name,
93-
Help: "How many seconds of work has done that " +
94-
"is in progress and hasn't been observed by work_duration. Large " +
95-
"values indicate stuck threads. One can deduce the number of stuck " +
96-
"threads by observing the rate at which this increases.",
97-
ConstLabels: prometheus.Labels{"name": queue},
98-
})
99-
registerWorkqueueMetric(m, name, queue)
100-
return m
120+
func (workqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
121+
return unfinished.WithLabelValues(name)
101122
}
102123

103-
func (workqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(queue string) workqueue.SettableGaugeMetric {
104-
const name = "workqueue_longest_running_processor_seconds"
105-
m := prometheus.NewGauge(prometheus.GaugeOpts{
106-
Name: name,
107-
Help: "How many seconds has the longest running " +
108-
"processor for workqueue been running.",
109-
ConstLabels: prometheus.Labels{"name": queue},
110-
})
111-
registerWorkqueueMetric(m, name, queue)
112-
return m
124+
func (workqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
125+
return longestRunningProcessor.WithLabelValues(name)
113126
}
114127

115-
func (workqueueMetricsProvider) NewRetriesMetric(queue string) workqueue.CounterMetric {
116-
const name = "workqueue_retries_total"
117-
m := prometheus.NewCounter(prometheus.CounterOpts{
118-
Name: name,
119-
Help: "Total number of retries handled by workqueue",
120-
ConstLabels: prometheus.Labels{"name": queue},
121-
})
122-
registerWorkqueueMetric(m, name, queue)
123-
return m
128+
func (workqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
129+
return retries.WithLabelValues(name)
124130
}

0 commit comments

Comments
 (0)