Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 100 additions & 97 deletions config/system-stats-monitor.json
Original file line number Diff line number Diff line change
@@ -1,99 +1,102 @@
{
"cpu": {
"metricsConfigs": {
"cpu/runnable_task_count": {
"displayName": "cpu/runnable_task_count"
},
"cpu/usage_time": {
"displayName": "cpu/usage_time"
},
"cpu/load_1m": {
"displayName": "cpu/load_1m"
},
"cpu/load_5m": {
"displayName": "cpu/load_5m"
},
"cpu/load_15m": {
"displayName": "cpu/load_15m"
},
"system/processes_total": {
"displayName": "system/processes_total"
},
"system/procs_running": {
"displayName": "system/procs_running"
},
"system/procs_blocked": {
"displayName": "system/procs_blocked"
},
"system/interrupts_total": {
"displayName": "system/interrupts_total"
}
}
},
"disk": {
"metricsConfigs": {
"disk/io_time": {
"displayName": "disk/io_time"
},
"disk/weighted_io": {
"displayName": "disk/weighted_io"
},
"disk/avg_queue_len": {
"displayName": "disk/avg_queue_len"
},
"disk/operation_count": {
"displayName": "disk/operation_count"
},
"disk/merged_operation_count": {
"displayName": "disk/merged_operation_count"
},
"disk/operation_bytes_count": {
"displayName": "disk/operation_bytes_count"
},
"disk/operation_time": {
"displayName": "disk/operation_time"
},
"disk/bytes_used": {
"displayName": "disk/bytes_used"
}
},
"includeRootBlk": true,
"includeAllAttachedBlk": true,
"lsblkTimeout": "5s"
},
"host": {
"metricsConfigs": {
"host/uptime": {
"displayName": "host/uptime"
}
}
},
"memory": {
"metricsConfigs": {
"memory/bytes_used": {
"displayName": "memory/bytes_used"
},
"memory/anonymous_used": {
"displayName": "memory/anonymous_used"
},
"memory/page_cache_used": {
"displayName": "memory/page_cache_used"
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
},
"memory/dirty_used": {
"displayName": "memory/dirty_used"
}
}
},
"osFeature": {
"metricsConfigs": {
"system/os_feature": {
"displayName": "system/os_feature"
}
},
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json"
},
"invokeInterval": "60s"
"cpu": {
"metricsConfigs": {
"cpu/load_15m": {
"displayName": "cpu/load_15m"
},
"cpu/load_1m": {
"displayName": "cpu/load_1m"
},
"cpu/load_5m": {
"displayName": "cpu/load_5m"
},
"cpu/runnable_task_count": {
"displayName": "cpu/runnable_task_count"
},
"cpu/usage_time": {
"displayName": "cpu/usage_time"
},
"system/cpu_stat": {
"displayName": "system/cpu_stat"
},
"system/interrupts_total": {
"displayName": "system/interrupts_total"
},
"system/processes_total": {
"displayName": "system/processes_total"
},
"system/procs_blocked": {
"displayName": "system/procs_blocked"
},
"system/procs_running": {
"displayName": "system/procs_running"
}
}
},
"disk": {
"includeAllAttachedBlk": true,
"includeRootBlk": true,
"lsblkTimeout": "5s",
"metricsConfigs": {
"disk/avg_queue_len": {
"displayName": "disk/avg_queue_len"
},
"disk/bytes_used": {
"displayName": "disk/bytes_used"
},
"disk/io_time": {
"displayName": "disk/io_time"
},
"disk/merged_operation_count": {
"displayName": "disk/merged_operation_count"
},
"disk/operation_bytes_count": {
"displayName": "disk/operation_bytes_count"
},
"disk/operation_count": {
"displayName": "disk/operation_count"
},
"disk/operation_time": {
"displayName": "disk/operation_time"
},
"disk/weighted_io": {
"displayName": "disk/weighted_io"
}
}
},
"host": {
"metricsConfigs": {
"host/uptime": {
"displayName": "host/uptime"
}
}
},
"invokeInterval": "60s",
"memory": {
"metricsConfigs": {
"memory/anonymous_used": {
"displayName": "memory/anonymous_used"
},
"memory/bytes_used": {
"displayName": "memory/bytes_used"
},
"memory/dirty_used": {
"displayName": "memory/dirty_used"
},
"memory/page_cache_used": {
"displayName": "memory/page_cache_used"
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
}
}
},
"osFeature": {
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json",
"metricsConfigs": {
"system/os_feature": {
"displayName": "system/os_feature"
}
}
}
}
1 change: 1 addition & 0 deletions pkg/exporters/stackdriver/stackdriver_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running",
metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked",
metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total",
metrics.SystemCPUStat: "kubernetes.io/internal/node/guest/system/cpu_stat",
metrics.NetDevRxBytes: "kubernetes.io/internal/node/guest/net/rx_bytes",
metrics.NetDevRxPackets: "kubernetes.io/internal/node/guest/net/rx_packets",
metrics.NetDevRxErrors: "kubernetes.io/internal/node/guest/net/rx_errors",
Expand Down
1 change: 1 addition & 0 deletions pkg/systemstatsmonitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Below metrics are collected from `cpu` component:
* `system/procs_running`: Number of processes currently running.
* `system/procs_blocked`: Number of processes currently blocked.
* `system/interrupts_total`: Total number of interrupts serviced (cumulative).
* `system/cpu_stats`: Cumulative time each cpu spent in various stages. Collected from `/proc/stats`. Has a label for `cpu` and `stage`.

[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html

Expand Down
57 changes: 42 additions & 15 deletions pkg/systemstatsmonitor/cpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package systemstatsmonitor

import (
"fmt"

"github.com/golang/glog"
"github.com/prometheus/procfs"
"github.com/shirou/gopsutil/cpu"
Expand Down Expand Up @@ -50,6 +52,7 @@ type cpuCollector struct {
mSystemProcsRunning *metrics.Int64Metric
mSystemProcsBlocked *metrics.Int64Metric
mSystemInterruptsTotal *metrics.Int64Metric
mSystemCPUStat *metrics.Float64Metric // per-cpu time from /proc/stats

config *ssmtypes.CPUStatsConfig

Expand All @@ -63,13 +66,13 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
if err != nil {
glog.Fatalf("Failed to retrieve kernel version: %v", err)
}
cc.tags["kernel_version"] = kernelVersion
cc.tags[kernelVersionLabel] = kernelVersion

osVersion, err := util.GetOSVersion()
if err != nil {
glog.Fatalf("Failed to retrieve OS version: %v", err)
}
cc.tags["os_version"] = osVersion
cc.tags[osVersionLabel] = osVersion

cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
metrics.CPURunnableTaskCountID,
Expand Down Expand Up @@ -170,6 +173,17 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemInterruptsTotal, err)
}

cc.mSystemCPUStat, err = metrics.NewFloat64Metric(
metrics.SystemCPUStat,
cpuConfig.MetricsConfigs[string(metrics.SystemCPUStat)].DisplayName,
"Cumulative time each cpu spent in various stages.",
"ns",
metrics.Sum,
[]string{osVersionLabel, kernelVersionLabel, cpuLabel, stageLabel})
if err != nil {
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemCPUStat, err)
}

cc.lastUsageTime = make(map[string]float64)

return &cc
Expand Down Expand Up @@ -238,19 +252,6 @@ func (cc *cpuCollector) recordUsage() {
}

func (cc *cpuCollector) recordSystemStats() {
if cc.mSystemProcessesTotal == nil {
return
}
if cc.mSystemProcsRunning == nil {
return
}
if cc.mSystemProcsBlocked == nil {
return
}
if cc.mSystemInterruptsTotal == nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will the cpu_collector break if the config is not applied?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No - this is a nil check for the metric (as opposed to the config). The metric will only be initialized if the config is provided:

if len(ssm.config.CPUConfig.MetricsConfigs) > 0 {
ssm.cpuCollector = NewCPUCollectorOrDie(&ssm.config.CPUConfig)
}

So this will always evaluate to false.

return
}

fs, err := procfs.NewFS("/proc")
stats, err := fs.Stat()
if err != nil {
Expand All @@ -262,6 +263,32 @@ func (cc *cpuCollector) recordSystemStats() {
cc.mSystemProcsRunning.Record(cc.tags, int64(stats.ProcessesRunning))
cc.mSystemProcsBlocked.Record(cc.tags, int64(stats.ProcessesBlocked))
cc.mSystemInterruptsTotal.Record(cc.tags, int64(stats.IRQTotal))

for i, c := range stats.CPU {
tags := cc.tags
tags[cpuLabel] = fmt.Sprintf("cpu%d", i)

tags[stageLabel] = "user"
cc.mSystemCPUStat.Record(tags, c.User)
tags[stageLabel] = "nice"
cc.mSystemCPUStat.Record(tags, c.Nice)
tags[stageLabel] = "system"
cc.mSystemCPUStat.Record(tags, c.System)
tags[stageLabel] = "idle"
cc.mSystemCPUStat.Record(tags, c.Idle)
tags[stageLabel] = "iowait"
cc.mSystemCPUStat.Record(tags, c.Iowait)
tags[stageLabel] = "iRQ"
cc.mSystemCPUStat.Record(tags, c.IRQ)
tags[stageLabel] = "softIRQ"
cc.mSystemCPUStat.Record(tags, c.SoftIRQ)
tags[stageLabel] = "steal"
cc.mSystemCPUStat.Record(tags, c.Steal)
tags[stageLabel] = "guest"
cc.mSystemCPUStat.Record(tags, c.Guest)
tags[stageLabel] = "guestNice"
cc.mSystemCPUStat.Record(tags, c.GuestNice)
}
}

func (cc *cpuCollector) collect() {
Expand Down
6 changes: 6 additions & 0 deletions pkg/systemstatsmonitor/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,9 @@ const kernelVersionLabel = "kernel_version"

// interfaceNameLabel labels the network interface name
const interfaceNameLabel = "interface_name"

// cpuLabel labels the CPU (eg "cpu0")
const cpuLabel = "cpu"

// stageLabel labels the stage according to the kernel where CPU time was spent
const stageLabel = "stage"
1 change: 1 addition & 0 deletions pkg/util/metrics/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const (
SystemProcsRunning MetricID = "system/procs_running"
SystemProcsBlocked MetricID = "system/procs_blocked"
SystemInterruptsTotal MetricID = "system/interrupts_total"
SystemCPUStat MetricID = "system/cpu_stat"
NetDevRxBytes MetricID = "net/rx_bytes"
NetDevRxPackets MetricID = "net/rx_packets"
NetDevRxErrors MetricID = "net/rx_errors"
Expand Down