diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json index 60649ff28..12771a0f2 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json @@ -637,8 +637,8 @@ "overrides": [] }, "gridPos": { - "h": 12, - "w": 20, + "h": 13, + "w": 9, "x": 0, "y": 17 }, @@ -674,6 +674,95 @@ ], "title": "Disk Temperatures", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The data written to the disk in the last 24h period divided by the physical capacity of the disk", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 10, + "x": 9, + "y": 17 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000 / nvme_physical_size_bytes{instance=~\"$node\"}", + "legendFormat": "{{instance}} - {{device}}", + "range": true, + "refId": "A" + } + ], + "title": "DWPD", + "type": "timeseries" } ], "refresh": false, diff --git a/etc/kayobe/kolla/config/prometheus/smart.rules b/etc/kayobe/kolla/config/prometheus/smart.rules index aea36bdf8..853d9268a 100644 --- a/etc/kayobe/kolla/config/prometheus/smart.rules +++ b/etc/kayobe/kolla/config/prometheus/smart.rules @@ -13,4 +13,20 @@ groups: summary: "SMART monitor reports bad disk on (instance {{ $labels.instance }})" description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}" -{% endraw %} \ No newline at end of file + - alert: DWPDTooHigh + expr: (delta(nvme_data_units_written_total[30d])*512000 / nvme_physical_size_bytes) / 30 > 1 + labels: + severity: alert + annotations: + summary: "High 30-Day Average DWPD for {{ $labels.instance }}" + description: "The 30-Day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD" + + - alert: DWPDTooHighWarning + expr: (delta(nvme_data_units_written_total[7d])*512000 / nvme_physical_size_bytes) / 7 > 1 + labels: + severity: warning + annotations: + summary: "High 7-Day Average DWPD for {{ $labels.instance }}" + description: "The 7-day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD" + +{% endraw %} diff --git a/releasenotes/notes/dwpd-6b9fb0c8d6d3a570.yaml b/releasenotes/notes/dwpd-6b9fb0c8d6d3a570.yaml new file mode 100644 index 000000000..62d918519 --- /dev/null +++ b/releasenotes/notes/dwpd-6b9fb0c8d6d3a570.yaml @@ -0,0 +1,10 @@ +--- +features: + - | + Adds a panel in the Hardware Overview dashboard to show DWPD (Drive writes + per day) for NVMEs. This is calculated by dividing the total bytes written + in the past 24 hours by the drive capacity. This is currently only + supported on NVMEs. + - | + Adds alerts that will fire after 1 DWPD is sustained for 7 days, and a + critical alert if 1 DWPD is sustained for 30 days.