Skip to content

Commit 84604df

Browse files
committed
04_Enterprise_Observability_at_Scale
1 parent 80a8c1d commit 84604df

File tree

2 files changed

+584
-0
lines changed

2 files changed

+584
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
# Centralized Monitoring Aggregation
2+
3+
## Overview
4+
This guide provides detailed instructions for implementing centralized monitoring aggregation in multi-region Kubernetes deployments, ensuring comprehensive observability across all regions.
5+
6+
## Prerequisites
7+
- Multi-region Kubernetes clusters
8+
- kubectl configured
9+
- Helm installed
10+
- Basic understanding of monitoring concepts
11+
- Access to create and modify resources
12+
13+
## Prometheus Federation Setup
14+
15+
### 1. Install Prometheus
16+
```bash
17+
# Add Helm repository
18+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
19+
helm repo update
20+
21+
# Install Prometheus
22+
helm install prometheus prometheus-community/kube-prometheus-stack \
23+
--namespace monitoring \
24+
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
25+
--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \
26+
--set prometheus.prometheusSpec.ruleSelectorNilUsesHelmValues=false \
27+
--set grafana.enabled=true
28+
```
29+
30+
### 2. Configure Federation
31+
```yaml
32+
# prometheus-federation.yaml
33+
apiVersion: v1
34+
kind: ConfigMap
35+
metadata:
36+
name: prometheus-federation
37+
namespace: monitoring
38+
data:
39+
prometheus.yml: |
40+
global:
41+
scrape_interval: 15s
42+
evaluation_interval: 15s
43+
scrape_configs:
44+
- job_name: 'federate'
45+
honor_labels: true
46+
metrics_path: '/federate'
47+
params:
48+
'match[]':
49+
- '{job="prometheus"}'
50+
- '{job="node-exporter"}'
51+
- '{job="kube-state-metrics"}'
52+
static_configs:
53+
- targets:
54+
- 'prometheus-region-1.monitoring.svc.cluster.local:9090'
55+
- 'prometheus-region-2.monitoring.svc.cluster.local:9090'
56+
- 'prometheus-region-3.monitoring.svc.cluster.local:9090'
57+
```
58+
59+
## Thanos Setup
60+
61+
### 1. Install Thanos
62+
```bash
63+
# Add Helm repository
64+
helm repo add bitnami https://charts.bitnami.com/bitnami
65+
helm repo update
66+
67+
# Install Thanos
68+
helm install thanos bitnami/thanos \
69+
--namespace monitoring \
70+
--set objstore.config=|
71+
type: S3
72+
config:
73+
bucket: thanos-data
74+
endpoint: s3.amazonaws.com
75+
access_key: ${AWS_ACCESS_KEY_ID}
76+
secret_key: ${AWS_SECRET_ACCESS_KEY}
77+
```
78+
79+
### 2. Configure Thanos Components
80+
```yaml
81+
# thanos-config.yaml
82+
apiVersion: apps/v1
83+
kind: Deployment
84+
metadata:
85+
name: thanos-query
86+
namespace: monitoring
87+
spec:
88+
replicas: 2
89+
selector:
90+
matchLabels:
91+
app: thanos-query
92+
template:
93+
metadata:
94+
labels:
95+
app: thanos-query
96+
spec:
97+
containers:
98+
- name: thanos-query
99+
image: thanosio/thanos:v0.28.0
100+
args:
101+
- query
102+
- --http-address=0.0.0.0:10902
103+
- --grpc-address=0.0.0.0:10901
104+
- --store=thanos-store.monitoring.svc.cluster.local:10901
105+
- --store=thanos-sidecar-region-1.monitoring.svc.cluster.local:10901
106+
- --store=thanos-sidecar-region-2.monitoring.svc.cluster.local:10901
107+
- --store=thanos-sidecar-region-3.monitoring.svc.cluster.local:10901
108+
ports:
109+
- name: http
110+
containerPort: 10902
111+
- name: grpc
112+
containerPort: 10901
113+
```
114+
115+
## Grafana Configuration
116+
117+
### 1. Configure Data Sources
118+
```yaml
119+
# grafana-datasources.yaml
120+
apiVersion: v1
121+
kind: ConfigMap
122+
metadata:
123+
name: grafana-datasources
124+
namespace: monitoring
125+
data:
126+
prometheus.yaml: |
127+
apiVersion: 1
128+
datasources:
129+
- name: Prometheus
130+
type: prometheus
131+
url: http://prometheus-server.monitoring.svc.cluster.local
132+
access: proxy
133+
isDefault: true
134+
- name: Thanos
135+
type: prometheus
136+
url: http://thanos-query.monitoring.svc.cluster.local:10902
137+
access: proxy
138+
```
139+
140+
### 2. Create Global Dashboard
141+
```yaml
142+
# global-dashboard.yaml
143+
apiVersion: integreatly.org/v1alpha1
144+
kind: GrafanaDashboard
145+
metadata:
146+
name: global-monitoring
147+
namespace: monitoring
148+
spec:
149+
json: |
150+
{
151+
"dashboard": {
152+
"title": "Global Monitoring",
153+
"panels": [
154+
{
155+
"title": "Cluster Status",
156+
"type": "table",
157+
"datasource": "Thanos",
158+
"targets": [
159+
{
160+
"expr": "kube_cluster_status",
161+
"legendFormat": "{{region}}"
162+
}
163+
]
164+
},
165+
{
166+
"title": "Resource Usage",
167+
"type": "graph",
168+
"datasource": "Thanos",
169+
"targets": [
170+
{
171+
"expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (region)",
172+
"legendFormat": "CPU {{region}}"
173+
},
174+
{
175+
"expr": "sum(container_memory_usage_bytes) by (region)",
176+
"legendFormat": "Memory {{region}}"
177+
}
178+
]
179+
},
180+
{
181+
"title": "Request Rate",
182+
"type": "graph",
183+
"datasource": "Thanos",
184+
"targets": [
185+
{
186+
"expr": "sum(rate(http_requests_total[5m])) by (region)",
187+
"legendFormat": "{{region}}"
188+
}
189+
]
190+
}
191+
]
192+
}
193+
}
194+
```
195+
196+
## Alerting Configuration
197+
198+
### 1. Configure Alertmanager
199+
```yaml
200+
# alertmanager-config.yaml
201+
apiVersion: v1
202+
kind: ConfigMap
203+
metadata:
204+
name: alertmanager-config
205+
namespace: monitoring
206+
data:
207+
alertmanager.yml: |
208+
global:
209+
resolve_timeout: 5m
210+
route:
211+
group_by: ['alertname', 'region']
212+
group_wait: 30s
213+
group_interval: 5m
214+
repeat_interval: 4h
215+
receiver: 'slack'
216+
receivers:
217+
- name: 'slack'
218+
slack_configs:
219+
- api_url: '${SLACK_WEBHOOK_URL}'
220+
channel: '#alerts'
221+
send_resolved: true
222+
```
223+
224+
### 2. Configure Alert Rules
225+
```yaml
226+
# alert-rules.yaml
227+
apiVersion: monitoring.coreos.com/v1
228+
kind: PrometheusRule
229+
metadata:
230+
name: global-alerts
231+
namespace: monitoring
232+
spec:
233+
groups:
234+
- name: global
235+
rules:
236+
- alert: HighErrorRate
237+
expr: sum(rate(http_requests_total{status=~"5.."}[5m])) by (region) / sum(rate(http_requests_total[5m])) by (region) > 0.01
238+
for: 5m
239+
labels:
240+
severity: critical
241+
annotations:
242+
summary: High error rate in {{ $labels.region }}
243+
description: Error rate is above 1% in {{ $labels.region }}
244+
- alert: HighLatency
245+
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, region)) > 1
246+
for: 5m
247+
labels:
248+
severity: warning
249+
annotations:
250+
summary: High latency in {{ $labels.region }}
251+
description: 95th percentile latency is above 1s in {{ $labels.region }}
252+
```
253+
254+
## Best Practices
255+
256+
### 1. Monitoring Configuration
257+
- Set appropriate scrape intervals
258+
- Configure retention policies
259+
- Monitor resource usage
260+
- Regular review
261+
- Document configurations
262+
263+
### 2. Data Management
264+
- Implement data retention
265+
- Configure backup strategy
266+
- Monitor storage usage
267+
- Regular cleanup
268+
- Document procedures
269+
270+
### 3. Alert Management
271+
- Set meaningful thresholds
272+
- Configure notification channels
273+
- Regular review
274+
- Document procedures
275+
- Update configurations
276+
277+
## Next Steps
278+
1. Monitor system performance
279+
2. Optimize configurations
280+
3. Implement backup strategy
281+
4. Regular reviews
282+
5. Documentation

0 commit comments

Comments
 (0)