Part 6: Incident Management and On-Call Automation
The First Real Incident
Alertmanager Configuration
# infrastructure/prometheus-stack/alertmanager-config.yaml
# Applied as a Secret to the kube-prometheus-stack Helm release
global:
resolve_timeout: 5m
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
route:
receiver: 'null' # Default: silence alerts not matched by any route
group_by: ['alertname', 'sloth_slo', 'severity']
group_wait: 30s # Wait 30s before sending first notification (group related alerts)
group_interval: 5m # Wait 5m before sending updates on already-firing alerts
repeat_interval: 4h # Re-notify every 4h if alert is still firing
routes:
# Critical SLO burn-rate alerts → PagerDuty
- matchers:
- severity = critical
- alertname =~ ".*Page$"
receiver: pagerduty-critical
continue: true # Also send to Slack
# Critical alerts → Slack
- matchers:
- severity = critical
receiver: slack-critical
# Warning alerts → Slack only
- matchers:
- severity = warning
receiver: slack-warning
# Watchdog — always fires to confirm alerting pipeline is working
- matchers:
- alertname = Watchdog
receiver: 'null'
receivers:
- name: 'null'
- name: pagerduty-critical
pagerduty_configs:
- service_key: '{{ .Values.pagerduty.serviceKey }}'
severity: critical
description: '{{ template "pagerduty.default.description" . }}'
details:
slo: '{{ .CommonLabels.sloth_slo }}'
burn_rate: '{{ .CommonLabels.sloth_burn_rate }}'
runbook: '{{ .CommonAnnotations.runbook }}'
- name: slack-critical
slack_configs:
- api_url: '{{ .Values.slack.webhookURL }}'
channel: '#incidents'
color: 'danger'
title: '🔴 Critical: {{ .CommonLabels.alertname }}'
text: |
*SLO:* {{ .CommonLabels.sloth_slo }}
*Burn Rate:* {{ .CommonLabels.sloth_burn_rate }}×
*Summary:* {{ .CommonAnnotations.summary }}
*Runbook:* {{ .CommonAnnotations.runbook }}
send_resolved: true
- name: slack-warning
slack_configs:
- api_url: '{{ .Values.slack.webhookURL }}'
channel: '#sre-alerts'
color: 'warning'
title: '🟡 Warning: {{ .CommonLabels.alertname }}'
text: |
*SLO:* {{ .CommonLabels.sloth_slo }}
*Summary:* {{ .CommonAnnotations.summary }}
*Runbook:* {{ .CommonAnnotations.runbook }}
send_resolved: true
inhibit_rules:
# Suppress warnings when a critical alert for the same SLO is already firing
- source_matchers:
- severity = critical
target_matchers:
- severity = warning
equal: ['sloth_slo']Runbooks as Code
2. Identify blocking queries in PostgreSQL
3. Kill blocking query (if safe)
Resolution
Escalation
Post-Incident
Toil Identification and Reduction
Post-Incident Review Template
PreviousPart 5: SLIs, SLOs, and Error Budgets in PracticeNextPart 7: Capacity Planning, Performance, and Chaos Engineering
Last updated