Part 7: Real-World KQL Patterns and Production Use Cases
From Learning to Practice
Production Monitoring Patterns
Pattern 1: Golden Signals Monitoring
// Golden Signals for Application Service
let timeRange = 5m;
let latencyThreshold = 1000; // ms
let errorThreshold = 1.0; // percent
let trafficWindow = 1m;
// 1. Latency (Response Time)
let latencyMetrics = AppRequests
| where TimeGenerated > ago(timeRange)
| summarize
P50 = percentile(DurationMs, 50),
P95 = percentile(DurationMs, 95),
P99 = percentile(DurationMs, 99),
AvgLatency = avg(DurationMs)
| extend
LatencyStatus = case(
P95 > latencyThreshold, "π΄ Critical",
P95 > (latencyThreshold * 0.8), "π‘ Warning",
"π’ Healthy"
),
Signal = "Latency";
// 2. Traffic (Requests Per Second)
let trafficMetrics = AppRequests
| where TimeGenerated > ago(timeRange)
| summarize RequestCount = count()
| extend
RequestsPerSecond = RequestCount / datetime_diff('second', timeRange, 0s),
TrafficStatus = "π’ Healthy",
Signal = "Traffic";
// 3. Errors (Error Rate)
let errorMetrics = AppRequests
| where TimeGenerated > ago(timeRange)
| summarize
TotalRequests = count(),
FailedRequests = countif(Success == false)
| extend
ErrorRate = 100.0 * FailedRequests / TotalRequests,
ErrorStatus = case(
(100.0 * FailedRequests / TotalRequests) > errorThreshold, "π΄ Critical",
(100.0 * FailedRequests / TotalRequests) > (errorThreshold * 0.5), "π‘ Warning",
"π’ Healthy"
),
Signal = "Errors";
// 4. Saturation (Resource Utilization)
let saturationMetrics = Perf
| where TimeGenerated > ago(timeRange)
| where CounterName in ("% Processor Time", "% Used Memory")
| summarize AvgValue = avg(CounterValue) by CounterName
| summarize
AvgCpu = sumif(AvgValue, CounterName == "% Processor Time"),
AvgMemory = sumif(AvgValue, CounterName == "% Used Memory")
| extend
MaxUtilization = iff(AvgCpu > AvgMemory, AvgCpu, AvgMemory),
SaturationStatus = case(
iff(AvgCpu > AvgMemory, AvgCpu, AvgMemory) > 90, "π΄ Critical",
iff(AvgCpu > AvgMemory, AvgCpu, AvgMemory) > 80, "π‘ Warning",
"π’ Healthy"
),
Signal = "Saturation";
// Combine all signals
union
(latencyMetrics | project Signal, Status = LatencyStatus, Value = P95, Unit = "ms"),
(trafficMetrics | project Signal, Status = TrafficStatus, Value = RequestsPerSecond, Unit = "req/s"),
(errorMetrics | project Signal, Status = ErrorStatus, Value = ErrorRate, Unit = "%"),
(saturationMetrics | project Signal, Status = SaturationStatus, Value = MaxUtilization, Unit = "%")
| project Signal, Value, Unit, StatusPattern 2: Service Level Objectives (SLO) Tracking
Pattern 3: Anomaly Detection with Baseline
Pattern 4: Dependency Health Matrix
Security Monitoring Patterns
Pattern 5: Failed Authentication Analysis
Pattern 6: Security Event Correlation
Performance Analysis Patterns
Pattern 7: Response Time Percentile Distribution
Pattern 8: Database Query Performance Analysis
Capacity Planning Patterns
Pattern 9: Resource Growth Trend Analysis
Pattern 10: Storage Capacity Forecasting
Troubleshooting Patterns
Pattern 11: Error Spike Investigation
Pattern 12: Deployment Impact Analysis
Advanced Analytics Patterns
Pattern 13: User Journey Analysis
Pattern 14: Correlation Analysis Between Metrics
Key Takeaways
Conclusion
Additional Resources
Last updated