Observability & Monitoring

← Back to System Design 101 | ← Previous: Distributed Systems

Introduction

"You can't fix what you can't see." Observability is the practice of instrumenting systems to understand their internal state from external outputs. This article covers monitoring patterns I use to keep production systems healthy.

The Three Pillars

1. Metrics (Prometheus)

from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time

# Define metrics
request_count = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

request_duration = Histogram(
    'http_request_duration_seconds',
    'HTTP request duration',
    ['method', 'endpoint']
)

active_users = Gauge(
    'active_users_total',
    'Number of active users'
)

# Instrument code
@app.get("/api/users/{user_id}")
async def get_user(user_id: str):
    start_time = time.time()
    
    try:
        user = db.users.find_one({"id": user_id})
        
        # Record metrics
        request_count.labels(
            method="GET",
            endpoint="/api/users",
            status=200
        ).inc()
        
        return user
    
    except Exception as e:
        request_count.labels(
            method="GET",
            endpoint="/api/users",
            status=500
        ).inc()
        raise
    
    finally:
        duration = time.time() - start_time
        request_duration.labels(
            method="GET",
            endpoint="/api/users"
        ).observe(duration)

# Start metrics server
start_http_server(9090)

Prometheus query examples:

# Request rate
rate(http_requests_total[5m])

# P95 latency
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

# Error rate
rate(http_requests_total{status=~"5.."}[5m])

2. Logging (ELK Stack)

import logging
import json
from datetime import datetime

class StructuredLogger:
    """
    Structured logging for easy parsing and searching.
    I use this pattern across all services.
    """
    
    def __init__(self, service_name: str):
        self.service_name = service_name
        self.logger = logging.getLogger(service_name)
        self.logger.setLevel(logging.INFO)
        
        # JSON formatter
        handler = logging.StreamHandler()
        handler.setFormatter(self.JSONFormatter())
        self.logger.addHandler(handler)
    
    class JSONFormatter(logging.Formatter):
        def format(self, record):
            log_data = {
                'timestamp': datetime.utcnow().isoformat(),
                'level': record.levelname,
                'service': record.name,
                'message': record.getMessage(),
                'module': record.module,
                'function': record.funcName,
                'line': record.lineno
            }
            
            # Add extra fields
            if hasattr(record, 'user_id'):
                log_data['user_id'] = record.user_id
            if hasattr(record, 'request_id'):
                log_data['request_id'] = record.request_id
            if hasattr(record, 'duration'):
                log_data['duration'] = record.duration
            
            return json.dumps(log_data)
    
    def info(self, message: str, **kwargs):
        """Log info message with extra context."""
        extra = {k: v for k, v in kwargs.items()}
        self.logger.info(message, extra=extra)
    
    def error(self, message: str, exc_info=None, **kwargs):
        """Log error with exception info."""
        extra = {k: v for k, v in kwargs.items()}
        self.logger.error(message, exc_info=exc_info, extra=extra)

# Usage
logger = StructuredLogger("user-service")

logger.info(
    "User logged in",
    user_id="12345",
    request_id="req_abc",
    ip_address="192.168.1.1"
)

try:
    risky_operation()
except Exception as e:
    logger.error(
        "Operation failed",
        exc_info=True,
        user_id="12345",
        operation="risky_operation"
    )

3. Distributed Tracing (Jaeger)

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor

# Setup tracing
trace.set_tracer_provider(TracerProvider())
jaeger_exporter = JaegerExporter(
    agent_host_name="jaeger",
    agent_port=6831,
)
trace.get_tracer_provider().add_span_processor(
    BatchSpanProcessor(jaeger_exporter)
)

tracer = trace.get_tracer(__name__)

# Auto-instrument FastAPI
FastAPIInstrumentor.instrument_app(app)

# Manual instrumentation
@app.get("/api/orders/{order_id}")
async def get_order(order_id: str):
    """Get order with distributed tracing."""
    
    with tracer.start_as_current_span("get_order") as span:
        span.set_attribute("order_id", order_id)
        
        # Trace database call
        with tracer.start_as_current_span("db.query"):
            order = db.orders.find_one({"id": order_id})
        
        # Trace external service call
        with tracer.start_as_current_span("payment.get_status"):
            payment_status = await payment_service.get_status(order['payment_id'])
        
        span.set_attribute("payment_status", payment_status)
        return order

SLOs, SLIs, and SLAs

Service Level Indicators (SLIs)

class SLIMetrics:
    """
    Track Service Level Indicators.
    """
    
    def __init__(self):
        self.availability_counter = Counter(
            'sli_availability_total',
            'Availability SLI',
            ['status']  # success/failure
        )
        
        self.latency_histogram = Histogram(
            'sli_latency_seconds',
            'Latency SLI',
            buckets=[0.1, 0.5, 1.0, 2.0, 5.0]
        )
    
    def record_request(self, success: bool, duration: float):
        """Record request for SLI calculation."""
        status = 'success' if success else 'failure'
        self.availability_counter.labels(status=status).inc()
        self.latency_histogram.observe(duration)

sli_metrics = SLIMetrics()

# Record in request handler
@app.get("/api/endpoint")
async def endpoint():
    start = time.time()
    try:
        result = do_work()
        sli_metrics.record_request(success=True, duration=time.time() - start)
        return result
    except:
        sli_metrics.record_request(success=False, duration=time.time() - start)
        raise

Service Level Objectives (SLOs)

# SLO definitions
slos:
  - name: "API Availability"
    description: "99.9% of requests succeed"
    sli: "availability"
    target: 0.999
    window: "30d"
  
  - name: "API Latency"
    description: "95% of requests complete within 500ms"
    sli: "latency_p95"
    target: 0.5  # seconds
    window: "30d"

Alerting

# Prometheus alert rules
"""
groups:
- name: api_alerts
  rules:
  # High error rate
  - alert: HighErrorRate
    expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "High error rate detected"
      description: "Error rate is {{ $value | humanize }} requests/sec"
  
  # High latency
  - alert: HighLatency
    expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "High API latency"
      description: "P95 latency is {{ $value }}s"
  
  # Low availability
  - alert: LowAvailability
    expr: |
      (
        sum(rate(http_requests_total{status="200"}[5m]))
        /
        sum(rate(http_requests_total[5m]))
      ) < 0.999
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "SLO violation: Low availability"
"""

# Python alerting integration
from slack_sdk import WebClient

class AlertManager:
    """Send alerts to multiple channels."""
    
    def __init__(self, slack_token: str, pagerduty_key: str):
        self.slack = WebClient(token=slack_token)
        self.pagerduty_key = pagerduty_key
    
    def send_alert(
        self,
        severity: str,
        title: str,
        description: str,
        metrics: dict = None
    ):
        """Send alert to appropriate channel based on severity."""
        if severity == "critical":
            self._page_oncall(title, description)
        
        self._send_slack(severity, title, description, metrics)
    
    def _send_slack(self, severity, title, description, metrics):
        """Send Slack notification."""
        color = {"critical": "danger", "warning": "warning", "info": "good"}
        
        blocks = [
            {
                "type": "section",
                "text": {"type": "mrkdwn", "text": f"*{title}*\n{description}"}
            }
        ]
        
        if metrics:
            blocks.append({
                "type": "section",
                "fields": [
                    {"type": "mrkdwn", "text": f"*{k}:*\n{v}"}
                    for k, v in metrics.items()
                ]
            })
        
        self.slack.chat_postMessage(
            channel="#alerts",
            attachments=[{
                "color": color.get(severity, "warning"),
                "blocks": blocks
            }]
        )
    
    def _page_oncall(self, title, description):
        """Page on-call engineer via PagerDuty."""
        # PagerDuty integration
        pass

Dashboards

# Grafana dashboard as code (JSON)
dashboard_json = {
    "dashboard": {
        "title": "API Monitoring",
        "panels": [
            {
                "title": "Request Rate",
                "targets": [{
                    "expr": "rate(http_requests_total[5m])"
                }],
                "type": "graph"
            },
            {
                "title": "Error Rate",
                "targets": [{
                    "expr": "rate(http_requests_total{status=~\"5..\"}[5m])"
                }],
                "type": "graph"
            },
            {
                "title": "Latency (P50, P95, P99)",
                "targets": [
                    {"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))"},
                    {"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"},
                    {"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))"}
                ],
                "type": "graph"
            }
        ]
    }
}

Health Checks

from fastapi import status
from fastapi.responses import JSONResponse

@app.get("/health")
async def health_check():
    """Comprehensive health check."""
    health = {
        "status": "healthy",
        "timestamp": datetime.utcnow().isoformat(),
        "checks": {}
    }
    
    # Check database
    try:
        db.command('ping')
        health["checks"]["database"] = "healthy"
    except Exception as e:
        health["status"] = "unhealthy"
        health["checks"]["database"] = f"unhealthy: {str(e)}"
    
    # Check Redis
    try:
        redis_client.ping()
        health["checks"]["redis"] = "healthy"
    except Exception as e:
        health["status"] = "unhealthy"
        health["checks"]["redis"] = f"unhealthy: {str(e)}"
    
    # Check external dependencies
    try:
        response = await httpx.get("https://api.external.com/health", timeout=2)
        health["checks"]["external_api"] = "healthy" if response.status_code == 200 else "degraded"
    except:
        health["checks"]["external_api"] = "unavailable"
    
    status_code = status.HTTP_200_OK if health["status"] == "healthy" else status.HTTP_503_SERVICE_UNAVAILABLE
    
    return JSONResponse(content=health, status_code=status_code)

Lessons Learned

What worked:

Structured logging from day one
Distributed tracing for microservices
SLO-based alerting (not threshold-based)
Comprehensive dashboards
Regular review of metrics and alerts

What didn't work:

Too many alerts (alert fatigue)
Logging everything (log bloat)
No log aggregation
Missing distributed tracing
Not documenting runbooks

What's Next

With observability in place, let's explore security best practices:

Security Best Practices →: Authentication, authorization, and encryption

Navigation:

PreviousDistributed Systems NextSecurity Best Practices

Last updated 1 month ago

hashtagIntroduction

hashtagThe Three Pillars

hashtag1. Metrics (Prometheus)

hashtag2. Logging (ELK Stack)

hashtag3. Distributed Tracing (Jaeger)

hashtagSLOs, SLIs, and SLAs

hashtagService Level Indicators (SLIs)

hashtagService Level Objectives (SLOs)

hashtagAlerting

hashtagDashboards

hashtagHealth Checks

hashtagLessons Learned

hashtagWhat's Next