Monitoring & Observability

Why Monitoring Matters

Deployed a model? Congratulations! Now the real work begins.

Unlike traditional software, ML models can fail silently:

Code runs fine, but predictions are garbage
Input data distribution changes (drift)
Model accuracy degrades over time
Edge cases appear that weren't in training data

You won't know unless you monitor.

What to Monitor

1. Model Performance Metrics

Track the metrics you care about:

# metrics_collector.py (Python 3.12)
from prometheus_client import Counter, Histogram, Gauge
import time

# Request metrics
prediction_requests = Counter(
    'model_prediction_requests_total',
    'Total prediction requests',
    ['model_name', 'model_version']
)

prediction_errors = Counter(
    'model_prediction_errors_total',
    'Total prediction errors',
    ['model_name', 'error_type']
)

# Latency
prediction_latency = Histogram(
    'model_prediction_latency_seconds',
    'Prediction latency in seconds',
    ['model_name']
)

# Model metrics
model_accuracy = Gauge(
    'model_accuracy',
    'Current model accuracy',
    ['model_name', 'model_version']
)

def track_prediction(model_name: str, model_version: str):
    """Decorator to track predictions."""
    def decorator(func):
        def wrapper(*args, **kwargs):
            prediction_requests.labels(
                model_name=model_name,
                model_version=model_version
            ).inc()
            
            start_time = time.time()
            try:
                result = func(*args, **kwargs)
                latency = time.time() - start_time
                prediction_latency.labels(model_name=model_name).observe(latency)
                return result
            except Exception as e:
                prediction_errors.labels(
                    model_name=model_name,
                    error_type=type(e).__name__
                ).inc()
                raise
        
        return wrapper
    return decorator

# Usage in inference service
@track_prediction('iris-classifier', 'v1.0.0')
def predict(instances):
    return model.predict(instances)

2. Data Drift Detection

Monitor if input data changes:

# drift_detector.py
import numpy as np
from scipy import stats
import json

class DriftDetector:
    def __init__(self, reference_data: np.ndarray, threshold: float = 0.05):
        """Initialize with reference data from training."""
        self.reference_data = reference_data
        self.threshold = threshold
        self.feature_means = reference_data.mean(axis=0)
        self.feature_stds = reference_data.std(axis=0)
    
    def detect_drift(self, current_data: np.ndarray) -> dict:
        """Detect if current data has drifted from reference."""
        drift_detected = {}
        
        for i in range(current_data.shape[1]):
            # Kolmogorov-Smirnov test
            statistic, p_value = stats.ks_2samp(
                self.reference_data[:, i],
                current_data[:, i]
            )
            
            drift_detected[f'feature_{i}'] = {
                'drifted': p_value < self.threshold,
                'p_value': float(p_value),
                'statistic': float(statistic)
            }
        
        return drift_detected
    
    def check_distribution_shift(self, current_data: np.ndarray) -> dict:
        """Check for significant distribution shifts."""
        current_means = current_data.mean(axis=0)
        current_stds = current_data.std(axis=0)
        
        mean_shift = np.abs(current_means - self.feature_means) / self.feature_stds
        
        return {
            'mean_shift': mean_shift.tolist(),
            'significant_shift': (mean_shift > 2.0).any()  # 2 std devs
        }

# Usage
detector = DriftDetector(reference_data=training_data)

# Periodically check new data
drift_results = detector.detect_drift(recent_predictions_data)

if any(f['drifted'] for f in drift_results.values()):
    print("WARNING: Data drift detected!")
    # Alert, retrain, or investigate

3. Prediction Distribution

Track what your model is predicting:

from collections import defaultdict
import numpy as np

class PredictionMonitor:
    def __init__(self):
        self.prediction_counts = defaultdict(int)
        self.prediction_confidences = []
    
    def log_prediction(self, prediction, confidence=None):
        """Log a prediction."""
        self.prediction_counts[str(prediction)] += 1
        
        if confidence is not None:
            self.prediction_confidences.append(confidence)
    
    def get_distribution(self) -> dict:
        """Get prediction distribution."""
        total = sum(self.prediction_counts.values())
        
        return {
            'counts': dict(self.prediction_counts),
            'percentages': {
                k: (v / total) * 100 
                for k, v in self.prediction_counts.items()
            },
            'avg_confidence': np.mean(self.prediction_confidences) if self.prediction_confidences else None
        }
    
    def detect_anomalies(self, expected_distribution: dict) -> bool:
        """Check if current distribution differs from expected."""
        current = self.get_distribution()['percentages']
        
        for class_label, expected_pct in expected_distribution.items():
            current_pct = current.get(class_label, 0)
            diff = abs(current_pct - expected_pct)
            
            if diff > 10:  # More than 10% difference
                print(f"Anomaly: {class_label} at {current_pct:.1f}% (expected {expected_pct:.1f}%)")
                return True
        
        return False

# Usage
monitor = PredictionMonitor()

# Log each prediction
for instance in batch:
    pred, conf = model.predict_proba(instance)
    monitor.log_prediction(pred, confidence=conf.max())

# Check distribution
dist = monitor.get_distribution()
print(f"Prediction distribution: {dist}")

Setting Up Prometheus & Grafana

Install Prometheus

# Add Prometheus to Kubernetes
kubectl create namespace monitoring

# Install via Helm
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install prometheus prometheus-community/kube-prometheus-stack -n monitoring

# Verify
kubectl get pods -n monitoring

Expose Metrics from Your Model

# inference_server.py
from prometheus_client import start_http_server, Counter, Histogram
from flask import Flask, request, jsonify
import joblib

app = Flask(__name__)

# Metrics
predictions_total = Counter('predictions_total', 'Total predictions')
prediction_latency = Histogram('prediction_latency_seconds', 'Prediction latency')

# Load model
model = joblib.load('model.pkl')

@app.route('/predict', methods=['POST'])
@prediction_latency.time()
def predict():
    data = request.json
    predictions = model.predict(data['instances'])
    predictions_total.inc()
    return jsonify({'predictions': predictions.tolist()})

@app.route('/metrics')
def metrics():
    """Prometheus metrics endpoint."""
    from prometheus_client import generate_latest
    return generate_latest()

if __name__ == '__main__':
    start_http_server(8000)  # Metrics on port 8000
    app.run(host='0.0.0.0', port=8080)  # API on port 8080

Create Grafana Dashboards

Access Grafana:

kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
# Open http://localhost:3000
# Default: admin / prom-operator

Dashboard panels:

Prediction Rate: rate(predictions_total[5m])
Error Rate: rate(prediction_errors_total[5m])
P95 Latency: histogram_quantile(0.95, prediction_latency_seconds)
Model Accuracy: model_accuracy

Logging Best Practices

Structured Logging

import logging
import json
from datetime import datetime

class StructuredLogger:
    def __init__(self, name: str):
        self.logger = logging.getLogger(name)
        self.logger.setLevel(logging.INFO)
        
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter('%(message)s'))
        self.logger.addHandler(handler)
    
    def log_prediction(
        self,
        model_name: str,
        model_version: str,
        input_data: dict,
        prediction: any,
        latency: float,
        confidence: float = None
    ):
        """Log prediction with structured format."""
        log_entry = {
            'timestamp': datetime.utcnow().isoformat(),
            'event_type': 'prediction',
            'model_name': model_name,
            'model_version': model_version,
            'input': input_data,
            'output': prediction,
            'latency_ms': latency * 1000,
            'confidence': confidence
        }
        
        self.logger.info(json.dumps(log_entry))

# Usage
logger = StructuredLogger('model-inference')

logger.log_prediction(
    model_name='iris-classifier',
    model_version='v1.0.0',
    input_data={'features': [5.1, 3.5, 1.4, 0.2]},
    prediction=0,
    latency=0.023,
    confidence=0.98
)

Alerting

Define Alert Rules

# prometheus-alerts.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-alerts
  namespace: monitoring
data:
  alerts.yml: |
    groups:
    - name: ml_model_alerts
      rules:
      # High error rate
      - alert: HighPredictionErrorRate
        expr: rate(prediction_errors_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate for {{ $labels.model_name }}"
          
      # High latency
      - alert: HighPredictionLatency
        expr: histogram_quantile(0.95, prediction_latency_seconds) > 1.0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency for {{ $labels.model_name }}"
      
      # Low accuracy (if tracking)
      - alert: ModelAccuracyDegraded
        expr: model_accuracy < 0.85
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Model accuracy dropped below threshold"

Notification Channels

Configure alerts to Slack, email, or PagerDuty.

Automated Retraining Triggers

Monitor performance and trigger retraining:

# retrain_trigger.py
from datetime import datetime, timedelta
import requests

class RetrainingTrigger:
    def __init__(
        self,
        accuracy_threshold: float = 0.85,
        drift_threshold: float = 0.1,
        time_threshold_days: int = 30
    ):
        self.accuracy_threshold = accuracy_threshold
        self.drift_threshold = drift_threshold
        self.time_threshold = timedelta(days=time_threshold_days)
        self.last_training_date = datetime.now()
    
    def should_retrain(
        self,
        current_accuracy: float,
        drift_detected: bool,
        last_trained: datetime
    ) -> tuple[bool, str]:
        """Determine if model should be retrained."""
        
        # Accuracy degraded
        if current_accuracy < self.accuracy_threshold:
            return True, f"Accuracy dropped to {current_accuracy:.2f}"
        
        # Data drift detected
        if drift_detected:
            return True, "Data drift detected"
        
        # Time-based retraining
        if datetime.now() - last_trained > self.time_threshold:
            return True, "Scheduled retraining (30 days)"
        
        return False, "Model performing well"
    
    def trigger_pipeline(self, reason: str):
        """Trigger Kubeflow Pipeline for retraining."""
        print(f"Triggering retraining: {reason}")
        
        # Call Kubeflow Pipelines API
        response = requests.post(
            'http://ml-pipeline.kubeflow:8888/apis/v1beta1/runs',
            json={
                'pipeline_id': 'training-pipeline-id',
                'name': f'retrain-{datetime.now().isoformat()}',
                'description': f'Automated retraining: {reason}'
            }
        )
        
        return response.json()

# Usage
trigger = RetrainingTrigger()

should_retrain, reason = trigger.should_retrain(
    current_accuracy=0.82,
    drift_detected=False,
    last_trained=datetime(2024, 1, 1)
)

if should_retrain:
    trigger.trigger_pipeline(reason)

Key Takeaways

Monitor model performance, not just system metrics
Detect data drift before it impacts predictions
Use structured logging for debuggability
Set up alerts for degraded performance
Automate retraining when needed

Next Steps

With monitoring in place, let's automate the entire workflow. In CI/CD for ML, we'll build pipelines that test, validate, and deploy models automatically.

Resources:

PreviousModel Registry NextCI/CD for ML

Last updated 1 month ago

hashtagWhy Monitoring Matters

hashtagWhat to Monitor

hashtag1. Model Performance Metrics

hashtag2. Data Drift Detection

hashtag3. Prediction Distribution

hashtagSetting Up Prometheus & Grafana

hashtagInstall Prometheus

hashtagExpose Metrics from Your Model

hashtagCreate Grafana Dashboards

hashtagLogging Best Practices

hashtagStructured Logging

hashtagAlerting

hashtagDefine Alert Rules

hashtagNotification Channels

hashtagAutomated Retraining Triggers

hashtagKey Takeaways

hashtagNext Steps