Monitoring & Observability
Why Monitoring Matters
What to Monitor
1. Model Performance Metrics
# metrics_collector.py (Python 3.12)
from prometheus_client import Counter, Histogram, Gauge
import time
# Request metrics
prediction_requests = Counter(
'model_prediction_requests_total',
'Total prediction requests',
['model_name', 'model_version']
)
prediction_errors = Counter(
'model_prediction_errors_total',
'Total prediction errors',
['model_name', 'error_type']
)
# Latency
prediction_latency = Histogram(
'model_prediction_latency_seconds',
'Prediction latency in seconds',
['model_name']
)
# Model metrics
model_accuracy = Gauge(
'model_accuracy',
'Current model accuracy',
['model_name', 'model_version']
)
def track_prediction(model_name: str, model_version: str):
"""Decorator to track predictions."""
def decorator(func):
def wrapper(*args, **kwargs):
prediction_requests.labels(
model_name=model_name,
model_version=model_version
).inc()
start_time = time.time()
try:
result = func(*args, **kwargs)
latency = time.time() - start_time
prediction_latency.labels(model_name=model_name).observe(latency)
return result
except Exception as e:
prediction_errors.labels(
model_name=model_name,
error_type=type(e).__name__
).inc()
raise
return wrapper
return decorator
# Usage in inference service
@track_prediction('iris-classifier', 'v1.0.0')
def predict(instances):
return model.predict(instances)2. Data Drift Detection
3. Prediction Distribution
Setting Up Prometheus & Grafana
Install Prometheus
Expose Metrics from Your Model
Create Grafana Dashboards
Logging Best Practices
Structured Logging
Alerting
Define Alert Rules
Notification Channels
Automated Retraining Triggers
Key Takeaways
Next Steps
Last updated