Part 8: AI Engineering in Production
Production Is Where the Real Work Starts
Observability for LLM Systems
Token Usage and Cost
# src/ai_engineer/observability/metrics.py
import time
import logging
from dataclasses import dataclass, field
from collections import defaultdict
logger = logging.getLogger(__name__)
@dataclass
class LLMCallMetrics:
"""Metrics for a single LLM call."""
model: str
input_tokens: int
output_tokens: int
latency_ms: float
timestamp: float = field(default_factory=time.time)
prompt_version: str = ""
endpoint: str = ""
success: bool = True
error: str | None = None
class MetricsCollector:
"""Collect and aggregate metrics for LLM calls."""
def __init__(self) -> None:
self._calls: list[LLMCallMetrics] = []
self._hourly_costs: dict[str, float] = defaultdict(float)
def record(self, metrics: LLMCallMetrics) -> None:
self._calls.append(metrics)
# Track hourly costs
hour_key = time.strftime("%Y-%m-%d-%H", time.localtime(metrics.timestamp))
cost = self._estimate_cost(metrics)
self._hourly_costs[hour_key] += cost
# Log for structured logging / log aggregation
logger.info(
"llm_call",
extra={
"model": metrics.model,
"input_tokens": metrics.input_tokens,
"output_tokens": metrics.output_tokens,
"latency_ms": metrics.latency_ms,
"cost_usd": cost,
"prompt_version": metrics.prompt_version,
"endpoint": metrics.endpoint,
"success": metrics.success,
},
)
def _estimate_cost(self, m: LLMCallMetrics) -> float:
"""Estimate cost in USD."""
pricing = {
"gpt-4o": (2.50, 10.00),
"gpt-4o-mini": (0.15, 0.60),
}
input_rate, output_rate = pricing.get(m.model, (5.0, 15.0))
return (m.input_tokens * input_rate + m.output_tokens * output_rate) / 1_000_000
def summary(self, last_n_hours: int = 24) -> dict:
"""Get a summary of recent metrics."""
cutoff = time.time() - (last_n_hours * 3600)
recent = [c for c in self._calls if c.timestamp >= cutoff]
if not recent:
return {"total_calls": 0, "total_cost_usd": 0}
total_cost = sum(self._estimate_cost(c) for c in recent)
success_count = sum(1 for c in recent if c.success)
latencies = [c.latency_ms for c in recent]
return {
"total_calls": len(recent),
"success_rate": success_count / len(recent),
"total_cost_usd": round(total_cost, 4),
"avg_latency_ms": round(sum(latencies) / len(latencies), 1),
"p95_latency_ms": round(sorted(latencies)[int(len(latencies) * 0.95)], 1),
"total_input_tokens": sum(c.input_tokens for c in recent),
"total_output_tokens": sum(c.output_tokens for c in recent),
}Wrapping LLM Calls with Metrics
What My Dashboard Shows
Guardrails and Content Filtering
Input Guardrails
Output Guardrails
Applying Guardrails in the Request Pipeline
Caching Strategies
Embedding Cache
Response Cache for Repeated Queries
Cache Impact
Metric
Before Cache
After Cache
When to Use a Framework vs Building From Scratch
Frameworks I've Used
My Decision Framework
Deployment
Dockerfile
docker-compose.yml for Production
Health Check Endpoint
What I'd Do Differently
The Complete Picture
Last updated