Built my first chatbot API on a Friday. Worked perfectly in testing - clean code, proper endpoints, fast responses.
Monday morning: Production meltdown. API timing out under load. Users complaining. My manager asking questions.
The problem? Synchronous Claude API calls blocking FastAPI's event loop. Every request waited for Claude's response (2-5 seconds) while holding the connection. With 50 concurrent users, everything ground to a halt.
Fixed it with async/await: Response time dropped from 4 seconds to 400ms. Handled 10x more concurrent users.
Let me show you how to build it right from the start.
FastAPI Basics
Why FastAPI?
I've built APIs with Flask, Django, Express. FastAPI is best for LLM apps because:
1. Native async support - Perfect for I/O-bound LLM calls 2. Automatic validation - Pydantic models prevent bad requests 3. Auto-generated docs - Interactive API documentation 4. High performance - As fast as Node.js and Go 5. Type hints - Better IDE support and fewer bugs
Problem: The anthropic SDK doesn't have native async support. Solution: Use asyncio.to_thread() to run blocking calls in thread pool.
Proper Async Integration
Now the event loop doesn't block! Can handle many concurrent requests.
Request Validation with Pydantic
Pydantic validates data automatically. Huge time-saver.
Advanced Request Models
FastAPI automatically validates - bad requests get 422 status with detailed errors.
Test validation:
Response:
Complete Production API
Here's my production-ready FastAPI + Claude application:
This is production-grade:
β Async/await for performance
β Request validation
β Error handling with retries
β Logging
β CORS support
β Auto-generated documentation
Testing the API
Using curl
Using Python Requests
Interactive Documentation
FastAPI auto-generates docs! Visit:
Swagger UI:http://localhost:8000/docs
ReDoc:http://localhost:8000/redoc
You can test all endpoints directly in the browser.
Rate Limiting
Protect your API from abuse.
Install dependencies:
I use this in production - prevents one user from burning through API quota.
Environment Configuration
Proper configuration management:
.env file:
Best Practices
From my production deployments:
1. Always use async/await:
2. Validate all inputs:
3. Implement retry logic:
4. Log everything:
5. Use environment variables:
Common Mistakes
Mistakes I made:
1. Blocking calls in async functions β
2. No request validation β
3. Poor error handling β
4. Hardcoded configuration β
5. No logging β
What's Next?
You now have a production-ready FastAPI application with Claude integration. In Part 3, we'll add streaming responses for real-time user experience and explore advanced Claude features.
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import anthropic
import os
from dotenv import load_dotenv
load_dotenv()
app = FastAPI(title="Claude API Wrapper")
# Initialize Claude client
claude_client = anthropic.Anthropic(
api_key=os.getenv("ANTHROPIC_API_KEY")
)
class ChatRequest(BaseModel):
message: str
max_tokens: int = 1024
class ChatResponse(BaseModel):
response: str
input_tokens: int
output_tokens: int
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
"""Send message to Claude."""
try:
# Call Claude API
message = claude_client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=request.max_tokens,
messages=[
{"role": "user", "content": request.message}
]
)
return ChatResponse(
response=message.content[0].text,
input_tokens=message.usage.input_tokens,
output_tokens=message.usage.output_tokens
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
curl -X POST "http://localhost:8000/chat" \
-H "Content-Type: application/json" \
-d '{"message": "What is FastAPI?"}'
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import anthropic
import os
import asyncio
from dotenv import load_dotenv
load_dotenv()
app = FastAPI()
# Initialize client
claude_client = anthropic.Anthropic(
api_key=os.getenv("ANTHROPIC_API_KEY")
)
class ChatRequest(BaseModel):
message: str
max_tokens: int = 1024
temperature: float = 1.0
class ChatResponse(BaseModel):
response: str
input_tokens: int
output_tokens: int
async def call_claude_async(message: str, max_tokens: int, temperature: float):
"""Async wrapper for Claude API call."""
def _call():
return claude_ client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=max_tokens,
temperature=temperature,
messages=[{"role": "user", "content": message}]
)
# Run in thread pool
response = await asyncio.to_thread(_call)
return response
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
"""Chat with Claude (async)."""
try:
message = await call_claude_async(
request.message,
request.max_tokens,
request.temperature
)
return ChatResponse(
response=message.content[0].text,
input_tokens=message.usage.input_tokens,
output_tokens=message.usage.output_tokens
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
from pydantic import BaseModel, Field, validator
from typing import Optional, List
from enum import Enum
class ClaudeModel(str, Enum):
"""Available Claude models."""
OPUS = "claude-3-opus-20240229"
SONNET = "claude-3-5-sonnet-20241022"
HAIKU = "claude-3-haiku-20240307"
class Message(BaseModel):
"""Chat message."""
role: str = Field(..., pattern="^(user|assistant)$")
content: str = Field(..., min_length=1, max_length=10000)
class ChatRequest(BaseModel):
"""Chat request with validation."""
messages: List[Message] = Field(..., min_items=1, max_items=50)
model: ClaudeModel = ClaudeModel.SONNET
max_tokens: int = Field(default=1024, ge=1, le=4096)
temperature: float = Field(default=1.0, ge=0.0, le=1.0)
system_prompt: Optional[str] = Field(None, max_length=5000)
@validator('messages')
def validate_message_alternation(cls, messages):
"""Ensure messages alternate between user and assistant."""
if messages[0].role != "user":
raise ValueError("First message must be from user")
return messages
class ChatResponse(BaseModel):
"""Chat response."""
response: str
model: str
usage: dict = Field(..., description="Token usage stats")
# This will fail - temperature too high
curl -X POST "http://localhost:8000/chat" \
-H "Content-Type: application/json" \
-d '{"message": "Hi", "temperature": 2.0}'
{
"detail": [
{
"loc": ["body", "temperature"],
"msg": "ensure this value is less than or equal to 1.0",
"type": "value_error.number.not_le"
}
]
}
"""
Claude API FastAPI Wrapper
Production-ready implementation with proper error handling.
"""
from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, validator
from typing import List, Optional
import anthropic
from anthropic import APIError, RateLimitError, APIConnectionError
import os
import asyncio
import logging
from dotenv import load_dotenv
from enum import Enum
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Load environment
load_dotenv()
# Initialize FastAPI
app = FastAPI(
title="Claude API Wrapper",
description="Production-ready Claude API integration",
version="1.0.0"
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately in production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize Claude client
claude_client = anthropic.Anthropic(
api_key=os.getenv("ANTHROPIC_API_KEY")
)
# Models
class ClaudeModel(str, Enum):
OPUS = "claude-3-opus-20240229"
SONNET = "claude-3-5-sonnet-20241022"
HAIKU = "claude-3-haiku-20240307"
class Message(BaseModel):
role: str = Field(..., pattern="^(user|assistant)$")
content: str = Field(..., min_length=1, max_length=100000)
class ChatRequest(BaseModel):
messages: List[Message] = Field(..., min_items=1, max_items=100)
model: ClaudeModel = ClaudeModel.SONNET
max_tokens: int = Field(default=2048, ge=1, le=4096)
temperature: float = Field(default=1.0, ge=0.0, le=1.0)
system_prompt: Optional[str] = None
@validator('messages')
def validate_first_message(cls, messages):
if messages[0].role != "user":
raise ValueError("First message must be from user")
return messages
class Usage(BaseModel):
input_tokens: int
output_tokens: int
total_tokens: int
class ChatResponse(BaseModel):
response: str
model: str
usage: Usage
finish_reason: str
# Helper functions
async def call_claude_with_retry(
messages: List[dict],
model: str,
max_tokens: int,
temperature: float,
system_prompt: Optional[str],
max_retries: int = 3
):
"""Call Claude API with retry logic."""
def _call():
params = {
"model": model,
"max_tokens": max_tokens,
"temperature": temperature,
"messages": messages
}
if system_prompt:
params["system"] = system_prompt
return claude_client.messages.create(**params)
for attempt in range(max_retries):
try:
response = await asyncio.to_thread(_call)
return response
except RateLimitError:
if attempt == max_retries - 1:
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail="Rate limit exceeded. Please try again later."
)
wait_time = 2 ** attempt
logger.warning(f"Rate limited. Retrying in {wait_time}s...")
await asyncio.sleep(wait_time)
except APIConnectionError:
if attempt == max_retries - 1:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Could not connect to Claude API"
)
wait_time = 2 ** attempt
logger.warning(f"Connection error. Retrying in {wait_time}s...")
await asyncio.sleep(wait_time)
except APIError as e:
logger.error(f"Claude API error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Claude API error: {str(e)}"
)
except Exception as e:
logger.error(f"Unexpected error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal error: {str(e)}"
)
# Endpoints
@app.get("/")
async def root():
"""Root endpoint."""
return {
"service": "Claude API Wrapper",
"version": "1.0.0",
"status": "healthy"
}
@app.get("/health")
async def health():
"""Health check."""
return {"status": "healthy"}
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
"""
Chat with Claude.
- **messages**: List of conversation messages
- **model**: Claude model to use
- **max_tokens**: Maximum response length
- **temperature**: Response randomness (0-1)
- **system_prompt**: Optional system prompt
"""
# Convert Pydantic models to dicts
messages = [msg.dict() for msg in request.messages]
# Log request
logger.info(
f"Chat request - Model: {request.model}, "
f"Messages: {len(messages)}, "
f"Max tokens: {request.max_tokens}"
)
# Call Claude
response = await call_claude_with_retry(
messages=messages,
model=request.model.value,
max_tokens=request.max_tokens,
temperature=request.temperature,
system_prompt=request.system_prompt
)
# Extract response
assistant_message = response.content[0].text
# Log usage
logger.info(
f"Response - Input tokens: {response.usage.input_tokens}, "
f"Output tokens: {response.usage.output_tokens}"
)
return ChatResponse(
response=assistant_message,
model=request.model.value,
usage=Usage(
input_tokens=response.usage.input_tokens,
output_tokens=response.usage.output_tokens,
total_tokens=response.usage.input_tokens + response.usage.output_tokens
),
finish_reason=response.stop_reason or "end_turn"
)
@app.post("/simple-chat")
async def simple_chat(message: str, max_tokens: int = 1024):
"""Simple chat endpoint (for testing)."""
try:
def _call():
return claude_client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=max_tokens,
messages=[{"role": "user", "content": message}]
)
response = await asyncio.to_thread(_call)
return {
"response": response.content[0].text,
"tokens": {
"input": response.usage.input_tokens,
"output": response.usage.output_tokens
}
}
except Exception as e:
logger.error(f"Error: {e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
# Simple chat
curl -X POST "http://localhost:8000/simple-chat?message=Hello&max_tokens=100"
# Full chat
curl -X POST "http://localhost:8000/chat" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "What is FastAPI?"}
],
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 500,
"temperature": 0.7
}'