Part 6: Building AI-Powered APIs with FastAPI
From Script to Service
Designing AI Endpoints
# src/ai_engineer/main.py
from contextlib import asynccontextmanager
from collections.abc import AsyncGenerator
from fastapi import FastAPI, HTTPException
from ai_engineer.config import settings
from ai_engineer.db.engine import init_db, close_db
from ai_engineer.models import QuestionRequest, AnswerResponse
from ai_engineer.retrieval.search import semantic_search
from ai_engineer.prompts.templates import PromptBuilder, RAGPromptInput
from ai_engineer.llm.factory import create_llm_provider
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
await init_db()
app.state.llm = create_llm_provider()
yield
await close_db()
app = FastAPI(title="AI Engineer Service", lifespan=lifespan)
@app.post("/ask", response_model=AnswerResponse)
async def ask_question(request: QuestionRequest) -> AnswerResponse:
"""Answer a question using RAG: retrieve context, then generate."""
import time
start = time.monotonic()
# Step 1: Retrieve relevant context
chunks = await semantic_search(
query=request.question,
top_k=5,
min_similarity=0.3,
)
if not chunks:
return AnswerResponse(
answer="I don't have information about this in my knowledge base.",
sources=[],
model=settings.llm_model,
tokens_used=0,
latency_ms=0,
)
# Step 2: Build prompt
prompt_input = RAGPromptInput(
question=request.question,
context_chunks=chunks,
)
messages = PromptBuilder.build_rag_prompt(prompt_input)
# Step 3: Generate answer
llm = app.state.llm
answer = await llm.generate(
messages[-1]["content"],
max_tokens=request.max_tokens,
temperature=0.1,
)
elapsed_ms = (time.monotonic() - start) * 1000
# Step 4: Build response
sources = [
{"title": c["title"], "content_preview": c["content"][:200], "similarity_score": c["similarity"]}
for c in chunks
]
return AnswerResponse(
answer=answer,
sources=sources,
model=settings.llm_model,
tokens_used=0, # Will track properly in Part 8
latency_ms=round(elapsed_ms, 1),
)The Request/Response Contract
Streaming Responses with SSE
Consuming the Stream (Client Side)
Async Patterns for Concurrent LLM Calls
Handling Multiple Independent Requests
Rate Limiting and Cost Control
Cost Tracking
Retry Logic
Putting It All Together
Key Takeaways
PreviousPart 5: Prompt Engineering for Production SystemsNextPart 7: Evaluating and Testing AI Systems
Last updated