Resilience Patterns

Introduction

In distributed systems, failures are inevitable. Networks partition, services crash, dependencies become slow. From building production systems, I've learned that the question isn't whether failures will happen, but how your system responds when they do.

This article covers practical resilience patterns: circuit breakers, retries, timeouts, bulkheads, and fallback strategies.

Why Resilience Matters

Failure Type

Impact

Mitigation

Service crash

Requests fail

Retry, fallback

Slow response

Thread exhaustion

Timeout, circuit breaker

Network partition

Requests hang

Timeout, bulkhead

Resource exhaustion

Cascading failures

Bulkhead, load shedding

Circuit Breaker Pattern

Concept

State

Behavior

Closed

Normal operation, requests pass through

Open

Fail fast, don't attempt requests

Half-Open

Allow limited requests to test recovery

Implementation

from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from typing import Callable, Any
import asyncio


class CircuitState(Enum):
    CLOSED = "closed"
    OPEN = "open"
    HALF_OPEN = "half_open"


@dataclass
class CircuitBreakerConfig:
    failure_threshold: int = 5
    success_threshold: int = 3
    timeout: timedelta = timedelta(seconds=30)
    half_open_max_calls: int = 3


@dataclass
class CircuitStats:
    failures: int = 0
    successes: int = 0
    last_failure_time: datetime | None = None
    half_open_calls: int = 0


class CircuitBreaker:
    """Circuit breaker implementation."""
    
    def __init__(self, name: str, config: CircuitBreakerConfig = None):
        self.name = name
        self.config = config or CircuitBreakerConfig()
        self.state = CircuitState.CLOSED
        self.stats = CircuitStats()
        self._lock = asyncio.Lock()
    
    async def call(self, func: Callable, *args, **kwargs) -> Any:
        """Execute function with circuit breaker protection."""
        async with self._lock:
            if not await self._can_execute():
                raise CircuitOpenError(f"Circuit {self.name} is open")
        
        try:
            result = await func(*args, **kwargs)
            await self._on_success()
            return result
        except Exception as e:
            await self._on_failure()
            raise
    
    async def _can_execute(self) -> bool:
        """Check if request should be allowed."""
        if self.state == CircuitState.CLOSED:
            return True
        
        if self.state == CircuitState.OPEN:
            # Check if timeout has elapsed
            if self._should_attempt_reset():
                self.state = CircuitState.HALF_OPEN
                self.stats.half_open_calls = 0
                return True
            return False
        
        # Half-open: allow limited calls
        if self.stats.half_open_calls < self.config.half_open_max_calls:
            self.stats.half_open_calls += 1
            return True
        return False
    
    def _should_attempt_reset(self) -> bool:
        """Check if circuit should attempt reset."""
        if not self.stats.last_failure_time:
            return True
        
        elapsed = datetime.utcnow() - self.stats.last_failure_time
        return elapsed >= self.config.timeout
    
    async def _on_success(self) -> None:
        """Handle successful call."""
        async with self._lock:
            if self.state == CircuitState.HALF_OPEN:
                self.stats.successes += 1
                if self.stats.successes >= self.config.success_threshold:
                    self._reset()
            elif self.state == CircuitState.CLOSED:
                self.stats.failures = 0
    
    async def _on_failure(self) -> None:
        """Handle failed call."""
        async with self._lock:
            self.stats.failures += 1
            self.stats.last_failure_time = datetime.utcnow()
            
            if self.state == CircuitState.HALF_OPEN:
                self._trip()
            elif self.stats.failures >= self.config.failure_threshold:
                self._trip()
    
    def _trip(self) -> None:
        """Open the circuit."""
        self.state = CircuitState.OPEN
        self.stats.successes = 0
    
    def _reset(self) -> None:
        """Close the circuit."""
        self.state = CircuitState.CLOSED
        self.stats = CircuitStats()
    
    @property
    def is_open(self) -> bool:
        return self.state == CircuitState.OPEN


class CircuitOpenError(Exception):
    pass


# Usage with decorator
def circuit_breaker(name: str, config: CircuitBreakerConfig = None):
    """Decorator for circuit breaker pattern."""
    cb = CircuitBreaker(name, config)
    
    def decorator(func):
        async def wrapper(*args, **kwargs):
            return await cb.call(func, *args, **kwargs)
        wrapper.circuit = cb
        return wrapper
    return decorator


# Example
class PaymentClient:
    @circuit_breaker("payment-service")
    async def process_payment(self, order_id: str, amount: float) -> dict:
        async with httpx.AsyncClient() as client:
            response = await client.post(
                "http://payment-service/payments",
                json={"order_id": order_id, "amount": amount},
            )
            response.raise_for_status()
            return response.json()

Circuit Breaker with Fallback

class CircuitBreakerWithFallback:
    """Circuit breaker with fallback support."""
    
    def __init__(
        self,
        name: str,
        config: CircuitBreakerConfig = None,
        fallback: Callable = None,
    ):
        self.circuit = CircuitBreaker(name, config)
        self.fallback = fallback
    
    async def call(
        self,
        func: Callable,
        *args,
        fallback: Callable = None,
        **kwargs,
    ) -> Any:
        """Execute with circuit breaker and fallback."""
        try:
            return await self.circuit.call(func, *args, **kwargs)
        except (CircuitOpenError, Exception) as e:
            fb = fallback or self.fallback
            if fb:
                return await fb(*args, **kwargs)
            raise


# Usage
class ProductService:
    def __init__(self):
        self.circuit = CircuitBreakerWithFallback(
            "product-catalog",
            fallback=self._get_cached_products,
        )
        self.cache = {}
    
    async def get_products(self, category: str) -> list:
        return await self.circuit.call(
            self._fetch_products,
            category,
            fallback=lambda c: self._get_cached_products(c),
        )
    
    async def _fetch_products(self, category: str) -> list:
        async with httpx.AsyncClient() as client:
            response = await client.get(
                f"http://product-service/products?category={category}"
            )
            products = response.json()
            # Update cache on success
            self.cache[category] = products
            return products
    
    async def _get_cached_products(self, category: str) -> list:
        return self.cache.get(category, [])

Retry Pattern

Exponential Backoff

from dataclasses import dataclass
import random


@dataclass
class RetryConfig:
    max_attempts: int = 3
    base_delay: float = 1.0
    max_delay: float = 60.0
    exponential_base: float = 2.0
    jitter: bool = True
    retryable_exceptions: tuple = (Exception,)


class RetryPolicy:
    """Retry with exponential backoff."""
    
    def __init__(self, config: RetryConfig = None):
        self.config = config or RetryConfig()
    
    def calculate_delay(self, attempt: int) -> float:
        """Calculate delay for attempt."""
        delay = self.config.base_delay * (
            self.config.exponential_base ** attempt
        )
        delay = min(delay, self.config.max_delay)
        
        if self.config.jitter:
            # Add random jitter (0.5 to 1.5 of delay)
            delay = delay * (0.5 + random.random())
        
        return delay
    
    def should_retry(self, attempt: int, exception: Exception) -> bool:
        """Check if should retry."""
        if attempt >= self.config.max_attempts:
            return False
        
        return isinstance(exception, self.config.retryable_exceptions)
    
    async def execute(self, func: Callable, *args, **kwargs) -> Any:
        """Execute function with retry."""
        last_exception = None
        
        for attempt in range(self.config.max_attempts):
            try:
                return await func(*args, **kwargs)
            except Exception as e:
                last_exception = e
                
                if not self.should_retry(attempt + 1, e):
                    break
                
                delay = self.calculate_delay(attempt)
                await asyncio.sleep(delay)
        
        raise last_exception


# Decorator version
def retry(
    max_attempts: int = 3,
    base_delay: float = 1.0,
    retryable_exceptions: tuple = (Exception,),
):
    """Retry decorator."""
    policy = RetryPolicy(RetryConfig(
        max_attempts=max_attempts,
        base_delay=base_delay,
        retryable_exceptions=retryable_exceptions,
    ))
    
    def decorator(func):
        async def wrapper(*args, **kwargs):
            return await policy.execute(func, *args, **kwargs)
        return wrapper
    return decorator


# Usage
class InventoryClient:
    @retry(max_attempts=3, base_delay=0.5, retryable_exceptions=(httpx.TimeoutException,))
    async def check_stock(self, product_id: str) -> int:
        async with httpx.AsyncClient(timeout=5.0) as client:
            response = await client.get(
                f"http://inventory-service/stock/{product_id}"
            )
            return response.json()["quantity"]

Retry with Circuit Breaker

class ResilientClient:
    """Combines retry and circuit breaker."""
    
    def __init__(
        self,
        circuit_config: CircuitBreakerConfig = None,
        retry_config: RetryConfig = None,
    ):
        self.circuit = CircuitBreaker("client", circuit_config)
        self.retry = RetryPolicy(retry_config)
    
    async def call(self, func: Callable, *args, **kwargs) -> Any:
        """Execute with retry inside circuit breaker."""
        async def retry_wrapper():
            return await self.retry.execute(func, *args, **kwargs)
        
        return await self.circuit.call(retry_wrapper)


# Usage
client = ResilientClient(
    circuit_config=CircuitBreakerConfig(failure_threshold=5),
    retry_config=RetryConfig(max_attempts=3, base_delay=1.0),
)

result = await client.call(some_external_api_call, param1, param2)

Timeout Pattern

import asyncio
from contextlib import asynccontextmanager


class TimeoutError(Exception):
    pass


@asynccontextmanager
async def timeout(seconds: float):
    """Context manager for timeout."""
    try:
        yield await asyncio.wait_for(asyncio.sleep(0), timeout=seconds)
    except asyncio.TimeoutError:
        raise TimeoutError(f"Operation timed out after {seconds}s")


class TimeoutPolicy:
    """Timeout wrapper for async operations."""
    
    def __init__(self, seconds: float):
        self.seconds = seconds
    
    async def execute(self, func: Callable, *args, **kwargs) -> Any:
        """Execute with timeout."""
        try:
            return await asyncio.wait_for(
                func(*args, **kwargs),
                timeout=self.seconds,
            )
        except asyncio.TimeoutError:
            raise TimeoutError(
                f"Operation timed out after {self.seconds}s"
            )


def with_timeout(seconds: float):
    """Timeout decorator."""
    policy = TimeoutPolicy(seconds)
    
    def decorator(func):
        async def wrapper(*args, **kwargs):
            return await policy.execute(func, *args, **kwargs)
        return wrapper
    return decorator


# Usage
class ExternalService:
    @with_timeout(5.0)
    async def fetch_data(self, resource_id: str) -> dict:
        async with httpx.AsyncClient() as client:
            response = await client.get(
                f"http://external-service/resources/{resource_id}"
            )
            return response.json()


# Propagating timeouts (deadline propagation)
class ServiceClient:
    async def call_with_deadline(
        self,
        func: Callable,
        deadline: datetime,
        *args,
        **kwargs,
    ) -> Any:
        """Call with deadline propagation."""
        remaining = (deadline - datetime.utcnow()).total_seconds()
        
        if remaining <= 0:
            raise TimeoutError("Deadline already passed")
        
        return await asyncio.wait_for(
            func(*args, **kwargs),
            timeout=remaining,
        )

Bulkhead Pattern

Thread Pool Bulkhead

from concurrent.futures import ThreadPoolExecutor
import asyncio


class BulkheadConfig:
    max_concurrent: int = 10
    max_queue: int = 100
    timeout: float = 30.0


class Bulkhead:
    """Isolate failures with bulkhead pattern."""
    
    def __init__(self, name: str, config: BulkheadConfig = None):
        self.name = name
        self.config = config or BulkheadConfig()
        self.semaphore = asyncio.Semaphore(self.config.max_concurrent)
        self.queue_count = 0
    
    async def execute(self, func: Callable, *args, **kwargs) -> Any:
        """Execute with bulkhead protection."""
        if self.queue_count >= self.config.max_queue:
            raise BulkheadFullError(f"Bulkhead {self.name} queue full")
        
        self.queue_count += 1
        try:
            async with asyncio.timeout(self.config.timeout):
                async with self.semaphore:
                    return await func(*args, **kwargs)
        finally:
            self.queue_count -= 1


class BulkheadFullError(Exception):
    pass


# Service with bulkheads for different dependencies
class OrderService:
    def __init__(self):
        # Separate bulkheads for each dependency
        self.payment_bulkhead = Bulkhead(
            "payment",
            BulkheadConfig(max_concurrent=5, max_queue=20),
        )
        self.inventory_bulkhead = Bulkhead(
            "inventory",
            BulkheadConfig(max_concurrent=10, max_queue=50),
        )
        self.notification_bulkhead = Bulkhead(
            "notification",
            BulkheadConfig(max_concurrent=20, max_queue=100),
        )
    
    async def process_order(self, order: Order) -> None:
        # Each call is isolated
        payment = await self.payment_bulkhead.execute(
            self._process_payment, order
        )
        
        inventory = await self.inventory_bulkhead.execute(
            self._reserve_inventory, order
        )
        
        # Non-critical, don't block on failure
        try:
            await self.notification_bulkhead.execute(
                self._send_confirmation, order
            )
        except BulkheadFullError:
            # Queue for later
            await self._queue_notification(order)

Connection Pool Bulkhead

from contextlib import asynccontextmanager


class ConnectionPoolBulkhead:
    """Connection pool with bulkhead isolation."""
    
    def __init__(
        self,
        name: str,
        min_size: int = 5,
        max_size: int = 20,
        timeout: float = 30.0,
    ):
        self.name = name
        self.min_size = min_size
        self.max_size = max_size
        self.timeout = timeout
        self._pool: list = []
        self._semaphore = asyncio.Semaphore(max_size)
        self._available: asyncio.Queue = asyncio.Queue()
    
    @asynccontextmanager
    async def acquire(self):
        """Acquire connection from pool."""
        async with asyncio.timeout(self.timeout):
            await self._semaphore.acquire()
            try:
                conn = await self._get_connection()
                yield conn
            finally:
                await self._return_connection(conn)
                self._semaphore.release()
    
    async def _get_connection(self):
        try:
            return self._available.get_nowait()
        except asyncio.QueueEmpty:
            return await self._create_connection()
    
    async def _return_connection(self, conn):
        await self._available.put(conn)


# Per-service connection pools
class DatabaseBulkheads:
    def __init__(self):
        self.order_db = ConnectionPoolBulkhead("order-db", max_size=10)
        self.user_db = ConnectionPoolBulkhead("user-db", max_size=5)
        self.product_db = ConnectionPoolBulkhead("product-db", max_size=15)

Fallback Pattern

from typing import TypeVar, Generic

T = TypeVar("T")


class FallbackChain(Generic[T]):
    """Chain of fallbacks."""
    
    def __init__(self):
        self.strategies: list[Callable[..., T]] = []
    
    def add(self, strategy: Callable[..., T]) -> "FallbackChain[T]":
        """Add fallback strategy."""
        self.strategies.append(strategy)
        return self
    
    async def execute(self, *args, **kwargs) -> T:
        """Execute strategies in order until one succeeds."""
        last_error = None
        
        for strategy in self.strategies:
            try:
                return await strategy(*args, **kwargs)
            except Exception as e:
                last_error = e
                continue
        
        raise last_error or RuntimeError("No strategies available")


# Usage
class ProductCatalog:
    def __init__(self):
        self.fallback_chain = FallbackChain[list]()
        self.fallback_chain.add(self._from_primary_service)
        self.fallback_chain.add(self._from_cache)
        self.fallback_chain.add(self._from_local_storage)
        self.fallback_chain.add(self._default_products)
    
    async def get_products(self, category: str) -> list:
        return await self.fallback_chain.execute(category)
    
    async def _from_primary_service(self, category: str) -> list:
        async with httpx.AsyncClient(timeout=5.0) as client:
            response = await client.get(
                f"http://product-service/products?category={category}"
            )
            response.raise_for_status()
            products = response.json()
            # Cache successful response
            await self._cache_products(category, products)
            return products
    
    async def _from_cache(self, category: str) -> list:
        products = await self.redis.get(f"products:{category}")
        if products:
            return json.loads(products)
        raise CacheMissError()
    
    async def _from_local_storage(self, category: str) -> list:
        # Read from local file as last resort
        with open(f"fallback_data/{category}.json") as f:
            return json.load(f)
    
    async def _default_products(self, category: str) -> list:
        # Return empty or minimal response
        return []

Load Shedding

from dataclasses import dataclass
import time


@dataclass
class LoadSheddingConfig:
    max_requests_per_second: int = 100
    priority_threshold: float = 0.8  # Shed low priority at 80% load


class LoadShedder:
    """Shed load when system is overloaded."""
    
    def __init__(self, config: LoadSheddingConfig = None):
        self.config = config or LoadSheddingConfig()
        self.requests_in_window = 0
        self.window_start = time.time()
    
    def should_accept(self, priority: str = "normal") -> bool:
        """Check if request should be accepted."""
        self._update_window()
        
        load = self.requests_in_window / self.config.max_requests_per_second
        
        # Always accept if under threshold
        if load < self.config.priority_threshold:
            self.requests_in_window += 1
            return True
        
        # At high load, only accept high priority
        if priority == "high" and load < 1.0:
            self.requests_in_window += 1
            return True
        
        return False
    
    def _update_window(self):
        """Reset window if second has passed."""
        now = time.time()
        if now - self.window_start >= 1.0:
            self.requests_in_window = 0
            self.window_start = now


# FastAPI middleware for load shedding
from fastapi import Request, HTTPException


class LoadSheddingMiddleware:
    def __init__(self, app, config: LoadSheddingConfig = None):
        self.app = app
        self.shedder = LoadShedder(config)
    
    async def __call__(self, scope, receive, send):
        if scope["type"] == "http":
            request = Request(scope, receive)
            priority = request.headers.get("X-Priority", "normal")
            
            if not self.shedder.should_accept(priority):
                response = HTTPException(
                    status_code=503,
                    detail="Service overloaded, try again later",
                    headers={"Retry-After": "1"},
                )
                await response(scope, receive, send)
                return
        
        await self.app(scope, receive, send)

Combined Resilience Strategy

@dataclass
class ResilienceConfig:
    timeout: float = 10.0
    retry_attempts: int = 3
    retry_delay: float = 1.0
    circuit_failure_threshold: int = 5
    circuit_timeout: float = 30.0
    bulkhead_max_concurrent: int = 10


class ResilientService:
    """Service with comprehensive resilience."""
    
    def __init__(self, name: str, config: ResilienceConfig = None):
        self.name = name
        self.config = config or ResilienceConfig()
        
        self.circuit = CircuitBreaker(
            name,
            CircuitBreakerConfig(
                failure_threshold=self.config.circuit_failure_threshold,
                timeout=timedelta(seconds=self.config.circuit_timeout),
            ),
        )
        
        self.retry = RetryPolicy(RetryConfig(
            max_attempts=self.config.retry_attempts,
            base_delay=self.config.retry_delay,
        ))
        
        self.bulkhead = Bulkhead(
            name,
            BulkheadConfig(max_concurrent=self.config.bulkhead_max_concurrent),
        )
    
    async def execute(
        self,
        func: Callable,
        *args,
        fallback: Callable = None,
        **kwargs,
    ) -> Any:
        """Execute with full resilience stack."""
        try:
            # Layer 1: Bulkhead (isolation)
            async def with_bulkhead():
                return await self.bulkhead.execute(func, *args, **kwargs)
            
            # Layer 2: Retry
            async def with_retry():
                return await self.retry.execute(with_bulkhead)
            
            # Layer 3: Circuit breaker
            async def with_circuit():
                return await self.circuit.call(with_retry)
            
            # Layer 4: Timeout
            return await asyncio.wait_for(
                with_circuit(),
                timeout=self.config.timeout,
            )
        
        except Exception as e:
            if fallback:
                return await fallback(*args, **kwargs)
            raise


# Usage
class OrderProcessingService:
    def __init__(self):
        self.payment_client = ResilientService("payment", ResilienceConfig(
            timeout=5.0,
            retry_attempts=2,
            circuit_failure_threshold=3,
        ))
        
        self.inventory_client = ResilientService("inventory", ResilienceConfig(
            timeout=3.0,
            retry_attempts=3,
            bulkhead_max_concurrent=20,
        ))
    
    async def process_order(self, order: Order) -> dict:
        # Payment with fallback to queue for later
        payment = await self.payment_client.execute(
            self._charge_payment,
            order,
            fallback=self._queue_payment,
        )
        
        # Inventory with fallback to optimistic processing
        inventory = await self.inventory_client.execute(
            self._reserve_inventory,
            order,
            fallback=self._optimistic_reservation,
        )
        
        return {"payment": payment, "inventory": inventory}

Health Checks for Resilience

from enum import Enum


class HealthStatus(Enum):
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    UNHEALTHY = "unhealthy"


class ServiceHealth:
    """Track service health based on resilience metrics."""
    
    def __init__(self, circuit_breakers: list[CircuitBreaker]):
        self.circuits = circuit_breakers
    
    def get_status(self) -> dict:
        open_circuits = [c for c in self.circuits if c.is_open]
        
        if not open_circuits:
            status = HealthStatus.HEALTHY
        elif len(open_circuits) < len(self.circuits):
            status = HealthStatus.DEGRADED
        else:
            status = HealthStatus.UNHEALTHY
        
        return {
            "status": status.value,
            "circuits": {
                c.name: {
                    "state": c.state.value,
                    "failures": c.stats.failures,
                }
                for c in self.circuits
            },
        }


# FastAPI health endpoint
@app.get("/health")
async def health():
    return service_health.get_status()

Key Takeaways

Circuit breaker prevents cascading failures - Fail fast when service is down
Retry with backoff - Handle transient failures gracefully
Timeouts are essential - Don't wait forever for responses
Bulkheads isolate failures - One dependency's failure shouldn't affect others
Fallbacks provide graceful degradation - Always have a plan B

What's Next?

To manage resilient systems, we need visibility. In Article 10: Observability, we'll cover distributed tracing, centralized logging, metrics collection, and health checks.

This article is part of the Microservice Architecture 101 series.

PreviousDistributed Data Patterns NextObservability

Last updated 1 month ago

hashtagIntroduction

hashtagWhy Resilience Matters

hashtagCircuit Breaker Pattern

hashtagConcept

hashtagImplementation

hashtagCircuit Breaker with Fallback

hashtagRetry Pattern

hashtagExponential Backoff

hashtagRetry with Circuit Breaker

hashtagTimeout Pattern

hashtagBulkhead Pattern

hashtagThread Pool Bulkhead

hashtagConnection Pool Bulkhead

hashtagFallback Pattern

hashtagLoad Shedding

hashtagCombined Resilience Strategy

hashtagHealth Checks for Resilience

hashtagKey Takeaways

hashtagWhat's Next?