Python for Data Engineering

← Previous: Data Engineering Fundamentals | Back to Index | Next: Data Ingestion & Sources →

Introduction

Python has become my primary tool for data engineering. While I also use SQL extensively, Python's rich ecosystem, readability, and versatility make it perfect for building data pipelines. This article covers the Python skills I use daily in production.

Why Python 3.12 for Data Engineering?

Python 3.12 improvements I appreciate:

Better error messages (saved me hours of debugging)
Performance improvements (15-50% faster)
Improved type hinting
Better asyncio support

All examples in this guide use Python 3.12+ features.

Essential Libraries

Core Data Libraries

# Python 3.12 - Essential imports for data engineering
import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations
from sqlalchemy import create_engine  # Database connections
import requests  # API calls
import json  # JSON handling
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Protocol
import logging
from pathlib import Path

Pandas - Data Manipulation Workhorse

# Python 3.12 - Pandas patterns I use daily
import pandas as pd
from datetime import datetime

class DataFrameProcessor:
    """
    Real-world pandas operations from my production pipelines.
    """
    
    @staticmethod
    def read_with_types(file_path: str) -> pd.DataFrame:
        """
        Read CSV with explicit types (prevents silent errors).
        """
        return pd.read_csv(
            file_path,
            dtype={
                'user_id': str,
                'amount': float,
                'quantity': int
            },
            parse_dates=['created_at', 'updated_at'],
            na_values=['NULL', 'null', 'N/A', '']
        )
    
    @staticmethod
    def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
        """
        Standard cleaning operations.
        """
        # Remove duplicates
        df = df.drop_duplicates(subset=['user_id', 'timestamp'], keep='last')
        
        # Handle missing values
        df['country'] = df['country'].fillna('unknown')
        df['amount'] = df['amount'].fillna(0.0)
        
        # Convert types
        df['user_id'] = df['user_id'].astype(str)
        df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
        
        # Filter invalid records
        df = df[df['amount'] >= 0]
        
        return df
    
    @staticmethod
    def aggregate_metrics(df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculate aggregated metrics.
        """
        return df.groupby(['user_id', 'date']).agg({
            'amount': ['sum', 'mean', 'count'],
            'quantity': 'sum',
            'transaction_id': 'nunique'
        }).reset_index()
    
    @staticmethod
    def merge_dataframes(
        left_df: pd.DataFrame,
        right_df: pd.DataFrame,
        on: str,
        how: str = 'left'
    ) -> pd.DataFrame:
        """
        Safe merge with validation.
        """
        # Check for duplicates in merge key
        if left_df[on].duplicated().any():
            logging.warning(f"Duplicates found in left DataFrame on {on}")
        
        # Perform merge
        result = pd.merge(left_df, right_df, on=on, how=how, indicator=True)
        
        # Log merge statistics
        merge_stats = result['_merge'].value_counts()
        logging.info(f"Merge statistics: {merge_stats.to_dict()}")
        
        return result.drop(columns=['_merge'])

# Usage example
df = pd.read_csv('transactions.csv')
df = DataFrameProcessor.clean_dataframe(df)
metrics = DataFrameProcessor.aggregate_metrics(df)

File Handling Best Practices

# Python 3.12 - Robust file handling
from pathlib import Path
import gzip
import shutil
from typing import Iterator
import logging

class FileHandler:
    """
    Production file handling with error recovery.
    """
    
    @staticmethod
    def read_large_file(file_path: Path, chunk_size: int = 10000) -> Iterator[pd.DataFrame]:
        """
        Read large files in chunks to avoid memory issues.
        I use this for files >1GB.
        """
        try:
            for chunk in pd.read_csv(
                file_path,
                chunksize=chunk_size,
                iterator=True,
                low_memory=False
            ):
                yield chunk
        except Exception as e:
            logging.error(f"Error reading {file_path}: {e}")
            raise
    
    @staticmethod
    def write_with_backup(df: pd.DataFrame, file_path: Path) -> None:
        """
        Write file with backup (learned this after data loss incident).
        """
        backup_path = file_path.with_suffix('.csv.bak')
        
        # Backup existing file
        if file_path.exists():
            shutil.copy2(file_path, backup_path)
        
        try:
            # Write new file
            df.to_csv(file_path, index=False)
            
            # Remove backup on success
            if backup_path.exists():
                backup_path.unlink()
                
        except Exception as e:
            # Restore backup on failure
            if backup_path.exists():
                shutil.move(str(backup_path), str(file_path))
            logging.error(f"Failed to write {file_path}: {e}")
            raise
    
    @staticmethod
    def compress_file(input_path: Path, output_path: Optional[Path] = None) -> Path:
        """
        Compress files to save storage (reduces costs in cloud).
        """
        if output_path is None:
            output_path = input_path.with_suffix(input_path.suffix + '.gz')
        
        with open(input_path, 'rb') as f_in:
            with gzip.open(output_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        logging.info(f"Compressed {input_path} -> {output_path}")
        return output_path

# Usage
for chunk in FileHandler.read_large_file(Path('large_file.csv')):
    process_chunk(chunk)

Error Handling & Logging

# Python 3.12 - Production error handling
import logging
from functools import wraps
from typing import Callable, Any
import traceback

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('pipeline.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

def retry_on_failure(max_attempts: int = 3, delay: int = 5):
    """
    Decorator for retrying failed operations.
    Saved me from many transient network errors.
    """
    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(*args, **kwargs) -> Any:
            for attempt in range(max_attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    if attempt == max_attempts - 1:
                        logger.error(
                            f"{func.__name__} failed after {max_attempts} attempts: {e}"
                        )
                        raise
                    
                    logger.warning(
                        f"{func.__name__} attempt {attempt + 1} failed: {e}. "
                        f"Retrying in {delay}s..."
                    )
                    import time
                    time.sleep(delay)
            
            return None
        return wrapper
    return decorator

def log_execution_time(func: Callable) -> Callable:
    """
    Decorator to log function execution time.
    Helps identify slow operations.
    """
    @wraps(func)
    def wrapper(*args, **kwargs) -> Any:
        start_time = datetime.now()
        logger.info(f"Starting {func.__name__}")
        
        try:
            result = func(*args, **kwargs)
            duration = (datetime.now() - start_time).total_seconds()
            logger.info(f"Completed {func.__name__} in {duration:.2f}s")
            return result
        except Exception as e:
            duration = (datetime.now() - start_time).total_seconds()
            logger.error(
                f"{func.__name__} failed after {duration:.2f}s: {e}\n"
                f"Traceback: {traceback.format_exc()}"
            )
            raise
    
    return wrapper

# Usage
@retry_on_failure(max_attempts=3, delay=5)
@log_execution_time
def fetch_data_from_api(url: str) -> dict:
    """Fetch data with retry logic and timing."""
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    return response.json()

Data Structures for Data Engineering

# Python 3.12 - Using dataclasses for structured data
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Optional
from enum import Enum

class RecordStatus(Enum):
    """Status of data records."""
    NEW = "new"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"

@dataclass
class DataRecord:
    """
    Structured data record with validation.
    Using dataclasses makes code cleaner and type-safe.
    """
    record_id: str
    user_id: str
    amount: float
    timestamp: datetime
    status: RecordStatus = RecordStatus.NEW
    metadata: dict = field(default_factory=dict)
    
    def __post_init__(self):
        """Validate data after initialization."""
        if self.amount < 0:
            raise ValueError(f"Amount cannot be negative: {self.amount}")
        
        if not self.user_id:
            raise ValueError("user_id is required")
    
    def to_dict(self) -> dict:
        """Convert to dictionary for storage."""
        return {
            'record_id': self.record_id,
            'user_id': self.user_id,
            'amount': self.amount,
            'timestamp': self.timestamp.isoformat(),
            'status': self.status.value,
            'metadata': self.metadata
        }

@dataclass
class PipelineMetrics:
    """Metrics for pipeline monitoring."""
    pipeline_name: str
    start_time: datetime
    end_time: Optional[datetime] = None
    records_processed: int = 0
    records_failed: int = 0
    errors: List[str] = field(default_factory=list)
    
    @property
    def duration_seconds(self) -> float:
        """Calculate duration."""
        if self.end_time:
            return (self.end_time - self.start_time).total_seconds()
        return (datetime.now() - self.start_time).total_seconds()
    
    @property
    def success_rate(self) -> float:
        """Calculate success rate."""
        total = self.records_processed + self.records_failed
        if total == 0:
            return 0.0
        return (self.records_processed / total) * 100

# Usage
record = DataRecord(
    record_id="txn_123",
    user_id="user_456",
    amount=99.99,
    timestamp=datetime.now()
)

metrics = PipelineMetrics(
    pipeline_name="user_events",
    start_time=datetime.now()
)

Database Operations with SQLAlchemy

# Python 3.12 - Database operations
from sqlalchemy import create_engine, text
from sqlalchemy.pool import QueuePool
from contextlib import contextmanager
import pandas as pd

class DatabaseConnection:
    """
    Reusable database connection manager.
    Connection pooling prevents resource exhaustion.
    """
    
    def __init__(self, connection_string: str):
        self.engine = create_engine(
            connection_string,
            poolclass=QueuePool,
            pool_size=5,
            max_overflow=10,
            pool_pre_ping=True  # Verify connections before use
        )
    
    @contextmanager
    def get_connection(self):
        """
        Context manager for database connections.
        Ensures proper cleanup even if errors occur.
        """
        conn = self.engine.connect()
        try:
            yield conn
            conn.commit()
        except Exception as e:
            conn.rollback()
            logger.error(f"Database error: {e}")
            raise
        finally:
            conn.close()
    
    def execute_query(self, query: str, params: dict = None) -> pd.DataFrame:
        """Execute SELECT query and return DataFrame."""
        with self.get_connection() as conn:
            return pd.read_sql(text(query), conn, params=params)
    
    def execute_insert(self, table: str, df: pd.DataFrame, if_exists: str = 'append') -> int:
        """
        Insert DataFrame into table.
        Returns number of rows inserted.
        """
        with self.get_connection() as conn:
            df.to_sql(table, conn, if_exists=if_exists, index=False)
            return len(df)
    
    def execute_batch(self, query: str, data: List[dict]) -> None:
        """Execute batch insert/update."""
        with self.get_connection() as conn:
            conn.execute(text(query), data)

# Usage
db = DatabaseConnection("postgresql://user:pass@localhost:5432/db")

# Read data
df = db.execute_query("""
    SELECT user_id, SUM(amount) as total
    FROM transactions
    WHERE created_at >= :start_date
    GROUP BY user_id
""", params={'start_date': '2026-01-01'})

# Write data
db.execute_insert('processed_transactions', df)

Asynchronous Operations

# Python 3.12 - Async operations for concurrent API calls
import asyncio
import aiohttp
from typing import List, Dict

class AsyncDataFetcher:
    """
    Fetch data from multiple sources concurrently.
    10x faster than sequential requests.
    """
    
    async def fetch_url(
        self,
        session: aiohttp.ClientSession,
        url: str
    ) -> Dict:
        """Fetch single URL."""
        try:
            async with session.get(url, timeout=30) as response:
                response.raise_for_status()
                return await response.json()
        except Exception as e:
            logger.error(f"Failed to fetch {url}: {e}")
            return {"error": str(e), "url": url}
    
    async def fetch_all(self, urls: List[str]) -> List[Dict]:
        """Fetch multiple URLs concurrently."""
        async with aiohttp.ClientSession() as session:
            tasks = [self.fetch_url(session, url) for url in urls]
            results = await asyncio.gather(*tasks)
            return results

# Usage
async def main():
    fetcher = AsyncDataFetcher()
    urls = [
        "https://api.example.com/users/1",
        "https://api.example.com/users/2",
        "https://api.example.com/users/3"
    ]
    results = await fetcher.fetch_all(urls)
    print(f"Fetched {len(results)} URLs")

# Run
# asyncio.run(main())

Testing Data Engineering Code

# Python 3.12 - Testing patterns
import pytest
import pandas as pd
from pandas.testing import assert_frame_equal

class TestDataProcessor:
    """
    Real tests from my pipeline code.
    Caught many bugs before production.
    """
    
    @pytest.fixture
    def sample_data(self) -> pd.DataFrame:
        """Sample data for testing."""
        return pd.DataFrame({
            'user_id': ['u1', 'u2', 'u3', 'u1'],
            'amount': [100, 200, 150, 100],
            'timestamp': pd.to_datetime(['2026-01-01', '2026-01-01', '2026-01-02', '2026-01-01'])
        })
    
    def test_remove_duplicates(self, sample_data):
        """Test duplicate removal."""
        result = sample_data.drop_duplicates(subset=['user_id', 'timestamp'])
        assert len(result) == 3
    
    def test_aggregation(self, sample_data):
        """Test aggregation logic."""
        result = sample_data.groupby('user_id')['amount'].sum()
        assert result['u1'] == 200
        assert result['u2'] == 200
    
    def test_data_validation(self):
        """Test validation catches bad data."""
        with pytest.raises(ValueError):
            DataRecord(
                record_id="test",
                user_id="",
                amount=-100,
                timestamp=datetime.now()
            )

Performance Optimization

# Python 3.12 - Performance optimization techniques
import pandas as pd
from typing import List

class PerformanceOptimizer:
    """
    Techniques I use to optimize slow pipelines.
    """
    
    @staticmethod
    def optimize_memory(df: pd.DataFrame) -> pd.DataFrame:
        """
        Reduce DataFrame memory usage.
        Reduced memory by 60% in production.
        """
        # Downcast numeric types
        for col in df.select_dtypes(include=['float']).columns:
            df[col] = pd.to_numeric(df[col], downcast='float')
        
        for col in df.select_dtypes(include=['integer']).columns:
            df[col] = pd.to_numeric(df[col], downcast='integer')
        
        # Convert object to category if low cardinality
        for col in df.select_dtypes(include=['object']).columns:
            num_unique = df[col].nunique()
            num_total = len(df[col])
            if num_unique / num_total < 0.5:
                df[col] = df[col].astype('category')
        
        return df
    
    @staticmethod
    def vectorize_operations(df: pd.DataFrame) -> pd.DataFrame:
        """
        Use vectorized operations instead of loops.
        100x faster than iterating rows.
        """
        # Bad: Loop
        # for i, row in df.iterrows():
        #     df.at[i, 'total'] = row['price'] * row['quantity']
        
        # Good: Vectorized
        df['total'] = df['price'] * df['quantity']
        
        return df
    
    @staticmethod
    def use_query_for_filtering(df: pd.DataFrame) -> pd.DataFrame:
        """
        Use query() for complex filtering (more readable).
        """
        # Instead of: df[(df['amount'] > 100) & (df['status'] == 'active')]
        return df.query('amount > 100 and status == "active"')

Conclusion

Python's rich ecosystem makes it ideal for data engineering. The patterns shown here are from real production code that processes millions of records daily.

Key takeaways:

Use pandas for data manipulation
Implement robust error handling and logging
Leverage async for I/O operations
Test your code thoroughly
Optimize for performance when needed

Navigation:

PreviousData Engineering Fundamentals NextData Ingestion Sources

Last updated 3 months ago

hashtagIntroduction

hashtagWhy Python 3.12 for Data Engineering?

hashtagEssential Libraries

hashtagCore Data Libraries

hashtagPandas - Data Manipulation Workhorse

hashtagFile Handling Best Practices

hashtagError Handling & Logging

hashtagData Structures for Data Engineering

hashtagDatabase Operations with SQLAlchemy

hashtagAsynchronous Operations

hashtagTesting Data Engineering Code

hashtagPerformance Optimization

hashtagConclusion