Data Engineering Best Practices

← Previous: Streaming & Real-Time Data | Next: Real-World Project →

Introduction

Throughout my data engineering career, I've learned that building pipelines is the easy part—maintaining them in production is the real challenge. This article covers production best practices I wish I'd known earlier: code quality, version control, CI/CD, monitoring, security, and cost optimization.

These aren't theoretical guidelines—they're battle-tested practices from real production systems processing terabytes of data daily.

Code Quality and Structure

Project Structure

I organize data engineering projects with clear separation of concerns:

data-pipeline/
├── README.md
├── requirements.txt
├── pyproject.toml
├── .env.example
├── .gitignore
├── setup.py
├── src/
│   ├── __init__.py
│   ├── config/
│   │   ├── __init__.py
│   │   └── settings.py
│   ├── extractors/
│   │   ├── __init__.py
│   │   ├── api_extractor.py
│   │   └── database_extractor.py
│   ├── transformers/
│   │   ├── __init__.py
│   │   ├── cleaner.py
│   │   └── aggregator.py
│   ├── loaders/
│   │   ├── __init__.py
│   │   ├── database_loader.py
│   │   └── s3_loader.py
│   └── utils/
│       ├── __init__.py
│       ├── logging.py
│       └── validators.py
├── tests/
│   ├── __init__.py
│   ├── test_extractors.py
│   ├── test_transformers.py
│   └── test_loaders.py
├── airflow/
│   └── dags/
│       └── daily_etl_dag.py
└── scripts/
    ├── setup_database.py
    └── run_pipeline.py

Configuration Management

Never hardcode credentials or configurations:

# src/config/settings.py
from pydantic_settings import BaseSettings
from typing import Optional

class Settings(BaseSettings):
    """Application settings from environment variables"""
    
    # Database
    DB_HOST: str
    DB_PORT: int = 5432
    DB_NAME: str
    DB_USER: str
    DB_PASSWORD: str
    
    # AWS
    AWS_REGION: str = 'us-east-1'
    AWS_S3_BUCKET: str
    AWS_ACCESS_KEY_ID: Optional[str] = None  # Use IAM roles when possible
    AWS_SECRET_ACCESS_KEY: Optional[str] = None
    
    # Application
    LOG_LEVEL: str = 'INFO'
    ENVIRONMENT: str = 'development'
    MAX_WORKERS: int = 4
    
    # Data processing
    BATCH_SIZE: int = 1000
    RETRY_ATTEMPTS: int = 3
    TIMEOUT_SECONDS: int = 300
    
    class Config:
        env_file = '.env'
        case_sensitive = True
    
    @property
    def database_url(self) -> str:
        """Get database connection URL"""
        return f"postgresql://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"


# Usage
settings = Settings()

Logging Best Practices

# src/utils/logging.py
import logging
import sys
from pathlib import Path
from datetime import datetime
from typing import Optional

def setup_logging(
    log_level: str = 'INFO',
    log_file: Optional[str] = None,
    include_timestamp: bool = True
) -> logging.Logger:
    """
    Setup application logging.
    
    Args:
        log_level: Logging level
        log_file: Optional log file path
        include_timestamp: Include timestamp in log format
        
    Returns:
        Configured logger
    """
    # Create logger
    logger = logging.getLogger('data_pipeline')
    logger.setLevel(getattr(logging, log_level.upper()))
    
    # Clear existing handlers
    logger.handlers.clear()
    
    # Define format
    if include_timestamp:
        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    else:
        log_format = '%(name)s - %(levelname)s - %(message)s'
    
    formatter = logging.Formatter(log_format)
    
    # Console handler
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    
    # File handler (if specified)
    if log_file:
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    
    return logger


# Example usage in pipeline
import logging
from src.config.settings import settings
from src.utils.logging import setup_logging

logger = setup_logging(
    log_level=settings.LOG_LEVEL,
    log_file=f'logs/pipeline_{datetime.now().strftime("%Y%m%d")}.log'
)

def extract_data():
    logger.info("Starting data extraction")
    try:
        # Extract logic
        logger.info("Extracted 1000 records")
    except Exception as e:
        logger.error(f"Extraction failed: {e}", exc_info=True)
        raise

Error Handling and Retries

# src/utils/retry.py
from functools import wraps
from typing import Callable, Type, Tuple
import time
import logging

logger = logging.getLogger(__name__)

def retry(
    max_attempts: int = 3,
    delay_seconds: int = 1,
    backoff_factor: int = 2,
    exceptions: Tuple[Type[Exception], ...] = (Exception,)
):
    """
    Retry decorator with exponential backoff.
    
    Args:
        max_attempts: Maximum retry attempts
        delay_seconds: Initial delay between retries
        backoff_factor: Multiplier for delay after each attempt
        exceptions: Tuple of exceptions to catch
    """
    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempt = 1
            current_delay = delay_seconds
            
            while attempt <= max_attempts:
                try:
                    return func(*args, **kwargs)
                
                except exceptions as e:
                    if attempt == max_attempts:
                        logger.error(f"Failed after {max_attempts} attempts: {e}")
                        raise
                    
                    logger.warning(
                        f"Attempt {attempt}/{max_attempts} failed: {e}. "
                        f"Retrying in {current_delay}s..."
                    )
                    
                    time.sleep(current_delay)
                    current_delay *= backoff_factor
                    attempt += 1
        
        return wrapper
    return decorator


# Example usage
from requests.exceptions import RequestException

@retry(
    max_attempts=3,
    delay_seconds=2,
    backoff_factor=2,
    exceptions=(RequestException,)
)
def fetch_api_data(url: str) -> dict:
    """Fetch data from API with retries"""
    import requests
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response.json()

Version Control with Git

Git Workflow for Data Pipelines

I use feature branches and pull requests:

# Create feature branch
git checkout -b feature/add-customer-segmentation

# Make changes
# ... edit files ...

# Commit with meaningful message
git add .
git commit -m "Add customer segmentation pipeline

- Implement RFM analysis
- Add segment classification logic
- Include unit tests for segmentation
"

# Push and create PR
git push origin feature/add-customer-segmentation

.gitignore for Data Projects

# .gitignore

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
venv/
env/
ENV/
.venv

# Data files (NEVER commit data)
data/
*.csv
*.parquet
*.json
*.xlsx
*.db
*.sqlite

# Credentials
.env
.env.local
secrets/
credentials.json
*.pem
*.key

# IDE
.vscode/
.idea/
*.swp
*.swo

# Logs
logs/
*.log

# OS
.DS_Store
Thumbs.db

# Airflow
airflow.db
airflow.cfg
airflow-webserver.pid

# Jupyter
.ipynb_checkpoints/

Meaningful Commit Messages

# Bad commit messages
git commit -m "fix"
git commit -m "update pipeline"
git commit -m "asdfasdf"

# Good commit messages
git commit -m "Fix null handling in customer transformer

- Add null checks before string operations
- Handle missing email addresses gracefully
- Update unit tests to cover null cases
"

git commit -m "Optimize S3 upload for large files

- Implement multipart upload for files > 100MB
- Add progress logging
- Reduce memory usage with streaming
"

CI/CD for Data Pipelines

GitHub Actions Workflow

# .github/workflows/ci.yml
name: CI/CD Pipeline

on:
  push:
    branches: [ main, develop ]
  pull_request:
    branches: [ main, develop ]

jobs:
  test:
    runs-on: ubuntu-latest
    
    services:
      postgres:
        image: postgres:15
        env:
          POSTGRES_PASSWORD: postgres
          POSTGRES_DB: test_db
        options: >-
          --health-cmd pg_isready
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
        ports:
          - 5432:5432
    
    steps:
      - uses: actions/checkout@v3
      
      - name: Set up Python 3.12
        uses: actions/setup-python@v4
        with:
          python-version: '3.12'
      
      - name: Cache dependencies
        uses: actions/cache@v3
        with:
          path: ~/.cache/pip
          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
          restore-keys: |
            ${{ runner.os }}-pip-
      
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
          pip install -r requirements-dev.txt
      
      - name: Lint with flake8
        run: |
          flake8 src/ tests/ --max-line-length=120 --exclude=venv
      
      - name: Type check with mypy
        run: |
          mypy src/ --ignore-missing-imports
      
      - name: Run tests with pytest
        env:
          DB_HOST: localhost
          DB_PORT: 5432
          DB_NAME: test_db
          DB_USER: postgres
          DB_PASSWORD: postgres
        run: |
          pytest tests/ -v --cov=src --cov-report=xml
      
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        with:
          file: ./coverage.xml
  
  deploy:
    needs: test
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/main'
    
    steps:
      - uses: actions/checkout@v3
      
      - name: Deploy to production
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
        run: |
          # Deploy Airflow DAGs to S3
          aws s3 sync airflow/dags/ s3://my-airflow-bucket/dags/
          
          # Update Lambda functions
          # zip and deploy...

Testing Data Pipelines

# tests/test_transformers.py
import pytest
import pandas as pd
from src.transformers.cleaner import CustomerCleaner

class TestCustomerCleaner:
    """Test suite for customer data cleaning"""
    
    @pytest.fixture
    def sample_data(self) -> pd.DataFrame:
        """Sample customer data for testing"""
        return pd.DataFrame({
            'customer_id': [1, 2, 3, 4],
            'email': ['[email protected]', '[email protected]', None, 'invalid'],
            'phone': ['555-1234', '(555) 5678', None, '555.9012'],
            'registration_date': ['2024-01-01', '2024-01-02', '2024-01-03', 'invalid']
        })
    
    def test_email_normalization(self, sample_data):
        """Test email addresses are lowercase and trimmed"""
        cleaner = CustomerCleaner()
        result = cleaner.clean(sample_data)
        
        # Should be lowercase
        assert result.loc[1, 'email'] == '[email protected]'
        
        # Invalid emails should be null
        assert pd.isna(result.loc[3, 'email'])
    
    def test_phone_standardization(self, sample_data):
        """Test phone numbers are standardized to digits only"""
        cleaner = CustomerCleaner()
        result = cleaner.clean(sample_data)
        
        # Should be digits only
        assert result.loc[0, 'phone'] == '5551234'
        assert result.loc[1, 'phone'] == '5555678'
    
    def test_date_parsing(self, sample_data):
        """Test dates are parsed correctly"""
        cleaner = CustomerCleaner()
        result = cleaner.clean(sample_data)
        
        # Valid dates should be parsed
        assert pd.notna(result.loc[0, 'registration_date'])
        
        # Invalid dates should be null
        assert pd.isna(result.loc[3, 'registration_date'])
    
    def test_no_data_loss(self, sample_data):
        """Test no rows are lost during cleaning"""
        cleaner = CustomerCleaner()
        result = cleaner.clean(sample_data)
        
        assert len(result) == len(sample_data)


# Integration test
@pytest.mark.integration
def test_end_to_end_pipeline(test_database):
    """Test complete ETL pipeline"""
    from src.extractors.database_extractor import DatabaseExtractor
    from src.transformers.cleaner import CustomerCleaner
    from src.loaders.database_loader import DatabaseLoader
    
    # Extract
    extractor = DatabaseExtractor(test_database)
    raw_data = extractor.extract('customers')
    
    # Transform
    cleaner = CustomerCleaner()
    cleaned_data = cleaner.clean(raw_data)
    
    # Load
    loader = DatabaseLoader(test_database)
    loader.load(cleaned_data, 'customers_clean')
    
    # Verify
    result = extractor.extract('customers_clean')
    assert len(result) > 0
    assert all(result['email'].str.contains('@', na=False))

Monitoring and Observability

Pipeline Monitoring

# src/utils/monitoring.py
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, Any, Optional
import time
from contextlib import contextmanager

@dataclass
class PipelineMetrics:
    """Pipeline execution metrics"""
    pipeline_name: str
    start_time: datetime
    end_time: Optional[datetime] = None
    status: str = 'running'  # running, success, failed
    rows_processed: int = 0
    rows_failed: int = 0
    duration_seconds: Optional[float] = None
    error_message: Optional[str] = None
    metadata: Dict[str, Any] = None

class MetricsCollector:
    """Collect and publish pipeline metrics"""
    
    def __init__(self, pipeline_name: str):
        """Initialize metrics collector"""
        self.pipeline_name = pipeline_name
        self.metrics = PipelineMetrics(
            pipeline_name=pipeline_name,
            start_time=datetime.now(),
            metadata={}
        )
    
    @contextmanager
    def track_execution(self):
        """Context manager to track pipeline execution"""
        start = time.time()
        
        try:
            yield self
            
            # Success
            self.metrics.status = 'success'
            self.metrics.end_time = datetime.now()
            self.metrics.duration_seconds = time.time() - start
            
        except Exception as e:
            # Failure
            self.metrics.status = 'failed'
            self.metrics.end_time = datetime.now()
            self.metrics.duration_seconds = time.time() - start
            self.metrics.error_message = str(e)
            raise
        
        finally:
            # Publish metrics
            self.publish_metrics()
    
    def increment_processed(self, count: int = 1) -> None:
        """Increment processed rows counter"""
        self.metrics.rows_processed += count
    
    def increment_failed(self, count: int = 1) -> None:
        """Increment failed rows counter"""
        self.metrics.rows_failed += count
    
    def add_metadata(self, key: str, value: Any) -> None:
        """Add metadata to metrics"""
        if self.metrics.metadata is None:
            self.metrics.metadata = {}
        self.metrics.metadata[key] = value
    
    def publish_metrics(self) -> None:
        """
        Publish metrics to monitoring system.
        In production: send to CloudWatch, Datadog, Prometheus, etc.
        """
        import json
        
        # Log metrics
        metrics_dict = {
            'pipeline_name': self.metrics.pipeline_name,
            'status': self.metrics.status,
            'start_time': self.metrics.start_time.isoformat(),
            'end_time': self.metrics.end_time.isoformat() if self.metrics.end_time else None,
            'duration_seconds': self.metrics.duration_seconds,
            'rows_processed': self.metrics.rows_processed,
            'rows_failed': self.metrics.rows_failed,
            'error_message': self.metrics.error_message,
            'metadata': self.metrics.metadata
        }
        
        print(f"METRICS: {json.dumps(metrics_dict)}")
        
        # In production, send to monitoring service
        # cloudwatch.put_metric_data(...)
        # datadog.statsd.gauge(...)


# Example usage
def run_customer_pipeline():
    """Run customer data pipeline with monitoring"""
    collector = MetricsCollector('customer_etl')
    
    with collector.track_execution():
        # Extract
        customers = extract_customers()
        collector.add_metadata('source_count', len(customers))
        
        # Transform
        for idx, customer in customers.iterrows():
            try:
                cleaned = clean_customer(customer)
                collector.increment_processed()
            except Exception as e:
                collector.increment_failed()
                continue
        
        # Load
        load_customers(cleaned_customers)
        collector.add_metadata('target_count', len(cleaned_customers))

Alerting

# src/utils/alerts.py
import boto3
from typing import Optional
import logging

logger = logging.getLogger(__name__)

class AlertManager:
    """Manage pipeline alerts"""
    
    def __init__(self, sns_topic_arn: Optional[str] = None):
        """
        Initialize alert manager.
        
        Args:
            sns_topic_arn: AWS SNS topic ARN for alerts
        """
        self.sns_topic_arn = sns_topic_arn
        if sns_topic_arn:
            self.sns_client = boto3.client('sns')
    
    def send_alert(
        self,
        subject: str,
        message: str,
        severity: str = 'ERROR'
    ) -> None:
        """
        Send alert notification.
        
        Args:
            subject: Alert subject
            message: Alert message
            severity: Severity level (INFO, WARNING, ERROR, CRITICAL)
        """
        formatted_message = f"""
        Severity: {severity}
        Pipeline: data-pipeline
        Timestamp: {datetime.now().isoformat()}
        
        {message}
        """
        
        # Log alert
        logger.error(f"ALERT [{severity}]: {subject}\n{formatted_message}")
        
        # Send SNS notification
        if self.sns_topic_arn:
            try:
                self.sns_client.publish(
                    TopicArn=self.sns_topic_arn,
                    Subject=f"[{severity}] {subject}",
                    Message=formatted_message
                )
            except Exception as e:
                logger.error(f"Failed to send SNS alert: {e}")


# Example usage in pipeline
alert_manager = AlertManager(sns_topic_arn='arn:aws:sns:us-east-1:123456789:pipeline-alerts')

try:
    run_pipeline()
except Exception as e:
    alert_manager.send_alert(
        subject="Customer ETL Pipeline Failed",
        message=f"Pipeline failed with error: {str(e)}",
        severity="CRITICAL"
    )
    raise

Data Quality and Validation

# src/utils/validators.py
import pandas as pd
from typing import List, Dict, Any
from dataclasses import dataclass

@dataclass
class ValidationResult:
    """Validation result"""
    passed: bool
    failures: List[str]
    row_count: int
    valid_count: int
    invalid_count: int

class DataValidator:
    """Validate data quality"""
    
    def __init__(self):
        """Initialize validator"""
        self.failures: List[str] = []
    
    def validate_dataframe(
        self,
        df: pd.DataFrame,
        rules: Dict[str, Any]
    ) -> ValidationResult:
        """
        Validate DataFrame against rules.
        
        Args:
            df: DataFrame to validate
            rules: Validation rules
            
        Returns:
            ValidationResult
        """
        self.failures = []
        
        # Check required columns
        if 'required_columns' in rules:
            missing = set(rules['required_columns']) - set(df.columns)
            if missing:
                self.failures.append(f"Missing required columns: {missing}")
        
        # Check row count
        if 'min_rows' in rules and len(df) < rules['min_rows']:
            self.failures.append(f"Row count {len(df)} below minimum {rules['min_rows']}")
        
        # Check null values
        if 'not_null_columns' in rules:
            for col in rules['not_null_columns']:
                null_count = df[col].isnull().sum()
                if null_count > 0:
                    self.failures.append(f"Column {col} has {null_count} null values")
        
        # Check unique values
        if 'unique_columns' in rules:
            for col in rules['unique_columns']:
                if not df[col].is_unique:
                    dup_count = df[col].duplicated().sum()
                    self.failures.append(f"Column {col} has {dup_count} duplicates")
        
        # Check value ranges
        if 'ranges' in rules:
            for col, (min_val, max_val) in rules['ranges'].items():
                invalid = df[(df[col] < min_val) | (df[col] > max_val)]
                if len(invalid) > 0:
                    self.failures.append(
                        f"Column {col} has {len(invalid)} values outside range [{min_val}, {max_val}]"
                    )
        
        return ValidationResult(
            passed=len(self.failures) == 0,
            failures=self.failures,
            row_count=len(df),
            valid_count=len(df) if len(self.failures) == 0 else 0,
            invalid_count=len(df) if len(self.failures) > 0 else 0
        )


# Example usage
validator = DataValidator()

validation_rules = {
    'required_columns': ['customer_id', 'email', 'registration_date'],
    'not_null_columns': ['customer_id', 'email'],
    'unique_columns': ['customer_id', 'email'],
    'min_rows': 1,
    'ranges': {
        'age': (0, 120),
        'total_purchases': (0, 1000000)
    }
}

result = validator.validate_dataframe(df, validation_rules)

if not result.passed:
    print("Validation failed:")
    for failure in result.failures:
        print(f"  - {failure}")
    raise ValueError("Data validation failed")

Cost Optimization

Best Practices

Use spot instances for non-critical batch jobs
Compress data before storing (Parquet with Snappy compression)
Partition data to reduce scan costs
Archive old data to cheaper storage tiers (S3 Glacier)
Right-size resources (don't over-provision)
Schedule batch jobs during off-peak hours
Monitor costs with billing alerts

# Example: S3 lifecycle policy for cost optimization
import boto3

s3_client = boto3.client('s3')

lifecycle_policy = {
    'Rules': [
        {
            'Id': 'Move old data to Glacier',
            'Status': 'Enabled',
            'Prefix': 'raw/',
            'Transitions': [
                {
                    'Days': 90,
                    'StorageClass': 'GLACIER'
                }
            ],
            'Expiration': {
                'Days': 365
            }
        }
    ]
}

s3_client.put_bucket_lifecycle_configuration(
    Bucket='my-data-lake',
    LifecycleConfiguration=lifecycle_policy
)

Key Takeaways

Code quality: Use type hints, linting, and formatters
Configuration: Never hardcode credentials
Logging: Comprehensive logging for debugging
Version control: Meaningful commits, feature branches
CI/CD: Automated testing and deployment
Monitoring: Track metrics and set up alerts
Testing: Unit tests and integration tests
Cost optimization: Compress, partition, archive

← Previous: Streaming & Real-Time Data | Next: Real-World Project →

PreviousStreaming & Real-Time Data NextReal-World Project

Last updated 1 month ago

hashtagIntroduction

hashtagCode Quality and Structure

hashtagProject Structure

hashtagConfiguration Management

hashtagLogging Best Practices

hashtagError Handling and Retries

hashtagVersion Control with Git

hashtagGit Workflow for Data Pipelines

hashtag.gitignore for Data Projects

hashtagMeaningful Commit Messages

hashtagCI/CD for Data Pipelines

hashtagGitHub Actions Workflow

hashtagTesting Data Pipelines

hashtagMonitoring and Observability

hashtagPipeline Monitoring

hashtagAlerting

hashtagData Quality and Validation

hashtagCost Optimization

hashtagBest Practices

hashtagKey Takeaways