CI/CD for ML

Why ML Needs CI/CD

Traditional software CI/CD tests code. ML CI/CD must test:

Code (training scripts, preprocessing)
Data (quality, schema, distribution)
Models (performance, bias, robustness)
Infrastructure (containers, dependencies)

The goal: Catch issues before they reach production.

ML Testing Pyramid

Level 1: Unit Tests

Test individual functions:

# tests/test_preprocessing.py (Python 3.12)
import pytest
import numpy as np
import pandas as pd
from src.preprocessing import clean_data, engineer_features

def test_clean_data_removes_nulls():
    """Test that clean_data removes null values."""
    df = pd.DataFrame({
        'feature1': [1, 2, None, 4],
        'feature2': [5, None, 7, 8]
    })
    
    result = clean_data(df)
    
    assert result.isnull().sum().sum() == 0
    assert len(result) == 2  # Only 2 complete rows

def test_clean_data_handles_empty_input():
    """Test edge case: empty dataframe."""
    df = pd.DataFrame()
    result = clean_data(df)
    assert len(result) == 0

def test_engineer_features_creates_expected_columns():
    """Test feature engineering creates correct features."""
    df = pd.DataFrame({
        'price': [100, 200, 300],
        'quantity': [2, 3, 4]
    })
    
    result = engineer_features(df)
    
    assert 'total_value' in result.columns
    assert (result['total_value'] == df['price'] * df['quantity']).all()

def test_engineer_features_handles_zero_values():
    """Test division by zero handling."""
    df = pd.DataFrame({
        'numerator': [10, 20],
        'denominator': [0, 5]
    })
    
    result = engineer_features(df)
    assert not result['ratio'].isnull().any()

Run tests:

pytest tests/test_preprocessing.py -v

Level 2: Integration Tests

Test components working together:

# tests/test_pipeline.py
import pytest
from src.pipeline import load_data, preprocess, train_model

def test_full_training_pipeline():
    """Test complete training pipeline."""
    # Load
    data = load_data('tests/fixtures/sample_data.csv')
    assert len(data) > 0
    
    # Preprocess
    X, y = preprocess(data)
    assert X.shape[0] == y.shape[0]
    assert not np.isnan(X).any()
    
    # Train
    model = train_model(X, y)
    assert model is not None
    
    # Predict
    predictions = model.predict(X[:5])
    assert len(predictions) == 5

def test_pipeline_with_missing_target():
    """Test pipeline handles missing target gracefully."""
    data = pd.DataFrame({
        'feature': [1, 2, 3],
        'target': [1, None, 3]
    })
    
    with pytest.raises(ValueError, match="Missing target values"):
        preprocess(data)

Level 3: Model Validation Tests

Test model quality:

# tests/test_model_quality.py
import pytest
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, precision_score
from src.model import load_model

@pytest.fixture
def trained_model():
    """Load trained model for testing."""
    return joblib.load('models/model.pkl')

@pytest.fixture
def test_data():
    """Load test dataset."""
    return load_data('tests/fixtures/test_data.csv')

def test_model_minimum_accuracy(trained_model, test_data):
    """Model must achieve minimum accuracy."""
    X_test, y_test = preprocess(test_data)
    predictions = trained_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    assert accuracy >= 0.85, f"Accuracy {accuracy:.2f} below threshold 0.85"

def test_model_precision_per_class(trained_model, test_data):
    """Test precision for each class."""
    X_test, y_test = preprocess(test_data)
    predictions = trained_model.predict(X_test)
    
    precision = precision_score(y_test, predictions, average=None)
    
    for i, p in enumerate(precision):
        assert p >= 0.80, f"Class {i} precision {p:.2f} below threshold"

def test_model_handles_edge_cases(trained_model):
    """Test model on edge cases."""
    edge_cases = [
        np.array([[0, 0, 0, 0]]),  # All zeros
        np.array([[1000, 1000, 1000, 1000]]),  # Large values
        np.array([[-1, -1, -1, -1]]),  # Negative values
    ]
    
    for case in edge_cases:
        try:
            prediction = trained_model.predict(case)
            assert prediction is not None
        except Exception as e:
            pytest.fail(f"Model failed on edge case: {e}")

def test_model_inference_speed(trained_model, test_data):
    """Test inference latency."""
    import time
    
    X_test, _ = preprocess(test_data)
    sample = X_test[:100]
    
    start = time.time()
    predictions = trained_model.predict(sample)
    latency = (time.time() - start) / len(sample)
    
    assert latency < 0.010, f"Latency {latency:.3f}s exceeds 10ms threshold"

Level 4: Data Validation Tests

Test data quality:

# tests/test_data_quality.py
import pytest
import pandas as pd
from src.data import load_data, validate_schema

def test_data_schema_matches_expected():
    """Validate data schema."""
    data = load_data('data/training_data.csv')
    
    expected_columns = ['feature1', 'feature2', 'feature3', 'target']
    assert list(data.columns) == expected_columns

def test_data_types_are_correct():
    """Validate data types."""
    data = load_data('data/training_data.csv')
    
    assert data['feature1'].dtype == np.float64
    assert data['feature2'].dtype == np.float64
    assert data['target'].dtype == np.int64

def test_data_has_no_duplicates():
    """Check for duplicate rows."""
    data = load_data('data/training_data.csv')
    duplicates = data.duplicated().sum()
    
    assert duplicates == 0, f"Found {duplicates} duplicate rows"

def test_data_distribution_within_expected_range():
    """Test feature distributions."""
    data = load_data('data/training_data.csv')
    
    # Check ranges
    assert data['feature1'].min() >= 0
    assert data['feature1'].max() <= 100
    
    # Check for outliers (3 std devs)
    mean = data['feature1'].mean()
    std = data['feature1'].std()
    outliers = ((data['feature1'] < mean - 3*std) | 
                (data['feature1'] > mean + 3*std)).sum()
    
    assert outliers < len(data) * 0.01, f"Too many outliers: {outliers}"

def test_target_class_balance():
    """Check target distribution."""
    data = load_data('data/training_data.csv')
    value_counts = data['target'].value_counts()
    
    # No class should be less than 10% of data
    min_class_pct = value_counts.min() / len(data)
    assert min_class_pct >= 0.10, f"Imbalanced classes: min class {min_class_pct:.1%}"

GitHub Actions CI/CD Pipeline

Workflow File

# .github/workflows/ml-ci-cd.yml
name: ML CI/CD Pipeline

on:
  push:
    branches: [main, develop]
  pull_request:
    branches: [main]

env:
  PYTHON_VERSION: '3.12'
  MODEL_REGISTRY: ghcr.io/${{ github.repository }}

jobs:
  test:
    runs-on: ubuntu-latest
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: ${{ env.PYTHON_VERSION }}
    
    - name: Install dependencies
      run: |
        pip install --upgrade pip
        pip install -r requirements.txt
        pip install -r requirements-dev.txt
    
    - name: Run unit tests
      run: |
        pytest tests/test_*.py -v --cov=src --cov-report=xml
    
    - name: Upload coverage
      uses: codecov/codecov-action@v3
  
  validate-data:
    runs-on: ubuntu-latest
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: ${{ env.PYTHON_VERSION }}
    
    - name: Install dependencies
      run: pip install -r requirements.txt
    
    - name: Validate data schema
      run: python scripts/validate_data.py
    
    - name: Check data quality
      run: pytest tests/test_data_quality.py -v
  
  train-and-validate:
    runs-on: ubuntu-latest
    needs: [test, validate-data]
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: ${{ env.PYTHON_VERSION }}
    
    - name: Install dependencies
      run: pip install -r requirements.txt
    
    - name: Train model
      run: python src/train.py --data data/training_data.csv --output models/model.pkl
    
    - name: Validate model quality
      run: pytest tests/test_model_quality.py -v
    
    - name: Upload model artifact
      uses: actions/upload-artifact@v3
      with:
        name: trained-model
        path: models/model.pkl
  
  build-container:
    runs-on: ubuntu-latest
    needs: train-and-validate
    if: github.ref == 'refs/heads/main'
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Download model
      uses: actions/download-artifact@v3
      with:
        name: trained-model
        path: models/
    
    - name: Build Docker image
      run: |
        docker build -t ${{ env.MODEL_REGISTRY }}/model-server:${{ github.sha }} .
        docker tag ${{ env.MODEL_REGISTRY }}/model-server:${{ github.sha }} \
                   ${{ env.MODEL_REGISTRY }}/model-server:latest
    
    - name: Push to registry
      run: |
        echo ${{ secrets.GITHUB_TOKEN }} | docker login ghcr.io -u ${{ github.actor }} --password-stdin
        docker push ${{ env.MODEL_REGISTRY }}/model-server:${{ github.sha }}
        docker push ${{ env.MODEL_REGISTRY }}/model-server:latest
  
  deploy-staging:
    runs-on: ubuntu-latest
    needs: build-container
    if: github.ref == 'refs/heads/main'
    
    steps:
    - name: Deploy to Kubeflow
      run: |
        # Configure kubectl
        echo "${{ secrets.KUBE_CONFIG }}" > kubeconfig.yaml
        export KUBECONFIG=kubeconfig.yaml
        
        # Update InferenceService
        kubectl apply -f - <<EOF
        apiVersion: serving.kserve.io/v1beta1
        kind: InferenceService
        metadata:
          name: model-staging
          namespace: ml-workspace
        spec:
          predictor:
            containers:
            - name: kserve-container
              image: ${{ env.MODEL_REGISTRY }}/model-server:${{ github.sha }}
        EOF
        
        # Wait for ready
        kubectl wait --for=condition=Ready \
          inferenceservice/model-staging \
          -n ml-workspace \
          --timeout=300s
    
    - name: Run smoke tests
      run: |
        python tests/smoke_test.py --endpoint http://model-staging.ml-workspace
  
  deploy-production:
    runs-on: ubuntu-latest
    needs: deploy-staging
    if: github.ref == 'refs/heads/main'
    environment: production
    
    steps:
    - name: Deploy to production
      run: |
        export KUBECONFIG=kubeconfig.yaml
        
        # Canary deployment (20% traffic)
        kubectl apply -f - <<EOF
        apiVersion: serving.kserve.io/v1beta1
        kind: InferenceService
        metadata:
          name: model-production
          namespace: ml-workspace
        spec:
          predictor:
            containers:
            - name: kserve-container
              image: ${{ env.MODEL_REGISTRY }}/model-server:latest
          canaryTrafficPercent: 20
          canary:
            containers:
            - name: kserve-container
              image: ${{ env.MODEL_REGISTRY }}/model-server:${{ github.sha }}
        EOF

Pre-commit Hooks

Catch issues before committing:

# .pre-commit-config.yaml
repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.4.0
    hooks:
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-yaml
      - id: check-added-large-files
        args: ['--maxkb=1000']
  
  - repo: https://github.com/psf/black
    rev: 23.3.0
    hooks:
      - id: black
        language_version: python3.12
  
  - repo: https://github.com/pycqa/flake8
    rev: 6.0.0
    hooks:
      - id: flake8
        args: ['--max-line-length=100']
  
  - repo: https://github.com/pycqa/isort
    rev: 5.12.0
    hooks:
      - id: isort
        args: ['--profile=black']
  
  - repo: local
    hooks:
      - id: pytest-unit
        name: pytest-unit
        entry: pytest tests/test_*.py -v
        language: system
        pass_filenames: false
        always_run: true

Install:

pip install pre-commit
pre-commit install

Best Practices

1. Test Data Fixtures

Create reusable test data:

# tests/conftest.py
import pytest
import pandas as pd
import numpy as np

@pytest.fixture
def sample_training_data():
    """Create sample training data."""
    np.random.seed(42)
    return pd.DataFrame({
        'feature1': np.random.rand(100),
        'feature2': np.random.rand(100),
        'target': np.random.randint(0, 3, 100)
    })

@pytest.fixture
def trained_model(sample_training_data):
    """Train a model for testing."""
    from sklearn.ensemble import RandomForestClassifier
    X = sample_training_data[['feature1', 'feature2']]
    y = sample_training_data['target']
    model = RandomForestClassifier(n_estimators=10, random_state=42)
    model.fit(X, y)
    return model

2. Separate Test Requirements

# requirements-dev.txt
pytest>=7.4.0
pytest-cov>=4.1.0
black>=23.3.0
flake8>=6.0.0
pre-commit>=3.3.0

3. Use Environment Variables for Config

# config.py
import os

class Config:
    DATA_PATH = os.getenv('DATA_PATH', 'data/training_data.csv')
    MODEL_PATH = os.getenv('MODEL_PATH', 'models/model.pkl')
    MIN_ACCURACY = float(os.getenv('MIN_ACCURACY', '0.85'))
    
    # Test environment uses different values
    if os.getenv('ENVIRONMENT') == 'test':
        DATA_PATH = 'tests/fixtures/test_data.csv'

4. Version Everything

# version.py
__version__ = '1.2.3'
MODEL_VERSION = 'v1.2.3'
DATA_VERSION = 'v2.1.0'

Key Takeaways

Test code, data, and models
Automate testing in CI/CD
Use staging environment before production
Implement canary deployments
Version control everything

Next Steps

We've covered all components. Now let's put it all together in an End-to-End Example that demonstrates a complete MLOps workflow.

Resources:

PreviousMonitoring & Observability NextEnd-to-End Example

Last updated 1 month ago

hashtagWhy ML Needs CI/CD

hashtagML Testing Pyramid

hashtagLevel 1: Unit Tests

hashtagLevel 2: Integration Tests

hashtagLevel 3: Model Validation Tests

hashtagLevel 4: Data Validation Tests

hashtagGitHub Actions CI/CD Pipeline

hashtagWorkflow File

hashtagPre-commit Hooks

hashtagBest Practices

hashtag1. Test Data Fixtures

hashtag2. Separate Test Requirements

hashtag3. Use Environment Variables for Config

hashtag4. Version Everything

hashtagKey Takeaways

hashtagNext Steps