Data Modeling & Warehousing

← Previous: Data Cleaning & Transformation | Back to Index | Next: ETL/ELT Pipelines →

Introduction

Data modeling determines how efficiently you can query your data warehouse. I've redesigned data models that reduced query times from minutes to seconds. This article covers the modeling patterns that work in production.

Star Schema vs Snowflake Schema

Star Schema

The star schema is my go-to pattern for most data warehouses.

        ┌─────────────────┐
        │  Dim_Customer   │
        │  - customer_key │
        │  - name         │
        │  - email        │
        └─────────┬───────┘
                  │
        ┌─────────▼────────────────┐
        │   Fact_Transactions      │
        │   - transaction_id       │
        │   - customer_key (FK)    │
        │   - product_key (FK)     │
        │   - date_key (FK)        │
        │   - amount              │
        │   - quantity            │
        └──────────┬───────────────┘
                   │
         ┌─────────┴─────────┬──────────────┐
         │                   │              │
    ┌────▼────┐       ┌──────▼──────┐  ┌───▼──────┐
    │Dim_     │       │  Dim_Date   │  │Dim_      │
    │Product  │       │  - date_key │  │Location  │
    └─────────┘       └─────────────┘  └──────────┘

Advantages:

Simple joins (fact → dimension)
Fast query performance
Easy to understand

When I use it:

95% of my data warehouses
When query performance is critical
When business users self-serve

# Python 3.12 - Star schema implementation
from sqlalchemy import Column, Integer, String, Float, Date, ForeignKey, DateTime
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime

Base = declarative_base()

# Dimension Tables
class DimCustomer(Base):
    """Customer dimension."""
    __tablename__ = 'dim_customer'
    
    customer_key = Column(Integer, primary_key=True, autoincrement=True)
    customer_id = Column(String(50), unique=True, nullable=False)
    name = Column(String(255))
    email = Column(String(255))
    country = Column(String(2))
    created_at = Column(DateTime, default=datetime.utcnow)

class DimProduct(Base):
    """Product dimension."""
    __tablename__ = 'dim_product'
    
    product_key = Column(Integer, primary_key=True, autoincrement=True)
    product_id = Column(String(50), unique=True, nullable=False)
    name = Column(String(255))
    category = Column(String(100))
    price = Column(Float)

class DimDate(Base):
    """Date dimension."""
    __tablename__ = 'dim_date'
    
    date_key = Column(Integer, primary_key=True)
    date = Column(Date, unique=True, nullable=False)
    year = Column(Integer)
    quarter = Column(Integer)
    month = Column(Integer)
    day = Column(Integer)
    day_of_week = Column(Integer)
    is_weekend = Column(Integer)
    is_holiday = Column(Integer)

# Fact Table
class FactTransactions(Base):
    """Transaction fact table."""
    __tablename__ = 'fact_transactions'
    
    transaction_id = Column(String(50), primary_key=True)
    customer_key = Column(Integer, ForeignKey('dim_customer.customer_key'))
    product_key = Column(Integer, ForeignKey('dim_product.product_key'))
    date_key = Column(Integer, ForeignKey('dim_date.date_key'))
    
    # Measures
    amount = Column(Float, nullable=False)
    quantity = Column(Integer, nullable=False)
    discount = Column(Float, default=0.0)
    
    created_at = Column(DateTime, default=datetime.utcnow)

Slowly Changing Dimensions (SCD)

SCD Type 1 - Overwrite

# Python 3.12 - SCD Type 1
class SCDType1Handler:
    """
    Overwrite old values with new ones.
    No history tracking.
    """
    
    @staticmethod
    def update_dimension(engine, table_name: str, key: str, updates: dict):
        """Update dimension record."""
        from sqlalchemy import text
        
        set_clause = ', '.join([f"{k} = :{k}" for k in updates.keys()])
        query = f"""
            UPDATE {table_name}
            SET {set_clause}, updated_at = :updated_at
            WHERE {key} = :key_value
        """
        
        with engine.connect() as conn:
            conn.execute(
                text(query),
                {**updates, 'key_value': updates[key], 'updated_at': datetime.utcnow()}
            )
            conn.commit()

SCD Type 2 - Track History

# Python 3.12 - SCD Type 2 (my preferred approach)
from sqlalchemy import Column, Integer, String, DateTime, Boolean

class DimCustomerSCD2(Base):
    """
    Customer dimension with history tracking.
    This is what I use in production.
    """
    __tablename__ = 'dim_customer_scd2'
    
    # Surrogate key
    customer_key = Column(Integer, primary_key=True, autoincrement=True)
    
    # Natural key
    customer_id = Column(String(50), nullable=False)
    
    # Attributes (can change)
    name = Column(String(255))
    email = Column(String(255))
    tier = Column(String(20))
    
    # SCD Type 2 tracking
    valid_from = Column(DateTime, nullable=False, default=datetime.utcnow)
    valid_to = Column(DateTime)  # NULL = current
    is_current = Column(Boolean, default=True)
    
    __table_args__ = (
        Index('idx_customer_id_current', 'customer_id', 'is_current'),
    )

class SCDType2Handler:
    """
    Handle SCD Type 2 updates.
    Preserves full history of changes.
    """
    
    def __init__(self, engine):
        self.engine = engine
    
    def upsert_customer(
        self,
        customer_id: str,
        name: str,
        email: str,
        tier: str
    ):
        """
        Insert or update customer with history tracking.
        """
        from sqlalchemy.orm import Session
        
        with Session(self.engine) as session:
            # Get current record
            current = session.query(DimCustomerSCD2).filter(
                DimCustomerSCD2.customer_id == customer_id,
                DimCustomerSCD2.is_current == True
            ).first()
            
            # Check if attributes changed
            if current:
                if (current.name == name and
                    current.email == email and
                    current.tier == tier):
                    return  # No change
                
                # Expire current record
                current.valid_to = datetime.utcnow()
                current.is_current = False
            
            # Insert new record
            new_record = DimCustomerSCD2(
                customer_id=customer_id,
                name=name,
                email=email,
                tier=tier,
                valid_from=datetime.utcnow(),
                is_current=True
            )
            session.add(new_record)
            session.commit()

Data Warehouse vs Data Lake

Aspect

Data Warehouse

Data Lake

Data

Structured, processed

Raw, all formats

Schema

Schema-on-write

Schema-on-read

Users

Business analysts

Data scientists, engineers

Cost

Higher (storage + compute)

Lower (storage only)

Query

Fast (optimized)

Slower (scan files)

Use case

BI, reporting

ML, data exploration

My approach: Use both!

Data Lake: Store raw data (S3, Azure Data Lake)
Data Warehouse: Process and serve analytics (Snowflake, Redshift)

# Python 3.12 - Hybrid architecture
class HybridDataPlatform:
    """
    My typical architecture:
    Raw data → Data Lake → Processing → Data Warehouse
    """
    
    def __init__(self, s3_bucket: str, warehouse_engine):
        self.s3_bucket = s3_bucket
        self.warehouse = warehouse_engine
    
    def ingest_raw_to_lake(self, data: pd.DataFrame, key: str):
        """
        Step 1: Store raw data in data lake.
        """
        import boto3
        from io import BytesIO
        
        # Save as Parquet (compressed, columnar)
        buffer = BytesIO()
        data.to_parquet(buffer, compression='snappy', index=False)
        buffer.seek(0)
        
        # Upload to S3
        s3_client = boto3.client('s3')
        s3_client.upload_fileobj(buffer, self.s3_bucket, key)
        
        logging.info(f"Stored raw data: s3://{self.s3_bucket}/{key}")
    
    def process_to_warehouse(self, s3_key: str, table_name: str):
        """
        Step 2: Process and load to data warehouse.
        """
        import boto3
        
        # Read from data lake
        s3_client = boto3.client('s3')
        obj = s3_client.get_object(Bucket=self.s3_bucket, Key=s3_key)
        df = pd.read_parquet(BytesIO(obj['Body'].read()))
        
        # Transform (clean, enrich, aggregate)
        df = self._transform_data(df)
        
        # Load to warehouse
        df.to_sql(table_name, self.warehouse, if_exists='append', index=False)
        
        logging.info(f"Loaded {len(df)} records to {table_name}")
    
    def _transform_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply business logic."""
        # Clean
        df = df.dropna(subset=['customer_id', 'amount'])
        
        # Enrich
        df['total'] = df['amount'] * df['quantity']
        
        # Aggregate
        return df

Dimensional Modeling Best Practices

Date Dimension Generation

# Python 3.12 - Generate date dimension
import pandas as pd
from datetime import datetime, timedelta

def generate_date_dimension(start_date: str, end_date: str) -> pd.DataFrame:
    """
    Generate complete date dimension.
    Include once in every data warehouse.
    """
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    df = pd.DataFrame({'date': date_range})
    
    # Add all date attributes
    df['date_key'] = df['date'].dt.strftime('%Y%m%d').astype(int)
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['month_name'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_name'] = df['date'].dt.day_name()
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    
    # Business calendar (customize per organization)
    df['is_holiday'] = 0  # Add logic for holidays
    df['fiscal_year'] = df['year']  # Adjust if fiscal != calendar
    
    return df

# Usage
date_dim = generate_date_dimension('2020-01-01', '2030-12-31')
date_dim.to_sql('dim_date', engine, if_exists='replace', index=False)

Fact Table Design

# Python 3.12 - Fact table best practices
class FactTableBuilder:
    """
    Build optimized fact tables.
    Lessons from building dozens of fact tables.
    """
    
    @staticmethod
    def create_fact_from_transactions(
        transactions_df: pd.DataFrame,
        dim_customer_df: pd.DataFrame,
        dim_product_df: pd.DataFrame,
        dim_date_df: pd.DataFrame
    ) -> pd.DataFrame:
        """
        Transform transactional data to fact table.
        """
        fact = transactions_df.copy()
        
        # Join to get dimension keys
        fact = fact.merge(
            dim_customer_df[['customer_id', 'customer_key']],
            on='customer_id',
            how='left'
        )
        
        fact = fact.merge(
            dim_product_df[['product_id', 'product_key']],
            on='product_id',
            how='left'
        )
        
        # Convert date to date_key
        fact['date'] = pd.to_datetime(fact['transaction_date']).dt.date
        fact = fact.merge(
            dim_date_df[['date', 'date_key']],
            on='date',
            how='left'
        )
        
        # Select only keys and measures
        fact = fact[[
            'transaction_id',
            'customer_key',
            'product_key',
            'date_key',
            'amount',
            'quantity',
            'discount'
        ]]
        
        # Calculate derived measures
        fact['total_amount'] = fact['amount'] * fact['quantity']
        fact['discounted_amount'] = fact['total_amount'] * (1 - fact['discount'])
        
        return fact

Partitioning Strategies

# Python 3.12 - Table partitioning
class PartitioningStrategy:
    """
    Partition large fact tables for performance.
    Reduced query times by 10x in production.
    """
    
    @staticmethod
    def create_partitioned_table(engine, table_name: str):
        """
        Create partitioned fact table (PostgreSQL example).
        """
        from sqlalchemy import text
        
        with engine.connect() as conn:
            # Create parent table
            conn.execute(text(f"""
                CREATE TABLE {table_name} (
                    transaction_id VARCHAR(50),
                    transaction_date DATE NOT NULL,
                    customer_key INTEGER,
                    product_key INTEGER,
                    amount DECIMAL(10,2),
                    PRIMARY KEY (transaction_id, transaction_date)
                ) PARTITION BY RANGE (transaction_date)
            """))
            
            # Create monthly partitions
            for year in range(2020, 2027):
                for month in range(1, 13):
                    partition_name = f"{table_name}_{year}_{month:02d}"
                    start_date = f"{year}-{month:02d}-01"
                    
                    if month == 12:
                        end_date = f"{year + 1}-01-01"
                    else:
                        end_date = f"{year}-{month + 1:02d}-01"
                    
                    conn.execute(text(f"""
                        CREATE TABLE {partition_name}
                        PARTITION OF {table_name}
                        FOR VALUES FROM ('{start_date}') TO ('{end_date}')
                    """))
            
            conn.commit()
            logging.info(f"Created partitioned table: {table_name}")

Indexing Strategies

# Python 3.12 - Indexing for performance
class IndexStrategy:
    """
    Create indexes for query performance.
    """
    
    @staticmethod
    def create_dimension_indexes(engine, table_name: str):
        """
        Standard indexes for dimension tables.
        """
        from sqlalchemy import text
        
        with engine.connect() as conn:
            # Index on natural key
            conn.execute(text(f"""
                CREATE INDEX idx_{table_name}_natural_key
                ON {table_name} (customer_id)
            """))
            
            # Index for SCD Type 2
            conn.execute(text(f"""
                CREATE INDEX idx_{table_name}_scd2
                ON {table_name} (customer_id, is_current)
            """))
            
            conn.commit()
    
    @staticmethod
    def create_fact_indexes(engine, table_name: str):
        """
        Indexes for fact tables.
        """
        from sqlalchemy import text
        
        with engine.connect() as conn:
            # Composite index on foreign keys
            conn.execute(text(f"""
                CREATE INDEX idx_{table_name}_customer
                ON {table_name} (customer_key)
            """))
            
            conn.execute(text(f"""
                CREATE INDEX idx_{table_name}_product
                ON {table_name} (product_key)
            """))
            
            conn.execute(text(f"""
                CREATE INDEX idx_{table_name}_date
                ON {table_name} (date_key)
            """))
            
            # Composite index for common queries
            conn.execute(text(f"""
                CREATE INDEX idx_{table_name}_customer_date
                ON {table_name} (customer_key, date_key)
            """))
            
            conn.commit()

Conclusion

Good data modeling is the foundation of performant data warehouses. Star schemas work for 95% of use cases. SCD Type 2 preserves history. Partitioning and indexing optimize queries.

Key takeaways:

Star schema for most warehouses
Use SCD Type 2 for history tracking
Partition large fact tables by date
Index foreign keys and common query patterns
Combine data lake (raw) + data warehouse (processed)

Navigation:

PreviousData Cleaning & Transformation NextETL/ELT Pipelines

Last updated 2 months ago

hashtagIntroduction

hashtagStar Schema vs Snowflake Schema

hashtagStar Schema

hashtagSlowly Changing Dimensions (SCD)

hashtagSCD Type 1 - Overwrite

hashtagSCD Type 2 - Track History

hashtagData Warehouse vs Data Lake

hashtagDimensional Modeling Best Practices

hashtagDate Dimension Generation

hashtagFact Table Design

hashtagPartitioning Strategies

hashtagIndexing Strategies

hashtagConclusion