Data Cleaning & Transformation
Introduction
Common Data Quality Issues
Missing Data
# Python 3.12 - Handling missing data
import pandas as pd
import numpy as np
from typing import Optional
class MissingDataHandler:
"""
Strategies for handling missing data.
Choice depends on business context.
"""
@staticmethod
def analyze_missing(df: pd.DataFrame) -> pd.DataFrame:
"""
Analyze missing data patterns.
First step before deciding how to handle.
"""
missing_stats = pd.DataFrame({
'column': df.columns,
'missing_count': df.isnull().sum(),
'missing_pct': (df.isnull().sum() / len(df) * 100).round(2),
'dtype': df.dtypes
})
return missing_stats[missing_stats['missing_count'] > 0].sort_values(
'missing_pct',
ascending=False
)
@staticmethod
def drop_missing(
df: pd.DataFrame,
threshold: float = 0.5
) -> pd.DataFrame:
"""
Drop columns with too many missing values.
Drop rows with any missing critical fields.
"""
# Drop columns with >50% missing
df = df.loc[:, df.isnull().mean() < threshold]
# Drop rows missing critical fields
critical_cols = ['user_id', 'transaction_id', 'amount']
df = df.dropna(subset=critical_cols)
return df
@staticmethod
def fill_missing(df: pd.DataFrame) -> pd.DataFrame:
"""
Fill missing values with appropriate strategies.
"""
df = df.copy()
# Numeric: fill with median (robust to outliers)
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
df[col].fillna(df[col].median(), inplace=True)
# Categorical: fill with mode or 'unknown'
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
mode_value = df[col].mode()[0] if not df[col].mode().empty else 'unknown'
df[col].fillna(mode_value, inplace=True)
# Boolean: fill with False
bool_cols = df.select_dtypes(include=['bool']).columns
for col in bool_cols:
df[col].fillna(False, inplace=True)
return df
@staticmethod
def forward_fill_time_series(
df: pd.DataFrame,
timestamp_col: str,
value_col: str
) -> pd.DataFrame:
"""
Forward fill for time series data.
Use previous value when data is missing.
"""
df = df.sort_values(timestamp_col)
df[value_col] = df[value_col].fillna(method='ffill')
return df
# Usage
handler = MissingDataHandler()
missing_report = handler.analyze_missing(df)
df_cleaned = handler.fill_missing(df)Duplicate Records
Data Type Conversions
Data Validation
Data Normalization
Outlier Detection
Data Transformation Patterns
Complete Cleaning Pipeline
Conclusion
Last updated