SQL for Data Engineering
Introduction
Common Table Expressions (CTEs)
Basic CTEs
-- calculate_user_metrics.sql
-- Calculate user engagement metrics using CTEs
WITH active_users AS (
-- Step 1: Find users active in the last 30 days
SELECT
user_id,
COUNT(*) as activity_count,
MAX(activity_date) as last_activity
FROM user_activities
WHERE activity_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY user_id
),
user_purchases AS (
-- Step 2: Calculate total purchases per user
SELECT
user_id,
COUNT(*) as purchase_count,
SUM(amount) as total_spent
FROM orders
WHERE order_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY user_id
),
user_segments AS (
-- Step 3: Segment users based on activity and purchases
SELECT
au.user_id,
au.activity_count,
COALESCE(up.purchase_count, 0) as purchase_count,
COALESCE(up.total_spent, 0) as total_spent,
CASE
WHEN COALESCE(up.purchase_count, 0) > 5 THEN 'high_value'
WHEN COALESCE(up.purchase_count, 0) > 0 THEN 'medium_value'
ELSE 'low_value'
END as user_segment
FROM active_users au
LEFT JOIN user_purchases up ON au.user_id = up.user_id
)
-- Step 4: Generate final metrics
SELECT
user_segment,
COUNT(*) as user_count,
AVG(activity_count) as avg_activity,
AVG(purchase_count) as avg_purchases,
AVG(total_spent) as avg_spent
FROM user_segments
GROUP BY user_segment
ORDER BY avg_spent DESC;Recursive CTEs
Window Functions
ROW_NUMBER, RANK, and DENSE_RANK
Running Totals and Moving Averages
LAG and LEAD for Time-Series Analysis
FIRST_VALUE and LAST_VALUE
Advanced JOIN Patterns
Self-Joins for Comparisons
LATERAL JOINs (PostgreSQL)
Slowly Changing Dimensions (SCD) in SQL
SCD Type 2 - Historical Tracking
Point-in-Time Lookups
Query Optimization Techniques
Using EXPLAIN ANALYZE
Index Strategy
Avoiding Common Performance Pitfalls
Working with Large Datasets
Partitioning
Incremental Processing
Advanced Aggregations
ROLLUP and CUBE
FILTER Clause
JSON and Semi-Structured Data
Querying JSON (PostgreSQL)
SQL Integration with Python
Best Practices
Key Takeaways
Last updated