Data Ingestion Sources

← Previous: Python for Data Engineering | Back to Index | Next: Data Cleaning & Transformation →

Introduction

Data ingestion is where every data pipeline begins. In my experience, 40% of data engineering work involves getting data from various sources reliably. This article covers the ingestion patterns I use in production.

Reading Files

CSV Files

# Python 3.12 - Production CSV reading
import pandas as pd
from pathlib import Path
from typing import Optional
import logging

class CSVIngestion:
    """
    CSV ingestion with error handling.
    Handles files from MB to GB in size.
    """
    
    @staticmethod
    def read_csv_robust(
        file_path: Path,
        chunk_size: Optional[int] = None
    ) -> pd.DataFrame:
        """Read CSV with comprehensive error handling."""
        try:
            # For small files
            if chunk_size is None:
                return pd.read_csv(
                    file_path,
                    encoding='utf-8',
                    na_values=['NULL', 'null', 'N/A', '#N/A', 'nan'],
                    keep_default_na=True,
                    parse_dates=True,
                    infer_datetime_format=True
                )
            
            # For large files - read in chunks
            chunks = []
            for chunk in pd.read_csv(file_path, chunksize=chunk_size):
                chunks.append(chunk)
            
            return pd.concat(chunks, ignore_index=True)
            
        except UnicodeDecodeError:
            # Try different encoding
            logging.warning(f"UTF-8 failed, trying latin-1 for {file_path}")
            return pd.read_csv(file_path, encoding='latin-1')
        
        except Exception as e:
            logging.error(f"Failed to read {file_path}: {e}")
            raise

# Usage
df = CSVIngestion.read_csv_robust(Path('data/transactions.csv'))

Excel Files

# Python 3.12 - Excel ingestion
import pandas as pd
from openpyxl import load_workbook

class ExcelIngestion:
    """Read Excel files with multiple sheets."""
    
    @staticmethod
    def read_excel_all_sheets(file_path: Path) -> dict[str, pd.DataFrame]:
        """
        Read all sheets from Excel file.
        Returns dict of sheet_name -> DataFrame.
        """
        excel_file = pd.ExcelFile(file_path)
        
        sheets = {}
        for sheet_name in excel_file.sheet_names:
            sheets[sheet_name] = pd.read_excel(
                excel_file,
                sheet_name=sheet_name,
                na_values=['NULL', 'N/A']
            )
        
        return sheets
    
    @staticmethod
    def read_excel_with_validation(file_path: Path) -> pd.DataFrame:
        """
        Read Excel and validate structure.
        I use this for vendor-provided files.
        """
        df = pd.read_excel(file_path)
        
        # Validate required columns
        required_cols = ['transaction_id', 'amount', 'date']
        missing = set(required_cols) - set(df.columns)
        
        if missing:
            raise ValueError(f"Missing required columns: {missing}")
        
        return df

JSON Files

# Python 3.12 - JSON ingestion
import json
import pandas as pd
from typing import List, Dict

class JSONIngestion:
    """
    Handle various JSON formats.
    APIs often return nested JSON - need to flatten.
    """
    
    @staticmethod
    def read_json_lines(file_path: Path) -> List[Dict]:
        """
        Read JSON Lines format (one JSON object per line).
        Common in log files and streaming data.
        """
        records = []
        with open(file_path, 'r') as f:
            for line in f:
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError as e:
                    logging.warning(f"Invalid JSON line: {e}")
                    continue
        
        return records
    
    @staticmethod
    def read_nested_json(file_path: Path) -> pd.DataFrame:
        """
        Read and flatten nested JSON.
        """
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Normalize nested structures
        df = pd.json_normalize(
            data,
            record_path=['transactions'],
            meta=['user_id', 'user_name'],
            errors='ignore'
        )
        
        return df

# Usage
records = JSONIngestion.read_json_lines(Path('logs/events.jsonl'))
df = pd.DataFrame(records)

Database Connections

PostgreSQL

# Python 3.12 - PostgreSQL ingestion
from sqlalchemy import create_engine, text
import pandas as pd
from typing import Optional

class PostgreSQLIngestion:
    """
    Production PostgreSQL patterns.
    Connection pooling prevents exhaustion.
    """
    
    def __init__(self, connection_string: str):
        self.engine = create_engine(
            connection_string,
            pool_size=5,
            max_overflow=10,
            pool_pre_ping=True
        )
    
    def read_table(
        self,
        table_name: str,
        schema: str = 'public',
        where_clause: Optional[str] = None
    ) -> pd.DataFrame:
        """Read entire table or filtered subset."""
        query = f"SELECT * FROM {schema}.{table_name}"
        
        if where_clause:
            query += f" WHERE {where_clause}"
        
        return pd.read_sql(query, self.engine)
    
    def read_incremental(
        self,
        table_name: str,
        timestamp_column: str,
        last_processed: str
    ) -> pd.DataFrame:
        """
        Incremental load - only new records.
        Critical for large tables.
        """
        query = text(f"""
            SELECT *
            FROM {table_name}
            WHERE {timestamp_column} > :last_timestamp
            ORDER BY {timestamp_column}
        """)
        
        return pd.read_sql(
            query,
            self.engine,
            params={'last_timestamp': last_processed}
        )
    
    def read_with_chunks(
        self,
        query: str,
        chunk_size: int = 10000
    ):
        """
        Read large result sets in chunks.
        Prevents memory issues.
        """
        for chunk in pd.read_sql(query, self.engine, chunksize=chunk_size):
            yield chunk

# Usage
pg = PostgreSQLIngestion("postgresql://user:pass@localhost:5432/db")
df = pg.read_incremental('transactions', 'created_at', '2026-01-01')

API Integration

REST APIs

# Python 3.12 - REST API ingestion
import requests
from typing import List, Dict, Optional
from dataclasses import dataclass
import time

@dataclass
class APIConfig:
    """API configuration."""
    base_url: str
    api_key: str
    timeout: int = 30
    max_retries: int = 3

class RESTAPIIngestion:
    """
    Production API client with rate limiting and retries.
    Learned these patterns from many API integration projects.
    """
    
    def __init__(self, config: APIConfig):
        self.config = config
        self.session = requests.Session()
        self.session.headers.update({
            'Authorization': f'Bearer {config.api_key}',
            'Content-Type': 'application/json'
        })
    
    def fetch_paginated(
        self,
        endpoint: str,
        page_size: int = 100
    ) -> List[Dict]:
        """
        Fetch all pages from paginated API.
        Handles offset and cursor-based pagination.
        """
        all_data = []
        page = 1
        
        while True:
            response = self._fetch_with_retry(
                endpoint,
                params={'page': page, 'size': page_size}
            )
            
            data = response.json()
            
            if not data.get('results'):
                break
            
            all_data.extend(data['results'])
            
            # Check if more pages
            if not data.get('next'):
                break
            
            page += 1
            time.sleep(0.5)  # Rate limiting
        
        logging.info(f"Fetched {len(all_data)} records from {endpoint}")
        return all_data
    
    def _fetch_with_retry(
        self,
        endpoint: str,
        params: Optional[Dict] = None
    ) -> requests.Response:
        """Fetch with exponential backoff retry."""
        url = f"{self.config.base_url}/{endpoint}"
        
        for attempt in range(self.config.max_retries):
            try:
                response = self.session.get(
                    url,
                    params=params,
                    timeout=self.config.timeout
                )
                response.raise_for_status()
                return response
                
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:  # Rate limit
                    wait_time = 2 ** attempt
                    logging.warning(f"Rate limited, waiting {wait_time}s")
                    time.sleep(wait_time)
                    continue
                raise
            
            except requests.exceptions.RequestException as e:
                if attempt == self.config.max_retries - 1:
                    raise
                
                wait_time = 2 ** attempt
                logging.warning(f"Request failed, retrying in {wait_time}s")
                time.sleep(wait_time)
        
        raise Exception(f"Failed after {self.config.max_retries} attempts")

# Usage
config = APIConfig(
    base_url="https://api.example.com/v1",
    api_key="your-api-key"
)
api = RESTAPIIngestion(config)
data = api.fetch_paginated('users')
df = pd.DataFrame(data)

GraphQL APIs

# Python 3.12 - GraphQL ingestion
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

class GraphQLIngestion:
    """
    GraphQL API client.
    More efficient than REST for complex nested data.
    """
    
    def __init__(self, url: str, headers: Dict[str, str]):
        transport = RequestsHTTPTransport(
            url=url,
            headers=headers,
            verify=True,
            retries=3
        )
        self.client = Client(transport=transport, fetch_schema_from_transport=True)
    
    def fetch_users(self, limit: int = 100) -> List[Dict]:
        """Fetch users with nested data."""
        query = gql("""
            query GetUsers($limit: Int!) {
                users(limit: $limit) {
                    id
                    email
                    profile {
                        country
                        age
                    }
                    transactions {
                        amount
                        date
                    }
                }
            }
        """)
        
        result = self.client.execute(query, variable_values={'limit': limit})
        return result['users']

Cloud Storage

AWS S3

# Python 3.12 - S3 ingestion
import boto3
from botocore.exceptions import ClientError
import pandas as pd
from io import BytesIO

class S3Ingestion:
    """
    Read data from S3.
    Handles large files and multiple formats.
    """
    
    def __init__(self, aws_access_key: str, aws_secret_key: str, region: str):
        self.s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key,
            region_name=region
        )
    
    def read_csv_from_s3(
        self,
        bucket: str,
        key: str
    ) -> pd.DataFrame:
        """Read CSV file from S3."""
        try:
            obj = self.s3_client.get_object(Bucket=bucket, Key=key)
            return pd.read_csv(BytesIO(obj['Body'].read()))
        
        except ClientError as e:
            logging.error(f"Error reading s3://{bucket}/{key}: {e}")
            raise
    
    def read_parquet_from_s3(
        self,
        bucket: str,
        key: str
    ) -> pd.DataFrame:
        """
        Read Parquet file from S3.
        Parquet is 10x faster than CSV for large datasets.
        """
        obj = self.s3_client.get_object(Bucket=bucket, Key=key)
        return pd.read_parquet(BytesIO(obj['Body'].read()))
    
    def list_files(
        self,
        bucket: str,
        prefix: str = ''
    ) -> List[str]:
        """List all files in S3 bucket/prefix."""
        paginator = self.s3_client.get_paginator('list_objects_v2')
        
        files = []
        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
            if 'Contents' in page:
                files.extend([obj['Key'] for obj in page['Contents']])
        
        return files

# Usage
s3 = S3Ingestion(
    aws_access_key='YOUR_KEY',
    aws_secret_key='YOUR_SECRET',
    region='us-east-1'
)
df = s3.read_parquet_from_s3('my-bucket', 'data/transactions.parquet')

Streaming Data

Kafka Consumer

# Python 3.12 - Kafka ingestion
from kafka import KafkaConsumer
import json
from typing import Iterator, Dict

class KafkaIngestion:
    """
    Consume streaming data from Kafka.
    Real-time ingestion for event-driven systems.
    """
    
    def __init__(
        self,
        bootstrap_servers: List[str],
        group_id: str,
        topic: str
    ):
        self.consumer = KafkaConsumer(
            topic,
            bootstrap_servers=bootstrap_servers,
            group_id=group_id,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda m: json.loads(m.decode('utf-8'))
        )
    
    def consume_batch(self, batch_size: int = 1000) -> List[Dict]:
        """Consume batch of messages."""
        messages = []
        
        for message in self.consumer:
            messages.append(message.value)
            
            if len(messages) >= batch_size:
                break
        
        return messages
    
    def consume_stream(self) -> Iterator[Dict]:
        """Consume continuous stream."""
        for message in self.consumer:
            yield message.value

# Usage
kafka = KafkaIngestion(
    bootstrap_servers=['localhost:9092'],
    group_id='data-pipeline',
    topic='user-events'
)

# Process in batches
batch = kafka.consume_batch(batch_size=1000)
df = pd.DataFrame(batch)

Data Ingestion Patterns

Full Load

# Python 3.12 - Full load pattern
class FullLoadIngestion:
    """
    Load entire dataset.
    Use for small tables or initial loads.
    """
    
    @staticmethod
    def ingest_full(source_table: str, dest_table: str, db_engine):
        """Replace destination with source data."""
        # Extract
        df = pd.read_sql(f"SELECT * FROM {source_table}", db_engine)
        
        # Load (replace existing data)
        df.to_sql(dest_table, db_engine, if_exists='replace', index=False)
        
        logging.info(f"Full load: {len(df)} records from {source_table}")

Incremental Load

# Python 3.12 - Incremental load pattern
from datetime import datetime

class IncrementalLoadIngestion:
    """
    Load only new/changed records.
    Essential for large datasets.
    """
    
    def __init__(self, db_engine):
        self.db = db_engine
    
    def get_last_processed_timestamp(self, table: str) -> datetime:
        """Get checkpoint from metadata table."""
        query = f"""
            SELECT MAX(last_processed_at)
            FROM ingestion_metadata
            WHERE table_name = '{table}'
        """
        result = pd.read_sql(query, self.db)
        return result.iloc[0, 0] or datetime(2020, 1, 1)
    
    def ingest_incremental(
        self,
        source_table: str,
        dest_table: str,
        timestamp_column: str
    ):
        """Load only new records since last run."""
        # Get checkpoint
        last_processed = self.get_last_processed_timestamp(source_table)
        
        # Extract new records
        query = f"""
            SELECT *
            FROM {source_table}
            WHERE {timestamp_column} > '{last_processed}'
        """
        df = pd.read_sql(query, self.db)
        
        if df.empty:
            logging.info("No new records to ingest")
            return
        
        # Load
        df.to_sql(dest_table, self.db, if_exists='append', index=False)
        
        # Update checkpoint
        max_timestamp = df[timestamp_column].max()
        self._update_checkpoint(source_table, max_timestamp)
        
        logging.info(f"Incremental load: {len(df)} new records")
    
    def _update_checkpoint(self, table: str, timestamp: datetime):
        """Update metadata table."""
        query = f"""
            INSERT INTO ingestion_metadata (table_name, last_processed_at)
            VALUES ('{table}', '{timestamp}')
            ON CONFLICT (table_name) DO UPDATE
            SET last_processed_at = EXCLUDED.last_processed_at
        """
        with self.db.connect() as conn:
            conn.execute(text(query))
            conn.commit()

Best Practices

1. Always use connection pooling

# Good: Reuse connections
engine = create_engine(url, pool_size=5, max_overflow=10)

# Bad: Create new connection each time
# engine = create_engine(url)  # No pooling

2. Implement retry logic

# Handle transient failures
@retry_on_failure(max_attempts=3, delay=5)
def fetch_data():
    return api.get('/data')

3. Use appropriate formats

CSV: Human-readable, slow
Parquet: Columnar, fast, compressed (my preference)
JSON: Nested data, APIs

4. Monitor data freshness

def check_freshness(df: pd.DataFrame, max_age_hours: int = 24):
    """Alert if data is stale."""
    latest = df['timestamp'].max()
    age_hours = (datetime.now() - latest).total_seconds() / 3600
    
    if age_hours > max_age_hours:
        send_alert(f"Data is {age_hours:.1f} hours old")

Conclusion

Data ingestion is the foundation of every pipeline. The patterns shown here handle 90% of real-world scenarios I encounter.

Key takeaways:

Use incremental loads for large datasets
Implement retry logic for API calls
Monitor data freshness
Choose the right format (Parquet > CSV)
Always handle errors gracefully

Navigation:

PreviousPython for Data Engineering NextData Cleaning & Transformation

Last updated 1 month ago

hashtagIntroduction

hashtagReading Files

hashtagCSV Files

hashtagExcel Files

hashtagJSON Files

hashtagDatabase Connections

hashtagPostgreSQL

hashtagAPI Integration

hashtagREST APIs

hashtagGraphQL APIs

hashtagCloud Storage

hashtagAWS S3

hashtagStreaming Data

hashtagKafka Consumer

hashtagData Ingestion Patterns

hashtagFull Load

hashtagIncremental Load

hashtagBest Practices

hashtagConclusion