Docker for Data Engineering: Containerization Guide

Why Docker for Data Engineering?

Docker enables consistent, reproducible environments for data applications. Whether running locally or in production, containerization ensures your data pipelines work the same everywhere.

Reproducibility: Same environment across dev, test, and production
Isolation: Dependencies don't conflict between projects
Portability: Run anywhere Docker is installed
Scalability: Easy to scale with Kubernetes

Docker Basics

# Pull an image
docker pull python:3.11-slim

# Run a container
docker run -it python:3.11-slim python

# Run with volume mount
docker run -v $(pwd):/app -w /app python:3.11-slim python script.py

# List containers
docker ps        # Running containers
docker ps -a     # All containers

# Stop and remove
docker stop container_id
docker rm container_id

# List and remove images
docker images
docker rmi image_id

# Execute command in running container
docker exec -it container_id bash

# View logs
docker logs container_id
docker logs -f container_id  # Follow logs

Dockerfile for Data Pipelines

# Dockerfile for Python ETL Pipeline
FROM python:3.11-slim

# Set working directory
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    gcc \
    libpq-dev \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements first (for caching)
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY src/ ./src/
COPY config/ ./config/

# Set environment variables
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1

# Create non-root user
RUN useradd -m appuser && chown -R appuser:appuser /app
USER appuser

# Default command
CMD ["python", "src/main.py"]

# requirements.txt
pandas==2.1.0
sqlalchemy==2.0.0
psycopg2-binary==2.9.9
boto3==1.28.0
apache-airflow==2.7.0

Multi-Stage Builds

# Multi-stage build for smaller images
FROM python:3.11 AS builder

WORKDIR /app
COPY requirements.txt .

# Install dependencies in virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN pip install --no-cache-dir -r requirements.txt

# Final stage
FROM python:3.11-slim

# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

WORKDIR /app
COPY src/ ./src/

# Run as non-root
RUN useradd -m appuser
USER appuser

CMD ["python", "src/main.py"]

Docker Compose for Data Stack

# docker-compose.yml
version: '3.8'

services:
  postgres:
    image: postgres:15
    environment:
      POSTGRES_USER: dataeng
      POSTGRES_PASSWORD: password
      POSTGRES_DB: warehouse
    volumes:
      - postgres_data:/var/lib/postgresql/data
      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U dataeng"]
      interval: 5s
      timeout: 5s
      retries: 5

  airflow:
    image: apache/airflow:2.7.0
    environment:
      AIRFLOW__CORE__EXECUTOR: LocalExecutor
      AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://dataeng:password@postgres/airflow
    volumes:
      - ./dags:/opt/airflow/dags
      - ./logs:/opt/airflow/logs
    ports:
      - "8080:8080"
    depends_on:
      postgres:
        condition: service_healthy
    command: >
      bash -c "airflow db init &&
               airflow users create --username admin --password admin
               --firstname Admin --lastname User --role Admin --email admin@example.com &&
               airflow webserver"

  spark:
    image: bitnami/spark:3.5
    environment:
      SPARK_MODE: master
    ports:
      - "8081:8080"
      - "7077:7077"
    volumes:
      - ./spark-apps:/opt/spark-apps

  spark-worker:
    image: bitnami/spark:3.5
    environment:
      SPARK_MODE: worker
      SPARK_MASTER_URL: spark://spark:7077
    depends_on:
      - spark

  jupyter:
    image: jupyter/pyspark-notebook:latest
    ports:
      - "8888:8888"
    volumes:
      - ./notebooks:/home/jovyan/work
    environment:
      JUPYTER_ENABLE_LAB: "yes"

volumes:
  postgres_data:

Kafka Stack with Docker

# docker-compose-kafka.yml
version: '3.8'

services:
  zookeeper:
    image: confluentinc/cp-zookeeper:7.5.0
    environment:
      ZOOKEEPER_CLIENT_PORT: 2181

  kafka:
    image: confluentinc/cp-kafka:7.5.0
    depends_on:
      - zookeeper
    ports:
      - "9092:9092"
    environment:
      KAFKA_BROKER_ID: 1
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1

  schema-registry:
    image: confluentinc/cp-schema-registry:7.5.0
    depends_on:
      - kafka
    ports:
      - "8081:8081"
    environment:
      SCHEMA_REGISTRY_HOST_NAME: schema-registry
      SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: kafka:29092

  kafka-ui:
    image: provectuslabs/kafka-ui:latest
    ports:
      - "8080:8080"
    environment:
      KAFKA_CLUSTERS_0_NAME: local
      KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:29092
      KAFKA_CLUSTERS_0_SCHEMAREGISTRY: http://schema-registry:8081

Building and Pushing Images

# Build image
docker build -t my-etl-pipeline:1.0 .

# Tag for registry
docker tag my-etl-pipeline:1.0 myregistry.com/my-etl-pipeline:1.0

# Push to registry
docker push myregistry.com/my-etl-pipeline:1.0

# Build with build arguments
docker build \
    --build-arg PYTHON_VERSION=3.11 \
    --build-arg ENV=production \
    -t my-pipeline:prod .

# Build for multiple platforms
docker buildx build \
    --platform linux/amd64,linux/arm64 \
    -t my-pipeline:multi \
    --push .

Environment Configuration

# .env file
POSTGRES_HOST=postgres
POSTGRES_PORT=5432
POSTGRES_USER=dataeng
POSTGRES_PASSWORD=secretpassword
POSTGRES_DB=warehouse
AWS_ACCESS_KEY_ID=your_key
AWS_SECRET_ACCESS_KEY=your_secret

# docker-compose.yml with env file
services:
  etl:
    build: .
    env_file:
      - .env
    environment:
      - LOG_LEVEL=INFO

# Use secrets for sensitive data
services:
  etl:
    build: .
    secrets:
      - db_password
    environment:
      - DB_PASSWORD_FILE=/run/secrets/db_password

secrets:
  db_password:
    file: ./secrets/db_password.txt

Common Commands

# Start services
docker-compose up -d

# View logs
docker-compose logs -f service_name

# Scale services
docker-compose up -d --scale spark-worker=3

# Stop services
docker-compose down

# Stop and remove volumes
docker-compose down -v

# Rebuild and restart
docker-compose up -d --build

# Execute command in service
docker-compose exec postgres psql -U dataeng -d warehouse

# Clean up unused resources
docker system prune -a
docker volume prune

Best Practices

Use slim base images: python:3.11-slim instead of python:3.11
Layer caching: Copy requirements before code for better caching
Run as non-root: Create and use non-root users
Use .dockerignore: Exclude unnecessary files from build context
Health checks: Add health checks for dependent services
Tag versions: Use specific version tags, not :latest

Master Docker with Expert Mentorship

Our Data Engineering program covers Docker and containerization for data applications. Deploy production-ready pipelines with guidance from industry experts.

Explore Data Engineering Program

Docker for Data Engineering