Why Docker for Data Engineering?

Docker enables consistent, reproducible environments for data applications. Whether running locally or in production, containerization ensures your data pipelines work the same everywhere.

  • Reproducibility: Same environment across dev, test, and production
  • Isolation: Dependencies don't conflict between projects
  • Portability: Run anywhere Docker is installed
  • Scalability: Easy to scale with Kubernetes

Docker Basics

# Pull an image
docker pull python:3.11-slim

# Run a container
docker run -it python:3.11-slim python

# Run with volume mount
docker run -v $(pwd):/app -w /app python:3.11-slim python script.py

# List containers
docker ps        # Running containers
docker ps -a     # All containers

# Stop and remove
docker stop container_id
docker rm container_id

# List and remove images
docker images
docker rmi image_id

# Execute command in running container
docker exec -it container_id bash

# View logs
docker logs container_id
docker logs -f container_id  # Follow logs

Dockerfile for Data Pipelines

# Dockerfile for Python ETL Pipeline
FROM python:3.11-slim

# Set working directory
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    gcc \
    libpq-dev \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements first (for caching)
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY src/ ./src/
COPY config/ ./config/

# Set environment variables
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1

# Create non-root user
RUN useradd -m appuser && chown -R appuser:appuser /app
USER appuser

# Default command
CMD ["python", "src/main.py"]
# requirements.txt
pandas==2.1.0
sqlalchemy==2.0.0
psycopg2-binary==2.9.9
boto3==1.28.0
apache-airflow==2.7.0

Multi-Stage Builds

# Multi-stage build for smaller images
FROM python:3.11 AS builder

WORKDIR /app
COPY requirements.txt .

# Install dependencies in virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN pip install --no-cache-dir -r requirements.txt

# Final stage
FROM python:3.11-slim

# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

WORKDIR /app
COPY src/ ./src/

# Run as non-root
RUN useradd -m appuser
USER appuser

CMD ["python", "src/main.py"]

Docker Compose for Data Stack

# docker-compose.yml
version: '3.8'

services:
  postgres:
    image: postgres:15
    environment:
      POSTGRES_USER: dataeng
      POSTGRES_PASSWORD: password
      POSTGRES_DB: warehouse
    volumes:
      - postgres_data:/var/lib/postgresql/data
      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U dataeng"]
      interval: 5s
      timeout: 5s
      retries: 5

  airflow:
    image: apache/airflow:2.7.0
    environment:
      AIRFLOW__CORE__EXECUTOR: LocalExecutor
      AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://dataeng:password@postgres/airflow
    volumes:
      - ./dags:/opt/airflow/dags
      - ./logs:/opt/airflow/logs
    ports:
      - "8080:8080"
    depends_on:
      postgres:
        condition: service_healthy
    command: >
      bash -c "airflow db init &&
               airflow users create --username admin --password admin
               --firstname Admin --lastname User --role Admin --email admin@example.com &&
               airflow webserver"

  spark:
    image: bitnami/spark:3.5
    environment:
      SPARK_MODE: master
    ports:
      - "8081:8080"
      - "7077:7077"
    volumes:
      - ./spark-apps:/opt/spark-apps

  spark-worker:
    image: bitnami/spark:3.5
    environment:
      SPARK_MODE: worker
      SPARK_MASTER_URL: spark://spark:7077
    depends_on:
      - spark

  jupyter:
    image: jupyter/pyspark-notebook:latest
    ports:
      - "8888:8888"
    volumes:
      - ./notebooks:/home/jovyan/work
    environment:
      JUPYTER_ENABLE_LAB: "yes"

volumes:
  postgres_data:

Kafka Stack with Docker

# docker-compose-kafka.yml
version: '3.8'

services:
  zookeeper:
    image: confluentinc/cp-zookeeper:7.5.0
    environment:
      ZOOKEEPER_CLIENT_PORT: 2181

  kafka:
    image: confluentinc/cp-kafka:7.5.0
    depends_on:
      - zookeeper
    ports:
      - "9092:9092"
    environment:
      KAFKA_BROKER_ID: 1
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1

  schema-registry:
    image: confluentinc/cp-schema-registry:7.5.0
    depends_on:
      - kafka
    ports:
      - "8081:8081"
    environment:
      SCHEMA_REGISTRY_HOST_NAME: schema-registry
      SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: kafka:29092

  kafka-ui:
    image: provectuslabs/kafka-ui:latest
    ports:
      - "8080:8080"
    environment:
      KAFKA_CLUSTERS_0_NAME: local
      KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:29092
      KAFKA_CLUSTERS_0_SCHEMAREGISTRY: http://schema-registry:8081

Building and Pushing Images

# Build image
docker build -t my-etl-pipeline:1.0 .

# Tag for registry
docker tag my-etl-pipeline:1.0 myregistry.com/my-etl-pipeline:1.0

# Push to registry
docker push myregistry.com/my-etl-pipeline:1.0

# Build with build arguments
docker build \
    --build-arg PYTHON_VERSION=3.11 \
    --build-arg ENV=production \
    -t my-pipeline:prod .

# Build for multiple platforms
docker buildx build \
    --platform linux/amd64,linux/arm64 \
    -t my-pipeline:multi \
    --push .

Environment Configuration

# .env file
POSTGRES_HOST=postgres
POSTGRES_PORT=5432
POSTGRES_USER=dataeng
POSTGRES_PASSWORD=secretpassword
POSTGRES_DB=warehouse
AWS_ACCESS_KEY_ID=your_key
AWS_SECRET_ACCESS_KEY=your_secret

# docker-compose.yml with env file
services:
  etl:
    build: .
    env_file:
      - .env
    environment:
      - LOG_LEVEL=INFO

# Use secrets for sensitive data
services:
  etl:
    build: .
    secrets:
      - db_password
    environment:
      - DB_PASSWORD_FILE=/run/secrets/db_password

secrets:
  db_password:
    file: ./secrets/db_password.txt

Common Commands

# Start services
docker-compose up -d

# View logs
docker-compose logs -f service_name

# Scale services
docker-compose up -d --scale spark-worker=3

# Stop services
docker-compose down

# Stop and remove volumes
docker-compose down -v

# Rebuild and restart
docker-compose up -d --build

# Execute command in service
docker-compose exec postgres psql -U dataeng -d warehouse

# Clean up unused resources
docker system prune -a
docker volume prune

Best Practices

  • Use slim base images: python:3.11-slim instead of python:3.11
  • Layer caching: Copy requirements before code for better caching
  • Run as non-root: Create and use non-root users
  • Use .dockerignore: Exclude unnecessary files from build context
  • Health checks: Add health checks for dependent services
  • Tag versions: Use specific version tags, not :latest

Master Docker with Expert Mentorship

Our Data Engineering program covers Docker and containerization for data applications. Deploy production-ready pipelines with guidance from industry experts.

Explore Data Engineering Program

Related Articles