Why Docker for Data Engineering?
Docker enables consistent, reproducible environments for data applications. Whether running locally or in production, containerization ensures your data pipelines work the same everywhere.
- Reproducibility: Same environment across dev, test, and production
- Isolation: Dependencies don't conflict between projects
- Portability: Run anywhere Docker is installed
- Scalability: Easy to scale with Kubernetes
Docker Basics
# Pull an image
docker pull python:3.11-slim
# Run a container
docker run -it python:3.11-slim python
# Run with volume mount
docker run -v $(pwd):/app -w /app python:3.11-slim python script.py
# List containers
docker ps # Running containers
docker ps -a # All containers
# Stop and remove
docker stop container_id
docker rm container_id
# List and remove images
docker images
docker rmi image_id
# Execute command in running container
docker exec -it container_id bash
# View logs
docker logs container_id
docker logs -f container_id # Follow logs
Dockerfile for Data Pipelines
# Dockerfile for Python ETL Pipeline
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first (for caching)
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY src/ ./src/
COPY config/ ./config/
# Set environment variables
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# Create non-root user
RUN useradd -m appuser && chown -R appuser:appuser /app
USER appuser
# Default command
CMD ["python", "src/main.py"]
# requirements.txt
pandas==2.1.0
sqlalchemy==2.0.0
psycopg2-binary==2.9.9
boto3==1.28.0
apache-airflow==2.7.0
Multi-Stage Builds
# Multi-stage build for smaller images
FROM python:3.11 AS builder
WORKDIR /app
COPY requirements.txt .
# Install dependencies in virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN pip install --no-cache-dir -r requirements.txt
# Final stage
FROM python:3.11-slim
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
WORKDIR /app
COPY src/ ./src/
# Run as non-root
RUN useradd -m appuser
USER appuser
CMD ["python", "src/main.py"]
Docker Compose for Data Stack
# docker-compose.yml
version: '3.8'
services:
postgres:
image: postgres:15
environment:
POSTGRES_USER: dataeng
POSTGRES_PASSWORD: password
POSTGRES_DB: warehouse
volumes:
- postgres_data:/var/lib/postgresql/data
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U dataeng"]
interval: 5s
timeout: 5s
retries: 5
airflow:
image: apache/airflow:2.7.0
environment:
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://dataeng:password@postgres/airflow
volumes:
- ./dags:/opt/airflow/dags
- ./logs:/opt/airflow/logs
ports:
- "8080:8080"
depends_on:
postgres:
condition: service_healthy
command: >
bash -c "airflow db init &&
airflow users create --username admin --password admin
--firstname Admin --lastname User --role Admin --email admin@example.com &&
airflow webserver"
spark:
image: bitnami/spark:3.5
environment:
SPARK_MODE: master
ports:
- "8081:8080"
- "7077:7077"
volumes:
- ./spark-apps:/opt/spark-apps
spark-worker:
image: bitnami/spark:3.5
environment:
SPARK_MODE: worker
SPARK_MASTER_URL: spark://spark:7077
depends_on:
- spark
jupyter:
image: jupyter/pyspark-notebook:latest
ports:
- "8888:8888"
volumes:
- ./notebooks:/home/jovyan/work
environment:
JUPYTER_ENABLE_LAB: "yes"
volumes:
postgres_data:
Kafka Stack with Docker
# docker-compose-kafka.yml
version: '3.8'
services:
zookeeper:
image: confluentinc/cp-zookeeper:7.5.0
environment:
ZOOKEEPER_CLIENT_PORT: 2181
kafka:
image: confluentinc/cp-kafka:7.5.0
depends_on:
- zookeeper
ports:
- "9092:9092"
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
schema-registry:
image: confluentinc/cp-schema-registry:7.5.0
depends_on:
- kafka
ports:
- "8081:8081"
environment:
SCHEMA_REGISTRY_HOST_NAME: schema-registry
SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: kafka:29092
kafka-ui:
image: provectuslabs/kafka-ui:latest
ports:
- "8080:8080"
environment:
KAFKA_CLUSTERS_0_NAME: local
KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:29092
KAFKA_CLUSTERS_0_SCHEMAREGISTRY: http://schema-registry:8081
Building and Pushing Images
# Build image
docker build -t my-etl-pipeline:1.0 .
# Tag for registry
docker tag my-etl-pipeline:1.0 myregistry.com/my-etl-pipeline:1.0
# Push to registry
docker push myregistry.com/my-etl-pipeline:1.0
# Build with build arguments
docker build \
--build-arg PYTHON_VERSION=3.11 \
--build-arg ENV=production \
-t my-pipeline:prod .
# Build for multiple platforms
docker buildx build \
--platform linux/amd64,linux/arm64 \
-t my-pipeline:multi \
--push .
Environment Configuration
# .env file
POSTGRES_HOST=postgres
POSTGRES_PORT=5432
POSTGRES_USER=dataeng
POSTGRES_PASSWORD=secretpassword
POSTGRES_DB=warehouse
AWS_ACCESS_KEY_ID=your_key
AWS_SECRET_ACCESS_KEY=your_secret
# docker-compose.yml with env file
services:
etl:
build: .
env_file:
- .env
environment:
- LOG_LEVEL=INFO
# Use secrets for sensitive data
services:
etl:
build: .
secrets:
- db_password
environment:
- DB_PASSWORD_FILE=/run/secrets/db_password
secrets:
db_password:
file: ./secrets/db_password.txt
Common Commands
# Start services
docker-compose up -d
# View logs
docker-compose logs -f service_name
# Scale services
docker-compose up -d --scale spark-worker=3
# Stop services
docker-compose down
# Stop and remove volumes
docker-compose down -v
# Rebuild and restart
docker-compose up -d --build
# Execute command in service
docker-compose exec postgres psql -U dataeng -d warehouse
# Clean up unused resources
docker system prune -a
docker volume prune
Best Practices
- Use slim base images: python:3.11-slim instead of python:3.11
- Layer caching: Copy requirements before code for better caching
- Run as non-root: Create and use non-root users
- Use .dockerignore: Exclude unnecessary files from build context
- Health checks: Add health checks for dependent services
- Tag versions: Use specific version tags, not :latest
Master Docker with Expert Mentorship
Our Data Engineering program covers Docker and containerization for data applications. Deploy production-ready pipelines with guidance from industry experts.
Explore Data Engineering Program