Terraform for Data Engineering

What is Terraform?

Terraform is an Infrastructure as Code (IaC) tool that lets you define and provision cloud infrastructure using a declarative configuration language called HCL (HashiCorp Configuration Language).

For data engineers, Terraform enables reproducible infrastructure: data lakes, warehouses, Kafka clusters, and processing pipelines - all defined in code and version controlled.

Why IaC for Data Engineering?

Reproducibility: Recreate entire environments from code
Version Control: Track infrastructure changes in Git
Consistency: Dev, staging, and prod from same templates
Automation: CI/CD for infrastructure changes
Documentation: Code IS the documentation
Cost Control: Easily spin up/down environments

Getting Started

# Install Terraform
# macOS
brew install terraform

# Windows
choco install terraform

# Linux
sudo apt-get install terraform

# Verify installation
terraform --version

# Basic workflow
terraform init      # Initialize, download providers
terraform plan      # Preview changes
terraform apply     # Apply changes
terraform destroy   # Tear down infrastructure

# Project structure
data-platform/
├── main.tf           # Main configuration
├── variables.tf      # Variable definitions
├── outputs.tf        # Output values
├── terraform.tfvars  # Variable values
├── modules/          # Reusable modules
│   ├── s3-bucket/
│   ├── redshift/
│   └── glue/
└── environments/
    ├── dev/
    ├── staging/
    └── prod/

AWS Data Lake Infrastructure

# main.tf - Data Lake on AWS

terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
  }

  backend "s3" {
    bucket = "my-terraform-state"
    key    = "data-lake/terraform.tfstate"
    region = "us-east-1"
  }
}

provider "aws" {
  region = var.aws_region
}

# S3 Bucket for Data Lake
resource "aws_s3_bucket" "data_lake" {
  bucket = "${var.project_name}-data-lake-${var.environment}"

  tags = {
    Environment = var.environment
    Project     = var.project_name
  }
}

# Bucket versioning for data protection
resource "aws_s3_bucket_versioning" "data_lake" {
  bucket = aws_s3_bucket.data_lake.id
  versioning_configuration {
    status = "Enabled"
  }
}

# Lifecycle rules for cost optimization
resource "aws_s3_bucket_lifecycle_configuration" "data_lake" {
  bucket = aws_s3_bucket.data_lake.id

  rule {
    id     = "bronze-tier"
    status = "Enabled"

    filter {
      prefix = "bronze/"
    }

    transition {
      days          = 30
      storage_class = "STANDARD_IA"
    }

    transition {
      days          = 90
      storage_class = "GLACIER"
    }
  }
}

# Glue Catalog Database
resource "aws_glue_catalog_database" "analytics" {
  name        = "${var.project_name}_${var.environment}"
  description = "Data catalog for analytics"
}

# IAM Role for Glue
resource "aws_iam_role" "glue_role" {
  name = "${var.project_name}-glue-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [{
      Action = "sts:AssumeRole"
      Effect = "Allow"
      Principal = {
        Service = "glue.amazonaws.com"
      }
    }]
  })
}

resource "aws_iam_role_policy_attachment" "glue_service" {
  role       = aws_iam_role.glue_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
}

Redshift Data Warehouse

# Redshift cluster
resource "aws_redshift_cluster" "warehouse" {
  cluster_identifier = "${var.project_name}-warehouse"
  database_name      = "analytics"
  master_username    = var.redshift_master_user
  master_password    = var.redshift_master_password

  node_type         = var.redshift_node_type
  cluster_type      = "multi-node"
  number_of_nodes   = var.redshift_nodes

  vpc_security_group_ids = [aws_security_group.redshift.id]
  cluster_subnet_group_name = aws_redshift_subnet_group.main.name

  skip_final_snapshot = var.environment != "prod"
  final_snapshot_identifier = var.environment == "prod" ? "${var.project_name}-final-snapshot" : null

  # Encryption
  encrypted  = true
  kms_key_id = aws_kms_key.redshift.arn

  # Enhanced VPC routing for S3 access
  enhanced_vpc_routing = true

  tags = {
    Environment = var.environment
  }
}

# Security group for Redshift
resource "aws_security_group" "redshift" {
  name        = "${var.project_name}-redshift-sg"
  description = "Security group for Redshift"
  vpc_id      = aws_vpc.main.id

  ingress {
    from_port       = 5439
    to_port         = 5439
    protocol        = "tcp"
    security_groups = [aws_security_group.app.id]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

# Redshift Spectrum - Query S3 directly
resource "aws_redshift_cluster_iam_roles" "spectrum" {
  cluster_identifier = aws_redshift_cluster.warehouse.cluster_identifier
  iam_role_arns      = [aws_iam_role.redshift_spectrum.arn]
}

Variables and Outputs

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-east-1"
}

variable "environment" {
  description = "Environment name"
  type        = string
  validation {
    condition     = contains(["dev", "staging", "prod"], var.environment)
    error_message = "Environment must be dev, staging, or prod."
  }
}

variable "project_name" {
  description = "Project name for resource naming"
  type        = string
}

variable "redshift_node_type" {
  description = "Redshift node type"
  type        = string
  default     = "dc2.large"
}

variable "redshift_nodes" {
  description = "Number of Redshift nodes"
  type        = number
  default     = 2
}

# outputs.tf
output "data_lake_bucket" {
  description = "S3 bucket for data lake"
  value       = aws_s3_bucket.data_lake.bucket
}

output "redshift_endpoint" {
  description = "Redshift cluster endpoint"
  value       = aws_redshift_cluster.warehouse.endpoint
  sensitive   = true
}

output "glue_database" {
  description = "Glue catalog database name"
  value       = aws_glue_catalog_database.analytics.name
}

# terraform.tfvars
environment    = "dev"
project_name   = "analytics"
aws_region     = "us-east-1"
redshift_nodes = 2

Modules for Reusability

# modules/s3-data-bucket/main.tf
variable "bucket_name" {
  type = string
}

variable "environment" {
  type = string
}

resource "aws_s3_bucket" "this" {
  bucket = var.bucket_name
  tags = {
    Environment = var.environment
  }
}

resource "aws_s3_bucket_versioning" "this" {
  bucket = aws_s3_bucket.this.id
  versioning_configuration {
    status = "Enabled"
  }
}

output "bucket_arn" {
  value = aws_s3_bucket.this.arn
}

output "bucket_name" {
  value = aws_s3_bucket.this.bucket
}

# Using the module in main.tf
module "bronze_bucket" {
  source      = "./modules/s3-data-bucket"
  bucket_name = "${var.project_name}-bronze-${var.environment}"
  environment = var.environment
}

module "silver_bucket" {
  source      = "./modules/s3-data-bucket"
  bucket_name = "${var.project_name}-silver-${var.environment}"
  environment = var.environment
}

module "gold_bucket" {
  source      = "./modules/s3-data-bucket"
  bucket_name = "${var.project_name}-gold-${var.environment}"
  environment = var.environment
}

State Management

# Remote state storage (recommended for teams)
terraform {
  backend "s3" {
    bucket         = "terraform-state-bucket"
    key            = "data-platform/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-locks"  # For state locking
  }
}

# DynamoDB table for state locking
resource "aws_dynamodb_table" "terraform_locks" {
  name         = "terraform-locks"
  billing_mode = "PAY_PER_REQUEST"
  hash_key     = "LockID"

  attribute {
    name = "LockID"
    type = "S"
  }
}

# Workspaces for multiple environments
# terraform workspace new dev
# terraform workspace new prod
# terraform workspace select dev

# Use workspace in configuration
locals {
  environment = terraform.workspace
}

resource "aws_s3_bucket" "data" {
  bucket = "data-${local.environment}"
}

Best Practices

Use remote state: Store state in S3/GCS with locking
Modularize: Create reusable modules for common patterns
Use variables: Never hardcode values
Sensitive data: Use secrets manager, not tfvars
Plan before apply: Always review terraform plan
Version lock: Pin provider and module versions
Tagging: Tag all resources for cost tracking

Master Terraform

Our Data Engineering program covers IaC and cloud infrastructure provisioning.

Explore Data Engineering Program

Terraform