Docker Compose File only a orchestration file
As I mentioned serveral times, I am using Docker Compose to orchestrate multiple services. It is not the target to build anything in the REAL PRODUCTION, but it is a good practice to use it for development and testing.
Focus on Docker Image, Container which are foundation concept of Docker.
version: '3.8'
services:
# Data Ingestion (Apache Kafka)
zookeeper:
image: confluentinc/cp-zookeeper:7.3.0
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ports:
- "2181:2181"
kafka:
image: confluentinc/cp-kafka:7.3.0
depends_on:
- zookeeper
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://localhost:29092
KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
ports:
- "29092:29092"
- "9092:9092"
# Data Storage (PostgreSQL and MinIO)
postgres:
image: postgres:14
environment:
POSTGRES_USER: admin
POSTGRES_PASSWORD: admin
POSTGRES_DB: dataplatform
ports:
- "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data
minio:
image: minio/minio
command: server /data --console-address ":9001"
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
ports:
- "9000:9000"
- "9001:9001"
volumes:
- minio_data:/data
# ETL (Apache Spark)
spark:
build: ./spark
depends_on:
- kafka
- postgres
volumes:
- ./spark/jobs:/opt/spark/jobs
# Data Visualization (Superset)
superset:
image: apache/superset:latest
depends_on:
- postgres
environment:
SUPERSET_SECRET_KEY: 'your-secret-key'
SUPERSET_FEATURE_EMBEDDED_SUPERSET: 'true'
ports:
- "8088:8088"
volumes:
- superset_data:/app/superset_home
command: ["/app/docker/docker-bootstrap.sh"]
# Model Training (Jupyter Notebook)
jupyter:
image: jupyter/datascience-notebook:latest
volumes:
- ./notebooks:/home/jovyan/work
ports:
- "8888:8888"
environment:
JUPYTER_TOKEN: "dataplatform"
volumes:
postgres_data:
minio_data:
superset_data:
FROM apache/spark:3.4.0
# Install PostgreSQL connector
RUN curl -L https://jdbc.postgresql.org/download/postgresql-42.6.0.jar -o /opt/spark/jars/postgresql-42.6.0.jar
# Install Kafka connector
RUN curl -L https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.4.0/spark-sql-kafka-0-10_2.12/3.4.0/spark-sql-kafka-0-10_2.12-3.4.0.jar -o /opt/spark/jars/spark-sql-kafka-0-10_2.12-3.4.0.jar
WORKDIR /opt/spark
├── docker-compose.yml
├── spark
│ ├── Dockerfile
│ └── jobs
│ └── etl_job.py
├── notebooks
│ └── model_training.ipynb
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("KafkaToPostgresETL") \
.getOrCreate()
# Read from Kafka
df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka:9092") \
.option("subscribe", "input_topic") \
.load()
# Perform transformations
processed_df = df.selectExpr("CAST(value AS STRING)") \
# Add your ETL logic here
# Write to PostgreSQL
def write_to_postgres(batch_df, batch_id):
batch_df.write \
.format("jdbc") \
.option("url", "jdbc:postgresql://postgres:5432/dataplatform") \
.option("dbtable", "processed_data") \
.option("user", "admin") \
.option("password", "admin") \
.mode("append") \
.save()
processed_df.writeStream \
.foreachBatch(write_to_postgres) \
.start() \
.awaitTermination()