data-platform-in-a-box/docker-compose.yml

329 lines
11 KiB
YAML

services:
# Ingest
redpanda:
image: redpandadata/redpanda:v24.3.4
container_name: redpanda
ports:
- "9092:9092"
- "8081:8081"
- "8082:8082"
- "29092:29092"
command:
- redpanda
- start
- --overprovisioned
- --smp
- "1"
- --memory
- "1G"
- --reserve-memory
- "0M"
- --node-id
- "0"
- --kafka-addr
- PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
- --advertise-kafka-addr
- PLAINTEXT://redpanda:29092,OUTSIDE://localhost:9092
- --check=false
client_application:
container_name: client_application
build:
context: ./client-application
restart: unless-stopped
environment:
RUST_LOG: info
command:
- produce
- --host=redpanda:29092
- --topic=event-stream
- --delay-ms=500
depends_on:
- connect
connect:
image: confluentinc/cp-kafka-connect-base:7.8.0
depends_on:
- redpanda
hostname: connect
container_name: connect
ports:
- 8083:8083
environment:
CONNECT_BOOTSTRAP_SERVERS: 'redpanda:29092'
CONNECT_REST_ADVERTISED_HOST_NAME: "connect"
CONNECT_REST_PORT: 8083
CONNECT_GROUP_ID: connect-cluster-group
CONNECT_CONFIG_STORAGE_TOPIC: _kafka-connect-configs
CONNECT_OFFSET_STORAGE_TOPIC: _kafka-connect-offsets
CONNECT_STATUS_STORAGE_TOPIC: _kafka-connect-status
CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter
CONNECT_KEY_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
CONNECT_VALUE_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.StringConverter"
CONNECT_INTERNAL_KEY_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
CONNECT_INTERNAL_VALUE_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO"
CONNECT_LOG4J_LOGGERS: "org.apache.kafka.connect.runtime.rest=WARN,org.reflections=ERROR"
CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: "1"
CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: "1"
CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: "1"
CONNECT_PLUGIN_PATH: '/usr/share/java,/usr/share/confluent-hub-components/,/connectors/'
AWS_ACCESS_KEY_ID: "minioadmin"
AWS_SECRET_ACCESS_KEY: "minioadmin"
command:
- bash
- -c
- |
#
echo "Installing connector plugins"
confluent-hub install --no-prompt tabular/iceberg-kafka-connect:0.4.11
#
echo "Launching Kafka Connect worker"
/etc/confluent/docker/run &
#
echo "Waiting for Kafka Connect to start listening on localhost ⏳"
while : ; do
curl_status=$$(curl -s -o /dev/null -w %{http_code} http://localhost:8083/connectors)
echo -e $$(date) " Kafka Connect listener HTTP state: " $$curl_status " (waiting for 200)"
if [ $$curl_status -eq 200 ] ; then
break
fi
sleep 5
done
echo -e "\n--\n+> Creating connector"
curl -X PUT \
-H 'Content-Type: application/json' \
-H 'Accept: application/json' http://localhost:8083/connectors/IcebergSinkConnector/config \
-d '{
"tasks.max": "1",
"topics": "event-stream",
"connector.class": "io.tabular.iceberg.connect.IcebergSinkConnector",
"iceberg.catalog.s3.endpoint": "http://minio:9000",
"iceberg.catalog.s3.secret-access-key": "minioadmin",
"iceberg.catalog.s3.access-key-id": "minioadmin",
"iceberg.catalog.s3.path-style-access": "true",
"iceberg.catalog.uri": "http://rest:8181",
"iceberg.catalog.warehouse": "s3://warehouse/",
"iceberg.catalog.client.region": "eu-west-1",
"iceberg.catalog.type": "rest",
"iceberg.control.commitIntervalMs": "1000",
"iceberg.tables": "marketing.ad_clicks",
"value.converter.schemas.enable": "false",
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
"schemas.enable": "false"
}'
sleep infinity
console:
image: redpandadata/console:v2.1.1
entrypoint: /bin/sh
command: -c "echo \"$$CONSOLE_CONFIG_FILE\" > /tmp/config.yml; /app/console"
environment:
CONFIG_FILEPATH: /tmp/config.yml
CONSOLE_CONFIG_FILE: |
kafka:
brokers: ["redpanda:29092"]
schemaRegistry:
enabled: true
urls: ["http://redpanda:8081"]
connect:
enabled: true
clusters:
- name: local-connect-cluster
url: http://connect:8083
redpanda:
adminApi:
enabled: true
urls: ["http://redpanda:9644"]
ports:
- 18080:8080
depends_on:
- redpanda
- connect
# Buckets
minio:
image: minio/minio
hostname: minio
container_name: minio
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin
- MINIO_DOMAIN=minio
networks:
default:
aliases:
- warehouse.minio
ports:
- 9001:9001
- 9000:9000
command: ["server", "/data", "--console-address", ":9001"]
aws:
image: amazon/aws-cli
container_name: aws-cli
command: |
-c "sleep 2 && \
aws --endpoint-url http://minio:9000 s3 mb s3://warehouse --region eu-west-1 || exit 0"
entrypoint: [/bin/bash]
environment:
AWS_ACCESS_KEY_ID: "minioadmin"
AWS_SECRET_ACCESS_KEY: "minioadmin"
depends_on:
- minio
# Batch & Iceberg manipulation
spark-iceberg:
image: tabulario/spark-iceberg:3.5.1_1.5.0
hostname: spark-iceberg
container_name: spark-iceberg
build: spark/
depends_on:
- rest
- minio
environment:
AWS_ACCESS_KEY_ID: minioadmin
AWS_SECRET_ACCESS_KEY: minioadmin
AWS_REGION: eu-west-1
SPARK_DEFAULTS: |
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
spark.sql.catalog.iceberg org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.iceberg.catalog-impl org.apache.iceberg.rest.RESTCatalog
spark.sql.catalog.iceberg.uri http://rest:8181
spark.sql.catalog.iceberg.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.catalog.iceberg.warehouse s3://warehouse/wh/
spark.sql.catalog.iceberg.s3.endpoint http://minio:9000
spark.sql.catalog.iceberg.s3.path-style-access true
spark.sql.defaultCatalog iceberg
spark.sql.catalogImplementation in-memory
spark.eventLog.enabled true
spark.eventLog.dir /home/iceberg/spark-events
spark.history.fs.logDirectory /home/iceberg/spark-events
spark.jars.packages org.apache.hadoop:hadoop-aws:3.2.0
ports:
- 8888:8888
- 8080:8080
- 10000:10000
- 10001:10001
volumes:
- ./spark:/home/iceberg/scripts
- ./notebooks:/home/iceberg/notebooks/notebooks
command: ["echo \"$$SPARK_DEFAULTS\" > /opt/spark/conf/spark-defaults.conf && spark-submit /home/iceberg/scripts/create_table.py && notebook"]
# Catalog
rest:
image: tabulario/iceberg-rest
hostname: rest
container_name: rest
ports:
- 8181:8181
environment:
- AWS_ACCESS_KEY_ID=minioadmin
- AWS_SECRET_ACCESS_KEY=minioadmin
- AWS_REGION=eu-west-1
- CATALOG_WAREHOUSE=s3://warehouse/
- CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
- CATALOG_S3_ENDPOINT=http://minio:9000
- CATALOG_S3_PATH__STYLE__ACCESS=True
# Query layer
starrocks-fe:
image: starrocks/fe-ubuntu:3.3.5
hostname: starrocks-fe
container_name: starrocks-fe
restart: unless-stopped
user: root
command: |
bash /opt/starrocks/fe/bin/start_fe.sh --host_type FQDN
ports:
- 8030:8030
- 9020:9020
- 9030:9030
environment:
- AWS_ACCESS_KEY_ID=minioadmin
- AWS_SECRET_ACCESS_KEY=minioadmin
- AWS_REGION=eu-west-1
healthcheck:
test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW FRONTENDS\G" |grep "Alive: true"'
interval: 10s
timeout: 5s
retries: 3
starrocks-init-tables:
image: starrocks/fe-ubuntu:3.3.5
hostname: starrocks-init-tables
container_name: starrocks-init-tables
user: root
restart: no
environment:
SETUP_SQL: |
CREATE EXTERNAL CATALOG 'iceberg'
COMMENT "Iceberg table"
PROPERTIES
(
"type"="iceberg",
"iceberg.catalog.type"="rest",
"iceberg.catalog.uri"="http://rest:8181",
"iceberg.catalog.warehouse"="warehouse",
"aws.s3.access_key"="minioadmin",
"aws.s3.secret_key"="minioadmin",
"aws.s3.endpoint"="http://minio:9000",
"aws.s3.enable_path_style_access"="true",
"client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory"
);
depends_on:
starrocks-fe:
condition: service_healthy
command: |
bash -c '
until mysql -P 9030 -h starrocks-fe -u root -e "SELECT 1" >/dev/null 2>&1; do
echo "Waiting for StarRocks FE to be ready..."
sleep 5
done
if ! mysql -P 9030 -h starrocks-fe -u root -e "SHOW CATALOGS" | grep -q iceberg; then
echo "Creating Iceberg catalog..."
mysql -P 9030 -h starrocks-fe -u root -e "$$SETUP_SQL"
echo "Iceberg catalog created successfully"
else
echo "Iceberg catalog already exists"
fi
'
starrocks-be:
image: starrocks/be-ubuntu:3.3.5
command:
- /bin/bash
- -c
- |
ulimit -u 65535;
ulimit -n 65535;
echo "# Enable data cache" >> /opt/starrocks/be/conf/be.conf
echo "block_cache_enable = true" >> /opt/starrocks/be/conf/be.conf
echo "block_cache_mem_size = 536870912" >> /opt/starrocks/be/conf/be.conf
echo "block_cache_disk_size = 1073741824" >> /opt/starrocks/be/conf/be.conf
sleep 15s
mysql --connect-timeout 2 -h starrocks-fe -P 9030 -u root -e "ALTER SYSTEM ADD BACKEND \"starrocks-be:9050\";"
bash /opt/starrocks/be/bin/start_be.sh
ports:
- 8040:8040
hostname: starrocks-be
container_name: starrocks-be
user: root
restart: unless-stopped
depends_on:
- starrocks-fe
healthcheck:
test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW BACKENDS\G" |grep "Alive: true"'
interval: 10s
timeout: 5s
retries: 3
environment:
- HOST_TYPE=FQDN