diff --git a/docker-exec.sh b/docker-exec.sh new file mode 100755 index 0000000..4d83cf5 --- /dev/null +++ b/docker-exec.sh @@ -0,0 +1,8 @@ +#!/bin/zsh + +docker compose \ + -f templates/docker-compose.rising-wave.yaml \ + -f templates/docker-compose.iceberg.yaml \ + -f templates/docker-compose.clickhouse.yaml \ + -f templates/docker-compose.yaml \ + $@ diff --git a/templates/create_iceberg_sink.sql b/templates/create_iceberg_sink.sql new file mode 100644 index 0000000..44209cf --- /dev/null +++ b/templates/create_iceberg_sink.sql @@ -0,0 +1,15 @@ +CREATE SINK clicks_iceberg_sink +FROM + clicks WITH ( + connector = 'iceberg', + type = 'upsert', + primary_key = 'user_id, ad_id, click_timestamp, impression_timestamp', + warehouse.path = 's3://hummock001/iceberg-data', + s3.endpoint = 'http://minio-0:9301', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + s3.region = 'us-east-1', + catalog.type = 'storage', + database.name='demo_db', + table.name='demo_table' +); diff --git a/templates/create_mvs.sql b/templates/create_mvs.sql index 1d34d1f..dc2ad82 100644 --- a/templates/create_mvs.sql +++ b/templates/create_mvs.sql @@ -11,3 +11,12 @@ where and impression_timestamp + interval '1' minute >= click_timestamp group by ad_id; + +create materialized view clicks as +select + user_id, + ad_id, + click_timestamp, + impression_timestamp +from + ad_source; diff --git a/templates/docker-compose.iceberg.yaml b/templates/docker-compose.iceberg.yaml new file mode 100644 index 0000000..e151b28 --- /dev/null +++ b/templates/docker-compose.iceberg.yaml @@ -0,0 +1,63 @@ +x-spark-common: + &spark-air + build: + context: . + dockerfile: spark.Dockerfile + target: spark + +services: + spark: + <<: *spark-air + environment: + - SPARK_MODE=master + ports: + - '7077:7077' + configs: + - source: run_sql + target: /spark-script/run_sql.sh + mode: 0755 + - source: create_table + target: /spark-script/create-table.sql + mode: 0755 + - source: query_table + target: /spark-script/query-table.sql + mode: 0755 + spark-worker: + <<: *spark-air + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark:7077 + - SPARK_WORKER_MEMORY=1G + - SPARK_WORKER_CORES=1 +configs: + run_sql: + content: | + set -ex + + spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.1.0,org.apache.hadoop:hadoop-aws:3.3.2\ + --master spark://spark:7077 \ + --conf spark.sql.catalog.demo=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.demo.type=hadoop \ + --conf spark.sql.catalog.demo.warehouse=s3a://hummock001/iceberg-data \ + --conf spark.sql.catalog.demo.hadoop.fs.s3a.endpoint=http://minio-0:9301 \ + --conf spark.sql.catalog.demo.hadoop.fs.s3a.path.style.access=true \ + --conf spark.sql.catalog.demo.hadoop.fs.s3a.access.key=hummockadmin \ + --conf spark.sql.catalog.demo.hadoop.fs.s3a.secret.key=hummockadmin \ + --conf spark.sql.defaultCatalog=demo \ + -f /spark-script/$1.sql + + query_table: + content: | + SELECT * from demo.demo_db.demo_table; + create_table: + content: | + drop table if exists demo.demo_db.demo_table; + + CREATE TABLE demo.demo_db.demo_table + ( + user_id bigint, + ad_id bigint, + click_timestamp timestamp, + impression_timestamp timestamp + ) TBLPROPERTIES ('format-version'='2'); +name: iceberg diff --git a/templates/docker-compose.rising-wave.yaml b/templates/docker-compose.rising-wave.yaml index 652e488..630c1df 100644 --- a/templates/docker-compose.rising-wave.yaml +++ b/templates/docker-compose.rising-wave.yaml @@ -58,6 +58,7 @@ services: target: /risingwave.toml environment: RUST_BACKTRACE: "1" + RUST_LOG: debug # If ENABLE_TELEMETRY is not set, telemetry will start by default ENABLE_TELEMETRY: true RW_TELEMETRY_TYPE: "docker-compose" diff --git a/templates/spark.Dockerfile b/templates/spark.Dockerfile new file mode 100644 index 0000000..792f927 --- /dev/null +++ b/templates/spark.Dockerfile @@ -0,0 +1,16 @@ +FROM bitnami/spark:3.3 as spark + +USER root + +RUN apt-get update && \ + apt-get install -y curl && \ + apt-get clean; + +USER 1001 + +RUN rm -r /opt/bitnami/spark/jars && \ + curl https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz | \ + tar --extract --gzip --strip=1 --directory /opt/bitnami/spark/ spark-3.3.1-bin-hadoop3/jars/ +RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.231/aws-java-sdk-bundle-1.12.231.jar --output /opt/bitnami/spark/jars/aws-java-sdk-bundle-1.12.231.jar +RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar --output /opt/bitnami/spark/jars/hadoop-aws-3.3.1.jar +RUN curl https://repo1.maven.org/maven2/net/java/dev/jets3t/jets3t/0.9.4/jets3t-0.9.4.jar --output /opt/bitnami/spark/jars/jets3t-0.9.4.jar