feat: add iceberg with spark
Signed-off-by: kjuulh <contact@kjuulh.io>
This commit is contained in:
parent
a0be74c4d8
commit
f345a956c9
8
docker-exec.sh
Executable file
8
docker-exec.sh
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/zsh
|
||||||
|
|
||||||
|
docker compose \
|
||||||
|
-f templates/docker-compose.rising-wave.yaml \
|
||||||
|
-f templates/docker-compose.iceberg.yaml \
|
||||||
|
-f templates/docker-compose.clickhouse.yaml \
|
||||||
|
-f templates/docker-compose.yaml \
|
||||||
|
$@
|
15
templates/create_iceberg_sink.sql
Normal file
15
templates/create_iceberg_sink.sql
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
CREATE SINK clicks_iceberg_sink
|
||||||
|
FROM
|
||||||
|
clicks WITH (
|
||||||
|
connector = 'iceberg',
|
||||||
|
type = 'upsert',
|
||||||
|
primary_key = 'user_id, ad_id, click_timestamp, impression_timestamp',
|
||||||
|
warehouse.path = 's3://hummock001/iceberg-data',
|
||||||
|
s3.endpoint = 'http://minio-0:9301',
|
||||||
|
s3.access.key = 'hummockadmin',
|
||||||
|
s3.secret.key = 'hummockadmin',
|
||||||
|
s3.region = 'us-east-1',
|
||||||
|
catalog.type = 'storage',
|
||||||
|
database.name='demo_db',
|
||||||
|
table.name='demo_table'
|
||||||
|
);
|
@ -11,3 +11,12 @@ where
|
|||||||
and impression_timestamp + interval '1' minute >= click_timestamp
|
and impression_timestamp + interval '1' minute >= click_timestamp
|
||||||
group by
|
group by
|
||||||
ad_id;
|
ad_id;
|
||||||
|
|
||||||
|
create materialized view clicks as
|
||||||
|
select
|
||||||
|
user_id,
|
||||||
|
ad_id,
|
||||||
|
click_timestamp,
|
||||||
|
impression_timestamp
|
||||||
|
from
|
||||||
|
ad_source;
|
||||||
|
63
templates/docker-compose.iceberg.yaml
Normal file
63
templates/docker-compose.iceberg.yaml
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
x-spark-common:
|
||||||
|
&spark-air
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: spark.Dockerfile
|
||||||
|
target: spark
|
||||||
|
|
||||||
|
services:
|
||||||
|
spark:
|
||||||
|
<<: *spark-air
|
||||||
|
environment:
|
||||||
|
- SPARK_MODE=master
|
||||||
|
ports:
|
||||||
|
- '7077:7077'
|
||||||
|
configs:
|
||||||
|
- source: run_sql
|
||||||
|
target: /spark-script/run_sql.sh
|
||||||
|
mode: 0755
|
||||||
|
- source: create_table
|
||||||
|
target: /spark-script/create-table.sql
|
||||||
|
mode: 0755
|
||||||
|
- source: query_table
|
||||||
|
target: /spark-script/query-table.sql
|
||||||
|
mode: 0755
|
||||||
|
spark-worker:
|
||||||
|
<<: *spark-air
|
||||||
|
environment:
|
||||||
|
- SPARK_MODE=worker
|
||||||
|
- SPARK_MASTER_URL=spark://spark:7077
|
||||||
|
- SPARK_WORKER_MEMORY=1G
|
||||||
|
- SPARK_WORKER_CORES=1
|
||||||
|
configs:
|
||||||
|
run_sql:
|
||||||
|
content: |
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.1.0,org.apache.hadoop:hadoop-aws:3.3.2\
|
||||||
|
--master spark://spark:7077 \
|
||||||
|
--conf spark.sql.catalog.demo=org.apache.iceberg.spark.SparkCatalog \
|
||||||
|
--conf spark.sql.catalog.demo.type=hadoop \
|
||||||
|
--conf spark.sql.catalog.demo.warehouse=s3a://hummock001/iceberg-data \
|
||||||
|
--conf spark.sql.catalog.demo.hadoop.fs.s3a.endpoint=http://minio-0:9301 \
|
||||||
|
--conf spark.sql.catalog.demo.hadoop.fs.s3a.path.style.access=true \
|
||||||
|
--conf spark.sql.catalog.demo.hadoop.fs.s3a.access.key=hummockadmin \
|
||||||
|
--conf spark.sql.catalog.demo.hadoop.fs.s3a.secret.key=hummockadmin \
|
||||||
|
--conf spark.sql.defaultCatalog=demo \
|
||||||
|
-f /spark-script/$1.sql
|
||||||
|
|
||||||
|
query_table:
|
||||||
|
content: |
|
||||||
|
SELECT * from demo.demo_db.demo_table;
|
||||||
|
create_table:
|
||||||
|
content: |
|
||||||
|
drop table if exists demo.demo_db.demo_table;
|
||||||
|
|
||||||
|
CREATE TABLE demo.demo_db.demo_table
|
||||||
|
(
|
||||||
|
user_id bigint,
|
||||||
|
ad_id bigint,
|
||||||
|
click_timestamp timestamp,
|
||||||
|
impression_timestamp timestamp
|
||||||
|
) TBLPROPERTIES ('format-version'='2');
|
||||||
|
name: iceberg
|
@ -58,6 +58,7 @@ services:
|
|||||||
target: /risingwave.toml
|
target: /risingwave.toml
|
||||||
environment:
|
environment:
|
||||||
RUST_BACKTRACE: "1"
|
RUST_BACKTRACE: "1"
|
||||||
|
RUST_LOG: debug
|
||||||
# If ENABLE_TELEMETRY is not set, telemetry will start by default
|
# If ENABLE_TELEMETRY is not set, telemetry will start by default
|
||||||
ENABLE_TELEMETRY: true
|
ENABLE_TELEMETRY: true
|
||||||
RW_TELEMETRY_TYPE: "docker-compose"
|
RW_TELEMETRY_TYPE: "docker-compose"
|
||||||
|
16
templates/spark.Dockerfile
Normal file
16
templates/spark.Dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
FROM bitnami/spark:3.3 as spark
|
||||||
|
|
||||||
|
USER root
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y curl && \
|
||||||
|
apt-get clean;
|
||||||
|
|
||||||
|
USER 1001
|
||||||
|
|
||||||
|
RUN rm -r /opt/bitnami/spark/jars && \
|
||||||
|
curl https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz | \
|
||||||
|
tar --extract --gzip --strip=1 --directory /opt/bitnami/spark/ spark-3.3.1-bin-hadoop3/jars/
|
||||||
|
RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.231/aws-java-sdk-bundle-1.12.231.jar --output /opt/bitnami/spark/jars/aws-java-sdk-bundle-1.12.231.jar
|
||||||
|
RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar --output /opt/bitnami/spark/jars/hadoop-aws-3.3.1.jar
|
||||||
|
RUN curl https://repo1.maven.org/maven2/net/java/dev/jets3t/jets3t/0.9.4/jets3t-0.9.4.jar --output /opt/bitnami/spark/jars/jets3t-0.9.4.jar
|
Loading…
Reference in New Issue
Block a user