diff --git a/docker-compose.yml b/docker-compose.yml
index 154d34e..2dbf4de 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,4 +1,6 @@
services:
+
+ # Ingest
redpanda:
image: redpandadata/redpanda:v24.3.4
container_name: redpanda
@@ -129,6 +131,7 @@ services:
- redpanda
- connect
+ # Buckets
minio:
image: minio/minio
hostname: minio
@@ -159,6 +162,7 @@ services:
depends_on:
- minio
+ # Batch & Iceberg manipulation
spark-iceberg:
image: tabulario/spark-iceberg
hostname: spark-iceberg
@@ -196,6 +200,7 @@ services:
- ./notebooks:/home/iceberg/notebooks/notebooks
command: ["echo \"$$SPARK_DEFAULTS\" > /opt/spark/conf/spark-defaults.conf && spark-submit /home/iceberg/scripts/create_table.py && notebook"]
+ # Catalog
rest:
image: tabulario/iceberg-rest
hostname: rest
@@ -210,3 +215,99 @@ services:
- CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
- CATALOG_S3_ENDPOINT=http://minio:9000
- CATALOG_S3_PATH__STYLE__ACCESS=True
+
+ # Query layer
+
+ starrocks-fe:
+ image: starrocks/fe-ubuntu:3.3.5
+ hostname: starrocks-fe
+ container_name: starrocks-fe
+ restart: unless-stopped
+ user: root
+ command: |
+ bash /opt/starrocks/fe/bin/start_fe.sh --host_type FQDN
+ ports:
+ - 8030:8030
+ - 9020:9020
+ - 9030:9030
+ environment:
+ - AWS_ACCESS_KEY_ID=minioadmin
+ - AWS_SECRET_ACCESS_KEY=minioadmin
+ - AWS_REGION=eu-west-1
+ healthcheck:
+ test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW FRONTENDS\G" |grep "Alive: true"'
+ interval: 10s
+ timeout: 5s
+ retries: 3
+
+ starrocks-init-tables:
+ image: starrocks/fe-ubuntu:3.3.5
+ hostname: starrocks-init-tables
+ container_name: starrocks-init-tables
+ user: root
+ restart: no
+ environment:
+ SETUP_SQL: |
+ CREATE EXTERNAL CATALOG 'iceberg'
+ COMMENT "Iceberg table"
+ PROPERTIES
+ (
+ "type"="iceberg",
+ "iceberg.catalog.type"="rest",
+ "iceberg.catalog.uri"="http://rest:8181",
+ "iceberg.catalog.warehouse"="warehouse",
+ "aws.s3.access_key"="minioadmin",
+ "aws.s3.secret_key"="minioadmin",
+ "aws.s3.endpoint"="http://minio:9000",
+ "aws.s3.enable_path_style_access"="true",
+ "client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory"
+ );
+ depends_on:
+ starrocks-fe:
+ condition: service_healthy
+ command: |
+ bash -c '
+ until mysql -P 9030 -h starrocks-fe -u root -e "SELECT 1" >/dev/null 2>&1; do
+ echo "Waiting for StarRocks FE to be ready..."
+ sleep 5
+ done
+
+ if ! mysql -P 9030 -h starrocks-fe -u root -e "SHOW CATALOGS" | grep -q iceberg; then
+ echo "Creating Iceberg catalog..."
+ mysql -P 9030 -h starrocks-fe -u root -e "$$SETUP_SQL"
+ echo "Iceberg catalog created successfully"
+ else
+ echo "Iceberg catalog already exists"
+ fi
+ '
+
+ starrocks-be:
+ image: starrocks/be-ubuntu:3.3.5
+ command:
+ - /bin/bash
+ - -c
+ - |
+ ulimit -u 65535;
+ ulimit -n 65535;
+ echo "# Enable data cache" >> /opt/starrocks/be/conf/be.conf
+ echo "block_cache_enable = true" >> /opt/starrocks/be/conf/be.conf
+ echo "block_cache_mem_size = 536870912" >> /opt/starrocks/be/conf/be.conf
+ echo "block_cache_disk_size = 1073741824" >> /opt/starrocks/be/conf/be.conf
+ sleep 15s
+ mysql --connect-timeout 2 -h starrocks-fe -P 9030 -u root -e "ALTER SYSTEM ADD BACKEND \"starrocks-be:9050\";"
+ bash /opt/starrocks/be/bin/start_be.sh
+ ports:
+ - 8040:8040
+ hostname: starrocks-be
+ container_name: starrocks-be
+ user: root
+ restart: unless-stopped
+ depends_on:
+ - starrocks-fe
+ healthcheck:
+ test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW BACKENDS\G" |grep "Alive: true"'
+ interval: 10s
+ timeout: 5s
+ retries: 3
+ environment:
+ - HOST_TYPE=FQDN
diff --git a/notebooks/iceberg.ipynb b/notebooks/iceberg.ipynb
index 2452242..b754616 100644
--- a/notebooks/iceberg.ipynb
+++ b/notebooks/iceberg.ipynb
@@ -1,103 +1,320 @@
{
- "cells": [
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "29f8d24e-e4bf-484d-afd4-cb82ff6cd50d",
+ "metadata": {},
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": null,
- "id": "29f8d24e-e4bf-484d-afd4-cb82ff6cd50d",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sql\n",
- "\n",
- "SHOW DATABASES"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "25/01/31 22:57:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "70349765-e5f1-43a5-a141-cc2d54c69a58",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sql\n",
- "\n",
- "SHOW TABLES FROM orders"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fabaed9c-9049-4996-9d26-b20f66303911",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sql\n",
- "\n",
- "SHOW TBLPROPERTIES orders.payments"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6317d9c6-140e-4a63-890e-2173fbb9503e",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sql\n",
- "\n",
- "SELECT COUNT(*)\n",
- "FROM orders.payments"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2a1ff132-dc65-4943-a9be-416ba5a13c26",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sql\n",
- "\n",
- "SELECT *\n",
- "FROM orders.payments\n",
- "LIMIT 10"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a2688a95-594c-45ad-9d49-70a1bcd59a1b",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sql\n",
- "\n",
- "SELECT * \n",
- "FROM orders.payments.partitions\n",
- "ORDER BY record_count DESC\n",
- "LIMIT 10"
+ "data": {
+ "text/html": [
+ "
\n",
+ " \n",
+ " \n",
+ " namespace | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " orders | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ "+-----------+\n",
+ "| namespace |\n",
+ "+-----------+\n",
+ "| orders |\n",
+ "+-----------+"
]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.17"
- }
+ ],
+ "source": [
+ "%%sql\n",
+ "\n",
+ "SHOW DATABASES"
+ ]
},
- "nbformat": 4,
- "nbformat_minor": 5
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "70349765-e5f1-43a5-a141-cc2d54c69a58",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " namespace | \n",
+ " tableName | \n",
+ " isTemporary | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " orders | \n",
+ " payments | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ "+-----------+-----------+-------------+\n",
+ "| namespace | tableName | isTemporary |\n",
+ "+-----------+-----------+-------------+\n",
+ "| orders | payments | False |\n",
+ "+-----------+-----------+-------------+"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "\n",
+ "SHOW TABLES FROM orders"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "fabaed9c-9049-4996-9d26-b20f66303911",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " key | \n",
+ " value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " current-snapshot-id | \n",
+ " none | \n",
+ "
\n",
+ " \n",
+ " format | \n",
+ " iceberg/parquet | \n",
+ "
\n",
+ " \n",
+ " format-version | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " write.parquet.compression-codec | \n",
+ " zstd | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ "+---------------------------------+-----------------+\n",
+ "| key | value |\n",
+ "+---------------------------------+-----------------+\n",
+ "| current-snapshot-id | none |\n",
+ "| format | iceberg/parquet |\n",
+ "| format-version | 2 |\n",
+ "| write.parquet.compression-codec | zstd |\n",
+ "+---------------------------------+-----------------+"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "\n",
+ "SHOW TBLPROPERTIES orders.payments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "6317d9c6-140e-4a63-890e-2173fbb9503e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " count(1) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ "+----------+\n",
+ "| count(1) |\n",
+ "+----------+\n",
+ "| 0 |\n",
+ "+----------+"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "\n",
+ "SELECT COUNT(*)\n",
+ "FROM orders.payments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "2a1ff132-dc65-4943-a9be-416ba5a13c26",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " id | \n",
+ " type | \n",
+ " created_at | \n",
+ " document | \n",
+ " payer | \n",
+ " amount | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ "+----+------+------------+----------+-------+--------+\n",
+ "| id | type | created_at | document | payer | amount |\n",
+ "+----+------+------------+----------+-------+--------+\n",
+ "+----+------+------------+----------+-------+--------+"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "\n",
+ "SELECT *\n",
+ "FROM orders.payments\n",
+ "LIMIT 10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "a2688a95-594c-45ad-9d49-70a1bcd59a1b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " partition | \n",
+ " spec_id | \n",
+ " record_count | \n",
+ " file_count | \n",
+ " total_data_file_size_in_bytes | \n",
+ " position_delete_record_count | \n",
+ " position_delete_file_count | \n",
+ " equality_delete_record_count | \n",
+ " equality_delete_file_count | \n",
+ " last_updated_at | \n",
+ " last_updated_snapshot_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ "+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n",
+ "| partition | spec_id | record_count | file_count | total_data_file_size_in_bytes | position_delete_record_count | position_delete_file_count | equality_delete_record_count | equality_delete_file_count | last_updated_at | last_updated_snapshot_id |\n",
+ "+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n",
+ "+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "\n",
+ "SELECT * \n",
+ "FROM orders.payments.partitions\n",
+ "ORDER BY record_count DESC\n",
+ "LIMIT 10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "72b17cc4-6c92-47b7-9b86-ae6a7f73af4d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}