From aa1cec9986e49e8e1bf500f9ba05d118f142c519 Mon Sep 17 00:00:00 2001 From: kjuulh Date: Sat, 1 Feb 2025 00:28:53 +0100 Subject: [PATCH] feat: added starrocks --- docker-compose.yml | 101 ++++++++++ notebooks/iceberg.ipynb | 407 ++++++++++++++++++++++++++++++---------- 2 files changed, 413 insertions(+), 95 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 154d34e..2dbf4de 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,6 @@ services: + + # Ingest redpanda: image: redpandadata/redpanda:v24.3.4 container_name: redpanda @@ -129,6 +131,7 @@ services: - redpanda - connect + # Buckets minio: image: minio/minio hostname: minio @@ -159,6 +162,7 @@ services: depends_on: - minio + # Batch & Iceberg manipulation spark-iceberg: image: tabulario/spark-iceberg hostname: spark-iceberg @@ -196,6 +200,7 @@ services: - ./notebooks:/home/iceberg/notebooks/notebooks command: ["echo \"$$SPARK_DEFAULTS\" > /opt/spark/conf/spark-defaults.conf && spark-submit /home/iceberg/scripts/create_table.py && notebook"] + # Catalog rest: image: tabulario/iceberg-rest hostname: rest @@ -210,3 +215,99 @@ services: - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO - CATALOG_S3_ENDPOINT=http://minio:9000 - CATALOG_S3_PATH__STYLE__ACCESS=True + + # Query layer + + starrocks-fe: + image: starrocks/fe-ubuntu:3.3.5 + hostname: starrocks-fe + container_name: starrocks-fe + restart: unless-stopped + user: root + command: | + bash /opt/starrocks/fe/bin/start_fe.sh --host_type FQDN + ports: + - 8030:8030 + - 9020:9020 + - 9030:9030 + environment: + - AWS_ACCESS_KEY_ID=minioadmin + - AWS_SECRET_ACCESS_KEY=minioadmin + - AWS_REGION=eu-west-1 + healthcheck: + test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW FRONTENDS\G" |grep "Alive: true"' + interval: 10s + timeout: 5s + retries: 3 + + starrocks-init-tables: + image: starrocks/fe-ubuntu:3.3.5 + hostname: starrocks-init-tables + container_name: starrocks-init-tables + user: root + restart: no + environment: + SETUP_SQL: | + CREATE EXTERNAL CATALOG 'iceberg' + COMMENT "Iceberg table" + PROPERTIES + ( + "type"="iceberg", + "iceberg.catalog.type"="rest", + "iceberg.catalog.uri"="http://rest:8181", + "iceberg.catalog.warehouse"="warehouse", + "aws.s3.access_key"="minioadmin", + "aws.s3.secret_key"="minioadmin", + "aws.s3.endpoint"="http://minio:9000", + "aws.s3.enable_path_style_access"="true", + "client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory" + ); + depends_on: + starrocks-fe: + condition: service_healthy + command: | + bash -c ' + until mysql -P 9030 -h starrocks-fe -u root -e "SELECT 1" >/dev/null 2>&1; do + echo "Waiting for StarRocks FE to be ready..." + sleep 5 + done + + if ! mysql -P 9030 -h starrocks-fe -u root -e "SHOW CATALOGS" | grep -q iceberg; then + echo "Creating Iceberg catalog..." + mysql -P 9030 -h starrocks-fe -u root -e "$$SETUP_SQL" + echo "Iceberg catalog created successfully" + else + echo "Iceberg catalog already exists" + fi + ' + + starrocks-be: + image: starrocks/be-ubuntu:3.3.5 + command: + - /bin/bash + - -c + - | + ulimit -u 65535; + ulimit -n 65535; + echo "# Enable data cache" >> /opt/starrocks/be/conf/be.conf + echo "block_cache_enable = true" >> /opt/starrocks/be/conf/be.conf + echo "block_cache_mem_size = 536870912" >> /opt/starrocks/be/conf/be.conf + echo "block_cache_disk_size = 1073741824" >> /opt/starrocks/be/conf/be.conf + sleep 15s + mysql --connect-timeout 2 -h starrocks-fe -P 9030 -u root -e "ALTER SYSTEM ADD BACKEND \"starrocks-be:9050\";" + bash /opt/starrocks/be/bin/start_be.sh + ports: + - 8040:8040 + hostname: starrocks-be + container_name: starrocks-be + user: root + restart: unless-stopped + depends_on: + - starrocks-fe + healthcheck: + test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW BACKENDS\G" |grep "Alive: true"' + interval: 10s + timeout: 5s + retries: 3 + environment: + - HOST_TYPE=FQDN diff --git a/notebooks/iceberg.ipynb b/notebooks/iceberg.ipynb index 2452242..b754616 100644 --- a/notebooks/iceberg.ipynb +++ b/notebooks/iceberg.ipynb @@ -1,103 +1,320 @@ { - "cells": [ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "29f8d24e-e4bf-484d-afd4-cb82ff6cd50d", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "id": "29f8d24e-e4bf-484d-afd4-cb82ff6cd50d", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "\n", - "SHOW DATABASES" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/31 22:57:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "id": "70349765-e5f1-43a5-a141-cc2d54c69a58", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "\n", - "SHOW TABLES FROM orders" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fabaed9c-9049-4996-9d26-b20f66303911", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "\n", - "SHOW TBLPROPERTIES orders.payments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6317d9c6-140e-4a63-890e-2173fbb9503e", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "\n", - "SELECT COUNT(*)\n", - "FROM orders.payments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a1ff132-dc65-4943-a9be-416ba5a13c26", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "\n", - "SELECT *\n", - "FROM orders.payments\n", - "LIMIT 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2688a95-594c-45ad-9d49-70a1bcd59a1b", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "\n", - "SELECT * \n", - "FROM orders.payments.partitions\n", - "ORDER BY record_count DESC\n", - "LIMIT 10" + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namespace
orders
" + ], + "text/plain": [ + "+-----------+\n", + "| namespace |\n", + "+-----------+\n", + "| orders |\n", + "+-----------+" ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - } + ], + "source": [ + "%%sql\n", + "\n", + "SHOW DATABASES" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "cell_type": "code", + "execution_count": 2, + "id": "70349765-e5f1-43a5-a141-cc2d54c69a58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namespacetableNameisTemporary
orderspaymentsFalse
" + ], + "text/plain": [ + "+-----------+-----------+-------------+\n", + "| namespace | tableName | isTemporary |\n", + "+-----------+-----------+-------------+\n", + "| orders | payments | False |\n", + "+-----------+-----------+-------------+" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "SHOW TABLES FROM orders" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fabaed9c-9049-4996-9d26-b20f66303911", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keyvalue
current-snapshot-idnone
formaticeberg/parquet
format-version2
write.parquet.compression-codeczstd
" + ], + "text/plain": [ + "+---------------------------------+-----------------+\n", + "| key | value |\n", + "+---------------------------------+-----------------+\n", + "| current-snapshot-id | none |\n", + "| format | iceberg/parquet |\n", + "| format-version | 2 |\n", + "| write.parquet.compression-codec | zstd |\n", + "+---------------------------------+-----------------+" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "SHOW TBLPROPERTIES orders.payments" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6317d9c6-140e-4a63-890e-2173fbb9503e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count(1)
0
" + ], + "text/plain": [ + "+----------+\n", + "| count(1) |\n", + "+----------+\n", + "| 0 |\n", + "+----------+" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "SELECT COUNT(*)\n", + "FROM orders.payments" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2a1ff132-dc65-4943-a9be-416ba5a13c26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypecreated_atdocumentpayeramount
" + ], + "text/plain": [ + "+----+------+------------+----------+-------+--------+\n", + "| id | type | created_at | document | payer | amount |\n", + "+----+------+------------+----------+-------+--------+\n", + "+----+------+------------+----------+-------+--------+" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM orders.payments\n", + "LIMIT 10" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a2688a95-594c-45ad-9d49-70a1bcd59a1b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
partitionspec_idrecord_countfile_counttotal_data_file_size_in_bytesposition_delete_record_countposition_delete_file_countequality_delete_record_countequality_delete_file_countlast_updated_atlast_updated_snapshot_id
" + ], + "text/plain": [ + "+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n", + "| partition | spec_id | record_count | file_count | total_data_file_size_in_bytes | position_delete_record_count | position_delete_file_count | equality_delete_record_count | equality_delete_file_count | last_updated_at | last_updated_snapshot_id |\n", + "+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n", + "+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "SELECT * \n", + "FROM orders.payments.partitions\n", + "ORDER BY record_count DESC\n", + "LIMIT 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72b17cc4-6c92-47b7-9b86-ae6a7f73af4d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 }