feat: add adclicks example

This commit is contained in:
Kasper Juul Hermansen 2025-02-01 00:34:01 +01:00
parent aa1cec9986
commit 50aa9c7a14
Signed by: kjuulh
SSH Key Fingerprint: SHA256:RjXh0p7U6opxnfd3ga/Y9TCo18FYlHFdSpRIV72S/QM
8 changed files with 1726 additions and 58 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
.cuddle/
target/

1237
client-application/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
[package]
name = "client-application"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.95"
chrono = { version = "0.4.39", features = ["serde"] }
clap = { version = "4.5.27", features = ["derive", "env"] }
dotenvy = "0.15.7"
kafka = "0.10.0"
nodrift = "0.3.0"
rand = "0.9.0"
serde = { version = "1.0.217", features = ["derive"] }
serde_json = "1.0.138"
tokio = { version = "1.43.0", features = ["full"] }
tracing = { version = "0.1.41", features = ["log"] }
tracing-subscriber = "0.3.19"
uuid = { version = "1.12.1", features = ["serde"] }

View File

@ -0,0 +1,17 @@
FROM rustlang/rust:nightly AS builder
WORKDIR /mnt/src
COPY Cargo.toml Cargo.toml
COPY Cargo.lock Cargo.lock
COPY src/ src/
RUN cargo build --release
FROM debian:bookworm AS production
RUN apt update && apt upgrade -y && apt install libssl-dev -y
COPY --from=builder /mnt/src/target/release/client-application /usr/local/bin/client-application
ENTRYPOINT ["/usr/local/bin/client-application"]

View File

@ -0,0 +1,111 @@
use std::time::Duration;
use anyhow::Context;
use chrono::{TimeDelta, Utc};
use clap::{Parser, Subcommand};
use kafka::producer::Record;
use rand::Rng;
use serde::Serialize;
#[derive(Parser)]
#[command(author, version, about, long_about = None, subcommand_required = true)]
struct Command {
#[command(subcommand)]
command: Option<Commands>,
}
#[derive(Subcommand)]
enum Commands {
Produce {
#[arg(long)]
host: String,
#[arg(long)]
topic: String,
#[arg(long = "delay-ms")]
delay_ms: u64,
},
}
#[derive(Clone, Serialize, Debug)]
struct AdSource {
user_id: i64,
ad_id: i64,
click_timestamp: String,
impression_timestamp: String,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
dotenvy::dotenv().ok();
tracing_subscriber::fmt::init();
let cli = Command::parse();
tracing::debug!("Starting cli");
match cli.command.unwrap() {
Commands::Produce {
topic,
delay_ms,
host,
} => {
let send_event =
nodrift::schedule(std::time::Duration::from_millis(delay_ms), move || {
let host = host.clone();
let topic = topic.clone();
async move {
tracing::info!("sending event");
let mut rng = rand::rng();
let mut producer = kafka::producer::Producer::from_hosts(vec![host])
.with_ack_timeout(Duration::from_secs(1))
.with_required_acks(kafka::client::RequiredAcks::One)
.create()
.map_err(|e| nodrift::DriftError::JobError(e.into()))?;
let msg = AdSource {
user_id: rng.random_range(0..64),
ad_id: rng.random_range(0..64),
click_timestamp: format!(
"{}",
Utc::now()
.checked_add_signed(TimeDelta::milliseconds(500))
.unwrap()
.format("%Y-%m-%dT%H:%M:%S")
),
impression_timestamp: format!(
"{}",
Utc::now().to_utc().format("%Y-%m-%dT%H:%M:%S")
),
};
producer
.send(&Record::from_value(
&topic,
serde_json::to_string(&msg)
.context("failed to serialize type")
.map_err(nodrift::DriftError::JobError)?,
))
.map_err(|e| nodrift::DriftError::JobError(e.into()))?;
Ok(())
}
});
println!("waiting for closure press ctrl-c to cancel");
tokio::select! {
_ = send_event.cancelled() => {
tokio::time::sleep(Duration::from_secs(5)).await;
return Ok(())
}
_ = tokio::signal::ctrl_c() => {
send_event.cancel();
return Ok(())
}
}
}
}
}

View File

@ -27,6 +27,21 @@ services:
- PLAINTEXT://redpanda:29092,OUTSIDE://localhost:9092
- --check=false
client_application:
container_name: client_application
build:
context: ./client-application
restart: unless-stopped
environment:
RUST_LOG: info
command:
- produce
- --host=redpanda:29092
- --topic=event-stream
- --delay-ms=500
depends_on:
- connect
connect:
image: confluentinc/cp-kafka-connect-base:7.8.0
depends_on:
@ -85,7 +100,7 @@ services:
-H 'Accept: application/json' http://localhost:8083/connectors/IcebergSinkConnector/config \
-d '{
"tasks.max": "1",
"topics": "payments",
"topics": "event-stream",
"connector.class": "io.tabular.iceberg.connect.IcebergSinkConnector",
"iceberg.catalog.s3.endpoint": "http://minio:9000",
"iceberg.catalog.s3.secret-access-key": "minioadmin",
@ -96,7 +111,7 @@ services:
"iceberg.catalog.client.region": "eu-west-1",
"iceberg.catalog.type": "rest",
"iceberg.control.commitIntervalMs": "1000",
"iceberg.tables": "orders.payments",
"iceberg.tables": "marketing.ad_clicks",
"value.converter.schemas.enable": "false",
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
@ -164,7 +179,7 @@ services:
# Batch & Iceberg manipulation
spark-iceberg:
image: tabulario/spark-iceberg
image: tabulario/spark-iceberg:3.5.1_1.5.0
hostname: spark-iceberg
container_name: spark-iceberg
build: spark/

View File

@ -10,7 +10,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"25/01/31 22:57:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
"25/02/01 00:22:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
]
},
{
@ -24,7 +24,7 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>orders</td>\n",
" <td>marketing</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
@ -33,7 +33,7 @@
"+-----------+\n",
"| namespace |\n",
"+-----------+\n",
"| orders |\n",
"| marketing |\n",
"+-----------+"
]
},
@ -50,10 +50,17 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "70349765-e5f1-43a5-a141-cc2d54c69a58",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"25/02/01 00:37:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
]
},
{
"data": {
"text/html": [
@ -67,8 +74,8 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>orders</td>\n",
" <td>payments</td>\n",
" <td>marketing</td>\n",
" <td>ad_clicks</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
@ -78,11 +85,11 @@
"+-----------+-----------+-------------+\n",
"| namespace | tableName | isTemporary |\n",
"+-----------+-----------+-------------+\n",
"| orders | payments | False |\n",
"| marketing | ad_clicks | False |\n",
"+-----------+-----------+-------------+"
]
},
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@ -90,12 +97,12 @@
"source": [
"%%sql\n",
"\n",
"SHOW TABLES FROM orders"
"SHOW TABLES FROM marketing"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 31,
"id": "fabaed9c-9049-4996-9d26-b20f66303911",
"metadata": {},
"outputs": [
@ -112,7 +119,7 @@
" <tbody>\n",
" <tr>\n",
" <td>current-snapshot-id</td>\n",
" <td>none</td>\n",
" <td>6641965456052712871</td>\n",
" </tr>\n",
" <tr>\n",
" <td>format</td>\n",
@ -130,17 +137,17 @@
"</table>"
],
"text/plain": [
"+---------------------------------+-----------------+\n",
"+---------------------------------+---------------------+\n",
"| key | value |\n",
"+---------------------------------+-----------------+\n",
"| current-snapshot-id | none |\n",
"+---------------------------------+---------------------+\n",
"| current-snapshot-id | 6641965456052712871 |\n",
"| format | iceberg/parquet |\n",
"| format-version | 2 |\n",
"| write.parquet.compression-codec | zstd |\n",
"+---------------------------------+-----------------+"
"+---------------------------------+---------------------+"
]
},
"execution_count": 3,
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
@ -148,12 +155,12 @@
"source": [
"%%sql\n",
"\n",
"SHOW TBLPROPERTIES orders.payments"
"SHOW TBLPROPERTIES marketing.ad_clicks"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 33,
"id": "6317d9c6-140e-4a63-890e-2173fbb9503e",
"metadata": {},
"outputs": [
@ -168,7 +175,7 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>637</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
@ -177,11 +184,11 @@
"+----------+\n",
"| count(1) |\n",
"+----------+\n",
"| 0 |\n",
"| 637 |\n",
"+----------+"
]
},
"execution_count": 4,
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
@ -190,12 +197,12 @@
"%%sql\n",
"\n",
"SELECT COUNT(*)\n",
"FROM orders.payments"
"FROM marketing.ad_clicks"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 22,
"id": "2a1ff132-dc65-4943-a9be-416ba5a13c26",
"metadata": {},
"outputs": [
@ -205,26 +212,94 @@
"<table>\n",
" <thead>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>type</th>\n",
" <th>created_at</th>\n",
" <th>document</th>\n",
" <th>payer</th>\n",
" <th>amount</th>\n",
" <th>user_id</th>\n",
" <th>ad_id</th>\n",
" <th>click_timestamp</th>\n",
" <th>impression_timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>21</td>\n",
" <td>51</td>\n",
" <td>2025-02-01 00:42:09</td>\n",
" <td>2025-02-01 00:42:09</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>55</td>\n",
" <td>2025-02-01 00:42:10</td>\n",
" <td>2025-02-01 00:42:09</td>\n",
" </tr>\n",
" <tr>\n",
" <td>31</td>\n",
" <td>32</td>\n",
" <td>2025-02-01 00:42:10</td>\n",
" <td>2025-02-01 00:42:10</td>\n",
" </tr>\n",
" <tr>\n",
" <td>63</td>\n",
" <td>59</td>\n",
" <td>2025-02-01 00:42:04</td>\n",
" <td>2025-02-01 00:42:03</td>\n",
" </tr>\n",
" <tr>\n",
" <td>60</td>\n",
" <td>29</td>\n",
" <td>2025-02-01 00:42:04</td>\n",
" <td>2025-02-01 00:42:04</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>31</td>\n",
" <td>2025-02-01 00:42:03</td>\n",
" <td>2025-02-01 00:42:03</td>\n",
" </tr>\n",
" <tr>\n",
" <td>52</td>\n",
" <td>18</td>\n",
" <td>2025-02-01 00:41:25</td>\n",
" <td>2025-02-01 00:41:25</td>\n",
" </tr>\n",
" <tr>\n",
" <td>32</td>\n",
" <td>27</td>\n",
" <td>2025-02-01 00:41:52</td>\n",
" <td>2025-02-01 00:41:51</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2025-02-01 00:41:46</td>\n",
" <td>2025-02-01 00:41:46</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>13</td>\n",
" <td>2025-02-01 00:40:30</td>\n",
" <td>2025-02-01 00:40:29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"+----+------+------------+----------+-------+--------+\n",
"| id | type | created_at | document | payer | amount |\n",
"+----+------+------------+----------+-------+--------+\n",
"+----+------+------------+----------+-------+--------+"
"+---------+-------+---------------------+----------------------+\n",
"| user_id | ad_id | click_timestamp | impression_timestamp |\n",
"+---------+-------+---------------------+----------------------+\n",
"| 21 | 51 | 2025-02-01 00:42:09 | 2025-02-01 00:42:09 |\n",
"| 17 | 55 | 2025-02-01 00:42:10 | 2025-02-01 00:42:09 |\n",
"| 31 | 32 | 2025-02-01 00:42:10 | 2025-02-01 00:42:10 |\n",
"| 63 | 59 | 2025-02-01 00:42:04 | 2025-02-01 00:42:03 |\n",
"| 60 | 29 | 2025-02-01 00:42:04 | 2025-02-01 00:42:04 |\n",
"| 6 | 31 | 2025-02-01 00:42:03 | 2025-02-01 00:42:03 |\n",
"| 52 | 18 | 2025-02-01 00:41:25 | 2025-02-01 00:41:25 |\n",
"| 32 | 27 | 2025-02-01 00:41:52 | 2025-02-01 00:41:51 |\n",
"| 10 | 6 | 2025-02-01 00:41:46 | 2025-02-01 00:41:46 |\n",
"| 17 | 13 | 2025-02-01 00:40:30 | 2025-02-01 00:40:29 |\n",
"+---------+-------+---------------------+----------------------+"
]
},
"execution_count": 5,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@ -233,13 +308,13 @@
"%%sql\n",
"\n",
"SELECT *\n",
"FROM orders.payments\n",
"FROM marketing.ad_clicks\n",
"LIMIT 10"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 19,
"id": "a2688a95-594c-45ad-9d49-70a1bcd59a1b",
"metadata": {},
"outputs": [
@ -263,17 +338,157 @@
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>Row(ad_id=42)</td>\n",
" <td>0</td>\n",
" <td>19</td>\n",
" <td>4</td>\n",
" <td>5429</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:41:49.202000</td>\n",
" <td>7965471739473975852</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=38)</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>1582</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:42:13.249000</td>\n",
" <td>6641965456052712871</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=10)</td>\n",
" <td>0</td>\n",
" <td>16</td>\n",
" <td>4</td>\n",
" <td>5286</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:41:31.587000</td>\n",
" <td>4059346813755015811</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=3)</td>\n",
" <td>0</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>1543</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:42:13.249000</td>\n",
" <td>6641965456052712871</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=49)</td>\n",
" <td>0</td>\n",
" <td>15</td>\n",
" <td>4</td>\n",
" <td>5359</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:41:43.138000</td>\n",
" <td>1865904111199103577</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=5)</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" <td>1</td>\n",
" <td>1526</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:41:08.813000</td>\n",
" <td>2155865929954566188</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=2)</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" <td>4105</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:42:06.311000</td>\n",
" <td>827301497454031138</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=41)</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" <td>4</td>\n",
" <td>5253</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:41:41.144000</td>\n",
" <td>1472536140048912459</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=34)</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" <td>1</td>\n",
" <td>1508</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:41:08.813000</td>\n",
" <td>2155865929954566188</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Row(ad_id=60)</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" <td>3</td>\n",
" <td>4007</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2025-02-01 00:41:17.518000</td>\n",
" <td>3047889973353044630</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n",
"+---------------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+----------------------------+--------------------------+\n",
"| partition | spec_id | record_count | file_count | total_data_file_size_in_bytes | position_delete_record_count | position_delete_file_count | equality_delete_record_count | equality_delete_file_count | last_updated_at | last_updated_snapshot_id |\n",
"+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n",
"+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+"
"+---------------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+----------------------------+--------------------------+\n",
"| Row(ad_id=42) | 0 | 19 | 4 | 5429 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:49.202000 | 7965471739473975852 |\n",
"| Row(ad_id=38) | 0 | 17 | 1 | 1582 | 0 | 0 | 0 | 0 | 2025-02-01 00:42:13.249000 | 6641965456052712871 |\n",
"| Row(ad_id=10) | 0 | 16 | 4 | 5286 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:31.587000 | 4059346813755015811 |\n",
"| Row(ad_id=3) | 0 | 15 | 1 | 1543 | 0 | 0 | 0 | 0 | 2025-02-01 00:42:13.249000 | 6641965456052712871 |\n",
"| Row(ad_id=49) | 0 | 15 | 4 | 5359 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:43.138000 | 1865904111199103577 |\n",
"| Row(ad_id=5) | 0 | 14 | 1 | 1526 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:08.813000 | 2155865929954566188 |\n",
"| Row(ad_id=2) | 0 | 14 | 3 | 4105 | 0 | 0 | 0 | 0 | 2025-02-01 00:42:06.311000 | 827301497454031138 |\n",
"| Row(ad_id=41) | 0 | 13 | 4 | 5253 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:41.144000 | 1472536140048912459 |\n",
"| Row(ad_id=34) | 0 | 13 | 1 | 1508 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:08.813000 | 2155865929954566188 |\n",
"| Row(ad_id=60) | 0 | 13 | 3 | 4007 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:17.518000 | 3047889973353044630 |\n",
"+---------------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+----------------------------+--------------------------+"
]
},
"execution_count": 6,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@ -282,11 +497,58 @@
"%%sql\n",
"\n",
"SELECT * \n",
"FROM orders.payments.partitions\n",
"FROM marketing.ad_clicks.partitions\n",
"ORDER BY record_count DESC\n",
"LIMIT 10"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "89c67f0b-6bed-44fe-9c3d-99dda30477a9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
" <thead>\n",
" <tr>\n",
" <th>rewritten_data_files_count</th>\n",
" <th>added_data_files_count</th>\n",
" <th>rewritten_bytes_count</th>\n",
" <th>failed_data_files_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>61</td>\n",
" <td>11</td>\n",
" <td>78409</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"+----------------------------+------------------------+-----------------------+-------------------------+\n",
"| rewritten_data_files_count | added_data_files_count | rewritten_bytes_count | failed_data_files_count |\n",
"+----------------------------+------------------------+-----------------------+-------------------------+\n",
"| 61 | 11 | 78409 | 0 |\n",
"+----------------------------+------------------------+-----------------------+-------------------------+"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"CALL system.rewrite_data_files(table => 'marketing.ad_clicks')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -294,6 +556,14 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "80a37e69-713f-418a-9f19-f154f00408aa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@ -3,18 +3,16 @@ from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("").getOrCreate()
print("creating database")
spark.sql('CREATE DATABASE IF NOT EXISTS orders')
spark.sql('CREATE DATABASE IF NOT EXISTS marketing')
print("creating table")
spark.sql('''
CREATE TABLE IF NOT EXISTS orders.payments (
id STRING,
type STRING,
created_at TIMESTAMP,
document STRING,
payer STRING,
amount INT
CREATE TABLE IF NOT EXISTS marketing.ad_clicks (
user_id INT,
ad_id INT,
click_timestamp TIMESTAMP,
impression_timestamp TIMESTAMP
)
USING iceberg
PARTITIONED BY (document)
PARTITIONED BY (ad_id)
''')