feat: add adclicks example
This commit is contained in:
parent
aa1cec9986
commit
50aa9c7a14
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
||||
.cuddle/
|
||||
target/
|
||||
|
1237
client-application/Cargo.lock
generated
Normal file
1237
client-application/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
19
client-application/Cargo.toml
Normal file
19
client-application/Cargo.toml
Normal file
@ -0,0 +1,19 @@
|
||||
[package]
|
||||
name = "client-application"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.95"
|
||||
chrono = { version = "0.4.39", features = ["serde"] }
|
||||
clap = { version = "4.5.27", features = ["derive", "env"] }
|
||||
dotenvy = "0.15.7"
|
||||
kafka = "0.10.0"
|
||||
nodrift = "0.3.0"
|
||||
rand = "0.9.0"
|
||||
serde = { version = "1.0.217", features = ["derive"] }
|
||||
serde_json = "1.0.138"
|
||||
tokio = { version = "1.43.0", features = ["full"] }
|
||||
tracing = { version = "0.1.41", features = ["log"] }
|
||||
tracing-subscriber = "0.3.19"
|
||||
uuid = { version = "1.12.1", features = ["serde"] }
|
17
client-application/Dockerfile
Normal file
17
client-application/Dockerfile
Normal file
@ -0,0 +1,17 @@
|
||||
FROM rustlang/rust:nightly AS builder
|
||||
|
||||
WORKDIR /mnt/src
|
||||
|
||||
COPY Cargo.toml Cargo.toml
|
||||
COPY Cargo.lock Cargo.lock
|
||||
COPY src/ src/
|
||||
|
||||
RUN cargo build --release
|
||||
|
||||
FROM debian:bookworm AS production
|
||||
|
||||
RUN apt update && apt upgrade -y && apt install libssl-dev -y
|
||||
|
||||
COPY --from=builder /mnt/src/target/release/client-application /usr/local/bin/client-application
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/client-application"]
|
111
client-application/src/main.rs
Normal file
111
client-application/src/main.rs
Normal file
@ -0,0 +1,111 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use chrono::{TimeDelta, Utc};
|
||||
use clap::{Parser, Subcommand};
|
||||
use kafka::producer::Record;
|
||||
use rand::Rng;
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None, subcommand_required = true)]
|
||||
struct Command {
|
||||
#[command(subcommand)]
|
||||
command: Option<Commands>,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
Produce {
|
||||
#[arg(long)]
|
||||
host: String,
|
||||
|
||||
#[arg(long)]
|
||||
topic: String,
|
||||
|
||||
#[arg(long = "delay-ms")]
|
||||
delay_ms: u64,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Debug)]
|
||||
struct AdSource {
|
||||
user_id: i64,
|
||||
ad_id: i64,
|
||||
click_timestamp: String,
|
||||
impression_timestamp: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
dotenvy::dotenv().ok();
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let cli = Command::parse();
|
||||
tracing::debug!("Starting cli");
|
||||
|
||||
match cli.command.unwrap() {
|
||||
Commands::Produce {
|
||||
topic,
|
||||
delay_ms,
|
||||
host,
|
||||
} => {
|
||||
let send_event =
|
||||
nodrift::schedule(std::time::Duration::from_millis(delay_ms), move || {
|
||||
let host = host.clone();
|
||||
let topic = topic.clone();
|
||||
|
||||
async move {
|
||||
tracing::info!("sending event");
|
||||
let mut rng = rand::rng();
|
||||
|
||||
let mut producer = kafka::producer::Producer::from_hosts(vec![host])
|
||||
.with_ack_timeout(Duration::from_secs(1))
|
||||
.with_required_acks(kafka::client::RequiredAcks::One)
|
||||
.create()
|
||||
.map_err(|e| nodrift::DriftError::JobError(e.into()))?;
|
||||
|
||||
let msg = AdSource {
|
||||
user_id: rng.random_range(0..64),
|
||||
ad_id: rng.random_range(0..64),
|
||||
click_timestamp: format!(
|
||||
"{}",
|
||||
Utc::now()
|
||||
.checked_add_signed(TimeDelta::milliseconds(500))
|
||||
.unwrap()
|
||||
.format("%Y-%m-%dT%H:%M:%S")
|
||||
),
|
||||
impression_timestamp: format!(
|
||||
"{}",
|
||||
Utc::now().to_utc().format("%Y-%m-%dT%H:%M:%S")
|
||||
),
|
||||
};
|
||||
|
||||
producer
|
||||
.send(&Record::from_value(
|
||||
&topic,
|
||||
serde_json::to_string(&msg)
|
||||
.context("failed to serialize type")
|
||||
.map_err(nodrift::DriftError::JobError)?,
|
||||
))
|
||||
.map_err(|e| nodrift::DriftError::JobError(e.into()))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
});
|
||||
|
||||
println!("waiting for closure press ctrl-c to cancel");
|
||||
|
||||
tokio::select! {
|
||||
_ = send_event.cancelled() => {
|
||||
tokio::time::sleep(Duration::from_secs(5)).await;
|
||||
return Ok(())
|
||||
}
|
||||
_ = tokio::signal::ctrl_c() => {
|
||||
send_event.cancel();
|
||||
return Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -27,6 +27,21 @@ services:
|
||||
- PLAINTEXT://redpanda:29092,OUTSIDE://localhost:9092
|
||||
- --check=false
|
||||
|
||||
client_application:
|
||||
container_name: client_application
|
||||
build:
|
||||
context: ./client-application
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
RUST_LOG: info
|
||||
command:
|
||||
- produce
|
||||
- --host=redpanda:29092
|
||||
- --topic=event-stream
|
||||
- --delay-ms=500
|
||||
depends_on:
|
||||
- connect
|
||||
|
||||
connect:
|
||||
image: confluentinc/cp-kafka-connect-base:7.8.0
|
||||
depends_on:
|
||||
@ -85,7 +100,7 @@ services:
|
||||
-H 'Accept: application/json' http://localhost:8083/connectors/IcebergSinkConnector/config \
|
||||
-d '{
|
||||
"tasks.max": "1",
|
||||
"topics": "payments",
|
||||
"topics": "event-stream",
|
||||
"connector.class": "io.tabular.iceberg.connect.IcebergSinkConnector",
|
||||
"iceberg.catalog.s3.endpoint": "http://minio:9000",
|
||||
"iceberg.catalog.s3.secret-access-key": "minioadmin",
|
||||
@ -96,7 +111,7 @@ services:
|
||||
"iceberg.catalog.client.region": "eu-west-1",
|
||||
"iceberg.catalog.type": "rest",
|
||||
"iceberg.control.commitIntervalMs": "1000",
|
||||
"iceberg.tables": "orders.payments",
|
||||
"iceberg.tables": "marketing.ad_clicks",
|
||||
"value.converter.schemas.enable": "false",
|
||||
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
|
||||
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
|
||||
@ -164,7 +179,7 @@ services:
|
||||
|
||||
# Batch & Iceberg manipulation
|
||||
spark-iceberg:
|
||||
image: tabulario/spark-iceberg
|
||||
image: tabulario/spark-iceberg:3.5.1_1.5.0
|
||||
hostname: spark-iceberg
|
||||
container_name: spark-iceberg
|
||||
build: spark/
|
||||
|
@ -10,7 +10,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"25/01/31 22:57:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
|
||||
"25/02/01 00:22:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -24,7 +24,7 @@
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>orders</td>\n",
|
||||
" <td>marketing</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>"
|
||||
@ -33,7 +33,7 @@
|
||||
"+-----------+\n",
|
||||
"| namespace |\n",
|
||||
"+-----------+\n",
|
||||
"| orders |\n",
|
||||
"| marketing |\n",
|
||||
"+-----------+"
|
||||
]
|
||||
},
|
||||
@ -50,10 +50,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"id": "70349765-e5f1-43a5-a141-cc2d54c69a58",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"25/02/01 00:37:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -67,8 +74,8 @@
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>orders</td>\n",
|
||||
" <td>payments</td>\n",
|
||||
" <td>marketing</td>\n",
|
||||
" <td>ad_clicks</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
@ -78,11 +85,11 @@
|
||||
"+-----------+-----------+-------------+\n",
|
||||
"| namespace | tableName | isTemporary |\n",
|
||||
"+-----------+-----------+-------------+\n",
|
||||
"| orders | payments | False |\n",
|
||||
"| marketing | ad_clicks | False |\n",
|
||||
"+-----------+-----------+-------------+"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -90,12 +97,12 @@
|
||||
"source": [
|
||||
"%%sql\n",
|
||||
"\n",
|
||||
"SHOW TABLES FROM orders"
|
||||
"SHOW TABLES FROM marketing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 31,
|
||||
"id": "fabaed9c-9049-4996-9d26-b20f66303911",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -112,7 +119,7 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>current-snapshot-id</td>\n",
|
||||
" <td>none</td>\n",
|
||||
" <td>6641965456052712871</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>format</td>\n",
|
||||
@ -130,17 +137,17 @@
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"+---------------------------------+-----------------+\n",
|
||||
"+---------------------------------+---------------------+\n",
|
||||
"| key | value |\n",
|
||||
"+---------------------------------+-----------------+\n",
|
||||
"| current-snapshot-id | none |\n",
|
||||
"+---------------------------------+---------------------+\n",
|
||||
"| current-snapshot-id | 6641965456052712871 |\n",
|
||||
"| format | iceberg/parquet |\n",
|
||||
"| format-version | 2 |\n",
|
||||
"| write.parquet.compression-codec | zstd |\n",
|
||||
"+---------------------------------+-----------------+"
|
||||
"+---------------------------------+---------------------+"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -148,12 +155,12 @@
|
||||
"source": [
|
||||
"%%sql\n",
|
||||
"\n",
|
||||
"SHOW TBLPROPERTIES orders.payments"
|
||||
"SHOW TBLPROPERTIES marketing.ad_clicks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 33,
|
||||
"id": "6317d9c6-140e-4a63-890e-2173fbb9503e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -168,7 +175,7 @@
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>637</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>"
|
||||
@ -177,11 +184,11 @@
|
||||
"+----------+\n",
|
||||
"| count(1) |\n",
|
||||
"+----------+\n",
|
||||
"| 0 |\n",
|
||||
"| 637 |\n",
|
||||
"+----------+"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -190,12 +197,12 @@
|
||||
"%%sql\n",
|
||||
"\n",
|
||||
"SELECT COUNT(*)\n",
|
||||
"FROM orders.payments"
|
||||
"FROM marketing.ad_clicks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 22,
|
||||
"id": "2a1ff132-dc65-4943-a9be-416ba5a13c26",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -205,26 +212,94 @@
|
||||
"<table>\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>type</th>\n",
|
||||
" <th>created_at</th>\n",
|
||||
" <th>document</th>\n",
|
||||
" <th>payer</th>\n",
|
||||
" <th>amount</th>\n",
|
||||
" <th>user_id</th>\n",
|
||||
" <th>ad_id</th>\n",
|
||||
" <th>click_timestamp</th>\n",
|
||||
" <th>impression_timestamp</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>21</td>\n",
|
||||
" <td>51</td>\n",
|
||||
" <td>2025-02-01 00:42:09</td>\n",
|
||||
" <td>2025-02-01 00:42:09</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>55</td>\n",
|
||||
" <td>2025-02-01 00:42:10</td>\n",
|
||||
" <td>2025-02-01 00:42:09</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>31</td>\n",
|
||||
" <td>32</td>\n",
|
||||
" <td>2025-02-01 00:42:10</td>\n",
|
||||
" <td>2025-02-01 00:42:10</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>63</td>\n",
|
||||
" <td>59</td>\n",
|
||||
" <td>2025-02-01 00:42:04</td>\n",
|
||||
" <td>2025-02-01 00:42:03</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>60</td>\n",
|
||||
" <td>29</td>\n",
|
||||
" <td>2025-02-01 00:42:04</td>\n",
|
||||
" <td>2025-02-01 00:42:04</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>31</td>\n",
|
||||
" <td>2025-02-01 00:42:03</td>\n",
|
||||
" <td>2025-02-01 00:42:03</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>52</td>\n",
|
||||
" <td>18</td>\n",
|
||||
" <td>2025-02-01 00:41:25</td>\n",
|
||||
" <td>2025-02-01 00:41:25</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>32</td>\n",
|
||||
" <td>27</td>\n",
|
||||
" <td>2025-02-01 00:41:52</td>\n",
|
||||
" <td>2025-02-01 00:41:51</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>2025-02-01 00:41:46</td>\n",
|
||||
" <td>2025-02-01 00:41:46</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td>2025-02-01 00:40:30</td>\n",
|
||||
" <td>2025-02-01 00:40:29</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"+----+------+------------+----------+-------+--------+\n",
|
||||
"| id | type | created_at | document | payer | amount |\n",
|
||||
"+----+------+------------+----------+-------+--------+\n",
|
||||
"+----+------+------------+----------+-------+--------+"
|
||||
"+---------+-------+---------------------+----------------------+\n",
|
||||
"| user_id | ad_id | click_timestamp | impression_timestamp |\n",
|
||||
"+---------+-------+---------------------+----------------------+\n",
|
||||
"| 21 | 51 | 2025-02-01 00:42:09 | 2025-02-01 00:42:09 |\n",
|
||||
"| 17 | 55 | 2025-02-01 00:42:10 | 2025-02-01 00:42:09 |\n",
|
||||
"| 31 | 32 | 2025-02-01 00:42:10 | 2025-02-01 00:42:10 |\n",
|
||||
"| 63 | 59 | 2025-02-01 00:42:04 | 2025-02-01 00:42:03 |\n",
|
||||
"| 60 | 29 | 2025-02-01 00:42:04 | 2025-02-01 00:42:04 |\n",
|
||||
"| 6 | 31 | 2025-02-01 00:42:03 | 2025-02-01 00:42:03 |\n",
|
||||
"| 52 | 18 | 2025-02-01 00:41:25 | 2025-02-01 00:41:25 |\n",
|
||||
"| 32 | 27 | 2025-02-01 00:41:52 | 2025-02-01 00:41:51 |\n",
|
||||
"| 10 | 6 | 2025-02-01 00:41:46 | 2025-02-01 00:41:46 |\n",
|
||||
"| 17 | 13 | 2025-02-01 00:40:30 | 2025-02-01 00:40:29 |\n",
|
||||
"+---------+-------+---------------------+----------------------+"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -233,13 +308,13 @@
|
||||
"%%sql\n",
|
||||
"\n",
|
||||
"SELECT *\n",
|
||||
"FROM orders.payments\n",
|
||||
"FROM marketing.ad_clicks\n",
|
||||
"LIMIT 10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 19,
|
||||
"id": "a2688a95-594c-45ad-9d49-70a1bcd59a1b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -263,17 +338,157 @@
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=42)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>19</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>5429</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:41:49.202000</td>\n",
|
||||
" <td>7965471739473975852</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=38)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1582</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:42:13.249000</td>\n",
|
||||
" <td>6641965456052712871</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=10)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>16</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>5286</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:41:31.587000</td>\n",
|
||||
" <td>4059346813755015811</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=3)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>15</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1543</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:42:13.249000</td>\n",
|
||||
" <td>6641965456052712871</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=49)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>15</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>5359</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:41:43.138000</td>\n",
|
||||
" <td>1865904111199103577</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=5)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>14</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1526</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:41:08.813000</td>\n",
|
||||
" <td>2155865929954566188</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=2)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>14</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>4105</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:42:06.311000</td>\n",
|
||||
" <td>827301497454031138</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=41)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>5253</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:41:41.144000</td>\n",
|
||||
" <td>1472536140048912459</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=34)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1508</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:41:08.813000</td>\n",
|
||||
" <td>2155865929954566188</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>Row(ad_id=60)</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>4007</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2025-02-01 00:41:17.518000</td>\n",
|
||||
" <td>3047889973353044630</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n",
|
||||
"+---------------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+----------------------------+--------------------------+\n",
|
||||
"| partition | spec_id | record_count | file_count | total_data_file_size_in_bytes | position_delete_record_count | position_delete_file_count | equality_delete_record_count | equality_delete_file_count | last_updated_at | last_updated_snapshot_id |\n",
|
||||
"+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+\n",
|
||||
"+-----------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+-----------------+--------------------------+"
|
||||
"+---------------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+----------------------------+--------------------------+\n",
|
||||
"| Row(ad_id=42) | 0 | 19 | 4 | 5429 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:49.202000 | 7965471739473975852 |\n",
|
||||
"| Row(ad_id=38) | 0 | 17 | 1 | 1582 | 0 | 0 | 0 | 0 | 2025-02-01 00:42:13.249000 | 6641965456052712871 |\n",
|
||||
"| Row(ad_id=10) | 0 | 16 | 4 | 5286 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:31.587000 | 4059346813755015811 |\n",
|
||||
"| Row(ad_id=3) | 0 | 15 | 1 | 1543 | 0 | 0 | 0 | 0 | 2025-02-01 00:42:13.249000 | 6641965456052712871 |\n",
|
||||
"| Row(ad_id=49) | 0 | 15 | 4 | 5359 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:43.138000 | 1865904111199103577 |\n",
|
||||
"| Row(ad_id=5) | 0 | 14 | 1 | 1526 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:08.813000 | 2155865929954566188 |\n",
|
||||
"| Row(ad_id=2) | 0 | 14 | 3 | 4105 | 0 | 0 | 0 | 0 | 2025-02-01 00:42:06.311000 | 827301497454031138 |\n",
|
||||
"| Row(ad_id=41) | 0 | 13 | 4 | 5253 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:41.144000 | 1472536140048912459 |\n",
|
||||
"| Row(ad_id=34) | 0 | 13 | 1 | 1508 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:08.813000 | 2155865929954566188 |\n",
|
||||
"| Row(ad_id=60) | 0 | 13 | 3 | 4007 | 0 | 0 | 0 | 0 | 2025-02-01 00:41:17.518000 | 3047889973353044630 |\n",
|
||||
"+---------------+---------+--------------+------------+-------------------------------+------------------------------+----------------------------+------------------------------+----------------------------+----------------------------+--------------------------+"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -282,11 +497,58 @@
|
||||
"%%sql\n",
|
||||
"\n",
|
||||
"SELECT * \n",
|
||||
"FROM orders.payments.partitions\n",
|
||||
"FROM marketing.ad_clicks.partitions\n",
|
||||
"ORDER BY record_count DESC\n",
|
||||
"LIMIT 10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "89c67f0b-6bed-44fe-9c3d-99dda30477a9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th>rewritten_data_files_count</th>\n",
|
||||
" <th>added_data_files_count</th>\n",
|
||||
" <th>rewritten_bytes_count</th>\n",
|
||||
" <th>failed_data_files_count</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>61</td>\n",
|
||||
" <td>11</td>\n",
|
||||
" <td>78409</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"+----------------------------+------------------------+-----------------------+-------------------------+\n",
|
||||
"| rewritten_data_files_count | added_data_files_count | rewritten_bytes_count | failed_data_files_count |\n",
|
||||
"+----------------------------+------------------------+-----------------------+-------------------------+\n",
|
||||
"| 61 | 11 | 78409 | 0 |\n",
|
||||
"+----------------------------+------------------------+-----------------------+-------------------------+"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%sql\n",
|
||||
"\n",
|
||||
"CALL system.rewrite_data_files(table => 'marketing.ad_clicks')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -294,6 +556,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "80a37e69-713f-418a-9f19-f154f00408aa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -3,18 +3,16 @@ from pyspark.sql import SparkSession
|
||||
spark = SparkSession.builder.appName("").getOrCreate()
|
||||
|
||||
print("creating database")
|
||||
spark.sql('CREATE DATABASE IF NOT EXISTS orders')
|
||||
spark.sql('CREATE DATABASE IF NOT EXISTS marketing')
|
||||
|
||||
print("creating table")
|
||||
spark.sql('''
|
||||
CREATE TABLE IF NOT EXISTS orders.payments (
|
||||
id STRING,
|
||||
type STRING,
|
||||
created_at TIMESTAMP,
|
||||
document STRING,
|
||||
payer STRING,
|
||||
amount INT
|
||||
CREATE TABLE IF NOT EXISTS marketing.ad_clicks (
|
||||
user_id INT,
|
||||
ad_id INT,
|
||||
click_timestamp TIMESTAMP,
|
||||
impression_timestamp TIMESTAMP
|
||||
)
|
||||
USING iceberg
|
||||
PARTITIONED BY (document)
|
||||
PARTITIONED BY (ad_id)
|
||||
''')
|
||||
|
Loading…
x
Reference in New Issue
Block a user