diff --git a/.gitignore b/.gitignore index 5d65ab34b..259888fdf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,24 @@ *.parquet hits.csv hits.tsv + +# Per-system runtime artifacts produced by benchmark.sh +result.csv +log.txt +load_out.txt +server.log +server.pid +arc_token.txt +data-size.txt +.doris_home +.sirius_env + +# Per-system data files +hits.db +mydb +hits.hyper +hits.vortex +*.vortex + +# Python venvs created by install scripts +myenv/ diff --git a/arc/benchmark.sh b/arc/benchmark.sh index d1f13caa7..b85187617 100755 --- a/arc/benchmark.sh +++ b/arc/benchmark.sh @@ -1,204 +1,5 @@ #!/bin/bash -# Arc ClickBench Complete Benchmark Script (Go Binary Version) -set -e - -# ============================================================ -# 1. INSTALL ARC FROM .DEB PACKAGE -# ============================================================ -echo "Installing Arc from .deb package..." - -# Fetch latest Arc version from GitHub releases -echo "Fetching latest Arc version..." -ARC_VERSION=$(curl -s https://api.github.com/repos/Basekick-Labs/arc/releases/latest | grep -oP '"tag_name": "v\K[^"]+') -if [ -z "$ARC_VERSION" ]; then - echo "Error: Could not fetch latest Arc version from GitHub" - exit 1 -fi -echo "Latest Arc version: $ARC_VERSION" - -ARCH=$(uname -m) -if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then - DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/arc_${ARC_VERSION}_arm64.deb" - DEB_FILE="arc_${ARC_VERSION}_arm64.deb" -else - DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/arc_${ARC_VERSION}_amd64.deb" - DEB_FILE="arc_${ARC_VERSION}_amd64.deb" -fi - -echo "Detected architecture: $ARCH -> $DEB_FILE" - -if [ ! -f "$DEB_FILE" ]; then - wget -q "$DEB_URL" -O "$DEB_FILE" -fi - -sudo dpkg -i "$DEB_FILE" || sudo apt-get install -f -y -echo "[OK] Arc installed" - -# ============================================================ -# 2. PRINT SYSTEM INFO (Arc defaults) -# ============================================================ -CORES=$(nproc) -TOTAL_MEM_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}') -TOTAL_MEM_GB=$((TOTAL_MEM_KB / 1024 / 1024)) -MEM_LIMIT_GB=$((TOTAL_MEM_GB * 80 / 100)) # 80% of system RAM - -echo "" -echo "System Configuration:" -echo " CPU cores: $CORES" -echo " Connections: $((CORES * 2)) (cores × 2)" -echo " Threads: $CORES (same as cores)" -echo " Memory limit: ${MEM_LIMIT_GB}GB (80% of ${TOTAL_MEM_GB}GB total)" -echo "" - -# ============================================================ -# 3. START ARC AND CAPTURE TOKEN FROM LOGS -# ============================================================ -echo "Starting Arc service..." - -# Check if we already have a valid token from a previous run -if [ -f "arc_token.txt" ]; then - EXISTING_TOKEN=$(cat arc_token.txt) - echo "Found existing token file, will verify after Arc starts..." -fi - -sudo systemctl start arc - -# Wait for Arc to be ready -echo "Waiting for Arc to be ready..." -for i in {1..30}; do - if curl -sf http://localhost:8000/health > /dev/null 2>&1; then - echo "[OK] Arc is ready!" - break - fi - if [ $i -eq 30 ]; then - echo "Error: Arc failed to start within 30 seconds" - sudo journalctl -u arc --no-pager | tail -50 - exit 1 - fi - sleep 1 -done - -# Try to get token - either from existing file or from logs (first run) -ARC_TOKEN="" - -# First, check if existing token works -if [ -n "$EXISTING_TOKEN" ]; then - if curl -sf http://localhost:8000/health -H "x-api-key: $EXISTING_TOKEN" > /dev/null 2>&1; then - ARC_TOKEN="$EXISTING_TOKEN" - echo "[OK] Using existing token from arc_token.txt" - else - echo "Existing token invalid, looking for new token in logs..." - fi -fi - -# If no valid token yet, try to extract from logs (first run scenario) -if [ -z "$ARC_TOKEN" ]; then - ARC_TOKEN=$(sudo journalctl -u arc --no-pager | grep -oP '(?:Initial admin API token|Admin API token): \K[^\s]+' | head -1) - if [ -n "$ARC_TOKEN" ]; then - echo "[OK] Captured new token from logs" - echo "$ARC_TOKEN" > arc_token.txt - else - echo "Error: Could not find or validate API token" - echo "If this is not the first run, Arc's database may need to be reset:" - echo " sudo rm -rf /var/lib/arc/data/arc.db" - exit 1 - fi -fi - -echo "Token: ${ARC_TOKEN:0:20}..." - -# ============================================================ -# 4. DOWNLOAD DATASET -# ============================================================ -DATASET_FILE="hits.parquet" -DATASET_URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet" -EXPECTED_SIZE=14779976446 - -if [ -f "$DATASET_FILE" ]; then - CURRENT_SIZE=$(stat -c%s "$DATASET_FILE" 2>/dev/null || stat -f%z "$DATASET_FILE" 2>/dev/null) - if [ "$CURRENT_SIZE" -eq "$EXPECTED_SIZE" ]; then - echo "[OK] Dataset already downloaded (14GB)" - else - echo "Re-downloading dataset (size mismatch)..." - rm -f "$DATASET_FILE" - wget --continue --progress=dot:giga "$DATASET_URL" - fi -else - echo "Downloading ClickBench dataset (14GB)..." - wget --continue --progress=dot:giga "$DATASET_URL" -fi - -# ============================================================ -# 5. LOAD DATA INTO ARC -# ============================================================ -echo "Loading data into Arc..." - -# Determine Arc's data directory (default: /var/lib/arc/data) -ARC_DATA_DIR="/var/lib/arc/data" -TARGET_DIR="$ARC_DATA_DIR/clickbench/hits" -TARGET_FILE="$TARGET_DIR/hits.parquet" - -sudo mkdir -p "$TARGET_DIR" - -if [ -f "$TARGET_FILE" ]; then - SOURCE_SIZE=$(stat -c%s "$DATASET_FILE" 2>/dev/null || stat -f%z "$DATASET_FILE" 2>/dev/null) - TARGET_SIZE=$(stat -c%s "$TARGET_FILE" 2>/dev/null || stat -f%z "$TARGET_FILE" 2>/dev/null) - if [ "$SOURCE_SIZE" -eq "$TARGET_SIZE" ]; then - echo "[OK] Data already loaded" - else - echo "Reloading data (size mismatch)..." - sudo cp "$DATASET_FILE" "$TARGET_FILE" - fi -else - sudo cp "$DATASET_FILE" "$TARGET_FILE" - echo "[OK] Data loaded to $TARGET_FILE" -fi - -# ============================================================ -# 6. SET ENVIRONMENT AND RUN BENCHMARK -# ============================================================ -export ARC_URL="http://localhost:8000" -export ARC_API_KEY="$ARC_TOKEN" -export DATABASE="clickbench" -export TABLE="hits" - -echo "" -echo "Running ClickBench queries (true cold runs)..." -echo "================================================" -./run.sh 2>&1 | tee log.txt - -# ============================================================ -# 7. STOP ARC AND FORMAT RESULTS -# ============================================================ -echo "Stopping Arc..." -sudo systemctl stop arc - -# Format results as proper JSON array -cat log.txt | grep -oE '^[0-9]+\.[0-9]+|^null' | \ - awk '{ - if (NR % 3 == 1) printf "["; - printf "%s", $1; - if (NR % 3 == 0) print "],"; - else printf ", "; - }' > results.txt - -echo "" -echo "[OK] Benchmark complete!" -echo "================================================" -echo "Load time: 0" -echo "Data size: $EXPECTED_SIZE" -cat results.txt -echo "================================================" - -# ============================================================ -# 8. CLEANUP -# ============================================================ -echo "Cleaning up..." - -# Uninstall Arc package -sudo dpkg -r arc || true - -# Remove Arc data directory -sudo rm -rf /var/lib/arc - -echo "[OK] Cleanup complete" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/arc/check b/arc/check new file mode 100755 index 000000000..2ba2f8851 --- /dev/null +++ b/arc/check @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +ARC_URL="${ARC_URL:-http://localhost:8000}" +TOKEN=$(cat arc_token.txt 2>/dev/null || true) + +if [ -n "$TOKEN" ]; then + curl -sf "$ARC_URL/health" -H "x-api-key: $TOKEN" >/dev/null +else + curl -sf "$ARC_URL/health" >/dev/null +fi diff --git a/arc/data-size b/arc/data-size new file mode 100755 index 000000000..d37e32e8e --- /dev/null +++ b/arc/data-size @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Source parquet file size (loaded into Arc's data directory). +F="/var/lib/arc/data/clickbench/hits/hits.parquet" +if [ -f "$F" ]; then + sudo stat -c%s "$F" +else + echo 14779976446 +fi diff --git a/arc/install b/arc/install new file mode 100755 index 000000000..eb79fb4bf --- /dev/null +++ b/arc/install @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +# Install Arc from a .deb release. Idempotent. +if dpkg -l arc 2>/dev/null | grep -q '^ii '; then + exit 0 +fi + +ARC_VERSION=$(curl -s https://api.github.com/repos/Basekick-Labs/arc/releases/latest \ + | grep -oP '"tag_name": "v\K[^"]+') +if [ -z "$ARC_VERSION" ]; then + echo "Error: Could not fetch latest Arc version from GitHub" >&2 + exit 1 +fi + +ARCH=$(uname -m) +if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then + DEB_FILE="arc_${ARC_VERSION}_arm64.deb" +else + DEB_FILE="arc_${ARC_VERSION}_amd64.deb" +fi +DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/${DEB_FILE}" + +if [ ! -f "$DEB_FILE" ]; then + wget -q "$DEB_URL" -O "$DEB_FILE" +fi + +sudo dpkg -i "$DEB_FILE" || sudo apt-get install -f -y diff --git a/arc/load b/arc/load new file mode 100755 index 000000000..b46a4e326 --- /dev/null +++ b/arc/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +# Arc loads the parquet file into its data directory and indexes it on startup. +ARC_DATA_DIR="/var/lib/arc/data" +TARGET_DIR="$ARC_DATA_DIR/clickbench/hits" +TARGET_FILE="$TARGET_DIR/hits.parquet" + +sudo mkdir -p "$TARGET_DIR" + +if [ -f "$TARGET_FILE" ] && \ + [ "$(stat -c%s hits.parquet)" -eq "$(stat -c%s "$TARGET_FILE")" ]; then + : # already loaded +else + sudo cp hits.parquet "$TARGET_FILE" +fi + +# Free up local space. +rm -f hits.parquet +sync diff --git a/arc/query b/arc/query new file mode 100755 index 000000000..da3619df9 --- /dev/null +++ b/arc/query @@ -0,0 +1,49 @@ +#!/bin/bash +# Reads a SQL query from stdin, POSTs it to Arc's HTTP API. +# Stdout: query response body (JSON). +# Stderr: query runtime in fractional seconds on the last line (extracted +# from Arc's journal log line `execution_time_ms=N`). +# Exit non-zero on error. +set -e + +ARC_URL="${ARC_URL:-http://localhost:8000}" +ARC_API_KEY="${ARC_API_KEY:-$(cat arc_token.txt 2>/dev/null)}" + +query=$(cat) + +# Build JSON payload with proper escaping. +JSON_PAYLOAD=$(jq -Rs '{sql: .}' <<<"$query") + +# Mark journal position so we can locate the matching execution_time_ms entry. +LOG_MARKER=$(date -u +"%Y-%m-%dT%H:%M:%S") + +RESPONSE=$(curl -s -w "\n%{http_code}" \ + -X POST "$ARC_URL/api/v1/query" \ + -H "x-api-key: $ARC_API_KEY" \ + -H "Content-Type: application/json" \ + -d "$JSON_PAYLOAD" \ + --max-time 300) + +HTTP_CODE=$(printf '%s\n' "$RESPONSE" | tail -1) +BODY=$(printf '%s\n' "$RESPONSE" | head -n -1) + +if [ "$HTTP_CODE" != "200" ]; then + printf 'arc query failed: HTTP %s\n%s\n' "$HTTP_CODE" "$BODY" >&2 + exit 1 +fi + +# Result body to stdout. +printf '%s\n' "$BODY" + +# Extract execution_time_ms from Arc's journal — give it a moment to flush. +sleep 0.1 +EXEC_MS=$(sudo journalctl -u arc --since="$LOG_MARKER" --no-pager 2>/dev/null \ + | grep -oP 'execution_time_ms=\K[0-9]+' | tail -1) + +if [ -z "$EXEC_MS" ]; then + echo "Could not extract execution_time_ms from arc journal" >&2 + exit 1 +fi + +# Convert ms -> seconds and emit on stderr. +awk -v ms="$EXEC_MS" 'BEGIN { printf "%.4f\n", ms / 1000 }' >&2 diff --git a/arc/run.sh b/arc/run.sh deleted file mode 100755 index 4145c9c5b..000000000 --- a/arc/run.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash -# Arc ClickBench Benchmark Runner - TRUE COLD RUNS -# Restarts Arc service and clears OS cache before EACH QUERY (not each run) -# Pattern: restart -> run query 3 times -> next query - -TRIES=3 -ARC_URL="${ARC_URL:-http://localhost:8000}" -ARC_API_KEY="${ARC_API_KEY:-$(cat arc_token.txt 2>/dev/null)}" - -echo "Running benchmark with TRUE COLD RUNS (restart + cache clear before each query)" >&2 -echo "API endpoint: $ARC_URL" >&2 -echo "API key: ${ARC_API_KEY:0:20}..." >&2 - -# Function to restart Arc and clear caches -restart_arc() { - # Stop Arc - sudo systemctl stop arc - - # Clear OS page cache - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - # Start Arc - sudo systemctl start arc - - # Wait for Arc to be ready - for i in {1..30}; do - if curl -sf "$ARC_URL/health" > /dev/null 2>&1; then - sleep 0.2 # Extra delay to ensure server is fully ready - return 0 - fi - sleep 0.5 - done - echo "Error: Arc failed to restart" >&2 - return 1 -} - -# Read queries line by line -cat queries.sql | while read -r query; do - # Skip empty lines and comments - [[ -z "$query" || "$query" =~ ^-- ]] && continue - - # TRUE COLD RUN: Restart Arc and clear OS cache ONCE per query - restart_arc - - echo "$query" >&2 - - # Run the query 3 times (first is cold, 2-3 benefit from warm DB caches) - for i in $(seq 1 $TRIES); do - # Mark the log position before query - LOG_MARKER=$(date -u +"%Y-%m-%dT%H:%M:%S") - - # Build JSON payload properly using printf to escape the query - JSON_PAYLOAD=$(printf '{"sql": %s}' "$(echo "$query" | jq -Rs .)") - - # Execute query - RESPONSE=$(curl -s -w "\n%{http_code}" \ - -X POST "$ARC_URL/api/v1/query" \ - -H "x-api-key: $ARC_API_KEY" \ - -H "Content-Type: application/json" \ - -d "$JSON_PAYLOAD" \ - --max-time 300 2>/dev/null) - - HTTP_CODE=$(echo "$RESPONSE" | tail -1) - - if [ "$HTTP_CODE" = "200" ]; then - # Extract execution_time_ms from Arc logs - # Log format: 2025-11-28T14:20:44Z INF ... execution_time_ms=97 ... - sleep 0.1 # Small delay to ensure log is written - EXEC_TIME_MS=$(sudo journalctl -u arc --since="$LOG_MARKER" --no-pager 2>/dev/null | \ - grep -oP 'execution_time_ms=\K[0-9]+' | tail -1) - - if [ -n "$EXEC_TIME_MS" ]; then - # Convert ms to seconds with 4 decimal places - EXEC_TIME_SEC=$(echo "scale=4; $EXEC_TIME_MS / 1000" | bc) - printf "%.4f\n" "$EXEC_TIME_SEC" - else - echo "null" - echo "Warning: Could not extract execution_time_ms from logs" >&2 - fi - else - echo "null" - if [ "$i" -eq 1 ]; then - echo "Query failed (HTTP $HTTP_CODE): ${query:0:50}..." >&2 - echo "Response: $(echo "$RESPONSE" | head -n -1 | head -c 200)" >&2 - fi - fi - done -done - -echo "Benchmark complete!" >&2 diff --git a/arc/start b/arc/start new file mode 100755 index 000000000..d06f81cab --- /dev/null +++ b/arc/start @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +ARC_URL="${ARC_URL:-http://localhost:8000}" + +# Idempotent: if already up and we have a working token, do nothing. +if [ -f arc_token.txt ]; then + TOKEN=$(cat arc_token.txt) + if curl -sf "$ARC_URL/health" -H "x-api-key: $TOKEN" >/dev/null 2>&1; then + exit 0 + fi +fi + +sudo systemctl start arc + +# Wait for the HTTP endpoint to come up before we try to read the token. +for _ in $(seq 1 30); do + if curl -sf "$ARC_URL/health" >/dev/null 2>&1; then + break + fi + sleep 1 +done + +# On first start, Arc prints its admin token to its journal; capture it. +if [ ! -f arc_token.txt ] || \ + ! curl -sf "$ARC_URL/health" -H "x-api-key: $(cat arc_token.txt)" >/dev/null 2>&1; then + TOKEN=$(sudo journalctl -u arc --no-pager \ + | grep -oP '(?:Initial admin API token|Admin API token): \K[^\s]+' \ + | head -1) + if [ -z "$TOKEN" ]; then + echo "Error: Could not extract Arc admin API token from journal" >&2 + exit 1 + fi + echo "$TOKEN" > arc_token.txt +fi diff --git a/arc/stop b/arc/stop new file mode 100755 index 000000000..98db475d8 --- /dev/null +++ b/arc/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop arc || true diff --git a/byconity/benchmark.sh b/byconity/benchmark.sh index af59bdff2..531bd6503 100755 --- a/byconity/benchmark.sh +++ b/byconity/benchmark.sh @@ -1,45 +1,5 @@ -#!/bin/bash -e - -sudo apt-get update -y -sudo apt-get install -y ca-certificates curl gnupg -sudo install -m 0755 -d /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --yes --dearmor -o /etc/apt/keyrings/docker.gpg -sudo chmod a+r /etc/apt/keyrings/docker.gpg -echo \ - "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - -sudo apt-get update -y -sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - -# Make sure docker is running -sudo systemctl start docker - -docker compose up -d -sleep 5 - -hdfs/create_users.sh - -function byconity() -{ - docker compose exec -T server clickhouse-client --port 52145 "$@" -} -export -f byconity - -byconity --time -n < create.sql -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -pigz -fkd hits.tsv.gz - -START=$(date +%s) -byconity --database bench --query "INSERT INTO hits FORMAT TSV" < hits.tsv -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -# NOTE: sometimes may hung due to docker-compose, using docker directly may help -./run.sh - -echo -n "Data size: " -byconity --enable_multiple_tables_for_cnch_parts=1 --query "SELECT sum(bytes_on_disk) FROM system.cnch_parts WHERE table = 'hits' AND database = 'bench'" - -docker compose down --volumes +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/byconity/check b/byconity/check new file mode 100755 index 000000000..55e22e34f --- /dev/null +++ b/byconity/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +docker compose exec -T server clickhouse-client --port 52145 \ + --query "SELECT 1" >/dev/null diff --git a/byconity/data-size b/byconity/data-size new file mode 100755 index 000000000..30383c1dd --- /dev/null +++ b/byconity/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +docker compose exec -T server clickhouse-client --port 52145 \ + --enable_multiple_tables_for_cnch_parts=1 \ + --query "SELECT sum(bytes_on_disk) FROM system.cnch_parts WHERE table = 'hits' AND database = 'bench'" diff --git a/byconity/install b/byconity/install new file mode 100755 index 000000000..dd787c963 --- /dev/null +++ b/byconity/install @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +# Install Docker (required by byconity's compose stack). Idempotent. +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y ca-certificates curl gnupg + sudo install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg \ + | sudo gpg --yes --dearmor -o /etc/apt/keyrings/docker.gpg + sudo chmod a+r /etc/apt/keyrings/docker.gpg + { + echo -n "deb [arch=$(dpkg --print-architecture) " + echo -n "signed-by=/etc/apt/keyrings/docker.gpg] " + echo -n "https://download.docker.com/linux/ubuntu " + echo "$(. /etc/os-release && echo "$VERSION_CODENAME") stable" + } | sudo tee /etc/apt/sources.list.d/docker.list >/dev/null + + sudo apt-get update -y + sudo apt-get install -y docker-ce docker-ce-cli containerd.io \ + docker-buildx-plugin docker-compose-plugin +fi + +sudo systemctl start docker diff --git a/byconity/load b/byconity/load new file mode 100755 index 000000000..c1b5a103c --- /dev/null +++ b/byconity/load @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +byconity() { + docker compose exec -T server clickhouse-client --port 52145 "$@" +} + +# Schema (creates database `bench` and table `bench.hits`). +byconity --multiquery < create.sql + +# Ingest from the downloaded TSV. +pigz -fkd hits.tsv.gz +byconity --database bench --query "INSERT INTO hits FORMAT TSV" < hits.tsv + +rm -f hits.tsv hits.tsv.gz +sync diff --git a/byconity/query b/byconity/query new file mode 100755 index 000000000..eb1df09b0 --- /dev/null +++ b/byconity/query @@ -0,0 +1,11 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client inside the +# byconity server container. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +set -e + +query=$(cat) +docker compose exec -T server clickhouse-client --port 52145 \ + --database bench --time --query="$query" diff --git a/byconity/run.sh b/byconity/run.sh deleted file mode 100755 index 720c67860..000000000 --- a/byconity/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(byconity --database bench --time --format=Null --query="$query" &1 ||:) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/byconity/start b/byconity/start new file mode 100755 index 000000000..492b4aecd --- /dev/null +++ b/byconity/start @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# Bring the byconity stack up. Idempotent: if the server already responds, +# do nothing. +if docker compose exec -T server clickhouse-client --port 52145 \ + --query "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi + +docker compose up -d +sleep 5 + +# HDFS user setup is required for ingestion. +hdfs/create_users.sh diff --git a/byconity/stop b/byconity/stop new file mode 100755 index 000000000..a6a2c3661 --- /dev/null +++ b/byconity/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +docker compose down --volumes || true diff --git a/cedardb-parquet/benchmark.sh b/cedardb-parquet/benchmark.sh index 341652332..b85187617 100755 --- a/cedardb-parquet/benchmark.sh +++ b/cedardb-parquet/benchmark.sh @@ -1,39 +1,5 @@ -#!/bin/bash -e - -# docker -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -# download dataset -../download-hits-parquet-single data -chmod -R 777 data -rm -rf db -mkdir db - -# get and configure CedarDB image -echo "Starting CedarDB..." -docker run --rm -p 5432:5432 -v ./data:/data -v ./db:/var/lib/cedardb/data -e CEDAR_PASSWORD=test --name cedardb cedardb/cedardb:latest > /dev/null 2>&1 & - -# wait for container to start -until pg_isready -h localhost --dbname postgres -U postgres > /dev/null 2>&1; do sleep 1; done - -# create view over the parquet file -PGPASSWORD=test psql -h localhost -U postgres -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# data size = parquet file size; load time = 0 (no ingestion) -echo -n "Data size: " -stat -c%s data/hits.parquet -echo "Load time: 0" - -# run benchmark -echo "running benchmark..." -./run.sh 2>&1 | tee log.txt - -cat log.txt | \ - grep -oP 'Time: \d+\.\d+ ms|psql: error' | \ - sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | \ - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/cedardb-parquet/check b/cedardb-parquet/check new file mode 100755 index 000000000..f161a0820 --- /dev/null +++ b/cedardb-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null diff --git a/cedardb-parquet/data-size b/cedardb-parquet/data-size new file mode 100755 index 000000000..7a49e4474 --- /dev/null +++ b/cedardb-parquet/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +# No ingestion — reported size is the parquet file itself. +stat -c%s data/hits.parquet diff --git a/cedardb-parquet/install b/cedardb-parquet/install new file mode 100755 index 000000000..2c6f09a04 --- /dev/null +++ b/cedardb-parquet/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull cedardb/cedardb:latest + +mkdir -p data db +chmod -R 777 data db diff --git a/cedardb-parquet/load b/cedardb-parquet/load new file mode 100755 index 000000000..ee17527ef --- /dev/null +++ b/cedardb-parquet/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# Stage parquet file under ./data so the docker container sees /data/hits.parquet. +mkdir -p data +mv hits.parquet data/ +chmod -R 777 data + +# create.sql defines a view over the parquet file — no ingestion needed. +PGPASSWORD=test psql -h localhost -U postgres -t < create.sql + +sync diff --git a/cedardb-parquet/query b/cedardb-parquet/query new file mode 100755 index 000000000..3261388dc --- /dev/null +++ b/cedardb-parquet/query @@ -0,0 +1,26 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against CedarDB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/cedardb-parquet/run.sh b/cedardb-parquet/run.sh deleted file mode 100755 index f76d6409e..000000000 --- a/cedardb-parquet/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart $(docker ps -a -q) - - # wait for the server quietly so retry-loop messages don't pollute log.txt - # (the awk filter in benchmark.sh treats any `psql: error` line as a failed query) - until pg_isready -h localhost --dbname postgres -U postgres > /dev/null 2>&1; do sleep 1; done - until PGPASSWORD=test psql -h localhost -U postgres -c "SELECT 'Ok';" > /dev/null 2>&1; do sleep 1; done - - echo "$query"; - for i in $(seq 1 $TRIES); do - PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done -done diff --git a/cedardb-parquet/start b/cedardb-parquet/start new file mode 100755 index 000000000..ad1d71439 --- /dev/null +++ b/cedardb-parquet/start @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo docker stop cedardb >/dev/null 2>&1 || true +sudo docker rm cedardb >/dev/null 2>&1 || true + +sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + -v "$(pwd)/db:/var/lib/cedardb/data" \ + -e CEDAR_PASSWORD=test \ + --name cedardb cedardb/cedardb:latest >/dev/null + +until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do + sleep 1 +done diff --git a/cedardb-parquet/stop b/cedardb-parquet/stop new file mode 100755 index 000000000..5d6ade0a8 --- /dev/null +++ b/cedardb-parquet/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop cedardb >/dev/null 2>&1 || true diff --git a/cedardb/benchmark.sh b/cedardb/benchmark.sh index a6e1c2502..531bd6503 100755 --- a/cedardb/benchmark.sh +++ b/cedardb/benchmark.sh @@ -1,43 +1,5 @@ -#!/bin/bash -e - -# docker -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client gzip - -# download dataset -../download-hits-tsv -mkdir data -mv hits.tsv data -chmod -R 777 data -rm -rf db -mkdir db - -# get and configure CedarDB image -echo "Starting CedarDB..." -docker run --rm -p 5432:5432 -v ./data:/data -v ./db:/var/lib/cedardb/data -e CEDAR_PASSWORD=test --name cedardb cedardb/cedardb:latest > /dev/null 2>&1 & - -# wait for container to start -until pg_isready -h localhost --dbname postgres -U postgres > /dev/null 2>&1; do sleep 1; done - -# create table and ingest data -PGPASSWORD=test psql -h localhost -U postgres -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -echo "Inserting data..." -echo -n "Load time: " -PGPASSWORD=test command time -f '%e' psql -h localhost -U postgres -q -t -c "COPY hits FROM '/data/hits.tsv';" - -# get ingested data size -echo -n "Data size: " -PGPASSWORD=test psql -h localhost -U postgres -q -t -c "SELECT pg_total_relation_size('hits');" - -# run benchmark -echo "running benchmark..." -./run.sh 2>&1 | tee log.txt - -cat log.txt | \ - grep -oP 'Time: \d+\.\d+ ms|psql: error' | \ - sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | \ - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/cedardb/check b/cedardb/check new file mode 100755 index 000000000..f161a0820 --- /dev/null +++ b/cedardb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null diff --git a/cedardb/data-size b/cedardb/data-size new file mode 100755 index 000000000..da53638e4 --- /dev/null +++ b/cedardb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +PGPASSWORD=test psql -h localhost -U postgres -q -t -A -c "SELECT pg_total_relation_size('hits');" diff --git a/cedardb/install b/cedardb/install new file mode 100755 index 000000000..0043c4c84 --- /dev/null +++ b/cedardb/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client gzip + +sudo docker pull cedardb/cedardb:latest + +mkdir -p data db +chmod -R 777 data db diff --git a/cedardb/load b/cedardb/load new file mode 100755 index 000000000..eeb6fd3a7 --- /dev/null +++ b/cedardb/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +mkdir -p data +mv hits.tsv data/ +chmod -R 777 data + +PGPASSWORD=test psql -h localhost -U postgres -t < create.sql + +PGPASSWORD=test psql -h localhost -U postgres -q -t -c "COPY hits FROM '/data/hits.tsv';" + +rm -f data/hits.tsv +sync diff --git a/cedardb/query b/cedardb/query new file mode 100755 index 000000000..3261388dc --- /dev/null +++ b/cedardb/query @@ -0,0 +1,26 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against CedarDB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/cedardb/run.sh b/cedardb/run.sh deleted file mode 100755 index 80e7a0616..000000000 --- a/cedardb/run.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart $(docker ps -a -q) - - retry_count=0 - while [ $retry_count -lt 120 ]; do - if PGPASSWORD=test psql -h localhost -U postgres -c "SELECT 'Ok';"; then - break - fi - - retry_count=$((retry_count+1)) - sleep 1 - done - - echo "$query"; - for i in $(seq 1 $TRIES); do - PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done -done diff --git a/cedardb/start b/cedardb/start new file mode 100755 index 000000000..0f4c8b56f --- /dev/null +++ b/cedardb/start @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# `docker run --rm` cleans up container on exit; we run detached. +sudo docker stop cedardb >/dev/null 2>&1 || true +sudo docker rm cedardb >/dev/null 2>&1 || true + +sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + -v "$(pwd)/db:/var/lib/cedardb/data" \ + -e CEDAR_PASSWORD=test \ + --name cedardb cedardb/cedardb:latest >/dev/null + +until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do + sleep 1 +done diff --git a/cedardb/stop b/cedardb/stop new file mode 100755 index 000000000..5d6ade0a8 --- /dev/null +++ b/cedardb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop cedardb >/dev/null 2>&1 || true diff --git a/chdb-dataframe/benchmark.sh b/chdb-dataframe/benchmark.sh index 0bb86a8ea..fc4bacc8f 100755 --- a/chdb-dataframe/benchmark.sh +++ b/chdb-dataframe/benchmark.sh @@ -1,20 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas pyarrow -pip install chdb - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chdb-dataframe/check b/chdb-dataframe/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/chdb-dataframe/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/chdb-dataframe/data-size b/chdb-dataframe/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/chdb-dataframe/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/chdb-dataframe/install b/chdb-dataframe/install new file mode 100755 index 000000000..d1c83816b --- /dev/null +++ b/chdb-dataframe/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas pyarrow chdb fastapi uvicorn diff --git a/chdb-dataframe/load b/chdb-dataframe/load new file mode 100755 index 000000000..ceba6beca --- /dev/null +++ b/chdb-dataframe/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/chdb-dataframe/query b/chdb-dataframe/query new file mode 100755 index 000000000..a4fe4abfb --- /dev/null +++ b/chdb-dataframe/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running chdb server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/chdb-dataframe/query.py b/chdb-dataframe/query.py deleted file mode 100755 index f350c8ad5..000000000 --- a/chdb-dataframe/query.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -import pandas as pd -import timeit -import datetime -import json -import subprocess -import chdb - -start = timeit.default_timer() -hits = pd.read_parquet("hits.parquet") -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") - -dataframe_size = hits.memory_usage().sum() - -# print("Dataframe(numpy) size:", dataframe_size, "bytes") - -# fix some types -hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") -hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") - -# fix all object columns to string -start = timeit.default_timer() -for col in hits.columns: - if hits[col].dtype == "O": - hits[col] = hits[col].astype(str) - -print("Dataframe(numpy) normalization time:", timeit.default_timer() - start) - -queries = [] -with open("queries.sql") as f: - queries = f.readlines() - -# conn = chdb.connect("./tmp?verbose&log-level=test") -conn = chdb.connect("./tmp") -for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) - - times = [] - for _ in range(3): - start = timeit.default_timer() - result = conn.query(q, "Null") - end = timeit.default_timer() - times.append(round(end - start, 3)) - print(times) diff --git a/chdb-dataframe/server.py b/chdb-dataframe/server.py new file mode 100644 index 000000000..38365b786 --- /dev/null +++ b/chdb-dataframe/server.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around chDB so it conforms to the ClickBench +install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the DataFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs it via + chdb against the loaded DataFrame, returns + {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (memory_usage) + +The query strings (43 of them, addressing Python(hits)) match the previous +chdb-dataframe/queries.sql, exposed over HTTP. +""" + +import os +import timeit + +import chdb +import pandas as pd +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits: pd.DataFrame | None = None # noqa: F841 — referenced by chdb's Python() table function +conn = None + + +def _make_runner(sql: str): + return lambda _df: conn.query(sql, "Null") + + +# 43 ClickBench queries — chdb addresses the in-process pandas DataFrame named +# `hits` via the Python() table function. SQL strings come straight from the +# prior chdb-dataframe/queries.sql. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM Python(hits);", + "SELECT COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM Python(hits);", + "SELECT AVG(UserID) FROM Python(hits);", + "SELECT COUNT(DISTINCT UserID) FROM Python(hits);", + "SELECT COUNT(DISTINCT SearchPhrase) FROM Python(hits);", + "SELECT MIN(EventDate), MAX(EventDate) FROM Python(hits);", + "SELECT AdvEngineID, COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM Python(hits) GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM Python(hits) GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM Python(hits) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM Python(hits) WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM Python(hits) WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM Python(hits) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM Python(hits) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM Python(hits) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM Python(hits) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM Python(hits) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM Python(hits);", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM Python(hits) GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM Python(hits) GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM Python(hits) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, conn + start = timeit.default_timer() + df = pd.read_parquet("hits.parquet") + df["EventTime"] = pd.to_datetime(df["EventTime"], unit="s") + df["EventDate"] = pd.to_datetime(df["EventDate"], unit="D") + for col in df.columns: + if df[col].dtype == "O": + df[col] = df[col].astype(str) + hits = df + # chdb addresses `hits` via Python(hits); the connection picks up the + # variable from the module globals at query time. + conn = chdb.connect("./tmp") + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + sql = QUERIES[idx][0] + start = timeit.default_timer() + conn.query(sql, "Null") + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if hits is None: + return {"bytes": 0} + return {"bytes": int(hits.memory_usage().sum())} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_CHDB_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/chdb-dataframe/start b/chdb-dataframe/start new file mode 100755 index 000000000..e3fab7273 --- /dev/null +++ b/chdb-dataframe/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/chdb-dataframe/stop b/chdb-dataframe/stop new file mode 100755 index 000000000..787b35abc --- /dev/null +++ b/chdb-dataframe/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/chdb-parquet-partitioned/benchmark.sh b/chdb-parquet-partitioned/benchmark.sh index db3290d0c..3b63e772a 100755 --- a/chdb-parquet-partitioned/benchmark.sh +++ b/chdb-parquet-partitioned/benchmark.sh @@ -1,23 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install psutil pyarrow -pip install chdb - -# Load the data -../download-hits-parquet-partitioned - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -echo "Load time: 0" -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chdb-parquet-partitioned/check b/chdb-parquet-partitioned/check new file mode 100755 index 000000000..cd67e7c07 --- /dev/null +++ b/chdb-parquet-partitioned/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import chdb; chdb.query('SELECT 1')" >/dev/null diff --git a/chdb-parquet-partitioned/data-size b/chdb-parquet-partitioned/data-size new file mode 100755 index 000000000..2d6921ab6 --- /dev/null +++ b/chdb-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/chdb-parquet-partitioned/install b/chdb-parquet-partitioned/install new file mode 100755 index 000000000..5232a3de5 --- /dev/null +++ b/chdb-parquet-partitioned/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade pip +pip install psutil pyarrow chdb diff --git a/chdb-parquet-partitioned/load b/chdb-parquet-partitioned/load new file mode 100755 index 000000000..5a0c7a9c7 --- /dev/null +++ b/chdb-parquet-partitioned/load @@ -0,0 +1,5 @@ +#!/bin/bash +# chdb-parquet-partitioned queries the parquet files directly via file(). +# Nothing to load; the dataset is already in CWD as hits_*.parquet. +set -e +sync diff --git a/chdb-parquet-partitioned/query b/chdb-parquet-partitioned/query new file mode 100755 index 000000000..1e8f1b4dc --- /dev/null +++ b/chdb-parquet-partitioned/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via chdb against the partitioned +# parquet files in CWD. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +python3 - <<'PY' +import sys +import timeit +import chdb + +query = sys.stdin.read() + +conn = chdb.connect() + +start = timeit.default_timer() +try: + res = conn.query(query, "CSV") + end = timeit.default_timer() + out = str(res) + if out: + sys.stdout.write(out) + if not out.endswith("\n"): + sys.stdout.write("\n") +finally: + conn.close() + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/chdb-parquet-partitioned/query.py b/chdb-parquet-partitioned/query.py deleted file mode 100755 index 1f9c3a048..000000000 --- a/chdb-parquet-partitioned/query.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 - -import chdb -import timeit -import sys - -query = sys.stdin.read() -print(query) - -conn = chdb.connect() -for try_num in range(3): - start = timeit.default_timer() - conn.query(query, "Null") - end = timeit.default_timer() - print(round(end - start, 3)) - -conn.close() diff --git a/chdb-parquet-partitioned/run.sh b/chdb-parquet-partitioned/run.sh deleted file mode 100755 index 02cb4f6d7..000000000 --- a/chdb-parquet-partitioned/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/chdb-parquet-partitioned/start b/chdb-parquet-partitioned/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/chdb-parquet-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/chdb-parquet-partitioned/stop b/chdb-parquet-partitioned/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/chdb-parquet-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/chdb/benchmark.sh b/chdb/benchmark.sh index 3f888cbfa..b0b9f4775 100755 --- a/chdb/benchmark.sh +++ b/chdb/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install psutil pyarrow -pip install chdb - -# Load the data -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' ./load.py - -# Run the queries -./run.sh 2>&1 | tee log.txt - -# Process the log.txt -cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo -n "Data size: " -du -bcs .clickbench | grep total +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chdb/check b/chdb/check new file mode 100755 index 000000000..cd67e7c07 --- /dev/null +++ b/chdb/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import chdb; chdb.query('SELECT 1')" >/dev/null diff --git a/chdb/data-size b/chdb/data-size new file mode 100755 index 000000000..226c121b8 --- /dev/null +++ b/chdb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs .clickbench | awk '/total$/ { print $1 }' diff --git a/chdb/install b/chdb/install new file mode 100755 index 000000000..6dcb72afd --- /dev/null +++ b/chdb/install @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +# chdb is a Python package (ClickHouse embedded). Install it into a venv. +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade pip +pip install psutil pyarrow chdb diff --git a/chdb/load.py b/chdb/load similarity index 51% rename from chdb/load.py rename to chdb/load index 4b780538e..8a516b3e3 100755 --- a/chdb/load.py +++ b/chdb/load @@ -1,5 +1,13 @@ -#!/usr/bin/env python3 +#!/bin/bash +set -e +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Idempotent: blow away any prior data dir. +rm -rf .clickbench + +python3 - <<'PY' from chdb import dbapi con = dbapi.connect(path=".clickbench") @@ -10,3 +18,7 @@ cur.close() con.close() +PY + +rm -f hits.csv +sync diff --git a/chdb/query b/chdb/query new file mode 100755 index 000000000..0ef6aeb27 --- /dev/null +++ b/chdb/query @@ -0,0 +1,34 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via chdb against the .clickbench dir. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +python3 - <<'PY' +import sys +import timeit +from chdb import dbapi + +query = sys.stdin.read() + +con = dbapi.connect(path=".clickbench") +cur = con.cursor() + +start = timeit.default_timer() +try: + cur._cursor.execute(query) + end = timeit.default_timer() + rows = cur.fetchall() if cur.description else [] + for row in rows: + print(row) +finally: + cur.close() + con.close() + +# Last line of stderr: fractional seconds. +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/chdb/query.py b/chdb/query.py deleted file mode 100755 index cca7676a1..000000000 --- a/chdb/query.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 - -import timeit -import sys -import os -import glob -from chdb import dbapi - -def main(): - query = sys.stdin.read() - print(query) - - con = dbapi.connect(path=".clickbench") - cur = con.cursor() - - for try_num in range(3): - start = timeit.default_timer() - cur._cursor.execute(query) - end = timeit.default_timer() - print(round(end - start, 3)) - - cur.close() - con.close() - -if __name__ == "__main__": - main() diff --git a/chdb/run.sh b/chdb/run.sh deleted file mode 100755 index 02cb4f6d7..000000000 --- a/chdb/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/chdb/start b/chdb/start new file mode 100755 index 000000000..71836e5f8 --- /dev/null +++ b/chdb/start @@ -0,0 +1,3 @@ +#!/bin/bash +# chdb is an embedded library — no daemon to start. +exit 0 diff --git a/chdb/stop b/chdb/stop new file mode 100755 index 000000000..1e5556318 --- /dev/null +++ b/chdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# chdb is an embedded library — no daemon to stop. +exit 0 diff --git a/chyt/benchmark.sh b/chyt/benchmark.sh index d4ee82c46..21a189925 100755 --- a/chyt/benchmark.sh +++ b/chyt/benchmark.sh @@ -1,28 +1,8 @@ #!/bin/bash - +# Thin shim — actual flow is in lib/benchmark-common.sh. +# CHYT executes against a remote YT cluster ($YT_PROXY); no local download. export YT_USE_HOSTS=0 -export CHYT_ALIAS=*ch_public - -echo "----------------" -# Create table -echo "Creating table" -command time -f '%e' yt clickhouse execute "$(cat create.sql)" --alias $CHYT_ALIAS --proxy $YT_PROXY -echo "----------------" - -echo "----------------" -# Fill table -echo -n "Load time: " -command time -f '%e' yt clickhouse execute "$(cat fill_data.sql)" --alias $CHYT_ALIAS --proxy $YT_PROXY -echo "----------------" - -echo "----------------" -# Sort table -echo -n "Load time: " -command time -f '%e' yt sort --src //home/hits --dst //home/hits --sort-by "CounterID" --sort-by "EventDate" --sort-by "UserID" --sort-by "EventTime" --sort-by "WatchID" --proxy $YT_PROXY -echo "----------------" - -echo "----------------" -# Run benchmark -echo "Starting benchmark" -./run.sh -echo "----------------" +export CHYT_ALIAS="${CHYT_ALIAS:-*ch_public}" +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/chyt/check b/chyt/check new file mode 100755 index 000000000..a31144aa4 --- /dev/null +++ b/chyt/check @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" +: "${CHYT_ALIAS:=*ch_public}" + +yt clickhouse execute "SELECT 1" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" >/dev/null diff --git a/chyt/data-size b/chyt/data-size new file mode 100755 index 000000000..723231ca8 --- /dev/null +++ b/chyt/data-size @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" + +# Report the byte size of the //home/hits table on the YT cluster. +yt get "//home/hits/@uncompressed_data_size" --proxy "$YT_PROXY" diff --git a/chyt/install b/chyt/install new file mode 100755 index 000000000..d7512fad1 --- /dev/null +++ b/chyt/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# CHYT runs on a remote YT cluster — install the YT Python client locally to +# drive it. Idempotent. +if [ -d myenv ] && [ -x myenv/bin/yt ]; then + exit 0 +fi + +sudo apt-get install -y python3-pip python3-venv +python3 -m venv myenv +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --upgrade pip +pip install ytsaurus-client ytsaurus-yson diff --git a/chyt/load b/chyt/load new file mode 100755 index 000000000..2e23011d9 --- /dev/null +++ b/chyt/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" +: "${CHYT_ALIAS:=*ch_public}" + +# Create the table on the remote cluster. +yt clickhouse execute "$(cat create.sql)" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" + +# Fill the table from a public dataset URL (see fill_data.sql). +yt clickhouse execute "$(cat fill_data.sql)" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" + +# Sort the resulting table to give CHYT useful primary-key ordering. +yt sort --src //home/hits --dst //home/hits \ + --sort-by CounterID --sort-by EventDate --sort-by UserID \ + --sort-by EventTime --sort-by WatchID \ + --proxy "$YT_PROXY" + +sync diff --git a/chyt/query b/chyt/query new file mode 100755 index 000000000..ff5f82646 --- /dev/null +++ b/chyt/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `yt clickhouse execute` against a +# remote CHYT clique. Stdout: query result. Stderr: query runtime in fractional +# seconds on the last line (extracted from the JSON `statistics.elapsed`). +# Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +: "${YT_PROXY:?YT_PROXY is required}" +: "${CHYT_ALIAS:=*ch_public}" + +query=$(cat) + +# `yt clickhouse execute --format JSON` returns a JSON envelope with timing +# info. We need to capture both data (stdout) and elapsed (stderr). +out=$(yt clickhouse execute "$query" --alias "$CHYT_ALIAS" --proxy "$YT_PROXY" --format JSON) + +# Result body to stdout. +printf '%s\n' "$out" + +# elapsed seconds → stderr. +printf '%s\n' "$out" | jq -r '.statistics.elapsed' >&2 diff --git a/chyt/run.sh b/chyt/run.sh deleted file mode 100755 index 5de7977fb..000000000 --- a/chyt/run.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash - -trap ctrl_c INT - -CPU_HIGH=12 -CPU_LOW=10 -RAM_HIGH=51539607552 -RAM_LOW=42949672960 - -apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install ytsaurus-client --break-system-packages -pip install ytsaurus-yson --break-system-packages - -function ctrl_c() { - echo "Exiting benchmark. Ctrl+C" - exit 1 -} - -throbber() { - local pid=$1 - local sp="\|/-" - local i=0 - - while kill -0 $pid > /dev/null; do - printf "\rWaiting... %c" "${sp:$i:1}" - ((i = (i + 1) % 4)) - sleep 0.1 - done - printf "\rWaiting... Done!\n" -} - - -stop_public_clique() { - yt clickhouse ctl --address $YT_CONTROLLER --proxy $YT_PROXY --cluster-name $CLUSTER_NAME stop *ch_public -} - -create_clique() { - yt clickhouse ctl create --speclet-options "{"active" = %true;"enable_geodata" = %false;"family" = "chyt";"instance_count" = 1;"instance_cpu" = 12;"instance_total_memory" = 51539607552;"pool" = "research";"restart_on_speclet_change" = %true;"stage" = "production";}" --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME clickbench -} - -insert_data() { - yt query --settings '{"clique"="clickbench"}' --format json --async chyt "$(cat fill_data.sql)" > fill_query_id -} - -data_filling_waiting() { - for _ in {1..300} - do - COUNT=$(yt clickhouse execute --alias *clickbench 'select count(*) as c from `//home/hits`') - if [[ "$COUNT" == 99997497 ]]; then - yt abort-query $(cat fill_query_id) - break - else - sleep 60 - fi - done -} - - -fill_data() { -command time -f '%e' yt clickhouse execute "$(cat create.sql)" --alias *clickbench --proxy $YT_PROXY -insert_data -data_filling_waiting & -throbber $! -yt sort --src //home/hits --dst //home/hits --sort-by "CounterID" --sort-by "EventDate" --sort-by "UserID" --sort-by "EventTime" --sort-by "WatchID" --proxy $YT_PROXY -} - - -check_ready() { - TOTAL_JOBS=$(yt list-operations --proxy $YT_PROXY --filter clique --filter clickbench --state running --format json | jq .operations[0].brief_progress.jobs.total) - RUNNING_JOBS=$(yt list-operations --proxy $YT_PROXY --filter clique --filter clickbench --state running --format json | jq .operations[0].brief_progress.jobs.running) - STATE=$(yt clickhouse ctl status --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME clickbench | grep "state" | head -n 1 | sed "s/^[ \t]*//") - HEALTH=$(yt clickhouse ctl status --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME clickbench | grep "health" | head -n 1 | sed "s/^[ \t]*//") - - if [[ "$STATE" == "\"state\" = \"active\";" && "$HEALTH" == "\"health\" = \"good\";" && "$TOTAL_JOBS" -eq "$RUNNING_JOBS" ]]; then - return 0 - else - return 1 - - fi -} - -change_clique_size() { - echo "Changing size. Instance count $1, vCPU $2, RAM $3 " - yt clickhouse ctl set-speclet --address $YT_CONTROLLER --cluster-name $CLUSTER_NAME --alias clickbench "{"active" = %true;"enable_geodata" = %false;"family" = "chyt";"instance_count" = $1;"instance_cpu" = $2;"instance_total_memory" = $3;"pool" = "research";"restart_on_speclet_change" = %true;"stage" = "production";}" -} - - -run() { - TRIES=3 - QUERY_NUM=1 - TOTAL_LINES=$(wc -l < queries.sql) - cat queries.sql | while read query; do - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(yt clickhouse execute "$query" --alias *clickbench@0 --proxy $YT_PROXY --format JSON | jq .statistics.elapsed 2>&1) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - done - if [[ $QUERY_NUM == $TOTAL_LINES ]] - then echo "]" - else - echo "]," - fi - QUERY_NUM=$((QUERY_NUM + 1)) - done - -} - -clique_waiting() { - for _ in {1..300} - do - if check_ready; then - echo "Clique is almost ready. Waiting 1 minute to stabilize" - sleep 60 - break - else - echo "Clique not ready. Waiting for 10 seconds" - sleep 10 - fi - done -} - -echo "-------------------------------------" -echo "Stopping public clique" -stop_public_clique - -create_clique - -change_clique_size 1 $CPU_HIGH $RAM_HIGH - -clique_waiting -echo "-------------------------------------" -echo -n "Load time: " -command time -f '%e' fill_data -echo "-------------------------------------" - -for i in "1 $CPU_HIGH $RAM_HIGH 48GB" "2 $CPU_HIGH $RAM_HIGH 96GB" "4 $CPU_HIGH $RAM_HIGH 192GB" "9 $CPU_LOW $RAM_LOW 360GB" -do - set -- $i - echo "Running test for $4 clique" - change_clique_size $1 $2 $3 - clique_waiting - run -done diff --git a/chyt/start b/chyt/start new file mode 100755 index 000000000..d1ada551c --- /dev/null +++ b/chyt/start @@ -0,0 +1,3 @@ +#!/bin/bash +# CHYT runs on a remote YT cluster ($YT_PROXY); nothing to start locally. +exit 0 diff --git a/chyt/stop b/chyt/stop new file mode 100755 index 000000000..43bcdcb09 --- /dev/null +++ b/chyt/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# CHYT runs on a remote YT cluster; nothing to stop locally. +exit 0 diff --git a/citus/benchmark.sh b/citus/benchmark.sh index b082a31df..531bd6503 100755 --- a/citus/benchmark.sh +++ b/citus/benchmark.sh @@ -1,33 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y docker.io -sudo apt-get install -y postgresql-client - -export PGPASSWORD=mypass -sudo docker run -d --name citus -p 5432:5432 -e POSTGRES_PASSWORD=$PGPASSWORD citusdata/citus:11.0 - -../download-hits-tsv - -echo "*:*:*:*:mypass" > .pgpass -chmod 400 .pgpass - -psql -U postgres -h localhost -d postgres -t -c 'CREATE DATABASE test' -psql -U postgres -h localhost -d postgres test -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -echo -n "Load time: " -command time -f '%e' psql -U postgres -h localhost -d postgres test -q -t -c "\\copy hits FROM 'hits.tsv'" - -# COPY 99997497 -# Time: 1579203.482 ms (26:19.203) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec -i citus du -bcs /var/lib/postgresql/data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/citus/check b/citus/check new file mode 100755 index 000000000..d8b098776 --- /dev/null +++ b/citus/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD=${PGPASSWORD:-mypass} +psql -U postgres -h localhost -d postgres -t -c 'SELECT 1' >/dev/null diff --git a/citus/data-size b/citus/data-size new file mode 100755 index 000000000..73535f867 --- /dev/null +++ b/citus/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-citus} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data | grep total | awk '{print $1}' diff --git a/citus/install b/citus/install new file mode 100755 index 000000000..0caf5deac --- /dev/null +++ b/citus/install @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-citus} +CITUS_VERSION=${CITUS_VERSION:-11.0} +PGPASSWORD=${PGPASSWORD:-mypass} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "citusdata/citus:$CITUS_VERSION" + +# (Re)create container so install is idempotent. +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD="$PGPASSWORD" \ + "citusdata/citus:$CITUS_VERSION" + +# Persist the password for psql clients invoked from this directory. +echo "*:*:*:*:$PGPASSWORD" > .pgpass +chmod 600 .pgpass diff --git a/citus/load b/citus/load new file mode 100755 index 000000000..e91fbb13c --- /dev/null +++ b/citus/load @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +export PGPASSWORD=${PGPASSWORD:-mypass} + +psql -U postgres -h localhost -d postgres -t -c "DROP DATABASE IF EXISTS test" +psql -U postgres -h localhost -d postgres -t -c "CREATE DATABASE test" + +# create.sql for citus relies on the columnar access method, which the citus +# extension provides. Ensure it's enabled in the test DB. +psql -U postgres -h localhost -d test -t -c "CREATE EXTENSION IF NOT EXISTS citus" +psql -U postgres -h localhost -d test -v ON_ERROR_STOP=1 -t < create.sql + +psql -U postgres -h localhost -d test -v ON_ERROR_STOP=1 -t -c "\\copy hits FROM 'hits.tsv'" +psql -U postgres -h localhost -d test -v ON_ERROR_STOP=1 -t -c 'VACUUM ANALYZE hits' + +rm -f hits.tsv +sync diff --git a/citus/query b/citus/query new file mode 100755 index 000000000..349a1d48f --- /dev/null +++ b/citus/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGPASSWORD=${PGPASSWORD:-mypass} +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -U postgres -h localhost -d test -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/citus/run.sh b/citus/run.sh deleted file mode 100755 index 0952225f6..000000000 --- a/citus/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -U postgres -h localhost -d postgres --no-password -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/citus/start b/citus/start new file mode 100755 index 000000000..f72133d4d --- /dev/null +++ b/citus/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-citus} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/citus/stop b/citus/stop new file mode 100755 index 000000000..a1bdaef32 --- /dev/null +++ b/citus/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-citus} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/clickhouse-datalake-partitioned/benchmark.sh b/clickhouse-datalake-partitioned/benchmark.sh index c4fec4d80..33e6ce27b 100755 --- a/clickhouse-datalake-partitioned/benchmark.sh +++ b/clickhouse-datalake-partitioned/benchmark.sh @@ -1,19 +1,6 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -# Configure - -RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) -> clickhouse-local.yaml echo " -page_cache_max_size: ${RAM} -" - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: 14737666736" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-datalake-partitioned/check b/clickhouse-datalake-partitioned/check new file mode 100755 index 000000000..39c3b4570 --- /dev/null +++ b/clickhouse-datalake-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --path . --query "SELECT 1" >/dev/null diff --git a/clickhouse-datalake-partitioned/data-size b/clickhouse-datalake-partitioned/data-size new file mode 100755 index 000000000..ec242ca74 --- /dev/null +++ b/clickhouse-datalake-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored in S3 — fixed size (100 partitioned parquet files). +echo 14737666736 diff --git a/clickhouse-datalake-partitioned/install b/clickhouse-datalake-partitioned/install new file mode 100755 index 000000000..ee46804c1 --- /dev/null +++ b/clickhouse-datalake-partitioned/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi + +# Use a userspace page cache sized to ~80% of RAM for S3 object reads. +RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) +cat > clickhouse-local.yaml <> result.csv - i=$((i+1)) - done <<< "$(./clickhouse local --path . --time --format Null --use_page_cache_for_object_storage 1 --query "$query; $query; $query" 2>&1)" - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done -./clickhouse local --path . --query="DROP TABLE hits" diff --git a/clickhouse-datalake-partitioned/start b/clickhouse-datalake-partitioned/start new file mode 100755 index 000000000..a726d9347 --- /dev/null +++ b/clickhouse-datalake-partitioned/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-datalake-partitioned/stop b/clickhouse-datalake-partitioned/stop new file mode 100755 index 000000000..766128568 --- /dev/null +++ b/clickhouse-datalake-partitioned/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-datalake/benchmark.sh b/clickhouse-datalake/benchmark.sh index 8b4d0718d..33e6ce27b 100755 --- a/clickhouse-datalake/benchmark.sh +++ b/clickhouse-datalake/benchmark.sh @@ -1,19 +1,6 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -# Configure - -RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) -> clickhouse-local.yaml echo " -page_cache_max_size: ${RAM} -" - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: 14779976446" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-datalake/check b/clickhouse-datalake/check new file mode 100755 index 000000000..39c3b4570 --- /dev/null +++ b/clickhouse-datalake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --path . --query "SELECT 1" >/dev/null diff --git a/clickhouse-datalake/data-size b/clickhouse-datalake/data-size new file mode 100755 index 000000000..eeeeea560 --- /dev/null +++ b/clickhouse-datalake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored in S3 — fixed size (single parquet). +echo 14779976446 diff --git a/clickhouse-datalake/install b/clickhouse-datalake/install new file mode 100755 index 000000000..ee46804c1 --- /dev/null +++ b/clickhouse-datalake/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi + +# Use a userspace page cache sized to ~80% of RAM for S3 object reads. +RAM=$(awk '/MemTotal/ {print int($2 * 0.8 * 1024)}' /proc/meminfo) +cat > clickhouse-local.yaml <> result.csv - i=$((i+1)) - done <<< "$(./clickhouse local --path . --time --format Null --use_page_cache_for_object_storage 1 --query "$query; $query; $query" 2>&1)" - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done -./clickhouse local --path . --query="DROP TABLE hits" diff --git a/clickhouse-datalake/start b/clickhouse-datalake/start new file mode 100755 index 000000000..a726d9347 --- /dev/null +++ b/clickhouse-datalake/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-datalake/stop b/clickhouse-datalake/stop new file mode 100755 index 000000000..766128568 --- /dev/null +++ b/clickhouse-datalake/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-parquet-partitioned/benchmark.sh b/clickhouse-parquet-partitioned/benchmark.sh index 6ade86759..3b63e772a 100755 --- a/clickhouse-parquet-partitioned/benchmark.sh +++ b/clickhouse-parquet-partitioned/benchmark.sh @@ -1,17 +1,5 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -../download-hits-parquet-partitioned - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -# Use for ClickHouse (Parquet, single) -# du -b hits.parquet +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-parquet-partitioned/check b/clickhouse-parquet-partitioned/check new file mode 100755 index 000000000..86d2609b6 --- /dev/null +++ b/clickhouse-parquet-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --query "SELECT 1" >/dev/null diff --git a/clickhouse-parquet-partitioned/data-size b/clickhouse-parquet-partitioned/data-size new file mode 100755 index 000000000..2d6921ab6 --- /dev/null +++ b/clickhouse-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/clickhouse-parquet-partitioned/install b/clickhouse-parquet-partitioned/install new file mode 100755 index 000000000..43a2ea1c3 --- /dev/null +++ b/clickhouse-parquet-partitioned/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi diff --git a/clickhouse-parquet-partitioned/load b/clickhouse-parquet-partitioned/load new file mode 100755 index 000000000..e49617559 --- /dev/null +++ b/clickhouse-parquet-partitioned/load @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# clickhouse-local with File(Parquet, 'hits_*.parquet') reads the parquet files +# in place, so there's no separate ingest step. Keep the downloaded +# hits_*.parquet files in this directory. +sync diff --git a/clickhouse-parquet-partitioned/query b/clickhouse-parquet-partitioned/query new file mode 100755 index 000000000..a157a84bf --- /dev/null +++ b/clickhouse-parquet-partitioned/query @@ -0,0 +1,8 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-local with the table +# definition from create.sql prepended. Stdout: query result. Stderr: query +# runtime in fractional seconds on the last line. Exit non-zero on error. +set -e + +query=$(cat) +./clickhouse local --time --query="$(cat create.sql); ${query}" diff --git a/clickhouse-parquet-partitioned/run.sh b/clickhouse-parquet-partitioned/run.sh deleted file mode 100755 index b0fd1faa5..000000000 --- a/clickhouse-parquet-partitioned/run.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(./clickhouse local --time --format Null --query="$(cat create.sql); $query" 2>&1 | tail -n1) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-parquet-partitioned/start b/clickhouse-parquet-partitioned/start new file mode 100755 index 000000000..a726d9347 --- /dev/null +++ b/clickhouse-parquet-partitioned/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-parquet-partitioned/stop b/clickhouse-parquet-partitioned/stop new file mode 100755 index 000000000..766128568 --- /dev/null +++ b/clickhouse-parquet-partitioned/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-parquet/benchmark.sh b/clickhouse-parquet/benchmark.sh index d6845a14b..fc4bacc8f 100755 --- a/clickhouse-parquet/benchmark.sh +++ b/clickhouse-parquet/benchmark.sh @@ -1,14 +1,5 @@ #!/bin/bash - -# Install - -curl https://clickhouse.com/ | sh - -../download-hits-parquet-single - -# Run the queries - -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/clickhouse-parquet/check b/clickhouse-parquet/check new file mode 100755 index 000000000..86d2609b6 --- /dev/null +++ b/clickhouse-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./clickhouse local --query "SELECT 1" >/dev/null diff --git a/clickhouse-parquet/data-size b/clickhouse-parquet/data-size new file mode 100755 index 000000000..1a38db62c --- /dev/null +++ b/clickhouse-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits.parquet | awk 'END { print $1 }' diff --git a/clickhouse-parquet/install b/clickhouse-parquet/install new file mode 100755 index 000000000..43a2ea1c3 --- /dev/null +++ b/clickhouse-parquet/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +if [ ! -x ./clickhouse ]; then + curl https://clickhouse.com/ | sh +fi diff --git a/clickhouse-parquet/load b/clickhouse-parquet/load new file mode 100755 index 000000000..0d7d5fc29 --- /dev/null +++ b/clickhouse-parquet/load @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# clickhouse-local with File(Parquet) engine reads the parquet file in place, +# so there's no separate ingest step. The "load" is implicit — just keep the +# downloaded hits.parquet in this directory. +sync diff --git a/clickhouse-parquet/query b/clickhouse-parquet/query new file mode 100755 index 000000000..a157a84bf --- /dev/null +++ b/clickhouse-parquet/query @@ -0,0 +1,8 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-local with the table +# definition from create.sql prepended. Stdout: query result. Stderr: query +# runtime in fractional seconds on the last line. Exit non-zero on error. +set -e + +query=$(cat) +./clickhouse local --time --query="$(cat create.sql); ${query}" diff --git a/clickhouse-parquet/run.sh b/clickhouse-parquet/run.sh deleted file mode 100755 index b0fd1faa5..000000000 --- a/clickhouse-parquet/run.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(./clickhouse local --time --format Null --query="$(cat create.sql); $query" 2>&1 | tail -n1) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-parquet/start b/clickhouse-parquet/start new file mode 100755 index 000000000..a726d9347 --- /dev/null +++ b/clickhouse-parquet/start @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to start. +exit 0 diff --git a/clickhouse-parquet/stop b/clickhouse-parquet/stop new file mode 100755 index 000000000..766128568 --- /dev/null +++ b/clickhouse-parquet/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# clickhouse-local is invoked per-query — no daemon to stop. +exit 0 diff --git a/clickhouse-tencent/benchmark.sh b/clickhouse-tencent/benchmark.sh index 755ab5e26..6a7f45d3a 100755 --- a/clickhouse-tencent/benchmark.sh +++ b/clickhouse-tencent/benchmark.sh @@ -1,32 +1,5 @@ #!/bin/bash - -if [ ! -x /usr/bin/clickhouse ] -then -wget --continue --progress=dot:giga https://clickhouse-builds.s3.amazonaws.com/PRs/81944/e3a48c0de6d188232cc544244ba6862b63eb4762/build_amd_release/clickhouse-common-static-25.9.1.1-amd64.tgz -O clickhouse-tencent.tgz - mkdir -p clickhouse-tencent && tar -xzf clickhouse-tencent.tgz -C clickhouse-tencent - sudo clickhouse-tencent/clickhouse-common-static-25.9.1.1/usr/bin/clickhouse install --noninteractive -fi - -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -clickhouse-client < create.sql - -../download-hits-parquet-partitioned -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet - -echo -n "Load time: " -clickhouse-client --time --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 )) - -# Run the queries - -./run.sh "$1" - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/clickhouse-tencent/check b/clickhouse-tencent/check new file mode 100755 index 000000000..febe4e0de --- /dev/null +++ b/clickhouse-tencent/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT 1" >/dev/null diff --git a/clickhouse-tencent/data-size b/clickhouse-tencent/data-size new file mode 100755 index 000000000..7770f6efb --- /dev/null +++ b/clickhouse-tencent/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/clickhouse-tencent/install b/clickhouse-tencent/install new file mode 100755 index 000000000..a51514c9a --- /dev/null +++ b/clickhouse-tencent/install @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +# Install a specific Tencent-built clickhouse package (PR build). +if [ ! -x /usr/bin/clickhouse ]; then + wget --continue --progress=dot:giga \ + https://clickhouse-builds.s3.amazonaws.com/PRs/81944/e3a48c0de6d188232cc544244ba6862b63eb4762/build_amd_release/clickhouse-common-static-25.9.1.1-amd64.tgz \ + -O clickhouse-tencent.tgz + mkdir -p clickhouse-tencent && tar -xzf clickhouse-tencent.tgz -C clickhouse-tencent + sudo clickhouse-tencent/clickhouse-common-static-25.9.1.1/usr/bin/clickhouse install --noninteractive +fi diff --git a/clickhouse-tencent/load b/clickhouse-tencent/load new file mode 100755 index 000000000..4a423a9b4 --- /dev/null +++ b/clickhouse-tencent/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +clickhouse-client < create.sql + +sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ +sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet + +clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))" + +sudo rm -f /var/lib/clickhouse/user_files/hits_*.parquet +sync diff --git a/clickhouse-tencent/query b/clickhouse-tencent/query new file mode 100755 index 000000000..9ef756b1f --- /dev/null +++ b/clickhouse-tencent/query @@ -0,0 +1,9 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +set -e + +query=$(cat) +clickhouse-client --time --query="$query" diff --git a/clickhouse-tencent/run.sh b/clickhouse-tencent/run.sh deleted file mode 100755 index 0dce71cf9..000000000 --- a/clickhouse-tencent/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(clickhouse-client --host "${FQDN:=localhost}" --password "${PASSWORD:=}" ${PASSWORD:+--secure} --time --format=Null --query="$query" --progress 0 2>&1 ||:) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-tencent/start b/clickhouse-tencent/start new file mode 100755 index 000000000..54819af9c --- /dev/null +++ b/clickhouse-tencent/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Idempotent: if already up, do nothing. +if clickhouse-client --query "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo clickhouse start diff --git a/clickhouse-tencent/stop b/clickhouse-tencent/stop new file mode 100755 index 000000000..ea9d529c3 --- /dev/null +++ b/clickhouse-tencent/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo clickhouse stop || true diff --git a/clickhouse-web/benchmark.sh b/clickhouse-web/benchmark.sh index eb927e5ef..21c0f79d7 100755 --- a/clickhouse-web/benchmark.sh +++ b/clickhouse-web/benchmark.sh @@ -1,32 +1,6 @@ #!/bin/bash - -# The benchmark should be run in the eu-central-1 (Frankfurt) region. -# Allocate a network-optimized ("n") machine, e.g. c5n.4xlarge. - -# Install - -curl https://clickhouse.com/ | sh -sudo ./clickhouse install --noninteractive -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -# A directory for cache -sudo mkdir /dev/shm/clickhouse -sudo chown clickhouse:clickhouse /dev/shm/clickhouse - -# Load the data - -echo -n "Load time: " -clickhouse-client --time < create.sql - -# Run the queries - -./run.sh - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read from a remote ClickHouse-hosted web disk; no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/clickhouse-web/check b/clickhouse-web/check new file mode 100755 index 000000000..febe4e0de --- /dev/null +++ b/clickhouse-web/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT 1" >/dev/null diff --git a/clickhouse-web/data-size b/clickhouse-web/data-size new file mode 100755 index 000000000..7770f6efb --- /dev/null +++ b/clickhouse-web/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/clickhouse-web/install b/clickhouse-web/install new file mode 100755 index 000000000..eb2362953 --- /dev/null +++ b/clickhouse-web/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# Note: this benchmark expects to run in eu-central-1 (Frankfurt) on an +# n-class network-optimized machine (e.g. c5n.4xlarge), since data is fetched +# over HTTP from a public ClickHouse-hosted dataset. + +if [ ! -x /usr/bin/clickhouse ]; then + curl https://clickhouse.com/ | sh + sudo ./clickhouse install --noninteractive +fi + +# Cache directory used by the web disk. +sudo mkdir -p /dev/shm/clickhouse +sudo chown clickhouse:clickhouse /dev/shm/clickhouse diff --git a/clickhouse-web/load b/clickhouse-web/load new file mode 100755 index 000000000..8b928b8f5 --- /dev/null +++ b/clickhouse-web/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# create.sql is an ATTACH TABLE that points to a remote web disk; nothing is +# downloaded or written here, the table is materialized on-demand at query +# time, with /dev/shm/clickhouse/ as a local cache. +clickhouse-client < create.sql +sync diff --git a/clickhouse-web/query b/clickhouse-web/query new file mode 100755 index 000000000..72a6eda1e --- /dev/null +++ b/clickhouse-web/query @@ -0,0 +1,12 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +# +# The web-disk cache is dropped before the query so timings are cold. +set -e + +query=$(cat) +clickhouse-client --query "SYSTEM DROP FILESYSTEM CACHE" >/dev/null +clickhouse-client --time --query="$query" diff --git a/clickhouse-web/run.sh b/clickhouse-web/run.sh deleted file mode 100755 index 502332872..000000000 --- a/clickhouse-web/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - clickhouse-client --query "SYSTEM DROP FILESYSTEM CACHE" - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(clickhouse-client --host "${FQDN:=localhost}" --password "${PASSWORD:=}" ${PASSWORD:+--secure} --time --format=Null --query="$query" --progress 0 2>&1 ||:) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse-web/start b/clickhouse-web/start new file mode 100755 index 000000000..54819af9c --- /dev/null +++ b/clickhouse-web/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Idempotent: if already up, do nothing. +if clickhouse-client --query "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo clickhouse start diff --git a/clickhouse-web/stop b/clickhouse-web/stop new file mode 100755 index 000000000..ea9d529c3 --- /dev/null +++ b/clickhouse-web/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo clickhouse stop || true diff --git a/clickhouse/benchmark.sh b/clickhouse/benchmark.sh index 18fcf86ef..6a7f45d3a 100755 --- a/clickhouse/benchmark.sh +++ b/clickhouse/benchmark.sh @@ -1,52 +1,5 @@ #!/bin/bash - -# Install - -if [ ! -x /usr/bin/clickhouse ] -then - curl https://clickhouse.com/ | sh - sudo ./clickhouse install --noninteractive -fi - -# Optional: if you want to use higher compression: -if (( 0 )); then - echo " -compression: - case: - method: zstd - " | sudo tee /etc/clickhouse-server/config.d/compression.yaml -fi; - -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -# Determine which set of files to use depending on the type of run -if [ "$1" != "" ] && [ "$1" != "tuned" ] && [ "$1" != "tuned-memory" ]; then - echo "Error: command line argument must be one of {'', 'tuned', 'tuned-memory'}" - exit 1 -elif [ ! -z "$1" ]; then - SUFFIX="-$1" -fi - -# Load the data - -clickhouse-client < create"$SUFFIX".sql - -../download-hits-parquet-partitioned -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet - -echo -n "Load time: " -clickhouse-client --time --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 )) - -# Run the queries - -./run.sh "$1" - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/clickhouse/check b/clickhouse/check new file mode 100755 index 000000000..febe4e0de --- /dev/null +++ b/clickhouse/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT 1" >/dev/null diff --git a/clickhouse/data-size b/clickhouse/data-size new file mode 100755 index 000000000..7770f6efb --- /dev/null +++ b/clickhouse/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/clickhouse/install b/clickhouse/install new file mode 100755 index 000000000..7541fda11 --- /dev/null +++ b/clickhouse/install @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +if [ ! -x /usr/bin/clickhouse ]; then + curl https://clickhouse.com/ | sh + sudo ./clickhouse install --noninteractive +fi diff --git a/clickhouse/load b/clickhouse/load new file mode 100755 index 000000000..4a423a9b4 --- /dev/null +++ b/clickhouse/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +clickhouse-client < create.sql + +sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ +sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet + +clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))" + +sudo rm -f /var/lib/clickhouse/user_files/hits_*.parquet +sync diff --git a/clickhouse/query b/clickhouse/query new file mode 100755 index 000000000..c6abe5b81 --- /dev/null +++ b/clickhouse/query @@ -0,0 +1,8 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via clickhouse-client. +# Stdout: query result (default format). +# Stderr: query runtime in fractional seconds on the last line (from --time). +# Exit non-zero on error. +set -e + +clickhouse-client --time diff --git a/clickhouse/run.sh b/clickhouse/run.sh deleted file mode 100755 index 820a39e3b..000000000 --- a/clickhouse/run.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# Determine which set of files to use depending on the type of run -if [ "$1" != "" ] && [ "$1" != "tuned" ] && [ "$1" != "tuned-memory" ]; then - echo "Error: command line argument must be one of {'', 'tuned', 'tuned-memory'}" - exit 1 -else if [ ! -z "$1" ]; then - SUFFIX="-$1" -fi -fi - -TRIES=3 -QUERY_NUM=1 -cat queries"$SUFFIX".sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(clickhouse-client --host "${FQDN:=localhost}" --password "${PASSWORD:=}" ${PASSWORD:+--secure} --time --format=Null --query="$query" --progress 0 2>&1 ||:) # (*) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - # (*) --format=Null is client-side formatting. The query result is still sent back to the client. - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/clickhouse/start b/clickhouse/start new file mode 100755 index 000000000..a3aa66fe7 --- /dev/null +++ b/clickhouse/start @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo clickhouse start diff --git a/clickhouse/stop b/clickhouse/stop new file mode 100755 index 000000000..ea9d529c3 --- /dev/null +++ b/clickhouse/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo clickhouse stop || true diff --git a/cloudberry/benchmark.sh b/cloudberry/benchmark.sh index 4271c6030..531bd6503 100755 --- a/cloudberry/benchmark.sh +++ b/cloudberry/benchmark.sh @@ -1,126 +1,5 @@ #!/bin/bash -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -if [[ $1 == '' ]]; then - echo "SELINUX=disabled" > /etc/selinux/config - SHMALL=$(expr $(getconf _PHYS_PAGES) / 2) - SHMAX=$(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE)) - echo "Using shmall=$SHMALL, shmax=$SHMAX" - echo " -kernel.shmall = $SHMALL -kernel.shmmax = $SHMAX -kernel.shmmni = 4096 -vm.overcommit_memory = 2 # See Segment Host Memory -vm.overcommit_ratio = 95 # See Segment Host Memory -net.ipv4.ip_local_port_range = 10000 65535 # See Port Settings -kernel.sem = 250 2048000 200 8192 -kernel.sysrq = 1 -kernel.core_uses_pid = 1 -kernel.msgmnb = 65536 -kernel.msgmax = 65536 -kernel.msgmni = 2048 -net.ipv4.tcp_syncookies = 1 -net.ipv4.conf.default.accept_source_route = 0 -net.ipv4.tcp_max_syn_backlog = 4096 -net.ipv4.conf.all.arp_filter = 1 -net.ipv4.ipfrag_high_thresh = 41943040 -net.ipv4.ipfrag_low_thresh = 31457280 -net.ipv4.ipfrag_time = 60 -net.core.netdev_max_backlog = 10000 -net.core.rmem_max = 2097152 -net.core.wmem_max = 2097152 -vm.swappiness = 10 -vm.zone_reclaim_mode = 0 -vm.dirty_expire_centisecs = 500 -vm.dirty_writeback_centisecs = 100 -vm.dirty_background_ratio = 0 # See System Memory -vm.dirty_ratio = 0 -vm.dirty_background_bytes = 1610612736 -vm.dirty_bytes = 4294967296 -" >> /etc/sysctl.conf - sysctl -p - - echo " -* soft nofile 524288 -* hard nofile 524288 -* soft nproc 131072 -* hard nproc 131072 -" > /etc/security/limits.conf - - echo " -RemoveIPC=no -" > /etc/systemd/logind.conf - - groupadd gpadmin - useradd gpadmin -r -m -g gpadmin - sudo -u gpadmin ssh-keygen -t rsa -b 4096 - usermod -aG wheel gpadmin - echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers - - grubby --update-kernel=ALL --args="transparent_hugepage=never" - - echo "Please reboot now. Then launch the script with the 'db-install' parameter." - -elif [[ $1 == 'db-install' ]]; then - echo "Database installation" - yum install -y go - export GOPROXY=https://goproxy.io,direct - yum -y install R apr apr-devel apr-util automake autoconf bash bison bison-devel bzip2 bzip2-devel flex flex-devel gcc gcc-c++ git gdb iproute krb5-devel less libevent libevent-devel libxml2 libxml2-devel libyaml libzstd-devel libzstd make openldap openssh openssh-clients openssh-server openssl openssl-devel openssl-libs perl python3-devel readline readline-devel rsync sed sudo tar vim wget which zip zlib python3-pip python3-venv python3-psycopg2 postgresql15 libpq-devel psutils - yum install curl libcurl-devel --allowerasing - yum install https://cdn.amazonlinux.com/2/core/2.0/x86_64/6b0225ccc542f3834c95733dcf321ab9f1e77e6ca6817469771a8af7c49efe6c/../../../../../blobstore/4846e71174e99f1b7f0985aa01631de003633d3a5f1a950812323c175214ae16/xerces-c-3.1.1-10.amzn2.x86_64.rpm - yum install https://cdn.amazonlinux.com/2/core/2.0/x86_64/6b0225ccc542f3834c95733dcf321ab9f1e77e6ca6817469771a8af7c49efe6c/../../../../../blobstore/53208ffe95cd1e38bba94984661e79134b3cc1b039922e828c40df7214ecaee8/xerces-c-devel-3.1.1-10.amzn2.x86_64.rpm - - python3 -m venv myenv - source myenv/bin/activate - pip install PygreSQL psutil - if [[ $2 != 'no_dl' ]]; then wget --continue --progress=dot:giga https://github.com/cloudberrydb/cloudberrydb/archive/refs/tags/1.5.3.tar.gz; fi - tar -xzf 1.5.3.tar.gz - cd cloudberrydb-1.5.3/ - echo -e "/usr/local/lib \n/usr/local/lib64" >> /etc/ld.so.conf - ldconfig - ./configure --prefix=/usr/local/cloudberrydb - make -j8 - make -j8 install - chown -R gpadmin:gpadmin /usr/local - chown -R gpadmin:gpadmin /usr/local/cloudberry* - echo "source /usr/local/cloudberrydb/greenplum_path.sh" >> /home/gpadmin/.bashrc - echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config - systemctl restart sshd - passwd gpadmin - sudo -iu gpadmin ssh-copy-id localhost - echo "localhost" > /home/gpadmin/hosts - mkdir -p /data0/primary/ - mkdir -p /data0/mirror/ - mkdir -p /data0/coordinator/ - chown -R gpadmin:gpadmin /data0 - echo "export COORDINATOR_DATA_DIRECTORY=/data0/coordinator/gpseg-1" >> /home/gpadmin/.bashrc - cp $SCRIPT_DIR/gpinitsystem_config /home/gpadmin/ - chown gpadmin:gpadmin /home/gpadmin/* - sudo -iu gpadmin gpinitsystem -c gpinitsystem_config -h hosts - echo "Database should be up. Run the script with the 'test' paramater to run the tests" - -elif [[ $1 == 'test' ]]; then - echo "Will run tests" - cd $SCRIPT_DIR - cp $SCRIPT_DIR/create.sql /home/gpadmin/ - cp $SCRIPT_DIR/queries.sql /home/gpadmin/ - cp $SCRIPT_DIR/run.sh /home/gpadmin/ - chmod +x /home/gpadmin/run.sh - chown gpadmin:gpadmin /home/gpadmin/* - if [[ $2 != 'no_dl' ]]; then sudo -iu gpadmin wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz'; fi - if [[ $2 != 'no_dl' ]]; then sudo -iu gpadmin gzip -d -f hits.tsv.gz; fi - sudo -iu gpadmin chmod 777 ~ hits.tsv - sudo -iu gpadmin psql -d postgres -f /home/gpadmin/create.sql 2>&1 | tee load_out.txt - if grep 'ERROR' load_out.txt - then - exit 1 - fi - sudo -iu gpadmin nohup gpfdist & - if [[ $2 != 'no_dl' ]]; then echo -n "Load time: " - command time -f '%e' sudo -iu gpadmin psql -d postgres -t -c "insert into hits select * from hits_ext;"; fi - if [[ $2 != 'no_dl' ]]; then echo -n "Load time: " - command time -f '%e' sudo -iu gpadmin psql -d postgres -t -c "ANALYZE hits;"; fi - du -sh /data0* - sudo -iu gpadmin /home/gpadmin/run.sh 2>&1 | tee log.txt - cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' |awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -fi +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/cloudberry/check b/cloudberry/check new file mode 100755 index 000000000..4a2e7a728 --- /dev/null +++ b/cloudberry/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -iu gpadmin bash -lc 'psql -d postgres -t -c "SELECT 1"' >/dev/null diff --git a/cloudberry/data-size b/cloudberry/data-size new file mode 100755 index 000000000..92acac459 --- /dev/null +++ b/cloudberry/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo du -bcs /data0 2>/dev/null | grep total | awk '{print $1}' diff --git a/cloudberry/install b/cloudberry/install new file mode 100755 index 000000000..44eefd3d5 --- /dev/null +++ b/cloudberry/install @@ -0,0 +1,126 @@ +#!/bin/bash +# Install Cloudberry DB. The full flow needs to be split across one or more +# reboots and password setup; this script encodes the original steps. +# +# Phases (controlled by the first argument; default is all-prereqs): +# System config; afterwards the host must be rebooted before +# running ./install db-install. +# db-install Build/install Cloudberry from source under user gpadmin. +# +# The original benchmark.sh ran phases interactively; we mirror that here. +set -eu + +PHASE=${1:-prereqs} + +if [ "$PHASE" = "prereqs" ]; then + echo "SELINUX=disabled" | sudo tee /etc/selinux/config + + SHMALL=$(expr $(getconf _PHYS_PAGES) / 2) + SHMAX=$(expr $(getconf _PHYS_PAGES) / 2 \* $(getconf PAGE_SIZE)) + + sudo tee -a /etc/sysctl.conf </dev/null 2>&1; then + sudo useradd gpadmin -r -m -g gpadmin + sudo -u gpadmin ssh-keygen -t rsa -b 4096 -N '' -f /home/gpadmin/.ssh/id_rsa + fi + sudo usermod -aG wheel gpadmin || true + echo "%wheel ALL=(ALL) NOPASSWD: ALL" | sudo tee -a /etc/sudoers + sudo grubby --update-kernel=ALL --args="transparent_hugepage=never" || true + + echo "Prereqs installed. Reboot, then re-run: ./install db-install" + exit 0 +fi + +if [ "$PHASE" = "db-install" ]; then + SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" + sudo yum install -y go + export GOPROXY=https://goproxy.io,direct + sudo yum -y install R apr apr-devel apr-util automake autoconf bash bison \ + bison-devel bzip2 bzip2-devel flex flex-devel gcc gcc-c++ git gdb iproute \ + krb5-devel less libevent libevent-devel libxml2 libxml2-devel libyaml \ + libzstd-devel libzstd make openldap openssh openssh-clients openssh-server \ + openssl openssl-devel openssl-libs perl python3-devel readline readline-devel \ + rsync sed sudo tar vim wget which zip zlib python3-pip python3-venv \ + python3-psycopg2 postgresql15 libpq-devel psutils + sudo yum install -y curl libcurl-devel --allowerasing + + python3 -m venv /tmp/cloudberry-venv + # shellcheck disable=SC1091 + source /tmp/cloudberry-venv/bin/activate + pip install PygreSQL psutil + + if [ ! -f 1.5.3.tar.gz ]; then + wget --continue --progress=dot:giga https://github.com/cloudberrydb/cloudberrydb/archive/refs/tags/1.5.3.tar.gz + fi + tar -xzf 1.5.3.tar.gz + cd cloudberrydb-1.5.3/ + echo -e "/usr/local/lib \n/usr/local/lib64" | sudo tee -a /etc/ld.so.conf + sudo ldconfig + ./configure --prefix=/usr/local/cloudberrydb + make -j$(nproc) + sudo make -j$(nproc) install + sudo chown -R gpadmin:gpadmin /usr/local + echo "source /usr/local/cloudberrydb/greenplum_path.sh" | sudo tee -a /home/gpadmin/.bashrc + + echo "PasswordAuthentication yes" | sudo tee -a /etc/ssh/sshd_config + sudo systemctl restart sshd + sudo -iu gpadmin ssh-copy-id -o StrictHostKeyChecking=no localhost || true + + sudo mkdir -p /data0/primary /data0/mirror /data0/coordinator + sudo chown -R gpadmin:gpadmin /data0 + echo "export COORDINATOR_DATA_DIRECTORY=/data0/coordinator/gpseg-1" | sudo tee -a /home/gpadmin/.bashrc + + echo "localhost" | sudo tee /home/gpadmin/hosts >/dev/null + sudo cp "$SCRIPT_DIR/gpinitsystem_config" /home/gpadmin/ + sudo chown gpadmin:gpadmin /home/gpadmin/hosts /home/gpadmin/gpinitsystem_config + + sudo -iu gpadmin gpinitsystem -c gpinitsystem_config -h hosts + exit 0 +fi + +echo "Unknown phase: $PHASE" >&2 +exit 1 diff --git a/cloudberry/load b/cloudberry/load new file mode 100755 index 000000000..b2b7ea92d --- /dev/null +++ b/cloudberry/load @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +# Make hits.tsv readable by gpadmin and gpfdist. +sudo cp -f hits.tsv /home/gpadmin/hits.tsv +sudo chown gpadmin:gpadmin /home/gpadmin/hits.tsv +sudo chmod 644 /home/gpadmin/hits.tsv + +sudo cp -f create.sql /home/gpadmin/create.sql +sudo chown gpadmin:gpadmin /home/gpadmin/create.sql + +sudo -iu gpadmin bash -lc 'psql -d postgres -v ON_ERROR_STOP=1 -f /home/gpadmin/create.sql' + +# Run gpfdist for the foreign-table-driven external load. +sudo -iu gpadmin bash -lc 'pgrep -u gpadmin gpfdist || nohup gpfdist >/tmp/gpfdist.log 2>&1 &' + +sudo -iu gpadmin bash -lc 'psql -d postgres -v ON_ERROR_STOP=1 -t -c "INSERT INTO hits SELECT * FROM hits_ext;"' +sudo -iu gpadmin bash -lc 'psql -d postgres -v ON_ERROR_STOP=1 -t -c "ANALYZE hits;"' + +sudo rm -f /home/gpadmin/hits.tsv hits.tsv +sync diff --git a/cloudberry/query b/cloudberry/query new file mode 100755 index 000000000..a22f5bc47 --- /dev/null +++ b/cloudberry/query @@ -0,0 +1,29 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB +# as gpadmin (so the Greenplum/Cloudberry env is loaded). +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Pipe '\timing' + query through psql under gpadmin's shell. +out=$(printf '\\timing\n%s\n' "$query" | sudo -iu gpadmin bash -lc 'psql -d postgres -t' 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/cloudberry/run.sh b/cloudberry/run.sh deleted file mode 100755 index 23a2756b7..000000000 --- a/cloudberry/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - echo '\timing' > /tmp/query_temp.sql - echo "$query" >> /tmp/query_temp.sql - psql -d postgres -t -f /tmp/query_temp.sql 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/cloudberry/start b/cloudberry/start new file mode 100755 index 000000000..37fef8a82 --- /dev/null +++ b/cloudberry/start @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +# Cloudberry start: ensure the gpadmin user can launch and the coordinator +# is running. gpstart is a no-op if the cluster is already up. +sudo -iu gpadmin bash -lc 'gpstart -a' || true diff --git a/cloudberry/stop b/cloudberry/stop new file mode 100755 index 000000000..5b7053ac0 --- /dev/null +++ b/cloudberry/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo -iu gpadmin bash -lc 'gpstop -a' 2>/dev/null || true diff --git a/cockroachdb/benchmark.sh b/cockroachdb/benchmark.sh index 4951f6cfe..1aa9264b9 100755 --- a/cockroachdb/benchmark.sh +++ b/cockroachdb/benchmark.sh @@ -1,47 +1,5 @@ #!/bin/bash - -CRDBVERSION=25.1.6 -CRDBDATADIR=/var/lib/cockroach-data - -sudo apt-get update -y -# Includes unbuffer utility program -sudo apt-get install -y expect-dev - -wget --continue --progress=dot:giga https://binaries.cockroachdb.com/cockroach-v$CRDBVERSION.linux-$(dpkg --print-architecture).tgz -tar -xvzf cockroach-v$CRDBVERSION.linux-$(dpkg --print-architecture).tgz -sudo cp -r cockroach-v$CRDBVERSION.linux-$(dpkg --print-architecture)/* /usr/local/bin/ -# Build Tag: v25.1.6 -cockroach version | grep "^Build Tag" -sudo mkdir -p $CRDBDATADIR -# Increase cache size to 25% for better read performance -# For details see https://www.cockroachlabs.com/docs/v25.1/recommended-production-settings#cache-and-sql-memory-size -sudo cockroach start-single-node --insecure --listen-addr=localhost --background --store=$CRDBDATADIR --cache=.25 --pid-file=crdb.pid - -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' -O /tmp/hits.csv.gz -# Make data file available in "extern" directory, so it can be loaded via nodelocal -sudo mkdir -p $CRDBDATADIR/extern -gzip -d -c /tmp/hits.csv.gz | sudo tee $CRDBDATADIR/extern/hits.csv > /dev/null - -# Deactivate query plan cache -# For details see https://www.cockroachlabs.com/docs/v25.1/cost-based-optimizer#query-plan-cache -cockroach sql --insecure --host=localhost --execute='SET CLUSTER SETTING sql.query_cache.enabled = false;' - -cockroach sql --insecure --host=localhost --execute='CREATE DATABASE test;' -cockroach sql --insecure --host=localhost --database=test --file='create.sql' -START=$(date +%s) -cockroach sql --insecure --host=localhost --database=test --execute="IMPORT INTO hits(WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID) CSV DATA ('nodelocal://1/hits.csv');" -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -cockroach sql --insecure --host=localhost --database=test --execute="SELECT SUM(range_size) FROM [SHOW RANGES FROM TABLE hits WITH DETAILS];" | tail -n1 - -# Values might be given in ms or s, depending on their magnitude -grep -oP 'Time: \K[\d.]+s|Time: \K\d+ms' log.txt | - sed -E 's/([0-9]+(\.[0-9]+)?)s/\1/; s/([0-9]+)ms/\1\/1000/' | - awk '{if ($0 ~ /\//) {print $1/1000} else {print $0}}' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -sudo killall cockroach +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/cockroachdb/check b/cockroachdb/check new file mode 100755 index 000000000..436921498 --- /dev/null +++ b/cockroachdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +cockroach sql --insecure --host=localhost -e 'SELECT 1' >/dev/null 2>&1 diff --git a/cockroachdb/data-size b/cockroachdb/data-size new file mode 100755 index 000000000..ee02785da --- /dev/null +++ b/cockroachdb/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +cockroach sql --insecure --host=localhost --database=test --format=tsv \ + --execute="SELECT SUM(range_size) FROM [SHOW RANGES FROM TABLE hits WITH DETAILS];" \ + | tail -n1 diff --git a/cockroachdb/install b/cockroachdb/install new file mode 100755 index 000000000..60cbf116e --- /dev/null +++ b/cockroachdb/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +CRDBVERSION=${CRDBVERSION:-25.1.6} + +sudo apt-get update -y +# expect-dev provides `unbuffer` (used in query script). +sudo apt-get install -y expect-dev wget bc + +if [ ! -x /usr/local/bin/cockroach ]; then + arch=$(dpkg --print-architecture) + wget --continue --progress=dot:giga \ + "https://binaries.cockroachdb.com/cockroach-v${CRDBVERSION}.linux-${arch}.tgz" + tar -xvzf "cockroach-v${CRDBVERSION}.linux-${arch}.tgz" + sudo cp -r "cockroach-v${CRDBVERSION}.linux-${arch}/"* /usr/local/bin/ +fi + +sudo mkdir -p /var/lib/cockroach-data diff --git a/cockroachdb/load b/cockroachdb/load new file mode 100755 index 000000000..2afaf6709 --- /dev/null +++ b/cockroachdb/load @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +CRDBDATADIR=/var/lib/cockroach-data + +# Stage data into cockroach's "extern" directory so it can be loaded via nodelocal://. +sudo mkdir -p "$CRDBDATADIR/extern" +sudo cp hits.csv "$CRDBDATADIR/extern/hits.csv" + +cockroach sql --insecure --host=localhost --execute='DROP DATABASE IF EXISTS test CASCADE;' +cockroach sql --insecure --host=localhost --execute='CREATE DATABASE test;' +cockroach sql --insecure --host=localhost --database=test --file='create.sql' + +cockroach sql --insecure --host=localhost --database=test --execute="IMPORT INTO hits(WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID) CSV DATA ('nodelocal://1/hits.csv');" + +sudo rm -f "$CRDBDATADIR/extern/hits.csv" +rm -f hits.csv +sync diff --git a/cockroachdb/query b/cockroachdb/query new file mode 100755 index 000000000..7dc4737d1 --- /dev/null +++ b/cockroachdb/query @@ -0,0 +1,41 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via cockroach sql. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# the "Time:" footer; cockroach prints either "Ns" or "Nms"). +# Exit non-zero on error. +# +# Note: cockroach sql only emits the elapsed-time footer when stdout is a TTY. +# We use `unbuffer` to fool isatty(). +set -e + +query=$(cat) + +raw=$(unbuffer cockroach sql --insecure --host=localhost --database=test \ + --execute="$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qiE '^ERROR:|^pq:'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +# Pull the LAST "Time: " line. CockroachDB uses s, ms, µs. +# Magnitude examples: "Time: 1.23s", "Time: 45ms", "Time: 678µs". +t_line=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+(s|ms|µs|us)' | tail -n1) + +if [ -z "$t_line" ]; then + echo "no Time: footer in cockroach output" >&2 + exit 1 +fi + +awk -v t="$t_line" 'BEGIN { + if (match(t, /[0-9.]+/)) { + v = substr(t, RSTART, RLENGTH) + u = substr(t, RSTART+RLENGTH) + if (u == "ms") { printf "%.3f\n", v / 1000 } + else if (u == "µs" || u == "us") { printf "%.6f\n", v / 1000000 } + else { printf "%.3f\n", v } + } +}' >&2 diff --git a/cockroachdb/run.sh b/cockroachdb/run.sh deleted file mode 100755 index 896025cc7..000000000 --- a/cockroachdb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - # Apparently, cockroach sql only writes the elapsed time of a statement to file descriptors that refer to a terminal (cf. isatty()). - # Since we *pipe* the output into grep, we need to use unbuffer. - unbuffer cockroach sql --insecure --host=localhost --database=test --execute="${query}" | grep 'Time' - done; -done; diff --git a/cockroachdb/start b/cockroachdb/start new file mode 100755 index 000000000..b614e3ab4 --- /dev/null +++ b/cockroachdb/start @@ -0,0 +1,16 @@ +#!/bin/bash +set -eu + +CRDBDATADIR=/var/lib/cockroach-data + +if cockroach sql --insecure --host=localhost -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# Cache=25% per CockroachDB production tuning recommendations. +sudo cockroach start-single-node --insecure --listen-addr=localhost --background \ + --store="$CRDBDATADIR" --cache=.25 --pid-file=crdb.pid + +# Disable plan cache to keep timings honest run-over-run. +cockroach sql --insecure --host=localhost \ + --execute='SET CLUSTER SETTING sql.query_cache.enabled = false;' diff --git a/cockroachdb/stop b/cockroachdb/stop new file mode 100755 index 000000000..3440031f5 --- /dev/null +++ b/cockroachdb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +# cockroach has no clean single-node stop; killall is the documented approach. +sudo killall cockroach 2>/dev/null || true diff --git a/cratedb/benchmark.sh b/cratedb/benchmark.sh index 7d5307b29..a427fca3e 100755 --- a/cratedb/benchmark.sh +++ b/cratedb/benchmark.sh @@ -1,79 +1,11 @@ #!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Set CRATEDB_MODE=tuned to use create-tuned.sql + queries-tuned.sql. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes -# Tuned execution if "tuned" is passed, default mode otherwise -MODE=$1 - -if [[ $MODE == "tuned" ]]; then - CREATE_FILE="create-tuned.sql" - EMPTY_STRING_AS_NULL=TRUE -else - CREATE_FILE="create.sql" - EMPTY_STRING_AS_NULL=FALSE -fi; - -# Install prerequisites. -sudo apt-get update -y -sudo apt-get install -y apt-transport-https apt-utils curl gnupg lsb-release - -# Import the public GPG key for verifying the package signatures. -curl -sS https://cdn.crate.io/downloads/debian/DEB-GPG-KEY-crate | \ - sudo tee /etc/apt/trusted.gpg.d/cratedb.asc - -# Add CrateDB repository to Apt -echo "deb https://cdn.crate.io/downloads/debian/testing/ default main" | \ - sudo tee /etc/apt/sources.list.d/crate-stable.list - -sudo apt-get update -y -sudo apt-get install -y postgresql-client crate - -sudo systemctl start crate - -for _ in {1..300} -do - psql -U crate -h localhost --no-password -t -c 'SELECT 1' && break - sleep 1 -done - -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -O /tmp/hits.tsv.gz -gzip -d -f /tmp/hits.tsv.gz -chmod 444 /tmp/hits.tsv - -psql -U crate -h localhost --no-password -t < $CREATE_FILE 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 +if [ "${CRATEDB_MODE:-default}" = "tuned" ]; then + export BENCH_QUERIES_FILE="queries-tuned.sql" fi -START=$(date +%s) -command time -f '%e' psql -U crate -h localhost --no-password -q -t -c " - COPY hits - FROM 'file:///tmp/hits.tsv' - WITH - ( - "delimiter"=e'\t', - "format"='csv', - "header"=false, - "empty_string_as_null"=${EMPTY_STRING_AS_NULL} - ) - RETURN SUMMARY;" -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -# One record did not load: -# 99997496 -# {"Missing closing quote for value\n at [Source: UNKNOWN; line: 1, column: 1069]":{"count":1,"line_numbers":[93557187]}} -# Time: 10687056.069 ms (02:58:07.056) - -if [[ $MODE == "tuned" ]]; then - psql -U crate -h localhost --no-password -t -c "REFRESH TABLE hits; OPTIMIZE TABLE hits;" -fi; - -# Some queries don't fit into the available heap space and raise an CircuitBreakingException -./run.sh "$MODE" 2>&1 | tee log.txt - -# Look up shard sizes from system tables. Only consider primary shards in case of multi-node setups with replication. -echo -n "Data size: " -psql -U crate -h localhost --no-password -q -t -c "SELECT SUM(size) FROM sys.shards WHERE table_name = 'hits' AND primary = TRUE;" - -grep -oP 'Time: \d+\.\d+ ms|ERROR' < log.txt | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | - awk '{ if ($1 == "ERROR") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : ($1 / 1000); if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' +exec ../lib/benchmark-common.sh diff --git a/cratedb/check b/cratedb/check new file mode 100755 index 000000000..a29db6dad --- /dev/null +++ b/cratedb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql -U crate -h localhost --no-password -t -c 'SELECT 1' >/dev/null diff --git a/cratedb/data-size b/cratedb/data-size new file mode 100755 index 000000000..66d3be5c9 --- /dev/null +++ b/cratedb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +psql -U crate -h localhost --no-password -q -t -A \ + -c "SELECT SUM(size) FROM sys.shards WHERE table_name = 'hits' AND primary = TRUE;" diff --git a/cratedb/install b/cratedb/install new file mode 100755 index 000000000..7f74b4569 --- /dev/null +++ b/cratedb/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y apt-transport-https apt-utils curl gnupg lsb-release + +if [ ! -f /etc/apt/trusted.gpg.d/cratedb.asc ]; then + curl -sS https://cdn.crate.io/downloads/debian/DEB-GPG-KEY-crate \ + | sudo tee /etc/apt/trusted.gpg.d/cratedb.asc >/dev/null +fi + +if [ ! -f /etc/apt/sources.list.d/crate-stable.list ]; then + echo "deb https://cdn.crate.io/downloads/debian/testing/ default main" \ + | sudo tee /etc/apt/sources.list.d/crate-stable.list + sudo apt-get update -y +fi + +sudo apt-get install -y postgresql-client crate diff --git a/cratedb/load b/cratedb/load new file mode 100755 index 000000000..2b0b58c46 --- /dev/null +++ b/cratedb/load @@ -0,0 +1,37 @@ +#!/bin/bash +set -eu + +# CrateDB has two schemas (default vs tuned); MODE env selects. +MODE=${CRATEDB_MODE:-default} +if [[ $MODE == "tuned" ]]; then + CREATE_FILE="create-tuned.sql" + EMPTY_STRING_AS_NULL=TRUE +else + CREATE_FILE="create.sql" + EMPTY_STRING_AS_NULL=FALSE +fi + +# Stage data into a known location. +mv hits.tsv /tmp/hits.tsv +chmod 444 /tmp/hits.tsv + +psql -U crate -h localhost --no-password -t < "$CREATE_FILE" + +psql -U crate -h localhost --no-password -q -t -c " + COPY hits + FROM 'file:///tmp/hits.tsv' + WITH + ( + \"delimiter\"=e'\t', + \"format\"='csv', + \"header\"=false, + \"empty_string_as_null\"=${EMPTY_STRING_AS_NULL} + ) + RETURN SUMMARY;" + +if [[ $MODE == "tuned" ]]; then + psql -U crate -h localhost --no-password -t -c "REFRESH TABLE hits; OPTIMIZE TABLE hits;" +fi + +rm -f /tmp/hits.tsv +sync diff --git a/cratedb/query b/cratedb/query new file mode 100755 index 000000000..eb208e78d --- /dev/null +++ b/cratedb/query @@ -0,0 +1,26 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against CrateDB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(psql -U crate -h localhost --no-password -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/cratedb/run.sh b/cratedb/run.sh deleted file mode 100755 index 89e6afbb1..000000000 --- a/cratedb/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -MODE=$1 -TRIES=3 - -if [[ $MODE == "tuned" ]]; then - FILE_NAME="queries-tuned.sql" -else - FILE_NAME="queries.sql" -fi; - -cat $FILE_NAME | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -U crate -h localhost --no-password -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/cratedb/start b/cratedb/start new file mode 100755 index 000000000..4489d806b --- /dev/null +++ b/cratedb/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if psql -U crate -h localhost --no-password -t -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi +sudo systemctl start crate diff --git a/cratedb/stop b/cratedb/stop new file mode 100755 index 000000000..181c6fa60 --- /dev/null +++ b/cratedb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop crate || true diff --git a/daft-parquet-partitioned/benchmark.sh b/daft-parquet-partitioned/benchmark.sh index 4d1a3920e..3b63e772a 100755 --- a/daft-parquet-partitioned/benchmark.sh +++ b/daft-parquet-partitioned/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas -pip install packaging -pip install daft==0.7.4 - -../download-hits-parquet-partitioned - -mode=partitioned -echo "Running $mode mode..." -./run.sh $machine_name $mode 2>&1 | tee "daft_log_${mode}.txt" - -echo "Load time: 0" -echo "Data size: $(du -bcs hits*.parquet | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/daft-parquet-partitioned/check b/daft-parquet-partitioned/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/daft-parquet-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/daft-parquet-partitioned/data-size b/daft-parquet-partitioned/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/daft-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/daft-parquet-partitioned/install b/daft-parquet-partitioned/install new file mode 100755 index 000000000..54c195180 --- /dev/null +++ b/daft-parquet-partitioned/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas packaging 'daft==0.7.4' fastapi uvicorn diff --git a/daft-parquet-partitioned/load b/daft-parquet-partitioned/load new file mode 100755 index 000000000..65019c662 --- /dev/null +++ b/daft-parquet-partitioned/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. Daft is lazy; reading just +# builds the plan and types — actual data is loaded on first query. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +sync diff --git a/daft-parquet-partitioned/query b/daft-parquet-partitioned/query new file mode 100755 index 000000000..6366d7160 --- /dev/null +++ b/daft-parquet-partitioned/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running daft server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/daft-parquet-partitioned/query.py b/daft-parquet-partitioned/query.py deleted file mode 100755 index b76176750..000000000 --- a/daft-parquet-partitioned/query.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 - -import daft -import os -import sys -import timeit -import traceback -from daft import col, DataType - -hits = None -current_dir = os.path.dirname(os.path.abspath(__file__)) -query_idx = int(sys.argv[1]) - 1 -is_single_mode = len(sys.argv) > 2 and sys.argv[2] == "single" -parquet_path = os.path.join( - current_dir, - "hits.parquet" if is_single_mode else "hits_*.parquet" -) - -with open("queries.sql") as f: - sql_list = [q.strip() for q in f.read().split(';') if q.strip()] - -def run_single_query(sql, i): - try: - start = timeit.default_timer() - - global hits - if hits is None: - hits = daft.read_parquet(parquet_path) - hits = hits.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) - hits = hits.with_column("EventDate", col("EventDate").cast(DataType.date())) - hits = hits.with_column("URL", col("URL").decode("utf-8")) - hits = hits.with_column("Title", col("Title").decode("utf-8")) - hits = hits.with_column("Referer", col("Referer").decode("utf-8")) - hits = hits.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) - hits = hits.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) - - result = daft.sql(sql) - result.collect() - - run_time = round(timeit.default_timer() - start, 3) - return run_time - except Exception as e: - print(f"Error executing query {query_idx}: {str(e)[:100]}", file=sys.stderr) - traceback.print_exc() - return None - -if __name__ == "__main__": - sql = sql_list[query_idx] - times = [] - for i in range(3): - elapsed = run_single_query(sql, i) - times.append(f"{elapsed}" if elapsed else "") - print(','.join(times)) diff --git a/daft-parquet-partitioned/run.sh b/daft-parquet-partitioned/run.sh deleted file mode 100755 index 86d1a512c..000000000 --- a/daft-parquet-partitioned/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -mode=${1} - -TRIES=3 -QUERY_COUNT=43 - -declare -a results=() -for ((i=0; i /dev/null - - output=$(python3 query.py $q $mode 2>&1) - IFS=',' read -r t1 t2 t3 <<< "$(echo "$output" | tail -1)" - - results[$((q-1))]="[${t1:-null},${t2:-null},${t3:-null}]" -done - -IFS=, printf '%s,\n' "${results[@]}" | sed '$s/,$//' diff --git a/daft-parquet-partitioned/server.py b/daft-parquet-partitioned/server.py new file mode 100644 index 000000000..40dd8f913 --- /dev/null +++ b/daft-parquet-partitioned/server.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around Daft (partitioned parquet) so it conforms to the +ClickBench install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits_*.parquet from the working directory, casts + types, holds the Daft DataFrame in memory, registers + it as `hits` for daft.sql, returns {"elapsed": ...} + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching callable via daft.sql, returns + {"elapsed": }. + GET /data-size -> total file size of hits_*.parquet at load time. + +The 43 SQL strings come straight from the prior +daft-parquet-partitioned/queries.sql. +""" + +import os +import timeit + +import daft +import uvicorn +from daft import DataType, col +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits = None +data_bytes = 0 + +PARQUET_GLOB = os.environ.get("BENCH_DAFT_PARQUET", "hits_*.parquet") + + +def _make_runner(sql: str): + return lambda _df: daft.sql(sql).collect() + + +# 43 ClickBench queries — daft.sql against the registered `hits` view. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM hits;", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + "SELECT AVG(UserID) FROM hits;", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + "SELECT MIN(EventDate) as m1, MAX(EventDate) as m2 FROM hits;", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) as m FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth) AS s0, SUM(ResolutionWidth + 1) AS s1, SUM(ResolutionWidth + 2) AS s2, SUM(ResolutionWidth + 3) AS s3, SUM(ResolutionWidth + 4) AS s4, SUM(ResolutionWidth + 5) AS s5, SUM(ResolutionWidth + 6) AS s6, SUM(ResolutionWidth + 7) AS s7, SUM(ResolutionWidth + 8) AS s8, SUM(ResolutionWidth + 9) AS s9, SUM(ResolutionWidth + 10) AS s10, SUM(ResolutionWidth + 11) AS s11, SUM(ResolutionWidth + 12) AS s12, SUM(ResolutionWidth + 13) AS s13, SUM(ResolutionWidth + 14) AS s14, SUM(ResolutionWidth + 15) AS s15, SUM(ResolutionWidth + 16) AS s16, SUM(ResolutionWidth + 17) AS s17, SUM(ResolutionWidth + 18) AS s18, SUM(ResolutionWidth + 19) AS s19, SUM(ResolutionWidth + 20) AS s20, SUM(ResolutionWidth + 21) AS s21, SUM(ResolutionWidth + 22) AS s22, SUM(ResolutionWidth + 23) AS s23, SUM(ResolutionWidth + 24) AS s24, SUM(ResolutionWidth + 25) AS s25, SUM(ResolutionWidth + 26) AS s26, SUM(ResolutionWidth + 27) AS s27, SUM(ResolutionWidth + 28) AS s28, SUM(ResolutionWidth + 29) AS s29, SUM(ResolutionWidth + 30) AS s30, SUM(ResolutionWidth + 31) AS s31, SUM(ResolutionWidth + 32) AS s32, SUM(ResolutionWidth + 33) AS s33, SUM(ResolutionWidth + 34) AS s34, SUM(ResolutionWidth + 35) AS s35, SUM(ResolutionWidth + 36) AS s36, SUM(ResolutionWidth + 37) AS s37, SUM(ResolutionWidth + 38) AS s38, SUM(ResolutionWidth + 39) AS s39, SUM(ResolutionWidth + 40) AS s40, SUM(ResolutionWidth + 41) AS s41, SUM(ResolutionWidth + 42) AS s42, SUM(ResolutionWidth + 43) AS s43, SUM(ResolutionWidth + 44) AS s44, SUM(ResolutionWidth + 45) AS s45, SUM(ResolutionWidth + 46) AS s46, SUM(ResolutionWidth + 47) AS s47, SUM(ResolutionWidth + 48) AS s48, SUM(ResolutionWidth + 49) AS s49, SUM(ResolutionWidth + 50) AS s50, SUM(ResolutionWidth + 51) AS s51, SUM(ResolutionWidth + 52) AS s52, SUM(ResolutionWidth + 53) AS s53, SUM(ResolutionWidth + 54) AS s54, SUM(ResolutionWidth + 55) AS s55, SUM(ResolutionWidth + 56) AS s56, SUM(ResolutionWidth + 57) AS s57, SUM(ResolutionWidth + 58) AS s58, SUM(ResolutionWidth + 59) AS s59, SUM(ResolutionWidth + 60) AS s60, SUM(ResolutionWidth + 61) AS s61, SUM(ResolutionWidth + 62) AS s62, SUM(ResolutionWidth + 63) AS s63, SUM(ResolutionWidth + 64) AS s64, SUM(ResolutionWidth + 65) AS s65, SUM(ResolutionWidth + 66) AS s66, SUM(ResolutionWidth + 67) AS s67, SUM(ResolutionWidth + 68) AS s68, SUM(ResolutionWidth + 69) AS s69, SUM(ResolutionWidth + 70) AS s70, SUM(ResolutionWidth + 71) AS s71, SUM(ResolutionWidth + 72) AS s72, SUM(ResolutionWidth + 73) AS s73, SUM(ResolutionWidth + 74) AS s74, SUM(ResolutionWidth + 75) AS s75, SUM(ResolutionWidth + 76) AS s76, SUM(ResolutionWidth + 77) AS s77, SUM(ResolutionWidth + 78) AS s78, SUM(ResolutionWidth + 79) AS s79, SUM(ResolutionWidth + 80) AS s80, SUM(ResolutionWidth + 81) AS s81, SUM(ResolutionWidth + 82) AS s82, SUM(ResolutionWidth + 83) AS s83, SUM(ResolutionWidth + 84) AS s84, SUM(ResolutionWidth + 85) AS s85, SUM(ResolutionWidth + 86) AS s86, SUM(ResolutionWidth + 87) AS s87, SUM(ResolutionWidth + 88) AS s88, SUM(ResolutionWidth + 89) AS s89 FROM hits;", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +def _data_size_bytes() -> int: + import glob + total = 0 + for p in glob.glob(PARQUET_GLOB): + try: + total += os.path.getsize(p) + except OSError: + pass + return total + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, data_bytes + start = timeit.default_timer() + data_bytes = _data_size_bytes() + df = daft.read_parquet(PARQUET_GLOB) + df = df.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) + df = df.with_column("EventDate", col("EventDate").cast(DataType.date())) + df = df.with_column("URL", col("URL").decode("utf-8")) + df = df.with_column("Title", col("Title").decode("utf-8")) + df = df.with_column("Referer", col("Referer").decode("utf-8")) + df = df.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) + df = df.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) + hits = df + # Register so daft.sql can see `hits`. + try: + daft.catalog.register_table("hits", df) # type: ignore[attr-defined] + except Exception: + pass + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if data_bytes: + return {"bytes": int(data_bytes)} + # Fall back to the on-disk size if /load hasn't run yet. + return {"bytes": _data_size_bytes()} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_DAFT_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/daft-parquet-partitioned/start b/daft-parquet-partitioned/start new file mode 100755 index 000000000..e3fab7273 --- /dev/null +++ b/daft-parquet-partitioned/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/daft-parquet-partitioned/stop b/daft-parquet-partitioned/stop new file mode 100755 index 000000000..787b35abc --- /dev/null +++ b/daft-parquet-partitioned/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/daft-parquet/benchmark.sh b/daft-parquet/benchmark.sh index e456b9a73..fc4bacc8f 100755 --- a/daft-parquet/benchmark.sh +++ b/daft-parquet/benchmark.sh @@ -1,20 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas -pip install packaging -pip install daft - -../download-hits-parquet-single - -# Run the queries -mode=single -echo "Running $mode mode..." -./run.sh $mode 2>&1 | tee "daft_log_${mode}.txt" - -echo "Load time: 0" -echo "Data size: $(du -bcs hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/daft-parquet/check b/daft-parquet/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/daft-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/daft-parquet/data-size b/daft-parquet/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/daft-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/daft-parquet/install b/daft-parquet/install new file mode 100755 index 000000000..8d49eef9f --- /dev/null +++ b/daft-parquet/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas packaging daft fastapi uvicorn diff --git a/daft-parquet/load b/daft-parquet/load new file mode 100755 index 000000000..65019c662 --- /dev/null +++ b/daft-parquet/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. Daft is lazy; reading just +# builds the plan and types — actual data is loaded on first query. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +sync diff --git a/daft-parquet/query b/daft-parquet/query new file mode 100755 index 000000000..6366d7160 --- /dev/null +++ b/daft-parquet/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running daft server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/daft-parquet/query.py b/daft-parquet/query.py deleted file mode 100755 index b76176750..000000000 --- a/daft-parquet/query.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 - -import daft -import os -import sys -import timeit -import traceback -from daft import col, DataType - -hits = None -current_dir = os.path.dirname(os.path.abspath(__file__)) -query_idx = int(sys.argv[1]) - 1 -is_single_mode = len(sys.argv) > 2 and sys.argv[2] == "single" -parquet_path = os.path.join( - current_dir, - "hits.parquet" if is_single_mode else "hits_*.parquet" -) - -with open("queries.sql") as f: - sql_list = [q.strip() for q in f.read().split(';') if q.strip()] - -def run_single_query(sql, i): - try: - start = timeit.default_timer() - - global hits - if hits is None: - hits = daft.read_parquet(parquet_path) - hits = hits.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) - hits = hits.with_column("EventDate", col("EventDate").cast(DataType.date())) - hits = hits.with_column("URL", col("URL").decode("utf-8")) - hits = hits.with_column("Title", col("Title").decode("utf-8")) - hits = hits.with_column("Referer", col("Referer").decode("utf-8")) - hits = hits.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) - hits = hits.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) - - result = daft.sql(sql) - result.collect() - - run_time = round(timeit.default_timer() - start, 3) - return run_time - except Exception as e: - print(f"Error executing query {query_idx}: {str(e)[:100]}", file=sys.stderr) - traceback.print_exc() - return None - -if __name__ == "__main__": - sql = sql_list[query_idx] - times = [] - for i in range(3): - elapsed = run_single_query(sql, i) - times.append(f"{elapsed}" if elapsed else "") - print(','.join(times)) diff --git a/daft-parquet/run.sh b/daft-parquet/run.sh deleted file mode 100755 index 86d1a512c..000000000 --- a/daft-parquet/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -mode=${1} - -TRIES=3 -QUERY_COUNT=43 - -declare -a results=() -for ((i=0; i /dev/null - - output=$(python3 query.py $q $mode 2>&1) - IFS=',' read -r t1 t2 t3 <<< "$(echo "$output" | tail -1)" - - results[$((q-1))]="[${t1:-null},${t2:-null},${t3:-null}]" -done - -IFS=, printf '%s,\n' "${results[@]}" | sed '$s/,$//' diff --git a/daft-parquet/server.py b/daft-parquet/server.py new file mode 100644 index 000000000..33b6cde92 --- /dev/null +++ b/daft-parquet/server.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around Daft (single-file parquet) so it conforms to the +ClickBench install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, casts + types, holds the Daft DataFrame in memory, registers + it as `hits` for daft.sql, returns {"elapsed": ...} + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching callable via daft.sql, returns + {"elapsed": }. + GET /data-size -> file size of hits.parquet at load time. + +The 43 SQL strings come straight from the prior daft-parquet/queries.sql. +""" + +import os +import timeit + +import daft +import uvicorn +from daft import DataType, col +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits = None +data_bytes = 0 + +PARQUET_GLOB = os.environ.get("BENCH_DAFT_PARQUET", "hits.parquet") + + +def _make_runner(sql: str): + return lambda _df: daft.sql(sql).collect() + + +# 43 ClickBench queries — daft.sql against the registered `hits` view. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM hits;", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + "SELECT AVG(UserID) FROM hits;", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + "SELECT MIN(EventDate) as m1, MAX(EventDate) as m2 FROM hits;", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) as m FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth) AS s0, SUM(ResolutionWidth + 1) AS s1, SUM(ResolutionWidth + 2) AS s2, SUM(ResolutionWidth + 3) AS s3, SUM(ResolutionWidth + 4) AS s4, SUM(ResolutionWidth + 5) AS s5, SUM(ResolutionWidth + 6) AS s6, SUM(ResolutionWidth + 7) AS s7, SUM(ResolutionWidth + 8) AS s8, SUM(ResolutionWidth + 9) AS s9, SUM(ResolutionWidth + 10) AS s10, SUM(ResolutionWidth + 11) AS s11, SUM(ResolutionWidth + 12) AS s12, SUM(ResolutionWidth + 13) AS s13, SUM(ResolutionWidth + 14) AS s14, SUM(ResolutionWidth + 15) AS s15, SUM(ResolutionWidth + 16) AS s16, SUM(ResolutionWidth + 17) AS s17, SUM(ResolutionWidth + 18) AS s18, SUM(ResolutionWidth + 19) AS s19, SUM(ResolutionWidth + 20) AS s20, SUM(ResolutionWidth + 21) AS s21, SUM(ResolutionWidth + 22) AS s22, SUM(ResolutionWidth + 23) AS s23, SUM(ResolutionWidth + 24) AS s24, SUM(ResolutionWidth + 25) AS s25, SUM(ResolutionWidth + 26) AS s26, SUM(ResolutionWidth + 27) AS s27, SUM(ResolutionWidth + 28) AS s28, SUM(ResolutionWidth + 29) AS s29, SUM(ResolutionWidth + 30) AS s30, SUM(ResolutionWidth + 31) AS s31, SUM(ResolutionWidth + 32) AS s32, SUM(ResolutionWidth + 33) AS s33, SUM(ResolutionWidth + 34) AS s34, SUM(ResolutionWidth + 35) AS s35, SUM(ResolutionWidth + 36) AS s36, SUM(ResolutionWidth + 37) AS s37, SUM(ResolutionWidth + 38) AS s38, SUM(ResolutionWidth + 39) AS s39, SUM(ResolutionWidth + 40) AS s40, SUM(ResolutionWidth + 41) AS s41, SUM(ResolutionWidth + 42) AS s42, SUM(ResolutionWidth + 43) AS s43, SUM(ResolutionWidth + 44) AS s44, SUM(ResolutionWidth + 45) AS s45, SUM(ResolutionWidth + 46) AS s46, SUM(ResolutionWidth + 47) AS s47, SUM(ResolutionWidth + 48) AS s48, SUM(ResolutionWidth + 49) AS s49, SUM(ResolutionWidth + 50) AS s50, SUM(ResolutionWidth + 51) AS s51, SUM(ResolutionWidth + 52) AS s52, SUM(ResolutionWidth + 53) AS s53, SUM(ResolutionWidth + 54) AS s54, SUM(ResolutionWidth + 55) AS s55, SUM(ResolutionWidth + 56) AS s56, SUM(ResolutionWidth + 57) AS s57, SUM(ResolutionWidth + 58) AS s58, SUM(ResolutionWidth + 59) AS s59, SUM(ResolutionWidth + 60) AS s60, SUM(ResolutionWidth + 61) AS s61, SUM(ResolutionWidth + 62) AS s62, SUM(ResolutionWidth + 63) AS s63, SUM(ResolutionWidth + 64) AS s64, SUM(ResolutionWidth + 65) AS s65, SUM(ResolutionWidth + 66) AS s66, SUM(ResolutionWidth + 67) AS s67, SUM(ResolutionWidth + 68) AS s68, SUM(ResolutionWidth + 69) AS s69, SUM(ResolutionWidth + 70) AS s70, SUM(ResolutionWidth + 71) AS s71, SUM(ResolutionWidth + 72) AS s72, SUM(ResolutionWidth + 73) AS s73, SUM(ResolutionWidth + 74) AS s74, SUM(ResolutionWidth + 75) AS s75, SUM(ResolutionWidth + 76) AS s76, SUM(ResolutionWidth + 77) AS s77, SUM(ResolutionWidth + 78) AS s78, SUM(ResolutionWidth + 79) AS s79, SUM(ResolutionWidth + 80) AS s80, SUM(ResolutionWidth + 81) AS s81, SUM(ResolutionWidth + 82) AS s82, SUM(ResolutionWidth + 83) AS s83, SUM(ResolutionWidth + 84) AS s84, SUM(ResolutionWidth + 85) AS s85, SUM(ResolutionWidth + 86) AS s86, SUM(ResolutionWidth + 87) AS s87, SUM(ResolutionWidth + 88) AS s88, SUM(ResolutionWidth + 89) AS s89 FROM hits;", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +def _data_size_bytes() -> int: + import glob + total = 0 + for p in glob.glob(PARQUET_GLOB): + try: + total += os.path.getsize(p) + except OSError: + pass + return total + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, data_bytes + start = timeit.default_timer() + data_bytes = _data_size_bytes() + df = daft.read_parquet(PARQUET_GLOB) + df = df.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s"))) + df = df.with_column("EventDate", col("EventDate").cast(DataType.date())) + df = df.with_column("URL", col("URL").decode("utf-8")) + df = df.with_column("Title", col("Title").decode("utf-8")) + df = df.with_column("Referer", col("Referer").decode("utf-8")) + df = df.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8")) + df = df.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8")) + hits = df + # Register so daft.sql can see `hits`. + try: + daft.catalog.register_table("hits", df) # type: ignore[attr-defined] + except Exception: + pass + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if data_bytes: + return {"bytes": int(data_bytes)} + # Fall back to the on-disk size if /load hasn't run yet. + return {"bytes": _data_size_bytes()} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_DAFT_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/daft-parquet/start b/daft-parquet/start new file mode 100755 index 000000000..e3fab7273 --- /dev/null +++ b/daft-parquet/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/daft-parquet/stop b/daft-parquet/stop new file mode 100755 index 000000000..787b35abc --- /dev/null +++ b/daft-parquet/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/databend/benchmark.sh b/databend/benchmark.sh index 1582a7ec0..531bd6503 100755 --- a/databend/benchmark.sh +++ b/databend/benchmark.sh @@ -1,74 +1,5 @@ #!/bin/bash - -curl -LJO 'https://github.com/datafuselabs/databend/releases/download/v0.9.53-nightly/databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' -tar xzvf 'databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' - -cat > config.toml << CONF -[storage] -type = "fs" - -[storage.fs] -data_path = "./_data" - -[meta] -endpoints = ["127.0.0.1:9191"] -username = "root" -password = "root" -client_timeout_in_second = 60 -auto_sync_interval = 60 -CONF - -# databend starts with meta service -./bin/databend-meta --single > meta.log 2>&1 & -./bin/databend-query -c config.toml > query.log 2>&1 & - -# Load the data -# Docs: https://databend.rs/doc/use-cases/analyze-hits-dataset-with-databend -for _ in {1..600} -do - curl -sS 'http://default@localhost:8124/' --data-binary @create.sql && break - sleep 1 -done - -../download-hits-tsv - -## Aws gp2 write performance is not stable, we must load the data when disk's write around ~500MB/s (Don't know much about the rules of gp2) -# Load Data -START=$(date +%s) -curl -sS -XPUT 'http://root:@127.0.0.1:8000/v1/streaming_load' -H 'insert_sql: insert into hits FILE_FORMAT = (type = TSV)' -F 'upload=@"./hits.tsv"' -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -## in c5.4x large, it's 368s -# {"id":"17477ed9-9f1a-46d9-b6cf-12a5971f4450","state":"SUCCESS","stats":{"rows":99997497,"bytes":74807831229},"error":null,"files":["hits.tsv"]} -# real 6m8.975s -# user 0m4.327s -# sys 0m36.185s - -## in c6a.4xlarge it's ~360s -# {"id":"f7506581-a4da-4684-850c-4bd03530314d","state":"SUCCESS","stats":{"rows":99997497,"bytes":74807831229},"error":null,"files":["hits.tsv"]} -# real 5m57.800s -# user 0m2.106s -# sys 0m33.507s - -## in c6a.metal it's ~70s -# {"id":"2564bd91-1b36-4cf2-a95e-de46c5aff0c6","state":"SUCCESS","stats":{"rows":99997497,"bytes":74807831229},"error":null,"files":["hits.tsv"]} -# real 1m10.347s -# user 0m0.953s -# sys 0m20.401s - - - -## check data is correct -curl -sS 'http://default@localhost:8124/' --data-binary "select count() from hits" - -echo -n "Data size: " -du -bcs _data | grep total -# 20922561953 _data -# 20922561953 total - -# If you wants to get the data size(without metadata and indexes) -# curl 'http://default@localhost:8124/' --data-binary "select humanize_size(bytes_compressed) from fuse_snapshot('default', 'hits') order by timestamp desc limit 1" -# 18.48 GiB - -./run.sh 2>&1 | tee log.txt +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/databend/check b/databend/check new file mode 100755 index 000000000..18125ca66 --- /dev/null +++ b/databend/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf 'http://default@localhost:8124/' --data-binary 'SELECT 1' >/dev/null diff --git a/databend/data-size b/databend/data-size new file mode 100755 index 000000000..2d6c82971 --- /dev/null +++ b/databend/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs _data | grep total | awk '{print $1}' diff --git a/databend/install b/databend/install new file mode 100755 index 000000000..e04c849be --- /dev/null +++ b/databend/install @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +if [ ! -d ./bin ]; then + curl -LJO 'https://github.com/datafuselabs/databend/releases/download/v0.9.53-nightly/databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' + tar xzvf 'databend-v0.9.53-nightly-x86_64-unknown-linux-musl.tar.gz' +fi + +cat > config.toml <<'CONF' +[storage] +type = "fs" + +[storage.fs] +data_path = "./_data" + +[meta] +endpoints = ["127.0.0.1:9191"] +username = "root" +password = "root" +client_timeout_in_second = 60 +auto_sync_interval = 60 +CONF diff --git a/databend/load b/databend/load new file mode 100755 index 000000000..0c083afbc --- /dev/null +++ b/databend/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# Create the table. +curl -sS 'http://default@localhost:8124/' --data-binary @create.sql + +# Load via the streaming_load HTTP endpoint. +curl -sS -XPUT 'http://root:@127.0.0.1:8000/v1/streaming_load' \ + -H 'insert_sql: insert into hits FILE_FORMAT = (type = TSV)' \ + -F 'upload=@./hits.tsv' + +rm -f hits.tsv +sync diff --git a/databend/query b/databend/query new file mode 100755 index 000000000..15698283e --- /dev/null +++ b/databend/query @@ -0,0 +1,25 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via databend's clickhouse-compatible HTTP. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (curl wall-clock). +# Exit non-zero on error. +set -e + +query=$(cat) + +body=$(mktemp) +trap 'rm -f "$body"' EXIT + +stats=$(curl -sS -o "$body" -w 'HTTP:%{http_code} TIME:%{time_total}\n' \ + 'http://default@localhost:8124' --data "$query") +http_code=$(echo "$stats" | grep -oP 'HTTP:\K[0-9]+') +res=$(echo "$stats" | grep -oP 'TIME:\K[0-9.]+') + +if [ "$http_code" != "200" ] || grep -qiE '"error"|exception|error code' "$body"; then + cat "$body" >&2 + exit 1 +fi + +cat "$body" + +awk -v t="$res" 'BEGIN { printf "%.3f\n", t }' >&2 diff --git a/databend/run.sh b/databend/run.sh deleted file mode 100755 index 98e766242..000000000 --- a/databend/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - BODY=$(mktemp) - STATS=$(curl -sS -o "$BODY" -w 'HTTP:%{http_code} TIME:%{time_total}\n' "http://default@localhost:8124" -d "${query}" 2>&1) - CURL_EXIT=$? - HTTP_CODE=$(echo "$STATS" | grep -oP 'HTTP:\K[0-9]+') - RES=$(echo "$STATS" | grep -oP 'TIME:\K[0-9.]+') - - if [[ "$CURL_EXIT" == "0" && "$HTTP_CODE" == "200" && -n "${RES}" ]] && ! grep -qiE '"error"|exception|error code' "$BODY" - then - echo -n "${RES}" - else - echo -n "null" - RES="" - fi - rm -f "$BODY" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/databend/start b/databend/start new file mode 100755 index 000000000..869ab376d --- /dev/null +++ b/databend/start @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# Idempotent: if HTTP API is already up, do nothing. +if curl -sSf 'http://default@localhost:8124/' --data-binary 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# databend has two daemons: meta service + query service. +nohup ./bin/databend-meta --single > meta.log 2>&1 & +disown +nohup ./bin/databend-query -c config.toml > query.log 2>&1 & +disown diff --git a/databend/stop b/databend/stop new file mode 100755 index 000000000..49788ddc4 --- /dev/null +++ b/databend/stop @@ -0,0 +1,12 @@ +#!/bin/bash + +pkill -x databend-query 2>/dev/null || true +pkill -x databend-meta 2>/dev/null || true + +# Wait briefly, escalate to KILL if needed. +for _ in $(seq 1 15); do + pgrep -x databend-query >/dev/null 2>&1 || pgrep -x databend-meta >/dev/null 2>&1 || exit 0 + sleep 1 +done +pkill -9 -x databend-query 2>/dev/null || true +pkill -9 -x databend-meta 2>/dev/null || true diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh index 1d59008b4..3b63e772a 100755 --- a/datafusion-partitioned/benchmark.sh +++ b/datafusion-partitioned/benchmark.sh @@ -1,51 +1,5 @@ #!/bin/bash - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -WITH_SWAP=false - -if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then - echo "LOW MEMORY MODE" - # Enable swap if not already enabled. This is needed both for rustc and until we have a better - # solution for low memory machines, see - # https://github.com/apache/datafusion/issues/18473 - if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then - echo "Enabling 8G swap" - sudo fallocate -l 8G /swapfile - sudo chmod 600 /swapfile - sudo mkswap /swapfile - sudo swapon /swapfile - WITH_SWAP=true - fi -fi - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y gcc - -echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 53.1.0 -CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli -export PATH="`pwd`/target/release:$PATH" -cd .. - -echo "Download benchmark target data, partitioned" -../download-hits-parquet-partitioned partitioned - -echo "Run benchmarks for partitioned" -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs partitioned | grep total)" - -if [ "$WITH_SWAP" = true ]; then - echo "Disable swap" - sudo swapoff /swapfile - sudo rm /swapfile -fi +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/datafusion-partitioned/check b/datafusion-partitioned/check new file mode 100755 index 000000000..52f2d2586 --- /dev/null +++ b/datafusion-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DF=arrow-datafusion/target/release/datafusion-cli +"$DF" -c "SELECT 1" >/dev/null diff --git a/datafusion-partitioned/data-size b/datafusion-partitioned/data-size new file mode 100755 index 000000000..7be44d792 --- /dev/null +++ b/datafusion-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs partitioned | awk '/total$/ { print $1 }' diff --git a/datafusion-partitioned/install b/datafusion-partitioned/install new file mode 100755 index 000000000..20f376193 --- /dev/null +++ b/datafusion-partitioned/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -e + +if [ ! -x arrow-datafusion/target/release/datafusion-cli ]; then + if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y + fi + export HOME=${HOME:=~} + # shellcheck disable=SC1091 + source "$HOME/.cargo/env" + + if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi + fi + + sudo apt-get update -y + sudo apt-get install -y gcc git + + if [ ! -d arrow-datafusion ]; then + git clone https://github.com/apache/arrow-datafusion.git + fi + cd arrow-datafusion + git checkout 53.1.0 + CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \ + cargo build --release --package datafusion-cli --bin datafusion-cli +fi diff --git a/datafusion-partitioned/load b/datafusion-partitioned/load new file mode 100755 index 000000000..275c2c6ae --- /dev/null +++ b/datafusion-partitioned/load @@ -0,0 +1,9 @@ +#!/bin/bash +# datafusion queries the parquet files via an external table at LOCATION +# 'partitioned' (see create.sql). The shared bench_download fetches the +# parquet files into CWD; move them into the expected subdir. +set -e + +mkdir -p partitioned +mv hits_*.parquet partitioned/ 2>/dev/null || true +sync diff --git a/datafusion-partitioned/make-json.sh b/datafusion-partitioned/make-json.sh deleted file mode 100755 index 7973d394f..000000000 --- a/datafusion-partitioned/make-json.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -# This script converts the raw `result.csv` data from `benchmark.sh` into the -# final json format used by the benchmark dashboard. -# -# usage : ./make-json.sh -# -# example (save results/c6a.4xlarge.json) -# ./make-json.sh c6a.4xlarge - -MACHINE=$1 -OUTPUT_FILE="results/${MACHINE}.json" -SYSTEM_NAME="DataFusion (Parquet, partitioned)" -DATE=$(date +%Y-%m-%d) - - -# Read the CSV and build the result array using sed -RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE -{ - "system": "$SYSTEM_NAME", - "date": "$DATE", - "machine": "$MACHINE", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "hardware": "cpu", - "tags": ["Rust","column-oriented","embedded","stateless"], - "load_time": 0, - "data_size": 14737666736, - "result": [ - $RESULT_ARRAY - ] -} -EOF diff --git a/datafusion-partitioned/query b/datafusion-partitioned/query new file mode 100755 index 000000000..c3625f4b1 --- /dev/null +++ b/datafusion-partitioned/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via datafusion-cli using create.sql +# to define the hits view, then the query. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +DF=arrow-datafusion/target/release/datafusion-cli + +query=$(cat) +tmp=$(mktemp /tmp/datafusion.XXXXXX.sql) +trap 'rm -f "$tmp"' EXIT +printf '%s\n' "$query" > "$tmp" + +out=$("$DF" -f create.sql "$tmp" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +printf '%s\n' "$out" | grep -v 'Elapsed' || true + +printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2 diff --git a/datafusion-partitioned/run.sh b/datafusion-partitioned/run.sh deleted file mode 100755 index 2e1c36109..000000000 --- a/datafusion-partitioned/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -echo $1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo "$query" > /tmp/query.sql - - echo -n "[" - for i in $(seq 1 $TRIES); do - # 1. there will be two query result, one for creating table another for executing the select statement - # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines - # 3. use sed to take the second line - # 4. use awk to take the number we want - RES=$(datafusion-cli -f create.sql /tmp/query.sql 2>&1 | grep "Elapsed" |tail -1| awk '{ print $2 }') - [[ $RES != "" ]] && \ - echo -n "$RES" || \ - echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/datafusion-partitioned/start b/datafusion-partitioned/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/datafusion-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/datafusion-partitioned/stop b/datafusion-partitioned/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/datafusion-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index 3296289e7..fc4bacc8f 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -1,52 +1,5 @@ #!/bin/bash - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -WITH_SWAP=false - -if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then - echo "LOW MEMORY MODE" - # Enable swap if not already enabled. This is needed both for rustc and until we have a better - # solution for low memory machines, see - # https://github.com/apache/datafusion/issues/18473 - if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then - echo "Enabling 8G swap" - sudo fallocate -l 8G /swapfile - sudo chmod 600 /swapfile - sudo mkswap /swapfile - sudo swapon /swapfile - WITH_SWAP=true - fi -fi - - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y gcc - -echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 53.1.0 -CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli -export PATH="`pwd`/target/release:$PATH" -cd .. - -echo "Download benchmark target data, single file" -../download-hits-parquet-single - -echo "Run benchmarks" -./run.sh - -echo "Load time: 0" -echo "Data size: $(du -bcs hits.parquet)" - -if [ "$WITH_SWAP" = true ]; then - echo "Disable swap" - sudo swapoff /swapfile - sudo rm /swapfile -fi +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/datafusion/check b/datafusion/check new file mode 100755 index 000000000..52f2d2586 --- /dev/null +++ b/datafusion/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DF=arrow-datafusion/target/release/datafusion-cli +"$DF" -c "SELECT 1" >/dev/null diff --git a/datafusion/data-size b/datafusion/data-size new file mode 100755 index 000000000..708c0b72e --- /dev/null +++ b/datafusion/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/datafusion/install b/datafusion/install new file mode 100755 index 000000000..8f4cee6f1 --- /dev/null +++ b/datafusion/install @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +# Build datafusion-cli from source. Idempotent: only build if the binary +# isn't already in arrow-datafusion/target/release/datafusion-cli. + +if [ ! -x arrow-datafusion/target/release/datafusion-cli ]; then + # Rust toolchain. + if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y + fi + export HOME=${HOME:=~} + # shellcheck disable=SC1091 + source "$HOME/.cargo/env" + + # Low-memory hosts need swap to compile datafusion-cli. + if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi + fi + + sudo apt-get update -y + sudo apt-get install -y gcc git + + if [ ! -d arrow-datafusion ]; then + git clone https://github.com/apache/arrow-datafusion.git + fi + cd arrow-datafusion + git checkout 53.1.0 + CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \ + cargo build --release --package datafusion-cli --bin datafusion-cli +fi diff --git a/datafusion/load b/datafusion/load new file mode 100755 index 000000000..96ed6eea5 --- /dev/null +++ b/datafusion/load @@ -0,0 +1,6 @@ +#!/bin/bash +# datafusion queries hits.parquet directly via an external table created +# inline in each query (see create.sql, executed by ./query). No persistent +# database to load. +set -e +sync diff --git a/datafusion/make-json.sh b/datafusion/make-json.sh deleted file mode 100755 index 67afcee6c..000000000 --- a/datafusion/make-json.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -# This script converts the raw `result.csv` data from `benchmark.sh` into the -# final json format used by the benchmark dashboard. -# -# usage : ./make-json.sh -# -# example ./make-json.sh c6a.4xlarge # saves results/c6a.4xlarge.json -# - -MACHINE=$1 -OUTPUT_FILE="results/${MACHINE}.json" -SYSTEM_NAME="DataFusion (Parquet, single)" -DATE=$(date +%Y-%m-%d) - - -# Read the CSV and build the result array using sed -RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE -{ - "system": "$SYSTEM_NAME", - "date": "$DATE", - "machine": "$MACHINE", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "hardware": "cpu", - "tags": ["Rust","column-oriented","embedded","stateless"], - "load_time": 0, - "data_size": 14779976446, - "result": [ - $RESULT_ARRAY - ] -} -EOF diff --git a/datafusion/query b/datafusion/query new file mode 100755 index 000000000..65cc944ea --- /dev/null +++ b/datafusion/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via datafusion-cli using create.sql +# to define the hits view, then the query. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +DF=arrow-datafusion/target/release/datafusion-cli + +query=$(cat) +tmp=$(mktemp /tmp/datafusion.XXXXXX.sql) +trap 'rm -f "$tmp"' EXIT +printf '%s\n' "$query" > "$tmp" + +out=$("$DF" -f create.sql "$tmp" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +# Print everything that's not an "Elapsed" timing line as the result. +printf '%s\n' "$out" | grep -v 'Elapsed' || true + +# datafusion-cli prints `... Elapsed X.YYY seconds.` for each statement; the +# last one is for the actual query. +printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2 diff --git a/datafusion/run.sh b/datafusion/run.sh deleted file mode 100755 index cd1059ac3..000000000 --- a/datafusion/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -TRIES=3 -QUERY_NUM=1 -echo $1 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo "$query" > /tmp/query.sql - - echo -n "[" - for i in $(seq 1 $TRIES); do - # 1. there will be two query result, one for creating table another for executing the select statement - # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines - # 3. use sed to take the second line - # 4. use awk to take the number we want - RES=$(datafusion-cli -f create.sql /tmp/query.sql 2>&1 | grep "Elapsed" |tail -1 | awk '{ print $2 }') - [[ $RES != "" ]] && \ - echo -n "$RES" || \ - echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/datafusion/start b/datafusion/start new file mode 100755 index 000000000..e53151aba --- /dev/null +++ b/datafusion/start @@ -0,0 +1,3 @@ +#!/bin/bash +# datafusion-cli is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/datafusion/stop b/datafusion/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/datafusion/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/doris-parquet/benchmark.sh b/doris-parquet/benchmark.sh index e287ac3e1..6a7f45d3a 100755 --- a/doris-parquet/benchmark.sh +++ b/doris-parquet/benchmark.sh @@ -1,94 +1,5 @@ #!/bin/bash - -set -e - -# This benchmark should run on Ubuntu 22.04 - -# Install -url='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-3.0.5-bin-x64.tar.gz' -# Download -file_name="$(basename ${url})" -if [[ "$url" == "http"* ]]; then - if [[ ! -f $file_name ]]; then - wget --continue --progress=dot:giga ${url} - else - echo "$file_name already exists, no need to download." - fi -fi -dir_name="${file_name/.tar.gz/}" - -# Try to stop Doris and remove it first if execute this script multiple times -set +e -"$dir_name"/apache-doris-3.0.5-bin-x64/fe/bin/stop_fe.sh -"$dir_name"/apache-doris-3.0.5-bin-x64/be/bin/stop_be.sh -rm -rf "$dir_name" -set -e - -# Uncompress -mkdir "$dir_name" -tar zxf "$file_name" -C "$dir_name" -DORIS_HOME="$dir_name/apache-doris-3.0.5-bin-x64" -export DORIS_HOME - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jdk -sudo apt-get install -y mysql-client -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -sudo systemctl disable unattended-upgrades -sudo systemctl stop unattended-upgrades - -"$DORIS_HOME"/fe/bin/start_fe.sh --daemon - -# Start Backend -sudo sysctl -w vm.max_map_count=2000000 -ulimit -n 65535 -"$DORIS_HOME"/be/bin/start_be.sh --daemon - -# Wait for Frontend ready -for _ in {1..300} -do - fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' | cut -f16 | sed -n '2,$p') - if [[ -n "${fe_version}" ]] && [[ "${fe_version}" != "NULL" ]]; then - echo "Frontend version: ${fe_version}" - break - else - echo 'Wait for Frontend ready ...' - sleep 2 - fi -done - -# Setup cluster, add Backend to cluster -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050' " - -# Wait for Backend ready -for _ in {1..300} -do - be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' | cut -f22 | sed -n '2,$p') - if [[ -n "${be_version}" ]]; then - echo "Backend version: ${be_version}" - break - else - echo 'Wait for Backend ready ...' - sleep 2 - fi -done - -# Download Parquet files -../download-hits-parquet-partitioned "$DORIS_HOME/be" - -# Run the queries -mysql -h127.1 -P9030 -uroot -vvv < create.sql - -./run.sh 2>&1 | tee -a log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Load time: 0" -echo "Data size: $(find "$DORIS_HOME/be/" -name '*.parquet' | xargs wc -c | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/doris-parquet/check b/doris-parquet/check new file mode 100755 index 000000000..c6e836c8c --- /dev/null +++ b/doris-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null diff --git a/doris-parquet/data-size b/doris-parquet/data-size new file mode 100755 index 000000000..992250bc6 --- /dev/null +++ b/doris-parquet/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +find "$DORIS_HOME/be/" -name 'hits_*.parquet' -printf '%s\n' \ + | awk '{ s += $1 } END { print s+0 }' diff --git a/doris-parquet/get-result-json.sh b/doris-parquet/get-result-json.sh deleted file mode 100755 index cf4cd7792..000000000 --- a/doris-parquet/get-result-json.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# set -x -if [[ ! -d results ]]; then mkdir results; fi - -echo -e "{ - \"system\": \"Apache Doris (Parquet, partitioned)\", - \"date\": \"$(date '+%Y-%m-%d')\", - \"machine\": \"$(sudo dmidecode -s system-product-name)\", - \"cluster_size\": 1, - \"comment\": \"\", - \"tags\": [\"C++\", \"column-oriented\", \"MySQL compatible\", \"ClickHouse derivative\"], - \"load_time\": 0, - \"data_size\": 14737666736, - \"result\": [ -$( - r=$(sed -r -e 's/query[0-9]+,/[/; s/$/],/' result.csv) - echo "${r%?}" -) - ] -} -" | tee results/"$(sudo dmidecode -s system-product-name).json" diff --git a/doris-parquet/install b/doris-parquet/install new file mode 100755 index 000000000..7dc603e91 --- /dev/null +++ b/doris-parquet/install @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +# This benchmark runs on Ubuntu 22.04+ +ROOT=$(pwd) +URL='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-3.0.5-bin-x64.tar.gz' + +file_name="$(basename "$URL")" +dir_name="${file_name/.tar.gz/}" +DORIS_HOME="$ROOT/$dir_name/apache-doris-3.0.5-bin-x64" + +if [ ! -d "$DORIS_HOME" ]; then + if [ ! -f "$file_name" ]; then + wget --continue --progress=dot:giga "$URL" + fi + mkdir -p "$dir_name" + tar zxf "$file_name" -C "$dir_name" +fi + +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jdk mysql-client bc + +sudo systemctl disable unattended-upgrades 2>/dev/null || true +sudo systemctl stop unattended-upgrades 2>/dev/null || true +sudo sysctl -w vm.max_map_count=2000000 + +echo "$DORIS_HOME" > .doris_home diff --git a/doris-parquet/load b/doris-parquet/load new file mode 100755 index 000000000..820105499 --- /dev/null +++ b/doris-parquet/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +ROOT=$(pwd) +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME + +# The dataset must be visible to the BE process (TVF reads local files +# relative to the BE working dir). +"$ROOT/../download-hits-parquet-partitioned" "$DORIS_HOME/be" + +# Create the view that wraps a local() TVF over the parquet files. Idempotent +# (CREATE OR REPLACE / IF NOT EXISTS). +mysql -h127.0.0.1 -P9030 -uroot < "$ROOT/create.sql" + +# Pre-set parquet flags to match original benchmark. +mysql -h127.0.0.1 -P9030 -uroot \ + -e 'set global enable_parquet_filter_by_min_max=true; set global enable_parquet_lazy_materialization=true;' + +# Note: data files remain — for parquet-as-storage there's no separate copy, +# the TVF reads them directly. +sync diff --git a/doris-parquet/query b/doris-parquet/query new file mode 100755 index 000000000..67aebec9c --- /dev/null +++ b/doris-parquet/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against Doris's `hits` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Clear the FE/BE caches before each query (parquet path). +curl -sS http://127.0.0.1:8040/api/clear_cache/all >/dev/null 2>&1 || true + +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/doris-parquet/run.sh b/doris-parquet/run.sh deleted file mode 100755 index a6438a8b7..000000000 --- a/doris-parquet/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -mysql -h127.1 -P9030 -uroot -e 'set global enable_parquet_filter_by_min_max=true; set global enable_parquet_lazy_materialization=true;' -while read -r query; do - curl -sS http://127.0.0.1:8040/api/clear_cache/all - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" - done -done < queries.sql diff --git a/doris-parquet/start b/doris-parquet/start new file mode 100755 index 000000000..79ac83727 --- /dev/null +++ b/doris-parquet/start @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +ulimit -n 65535 + +"$DORIS_HOME"/fe/bin/start_fe.sh --daemon +"$DORIS_HOME"/be/bin/start_be.sh --daemon + +for _ in $(seq 1 300); do + fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' 2>/dev/null | cut -f16 | sed -n '2,$p') + if [ -n "$fe_version" ] && [ "$fe_version" != "NULL" ]; then + break + fi + sleep 2 +done + +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050'" 2>/dev/null || true + +for _ in $(seq 1 300); do + be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' 2>/dev/null | cut -f22 | sed -n '2,$p') + if [ -n "$be_version" ]; then + break + fi + sleep 2 +done diff --git a/doris-parquet/stop b/doris-parquet/stop new file mode 100755 index 000000000..d8d0385b7 --- /dev/null +++ b/doris-parquet/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +DORIS_HOME=$(cat .doris_home 2>/dev/null) || exit 0 +"$DORIS_HOME"/fe/bin/stop_fe.sh 2>/dev/null || true +"$DORIS_HOME"/be/bin/stop_be.sh 2>/dev/null || true +exit 0 diff --git a/doris/benchmark.sh b/doris/benchmark.sh index 3f445758c..6a7f45d3a 100755 --- a/doris/benchmark.sh +++ b/doris/benchmark.sh @@ -1,255 +1,5 @@ #!/bin/bash -set -e - -# This benchmark should run on Ubuntu 20.04 - -# Install -ROOT=$(pwd) - -if [[ -n "$1" ]]; then - url="$1" -else - url='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-4.1.0-rc01-bin-x64.tar.gz' -fi -# Download -file_name="$(basename ${url})" -if [[ "$url" == "http"* ]]; then - if [[ ! -f $file_name ]]; then - wget --continue --progress=dot:giga ${url} - else - echo "$file_name already exists, no need to download." - fi -fi -dir_name="${file_name/.tar.gz/}" - -# Try to stop Doris and remove it first if execute this script multiple times -set +e -"$dir_name"/"$dir_name"/fe/bin/stop_fe.sh -"$dir_name"/"$dir_name"/be/bin/stop_be.sh -rm -rf "$dir_name" -set -e - -# Uncompress -mkdir "$dir_name" -tar zxf "$file_name" -C "$dir_name" -DORIS_HOME="$ROOT/$dir_name/$dir_name" -export DORIS_HOME - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jdk mysql-client -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -sudo systemctl disable unattended-upgrades -sudo systemctl stop unattended-upgrades - -"$DORIS_HOME"/fe/bin/start_fe.sh --daemon - -# Start Backend -sudo sysctl -w vm.max_map_count=2000000 -ulimit -n 65535 -# Disable internal caches so that the cold run (1st of 3 tries) is actually cold. -# Without this, the BE process keeps decoded data in its own in-memory page cache -# (`storage_page_cache`, default ~20% of RAM) and segment cache, which `drop_caches` -# does not clear, so first-run timings reflect a warm cache and underreport -# cold-run latency. -printf "\ndisable_storage_page_cache = true\n" >> "$DORIS_HOME"/be/conf/be.conf -printf "\nsegment_cache_capacity = 0\n" >> "$DORIS_HOME"/be/conf/be.conf -"$DORIS_HOME"/be/bin/start_be.sh --daemon - -# Wait for Frontend ready -for _ in {1..300} -do - fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' | cut -f16 | sed -n '2,$p') - if [[ -n "${fe_version}" ]] && [[ "${fe_version}" != "NULL" ]]; then - echo "Frontend version: ${fe_version}" - break - else - echo 'Wait for Frontend ready ...' - sleep 2 - fi -done - -# Setup cluster, add Backend to cluster -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050' " - -# Wait for Backend ready -for _ in {1..300} -do - be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' | cut -f22 | sed -n '2,$p') - if [[ -n "${be_version}" ]]; then - echo "Backend version: ${be_version}" - break - else - echo 'Wait for Backend ready ...' - sleep 2 - fi -done - -echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - -# Create Database and table -mysql -h 127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" -sleep 5 -mysql -h 127.0.0.1 -P9030 -uroot hits <"$ROOT"/create.sql - -# Download data -BE_DATA_DIR="$DORIS_HOME/be/" - -"$ROOT"/../download-hits-parquet-partitioned "$BE_DATA_DIR/user_files_secure" - -BE_ID=$(mysql -h127.0.0.1 -P9030 -uroot -N -e 'show backends' | awk '{print $1}' | head -1) - -CORES=$(nproc) -PARALLEL_NUM=$((CORES / 4)) -if [ "$PARALLEL_NUM" -lt 1 ]; then - echo "Computed parallel_pipeline_task_num ($PARALLEL_NUM) is less than 1 based on $CORES cores; clamping to 1." - PARALLEL_NUM=1 -fi -echo "Setting parallel_pipeline_task_num to $PARALLEL_NUM (cpu cores: $CORES, computed as CORES/4 with min 1)" - -echo "start loading hits.parquet using TVF, estimated to take about 3 minutes ..." -START=$(date +%s) -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SET parallel_pipeline_task_num = $PARALLEL_NUM;\ -INSERT INTO hits SELECT - CounterID, - DATE_ADD('1970-01-01', INTERVAL EventDate DAY) AS EventDate, - UserID, - FROM_UNIXTIME(EventTime) AS EventTime, - WatchID, - JavaEnable, - Title, - GoodEvent, - ClientIP, - RegionID, - CounterClass, - OS, - UserAgent, - URL, - Referer, - IsRefresh, - RefererCategoryID, - RefererRegionID, - URLCategoryID, - URLRegionID, - ResolutionWidth, - ResolutionHeight, - ResolutionDepth, - FlashMajor, - FlashMinor, - FlashMinor2, - NetMajor, - NetMinor, - UserAgentMajor, - UserAgentMinor, - CookieEnable, - JavascriptEnable, - IsMobile, - MobilePhone, - MobilePhoneModel, - Params, - IPNetworkID, - TraficSourceID, - SearchEngineID, - SearchPhrase, - AdvEngineID, - IsArtifical, - WindowClientWidth, - WindowClientHeight, - ClientTimeZone, - FROM_UNIXTIME(ClientEventTime) AS ClientEventTime, - SilverlightVersion1, - SilverlightVersion2, - SilverlightVersion3, - SilverlightVersion4, - PageCharset, - CodeVersion, - IsLink, - IsDownload, - IsNotBounce, - FUniqID, - OriginalURL, - HID, - IsOldCounter, - IsEvent, - IsParameter, - DontCountHits, - WithHash, - HitColor, - FROM_UNIXTIME(LocalEventTime) AS LocalEventTime, - Age, - Sex, - Income, - Interests, - Robotness, - RemoteIP, - WindowName, - OpenerName, - HistoryLength, - BrowserLanguage, - BrowserCountry, - SocialNetwork, - SocialAction, - HTTPError, - SendTiming, - DNSTiming, - ConnectTiming, - ResponseStartTiming, - ResponseEndTiming, - FetchTiming, - SocialSourceNetworkID, - SocialSourcePage, - ParamPrice, - ParamOrderID, - ParamCurrency, - ParamCurrencyID, - OpenstatServiceName, - OpenstatCampaignID, - OpenstatAdID, - OpenstatSourceID, - UTMSource, - UTMMedium, - UTMCampaign, - UTMContent, - UTMTerm, - FromTag, - HasGCLID, - RefererHash, - URLHash, - CLID -FROM local( - \"file_path\" = \"user_files_secure/hits_*.parquet\", - \"backend_id\" = \"$BE_ID\", - \"format\" = \"parquet\" -) -" -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" -echo "$LOADTIME" > loadtime - - -du -bs "$DORIS_HOME"/be/storage/ | cut -f1 | tee storage_size -echo "Data size: $(cat storage_size)" - -mysql -h 127.0.0.1 -P9030 -uroot hits -e "set global enable_sql_cache = false" -# Dataset contains 99997497 rows, storage size is about 13319588503 bytes -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits" - -# Run queries -TRIES=3 -while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" 2>&1 | tee -a log.txt - done -done /dev/null diff --git a/doris/data-size b/doris/data-size new file mode 100755 index 000000000..e99a148af --- /dev/null +++ b/doris/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +du -bs "$DORIS_HOME"/be/storage/ | cut -f1 diff --git a/doris/get-result-json.sh b/doris/get-result-json.sh deleted file mode 100755 index 646eb5204..000000000 --- a/doris/get-result-json.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# set -x -if [[ ! -d results ]]; then mkdir results; fi - -echo -e "{ - \"system\": \"Apache Doris\", - \"date\": \"$(date '+%Y-%m-%d')\", - \"machine\": \"$(sudo dmidecode -s system-product-name)\", - \"cluster_size\": 1, - \"comment\": \"\", - \"tags\": [\"C++\", \"column-oriented\", \"MySQL compatible\", \"ClickHouse derivative\"], - \"load_time\": $(cat loadtime), - \"data_size\": $(cat storage_size), - \"result\": [ -$( - r=$(sed -r -e 's/query[0-9]+,/[/; s/$/],/' result.csv) - echo "${r%?}" -) - ] -} -" | tee results/"$(sudo dmidecode -s system-product-name).json" diff --git a/doris/install b/doris/install new file mode 100755 index 000000000..05c69d8f3 --- /dev/null +++ b/doris/install @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +# This benchmark runs on Ubuntu 20.04+ +ROOT=$(pwd) +URL='https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-4.1.0-rc01-bin-x64.tar.gz' + +file_name="$(basename "$URL")" +dir_name="${file_name/.tar.gz/}" +DORIS_HOME="$ROOT/$dir_name/$dir_name" + +# Idempotent: skip if already extracted. +if [ ! -d "$DORIS_HOME" ]; then + if [ ! -f "$file_name" ]; then + wget --continue --progress=dot:giga "$URL" + fi + mkdir -p "$dir_name" + tar zxf "$file_name" -C "$dir_name" + + # Disable internal caches so cold runs are actually cold. + printf "\ndisable_storage_page_cache = true\n" >> "$DORIS_HOME"/be/conf/be.conf + printf "\nsegment_cache_capacity = 0\n" >> "$DORIS_HOME"/be/conf/be.conf +fi + +# Install dependencies (idempotent — apt-get is fine to re-run). +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jdk mysql-client bc + +sudo systemctl disable unattended-upgrades 2>/dev/null || true +sudo systemctl stop unattended-upgrades 2>/dev/null || true + +sudo sysctl -w vm.max_map_count=2000000 + +echo "$DORIS_HOME" > .doris_home diff --git a/doris/load b/doris/load new file mode 100755 index 000000000..e95d24a21 --- /dev/null +++ b/doris/load @@ -0,0 +1,142 @@ +#!/bin/bash +set -e + +ROOT=$(pwd) +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +# Idempotent: drop+create database. +mysql -h127.0.0.1 -P9030 -uroot -e "DROP DATABASE IF EXISTS hits" +mysql -h127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" +sleep 5 +mysql -h127.0.0.1 -P9030 -uroot hits < "$ROOT/create.sql" + +BE_DATA_DIR="$DORIS_HOME/be/" +"$ROOT/../download-hits-parquet-partitioned" "$BE_DATA_DIR/user_files_secure" + +BE_ID=$(mysql -h127.0.0.1 -P9030 -uroot -N -e 'show backends' | awk '{print $1}' | head -1) +CORES=$(nproc) +PARALLEL_NUM=$((CORES / 4)) +[ "$PARALLEL_NUM" -lt 1 ] && PARALLEL_NUM=1 + +mysql -h127.0.0.1 -P9030 -uroot hits -e "SET parallel_pipeline_task_num = $PARALLEL_NUM;\ +INSERT INTO hits SELECT + CounterID, + DATE_ADD('1970-01-01', INTERVAL EventDate DAY) AS EventDate, + UserID, + FROM_UNIXTIME(EventTime) AS EventTime, + WatchID, + JavaEnable, + Title, + GoodEvent, + ClientIP, + RegionID, + CounterClass, + OS, + UserAgent, + URL, + Referer, + IsRefresh, + RefererCategoryID, + RefererRegionID, + URLCategoryID, + URLRegionID, + ResolutionWidth, + ResolutionHeight, + ResolutionDepth, + FlashMajor, + FlashMinor, + FlashMinor2, + NetMajor, + NetMinor, + UserAgentMajor, + UserAgentMinor, + CookieEnable, + JavascriptEnable, + IsMobile, + MobilePhone, + MobilePhoneModel, + Params, + IPNetworkID, + TraficSourceID, + SearchEngineID, + SearchPhrase, + AdvEngineID, + IsArtifical, + WindowClientWidth, + WindowClientHeight, + ClientTimeZone, + FROM_UNIXTIME(ClientEventTime) AS ClientEventTime, + SilverlightVersion1, + SilverlightVersion2, + SilverlightVersion3, + SilverlightVersion4, + PageCharset, + CodeVersion, + IsLink, + IsDownload, + IsNotBounce, + FUniqID, + OriginalURL, + HID, + IsOldCounter, + IsEvent, + IsParameter, + DontCountHits, + WithHash, + HitColor, + FROM_UNIXTIME(LocalEventTime) AS LocalEventTime, + Age, + Sex, + Income, + Interests, + Robotness, + RemoteIP, + WindowName, + OpenerName, + HistoryLength, + BrowserLanguage, + BrowserCountry, + SocialNetwork, + SocialAction, + HTTPError, + SendTiming, + DNSTiming, + ConnectTiming, + ResponseStartTiming, + ResponseEndTiming, + FetchTiming, + SocialSourceNetworkID, + SocialSourcePage, + ParamPrice, + ParamOrderID, + ParamCurrency, + ParamCurrencyID, + OpenstatServiceName, + OpenstatCampaignID, + OpenstatAdID, + OpenstatSourceID, + UTMSource, + UTMMedium, + UTMCampaign, + UTMContent, + UTMTerm, + FromTag, + HasGCLID, + RefererHash, + URLHash, + CLID +FROM local( + \"file_path\" = \"user_files_secure/hits_*.parquet\", + \"backend_id\" = \"$BE_ID\", + \"format\" = \"parquet\" +) +" + +mysql -h127.0.0.1 -P9030 -uroot hits -e "set global enable_sql_cache = false" + +# Clean up downloaded parquet inputs. +rm -f "$BE_DATA_DIR"/user_files_secure/hits_*.parquet +sync diff --git a/doris/query b/doris/query new file mode 100755 index 000000000..ed73900d9 --- /dev/null +++ b/doris/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against Doris's `hits` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Use mysql -vvv which prints "X rows in set (Y.YY sec)" or "ERROR ...". +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +# Strip the timing line from stdout output. +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +# Parse the last "(X.XX sec)" or "X min Y.ZZ sec" line. +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/doris/start b/doris/start new file mode 100755 index 000000000..0ad1d9086 --- /dev/null +++ b/doris/start @@ -0,0 +1,39 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +# Idempotent: if FE replies, do nothing. +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +ulimit -n 65535 + +"$DORIS_HOME"/fe/bin/start_fe.sh --daemon +"$DORIS_HOME"/be/bin/start_be.sh --daemon + +# Wait for FE. +for _ in $(seq 1 300); do + fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' 2>/dev/null | cut -f16 | sed -n '2,$p') + if [ -n "$fe_version" ] && [ "$fe_version" != "NULL" ]; then + break + fi + sleep 2 +done + +# Add backend to cluster (idempotent — ignore "already exists"). +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050'" 2>/dev/null || true + +# Wait for BE. +for _ in $(seq 1 300); do + be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' 2>/dev/null | cut -f22 | sed -n '2,$p') + if [ -n "$be_version" ]; then + break + fi + sleep 2 +done diff --git a/doris/stop b/doris/stop new file mode 100755 index 000000000..d8d0385b7 --- /dev/null +++ b/doris/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +DORIS_HOME=$(cat .doris_home 2>/dev/null) || exit 0 +"$DORIS_HOME"/fe/bin/stop_fe.sh 2>/dev/null || true +"$DORIS_HOME"/be/bin/stop_be.sh 2>/dev/null || true +exit 0 diff --git a/drill/benchmark.sh b/drill/benchmark.sh index 4728ccaf7..fc4bacc8f 100755 --- a/drill/benchmark.sh +++ b/drill/benchmark.sh @@ -1,14 +1,5 @@ -# Install - -sudo apt-get update -y -sudo apt-get install -y docker.io - -../download-hits-parquet-single - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '\([\d\.]+ seconds\)|Errors' | sed -r -e 's/Errors:/null/; s/^.+\(([.0-9]+) seconds\)/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/drill/check b/drill/check new file mode 100755 index 000000000..f73b79539 --- /dev/null +++ b/drill/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo docker image inspect apache/drill >/dev/null diff --git a/drill/data-size b/drill/data-size new file mode 100755 index 000000000..708c0b72e --- /dev/null +++ b/drill/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/drill/install b/drill/install new file mode 100755 index 000000000..cf1bd6868 --- /dev/null +++ b/drill/install @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi + +# Pre-pull the apache/drill image so query timing isn't dominated by image +# pull on the first run. +sudo docker pull apache/drill diff --git a/drill/load b/drill/load new file mode 100755 index 000000000..8ef141f24 --- /dev/null +++ b/drill/load @@ -0,0 +1,5 @@ +#!/bin/bash +# Drill queries hits.parquet directly via the dfs filesystem plugin (mounted +# into the docker container per query). No persistent DB to load. +set -e +sync diff --git a/drill/query b/drill/query new file mode 100755 index 000000000..77ccd6fe6 --- /dev/null +++ b/drill/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via Apache Drill (in a one-shot +# docker container) against hits.parquet (mounted as /hits.parquet). +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +query=$(cat) +# Drill needs the full dfs path, not a bare table name. +query=${query//hits/dfs.\`/hits.parquet\`} + +out=$(printf '%s\r' "$query" | sudo docker run -i --rm \ + -v "$(pwd)/hits.parquet:/hits.parquet" apache/drill 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +# Drill output ends each query with `(X.YYY seconds)`. +printf '%s\n' "$out" | grep -v -E '\([0-9.]+ seconds\)' || true + +printf '%s\n' "$out" | grep -oE '\([0-9.]+ seconds\)' | grep -oE '[0-9.]+' | tail -n1 >&2 diff --git a/drill/run.sh b/drill/run.sh deleted file mode 100755 index f7168ac10..000000000 --- a/drill/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -cat queries.sql | sed -r -e 's@hits@dfs.`/hits.parquet`@g' | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo "${query}" - sudo docker run -i --rm --name drill -v $(pwd)/hits.parquet:/hits.parquet apache/drill <<< "${query}"$'\r'"${query}"$'\r'"${query}"$'\r' - echo -done diff --git a/drill/start b/drill/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/drill/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/drill/stop b/drill/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/drill/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/druid/benchmark.sh b/druid/benchmark.sh index c18e545dd..ce50709e3 100755 --- a/druid/benchmark.sh +++ b/druid/benchmark.sh @@ -1,44 +1,8 @@ -#!/bin/bash -e - -sudo apt-get update -y -sudo apt-get install -y openjdk-11-jdk -sudo update-alternatives --config java - -# Install - -VERSION=33.0.0 - -wget -O"apache-druid-${VERSION}-bin.tar.gz" "https://dlcdn.apache.org/druid/${VERSION}/apache-druid-${VERSION}-bin.tar.gz" -tar xf apache-druid-${VERSION}-bin.tar.gz -./apache-druid-${VERSION}/bin/verify-java - -# Have to increase indexer memory limit -sed -i 's MaxDirectMemorySize=1g MaxDirectMemorySize=5g g' apache-druid-$VERSION/conf/druid/single-server/medium/middleManager/runtime.properties - -# Disable cache to test query performance -sed -i 's druid.historical.cache.useCache=true druid.historical.cache.useCache=false g' apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties -sed -i 's druid.historical.cache.populateCache=true druid.historical.cache.populateCache=false g' apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties -sed -i 's druid.processing.buffer.sizeBytes=500MiB druid.processing.buffer.sizeBytes=1000MiB g' apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties - -echo "druid.query.groupBy.maxMergingDictionarySize=5000000000" >> apache-druid-$VERSION/conf/druid/single-server/medium/historical/runtime.properties -# Druid launcher does not start Druid as a daemon. Run it in background -./apache-druid-${VERSION}/bin/start-single-server-medium & - -# Load the data - -../download-hits-tsv - -echo -n "Load time: " -command time -f '%e' ./apache-druid-${VERSION}/bin/post-index-task --file ingest.json --url http://localhost:8081 - -# The command above will fail due to timeout but still continue to run in background. -# The loading time should be checked from the logs. - -# Run the queries -./run.sh - -# stop Druid services -kill %1 - -echo -n "Data size: " -du -bcs ./apache-druid-${VERSION}/var | grep total +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +# Druid degrades after some queries; the shared driver restarts between +# queries when restartable=yes (matches the original `pkill -f historical` +# hack now folded into stop). +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/druid/check b/druid/check new file mode 100755 index 000000000..1c5210451 --- /dev/null +++ b/druid/check @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Trivial query against the Druid SQL endpoint. +RES=$(curl -sf -o /dev/null -w '%{http_code}' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8888/druid/v2/sql/ \ + -d '{"query": "SELECT 1"}') + +[ "$RES" = "200" ] diff --git a/druid/data-size b/druid/data-size new file mode 100755 index 000000000..3a668c8b8 --- /dev/null +++ b/druid/data-size @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +VERSION=33.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +du -bcs "./${DRUID_DIR}/var" | awk '/total$/ {print $1}' diff --git a/druid/install b/druid/install new file mode 100755 index 000000000..1f7d2b31c --- /dev/null +++ b/druid/install @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +VERSION=33.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +if [ ! -d "$DRUID_DIR" ]; then + sudo apt-get update -y + sudo apt-get install -y openjdk-11-jdk curl + + if [ ! -f "${DRUID_DIR}-bin.tar.gz" ]; then + wget --continue --progress=dot:giga -O"${DRUID_DIR}-bin.tar.gz" \ + "https://dlcdn.apache.org/druid/${VERSION}/${DRUID_DIR}-bin.tar.gz" + fi + tar xf "${DRUID_DIR}-bin.tar.gz" + "./${DRUID_DIR}/bin/verify-java" + + # Have to increase indexer memory limit + sed -i 's MaxDirectMemorySize=1g MaxDirectMemorySize=5g g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/middleManager/runtime.properties" + + # Disable cache to test query performance + sed -i 's druid.historical.cache.useCache=true druid.historical.cache.useCache=false g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + sed -i 's druid.historical.cache.populateCache=true druid.historical.cache.populateCache=false g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + sed -i 's druid.processing.buffer.sizeBytes=500MiB druid.processing.buffer.sizeBytes=1000MiB g' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + + if ! grep -q '^druid.query.groupBy.maxMergingDictionarySize' \ + "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties"; then + echo "druid.query.groupBy.maxMergingDictionarySize=5000000000" \ + >> "${DRUID_DIR}/conf/druid/single-server/medium/historical/runtime.properties" + fi +fi diff --git a/druid/load b/druid/load new file mode 100755 index 000000000..c3984df39 --- /dev/null +++ b/druid/load @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +VERSION=33.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +# post-index-task posts the spec; ingestion runs asynchronously and the +# command may exit non-zero on its own polling timeout while the task keeps +# running. We treat that as success-with-warning; check the logs for the +# actual completion. Idempotent: re-running is safe (Druid replaces the +# datasource). +"./${DRUID_DIR}/bin/post-index-task" --file ingest.json --url http://localhost:8081 || true + +# Wait until the hits datasource is queryable. +for _ in $(seq 1 600); do + cnt=$(curl -sf -XPOST -H'Content-Type: application/json' \ + http://localhost:8888/druid/v2/sql/ \ + -d '{"query": "SELECT COUNT(*) FROM hits"}' 2>/dev/null \ + | grep -oE '[0-9]+' | head -n1) || cnt="" + if [ -n "$cnt" ] && [ "$cnt" -gt 0 ]; then + break + fi + sleep 5 +done + +rm -f hits.tsv +sync diff --git a/druid/query b/druid/query new file mode 100755 index 000000000..c1001f608 --- /dev/null +++ b/druid/query @@ -0,0 +1,40 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via Druid's SQL HTTP endpoint. +# Stdout: query result JSON. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) +# Druid uses __time and dislikes trailing semicolons. +query=$(printf '%s' "$query" | sed -e 's EventTime __time g' | tr -d ';') + +# Build JSON request safely. +req=$(printf '%s' "$query" | python3 -c ' +import json, sys +q = sys.stdin.read() +print(json.dumps({"query": q, "context": {"timeout": 1000000}})) +') + +tmp_body=$(mktemp) +trap 'rm -f "$tmp_body"' EXIT + +# -w prints time_total and http_code on the last line. -o sends body to file. +status_line=$(curl -s -o "$tmp_body" \ + -w '%{http_code} %{time_total}\n' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8888/druid/v2/sql/ \ + --data-binary "$req") + +http_code=$(echo "$status_line" | awk '{print $1}') +time_total=$(echo "$status_line" | awk '{print $2}') + +cat "$tmp_body" + +if [ "$http_code" != "200" ]; then + echo "druid query failed: HTTP $http_code" >&2 + exit 1 +fi + +# Print runtime in fractional seconds as the last stderr line. +printf '%s\n' "$time_total" >&2 diff --git a/druid/run.sh b/druid/run.sh deleted file mode 100755 index 2ffb72838..000000000 --- a/druid/run.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -TRIES=3 -cat queries.sql | while read -r query; do - sync - for i in $(seq 1 100); do - CHECK=$(curl -o /dev/null -w '%{http_code}' -s -XPOST -H'Content-Type: application/json' http://localhost:8888/druid/v2/sql/ -d @check.json }) - [[ "$CHECK" == "200" ]] && break - sleep 1 - done - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo -n "[" - for i in $(seq 1 $TRIES); do - echo "{\"query\":\"$query\", \"context\": {\"timeout\": 1000000} }"| sed -e 's EventTime __time g' | tr -d ';' > query.json - curl -w '%{http_code} %{time_total}\n' -s -XPOST -H'Content-Type: application/json' http://localhost:8888/druid/v2/sql/ -d @query.json | awk '{ if ($1!="200") { printf "null" } }' - [[ "$i" != $TRIES ]] && echo -n ", " - done - echo "]," - - # Ugly hack to measure independently queries. Otherwise some queries make Druid degraded and results are incorrect. For example after Q13 even SELECT 1 works for 7 seconds - pkill -f historical - sleep 3 -done diff --git a/druid/start b/druid/start new file mode 100755 index 000000000..bde3b1ead --- /dev/null +++ b/druid/start @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +VERSION=33.0.0 +DRUID_DIR="apache-druid-${VERSION}" + +# Idempotent: if router is responsive, do nothing. +if curl -sf -o /dev/null http://localhost:8888/status 2>/dev/null; then + exit 0 +fi + +# Druid launcher does not start Druid as a daemon. Run it in background, with +# its own session so the start script can exit and leave Druid running. +nohup "./${DRUID_DIR}/bin/start-single-server-medium" \ + >> druid.log 2>&1 < /dev/null & +disown diff --git a/druid/stop b/druid/stop new file mode 100755 index 000000000..e6ce67b95 --- /dev/null +++ b/druid/stop @@ -0,0 +1,8 @@ +#!/bin/bash + +# Kill all Druid processes (the launcher and child JVMs). +pkill -f 'start-single-server-medium' 2>/dev/null || true +pkill -f 'druid' 2>/dev/null || true +# Small grace period so subsequent ./start binds cleanly. +sleep 2 +exit 0 diff --git a/duckdb-dataframe/benchmark.sh b/duckdb-dataframe/benchmark.sh index 087381c44..fc4bacc8f 100755 --- a/duckdb-dataframe/benchmark.sh +++ b/duckdb-dataframe/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas duckdb pyarrow - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-dataframe/check b/duckdb-dataframe/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/duckdb-dataframe/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/duckdb-dataframe/data-size b/duckdb-dataframe/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/duckdb-dataframe/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/duckdb-dataframe/install b/duckdb-dataframe/install new file mode 100755 index 000000000..aa1911741 --- /dev/null +++ b/duckdb-dataframe/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas duckdb pyarrow fastapi uvicorn diff --git a/duckdb-dataframe/load b/duckdb-dataframe/load new file mode 100755 index 000000000..ceba6beca --- /dev/null +++ b/duckdb-dataframe/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/duckdb-dataframe/query b/duckdb-dataframe/query new file mode 100755 index 000000000..a4e152430 --- /dev/null +++ b/duckdb-dataframe/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running duckdb server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/duckdb-dataframe/query.py b/duckdb-dataframe/query.py deleted file mode 100755 index 0139a83e5..000000000 --- a/duckdb-dataframe/query.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -import pandas as pd -import timeit -import datetime -import subprocess -import duckdb - -start = timeit.default_timer() -hits = pd.read_parquet("hits.parquet") -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") - -dataframe_size = hits.memory_usage().sum() - -# print("Dataframe(numpy) size:", dataframe_size, "bytes") - -# fix some types -hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") -hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") - -# fix all object columns to string -for col in hits.columns: - if hits[col].dtype == "O": - hits[col] = hits[col].astype(str) - -queries = [] -with open("queries.sql") as f: - queries = f.readlines() - -conn = duckdb.connect() -for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) - - times = [] - for _ in range(3): - start = timeit.default_timer() - result = conn.execute(q).fetchall() - end = timeit.default_timer() - times.append(round(end - start, 3)) - print(times) diff --git a/duckdb-dataframe/server.py b/duckdb-dataframe/server.py new file mode 100644 index 000000000..4fe187577 --- /dev/null +++ b/duckdb-dataframe/server.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around DuckDB (running over a pandas DataFrame) so it +conforms to the ClickBench install/start/check/stop/load/query interface. + +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the DataFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs it via + DuckDB against the loaded DataFrame, returns + {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (memory_usage) + +The 43 SQL strings come straight from the prior duck-dataframe queries.sql. +""" + +import os +import timeit + +import duckdb +import pandas as pd +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +app = FastAPI() +hits: pd.DataFrame | None = None # noqa: F841 — referenced by DuckDB by name +conn = None + + +def _make_runner(sql: str): + return lambda _df: conn.execute(sql).fetchall() + + +# 43 ClickBench queries — DuckDB resolves `hits` from the session by name. SQL +# strings come straight from the prior duckdb-dataframe/queries.sql. +_SQL_LIST: list[str] = [ + "SELECT COUNT(*) FROM hits;", + "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", + "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", + "SELECT AVG(UserID) FROM hits;", + "SELECT COUNT(DISTINCT UserID) FROM hits;", + "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", + "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", + "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", + "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", + "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", + "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", + "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", + "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", + "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", + "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", + "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", + "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", + "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", + "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", + "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", + "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", + "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", + "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", + "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", + "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", + "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", + "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", +] + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits, conn + start = timeit.default_timer() + df = pd.read_parquet("hits.parquet") + df["EventTime"] = pd.to_datetime(df["EventTime"], unit="s") + df["EventDate"] = pd.to_datetime(df["EventDate"], unit="D") + for col in df.columns: + if df[col].dtype == "O": + df[col] = df[col].astype(str) + hits = df + # DuckDB picks up pandas DataFrames from globals by name; bind explicitly + # too so the connection sees `hits`. + conn = duckdb.connect() + conn.register("hits", hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + sql = QUERIES[idx][0] + start = timeit.default_timer() + conn.execute(sql).fetchall() + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if hits is None: + return {"bytes": 0} + return {"bytes": int(hits.memory_usage().sum())} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_DUCKDB_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/duckdb-dataframe/start b/duckdb-dataframe/start new file mode 100755 index 000000000..e3fab7273 --- /dev/null +++ b/duckdb-dataframe/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/duckdb-dataframe/stop b/duckdb-dataframe/stop new file mode 100755 index 000000000..787b35abc --- /dev/null +++ b/duckdb-dataframe/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/duckdb-datalake-partitioned/benchmark.sh b/duckdb-datalake-partitioned/benchmark.sh index a24d2e2dd..33e6ce27b 100755 --- a/duckdb-datalake-partitioned/benchmark.sh +++ b/duckdb-datalake-partitioned/benchmark.sh @@ -1,22 +1,6 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: 14737666736" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-datalake-partitioned/check b/duckdb-datalake-partitioned/check new file mode 100755 index 000000000..3c457f3f1 --- /dev/null +++ b/duckdb-datalake-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-datalake-partitioned/data-size b/duckdb-datalake-partitioned/data-size new file mode 100755 index 000000000..7fcf52750 --- /dev/null +++ b/duckdb-datalake-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored remotely in S3 — fixed size (100 partitioned parquet files). +echo 14737666736 diff --git a/duckdb-datalake-partitioned/install b/duckdb-datalake-partitioned/install new file mode 100755 index 000000000..c59770063 --- /dev/null +++ b/duckdb-datalake-partitioned/install @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh +fi + +ln -sf "$HOME/.duckdb/cli/latest/duckdb" "$HOME/.local/bin/duckdb" 2>/dev/null || true diff --git a/duckdb-datalake-partitioned/load b/duckdb-datalake-partitioned/load new file mode 100755 index 000000000..03aecdd6f --- /dev/null +++ b/duckdb-datalake-partitioned/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# create.sql installs httpfs and defines a VIEW directly over S3 partitioned +# parquet — no local data is loaded. Persist the view in hits.db. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-datalake-partitioned/query b/duckdb-datalake-partitioned/query new file mode 100755 index 000000000..0af71bda6 --- /dev/null +++ b/duckdb-datalake-partitioned/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over remote S3 partitioned parquet files). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-datalake-partitioned/run.sh b/duckdb-datalake-partitioned/run.sh deleted file mode 100755 index 61d016b4d..000000000 --- a/duckdb-datalake-partitioned/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-datalake-partitioned/start b/duckdb-datalake-partitioned/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-datalake-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-datalake-partitioned/stop b/duckdb-datalake-partitioned/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-datalake-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-datalake/benchmark.sh b/duckdb-datalake/benchmark.sh index c0c96825c..33e6ce27b 100755 --- a/duckdb-datalake/benchmark.sh +++ b/duckdb-datalake/benchmark.sh @@ -1,22 +1,6 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: 14779976446" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Data is read directly from S3, no local download. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-datalake/check b/duckdb-datalake/check new file mode 100755 index 000000000..3c457f3f1 --- /dev/null +++ b/duckdb-datalake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-datalake/data-size b/duckdb-datalake/data-size new file mode 100755 index 000000000..351ceea7b --- /dev/null +++ b/duckdb-datalake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Hits dataset stored remotely in S3 — fixed size (single parquet). +echo 14779976446 diff --git a/duckdb-datalake/install b/duckdb-datalake/install new file mode 100755 index 000000000..c59770063 --- /dev/null +++ b/duckdb-datalake/install @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh +fi + +ln -sf "$HOME/.duckdb/cli/latest/duckdb" "$HOME/.local/bin/duckdb" 2>/dev/null || true diff --git a/duckdb-datalake/load b/duckdb-datalake/load new file mode 100755 index 000000000..376474284 --- /dev/null +++ b/duckdb-datalake/load @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# create.sql installs httpfs and defines a VIEW directly over S3 — no local +# data is loaded. Persist the view in hits.db. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-datalake/query b/duckdb-datalake/query new file mode 100755 index 000000000..492e96356 --- /dev/null +++ b/duckdb-datalake/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over a remote S3 parquet). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-datalake/run.sh b/duckdb-datalake/run.sh deleted file mode 100755 index 61d016b4d..000000000 --- a/duckdb-datalake/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-datalake/start b/duckdb-datalake/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-datalake/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-datalake/stop b/duckdb-datalake/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-datalake/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-memory/benchmark.sh b/duckdb-memory/benchmark.sh index 7c201d6bc..fc4bacc8f 100755 --- a/duckdb-memory/benchmark.sh +++ b/duckdb-memory/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install duckdb psutil - -# Load the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -v ./query.py 2>&1 | tee log.txt - -echo -n "Load time: " -cat log.txt | grep -P '^\d|Killed|Segmentation' | head -n1 - -cat log.txt | grep -P '^\d|Killed|Segmentation' | tail -n+2 | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo -n "Data size: " -grep -F 'Maximum resident set size' log.txt | grep -o -P '\d+$' | awk '{ print $1 * 1024 }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-memory/check b/duckdb-memory/check new file mode 100755 index 000000000..ef56e3554 --- /dev/null +++ b/duckdb-memory/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import duckdb; duckdb.sql('SELECT 1').fetchall()" >/dev/null diff --git a/duckdb-memory/data-size b/duckdb-memory/data-size new file mode 100755 index 000000000..e8e06f0d4 --- /dev/null +++ b/duckdb-memory/data-size @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +if [ -f data-size.txt ]; then + cat data-size.txt +else + # Fallback: measure now if `load` wasn't run. + # shellcheck disable=SC1091 + source myenv/bin/activate + /usr/bin/time -v python3 memory.py 2> /tmp/memory.log + grep -F 'Maximum resident set size' /tmp/memory.log \ + | grep -o -P '\d+$' \ + | awk '{ print $1 * 1024 }' +fi diff --git a/duckdb-memory/install b/duckdb-memory/install new file mode 100755 index 000000000..8024a13c5 --- /dev/null +++ b/duckdb-memory/install @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +# duckdb-memory uses the DuckDB Python module to fully materialize the dataset +# in memory. We install it inside a venv so we don't pollute the system Python. +sudo apt-get install -y python3-pip python3-venv + +if [ ! -x myenv/bin/python ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --upgrade pip +pip install duckdb psutil diff --git a/duckdb-memory/load b/duckdb-memory/load new file mode 100755 index 000000000..fbdf48f4b --- /dev/null +++ b/duckdb-memory/load @@ -0,0 +1,23 @@ +#!/bin/bash +set -e + +# State doesn't persist across `query` invocations (the table lives in +# :memory:), so the load is performed inside each query script. Here we just +# verify the source parquet is in place and measure peak memory once via +# memory.py — that becomes our `data-size` answer. +if [ ! -f hits.parquet ]; then + echo "load: hits.parquet missing in $(pwd)" >&2 + exit 1 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Run memory.py under `time -v` to capture the peak RSS, save it to +# data-size.txt for use by ./data-size. +/usr/bin/time -v python3 memory.py 2> memory.log +grep -F 'Maximum resident set size' memory.log \ + | grep -o -P '\d+$' \ + | awk '{ print $1 * 1024 }' > data-size.txt + +sync diff --git a/duckdb-memory/query b/duckdb-memory/query new file mode 100755 index 000000000..f6860f324 --- /dev/null +++ b/duckdb-memory/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the DuckDB Python module against +# an in-memory hits table loaded fresh from hits.parquet for each invocation. +# Stdout: query result (Python repr). Stderr: query runtime in fractional +# seconds on the last line. Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) + +python3 - "$query" <<'PY' +import duckdb, sys, timeit +query = sys.argv[1] + +con = duckdb.connect(':memory:') +con.execute("PRAGMA enable_progress_bar;") +con.execute("SET preserve_insertion_order = false;") +con.execute(open("create.sql").read()) +con.execute(open("load.sql").read()) + +start = timeit.default_timer() +result = con.sql(query).fetchall() +elapsed = timeit.default_timer() - start + +# Result on stdout. +for row in result: + print(row) + +# Timing on stderr — last line, fractional seconds. +print(f"{elapsed:.4f}", file=sys.stderr) +PY diff --git a/duckdb-memory/query.py b/duckdb-memory/query.py deleted file mode 100755 index b22f1d650..000000000 --- a/duckdb-memory/query.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 - -import duckdb -import timeit -import sys -import os - -con = duckdb.connect(':memory:') - -# enable the progress bar -con.execute('PRAGMA enable_progress_bar;') -con.execute('PRAGMA enable_print_progress_bar;') -# disable preservation of insertion order -con.execute("SET preserve_insertion_order = false;") - -# perform the actual load -print("Will load the data") -start = timeit.default_timer() -con.execute(open("create.sql").read()) -con.execute(open("load.sql").read()) -end = timeit.default_timer() -print(round(end - start, 3)) - -with open('queries.sql', 'r') as file: - for query in file: - print(query) - - for try_num in range(3): - start = timeit.default_timer() - results = con.sql(query).fetchall() - end = timeit.default_timer() - print(round(end - start, 3)) - del results diff --git a/duckdb-memory/start b/duckdb-memory/start new file mode 100755 index 000000000..058589d84 --- /dev/null +++ b/duckdb-memory/start @@ -0,0 +1,3 @@ +#!/bin/bash +# Embedded Python — no daemon to start. +exit 0 diff --git a/duckdb-memory/stop b/duckdb-memory/stop new file mode 100755 index 000000000..38c84edd2 --- /dev/null +++ b/duckdb-memory/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# Embedded Python — no daemon to stop. +exit 0 diff --git a/duckdb-parquet-partitioned/benchmark.sh b/duckdb-parquet-partitioned/benchmark.sh index d2db40aac..3b63e772a 100755 --- a/duckdb-parquet-partitioned/benchmark.sh +++ b/duckdb-parquet-partitioned/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -# Load the data -../download-hits-parquet-partitioned - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-parquet-partitioned/check b/duckdb-parquet-partitioned/check new file mode 100755 index 000000000..3c457f3f1 --- /dev/null +++ b/duckdb-parquet-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-parquet-partitioned/data-size b/duckdb-parquet-partitioned/data-size new file mode 100755 index 000000000..2d6921ab6 --- /dev/null +++ b/duckdb-parquet-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/duckdb-parquet-partitioned/install b/duckdb-parquet-partitioned/install new file mode 100755 index 000000000..c59770063 --- /dev/null +++ b/duckdb-parquet-partitioned/install @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh +fi + +ln -sf "$HOME/.duckdb/cli/latest/duckdb" "$HOME/.local/bin/duckdb" 2>/dev/null || true diff --git a/duckdb-parquet-partitioned/load b/duckdb-parquet-partitioned/load new file mode 100755 index 000000000..54176f57c --- /dev/null +++ b/duckdb-parquet-partitioned/load @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# create.sql defines a VIEW over hits_*.parquet — no ingestion happens. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-parquet-partitioned/query b/duckdb-parquet-partitioned/query new file mode 100755 index 000000000..a21f56770 --- /dev/null +++ b/duckdb-parquet-partitioned/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over hits_*.parquet). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-parquet-partitioned/run.sh b/duckdb-parquet-partitioned/run.sh deleted file mode 100755 index 61d016b4d..000000000 --- a/duckdb-parquet-partitioned/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-parquet-partitioned/start b/duckdb-parquet-partitioned/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-parquet-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-parquet-partitioned/stop b/duckdb-parquet-partitioned/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-parquet-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-parquet/benchmark.sh b/duckdb-parquet/benchmark.sh index e8ad1d894..fc4bacc8f 100755 --- a/duckdb-parquet/benchmark.sh +++ b/duckdb-parquet/benchmark.sh @@ -1,25 +1,5 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -# Load the data -../download-hits-parquet-single - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql - -echo "Data size: $(du -bcs hits*.parquet | grep total)" - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-parquet/check b/duckdb-parquet/check new file mode 100755 index 000000000..3c457f3f1 --- /dev/null +++ b/duckdb-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-parquet/data-size b/duckdb-parquet/data-size new file mode 100755 index 000000000..1aecba4a1 --- /dev/null +++ b/duckdb-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits*.parquet | awk '/total$/ { print $1 }' diff --git a/duckdb-parquet/install b/duckdb-parquet/install new file mode 100755 index 000000000..df91f5eb9 --- /dev/null +++ b/duckdb-parquet/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh +fi + +# Make duckdb available on PATH for sibling scripts in this directory. +ln -sf "$HOME/.duckdb/cli/latest/duckdb" "$HOME/.local/bin/duckdb" 2>/dev/null || true diff --git a/duckdb-parquet/load b/duckdb-parquet/load new file mode 100755 index 000000000..99b8db36e --- /dev/null +++ b/duckdb-parquet/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# create.sql defines a VIEW over hits.parquet — no ingestion happens, the +# parquet file is read in place at query time. We persist the view in +# hits.db so subsequent query invocations see it. +rm -f hits.db +duckdb hits.db -f create.sql +sync diff --git a/duckdb-parquet/query b/duckdb-parquet/query new file mode 100755 index 000000000..46a748cb7 --- /dev/null +++ b/duckdb-parquet/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (which +# contains a VIEW over hits.parquet). +# Stdout: query result. Stderr: query runtime in fractional seconds on the +# last line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "SET parquet_metadata_cache=true" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-parquet/run.sh b/duckdb-parquet/run.sh deleted file mode 100755 index 61d016b4d..000000000 --- a/duckdb-parquet/run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("SET parquet_metadata_cache=true") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb-parquet/start b/duckdb-parquet/start new file mode 100755 index 000000000..c1d4b2fca --- /dev/null +++ b/duckdb-parquet/start @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/duckdb-parquet/stop b/duckdb-parquet/stop new file mode 100755 index 000000000..7af43b828 --- /dev/null +++ b/duckdb-parquet/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/duckdb-vortex-partitioned/benchmark.sh b/duckdb-vortex-partitioned/benchmark.sh index 0353f04f1..3b63e772a 100755 --- a/duckdb-vortex-partitioned/benchmark.sh +++ b/duckdb-vortex-partitioned/benchmark.sh @@ -1,53 +1,5 @@ #!/bin/bash - -# Install -sudo apt-get update -y -sudo apt-get install -y ninja-build cmake build-essential make ccache pip clang pkg-config - -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --no-modify-path - -export CC=clang -export CXX=clang++ -git clone https://github.com/vortex-data/duckdb-vortex --recursive -cd duckdb-vortex -git fetch --tags -git checkout v0.44.0 -git submodule update --init --recursive -GEN=ninja NATIVE_ARCH=1 LTO=thin make -export PATH="`pwd`/build/release/:$PATH" -cd .. - -# Load the data -../download-hits-parquet-partitioned - -# Convert parquet files to vortex partitioned -echo -n "Load time: " -seq 0 99 | command time -f '%e' xargs -P"$(nproc)" -I{} bash -c ' - if [ ! -f "hits_{}.vortex" ]; then - duckdb -c " - COPY ( - SELECT * - REPLACE ( - make_date(EventDate) AS EventDate, - epoch_ms(EventTime * 1000) as EventTime - ) - FROM read_parquet('"'"'hits_{}.parquet'"'"', binary_as_string=True) - ) - TO '"'"'hits_{}.vortex'"'"' (FORMAT VORTEX) - " - fi -' - -echo -n "Load time: " -command time -f '%e' duckdb hits-partitioned.db -c "CREATE VIEW hits AS SELECT * FROM read_vortex('hits_*.vortex')"; - -# Run the queries -echo 'partitioned' - -./run.sh 'hits-partitioned.db' 2>&1 | tee log-p.txt -cat log-p.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -bcs hits_*.vortex | grep total)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-vortex-partitioned/check b/duckdb-vortex-partitioned/check new file mode 100755 index 000000000..43c60a61e --- /dev/null +++ b/duckdb-vortex-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DUCKDB="$(pwd)/duckdb-vortex/build/release/duckdb" +"$DUCKDB" -c "SELECT 1" >/dev/null diff --git a/duckdb-vortex-partitioned/data-size b/duckdb-vortex-partitioned/data-size new file mode 100755 index 000000000..4bb0a059a --- /dev/null +++ b/duckdb-vortex-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.vortex | awk '/total$/ { print $1 }' diff --git a/duckdb-vortex-partitioned/install b/duckdb-vortex-partitioned/install new file mode 100755 index 000000000..34f11d1bc --- /dev/null +++ b/duckdb-vortex-partitioned/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -e + +# Build duckdb-vortex from source. Idempotent. +if [ -x duckdb-vortex/build/release/duckdb ]; then + exit 0 +fi + +sudo apt-get update -y +sudo apt-get install -y ninja-build cmake build-essential make ccache pip clang pkg-config + +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain stable --no-modify-path +fi +# shellcheck disable=SC1091 +[ -f "$HOME/.cargo/env" ] && source "$HOME/.cargo/env" + +export CC=clang +export CXX=clang++ + +if [ ! -d duckdb-vortex ]; then + git clone https://github.com/vortex-data/duckdb-vortex --recursive +fi + +( + cd duckdb-vortex + git fetch --tags + git checkout v0.44.0 + git submodule update --init --recursive + GEN=ninja NATIVE_ARCH=1 LTO=thin make +) diff --git a/duckdb-vortex-partitioned/load b/duckdb-vortex-partitioned/load new file mode 100755 index 000000000..282755fc0 --- /dev/null +++ b/duckdb-vortex-partitioned/load @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +DUCKDB="$(pwd)/duckdb-vortex/build/release/duckdb" + +# Convert each parquet partition to a vortex file in parallel. +seq 0 99 | xargs -P"$(nproc)" -I{} bash -c ' + if [ ! -f "hits_{}.vortex" ]; then + "'"$DUCKDB"'" -c " + COPY ( + SELECT * + REPLACE ( + make_date(EventDate) AS EventDate, + epoch_ms(EventTime * 1000) AS EventTime + ) + FROM read_parquet('"'"'hits_{}.parquet'"'"', binary_as_string=True) + ) + TO '"'"'hits_{}.vortex'"'"' (FORMAT VORTEX) + " + fi +' + +# Build a persistent VIEW over the resulting vortex files. +rm -f hits.db +"$DUCKDB" hits.db -c "CREATE VIEW hits AS SELECT * FROM read_vortex('hits_*.vortex')" + +# Free the source parquet files. +rm -f hits_*.parquet +sync diff --git a/duckdb-vortex-partitioned/query b/duckdb-vortex-partitioned/query new file mode 100755 index 000000000..3e67e1977 --- /dev/null +++ b/duckdb-vortex-partitioned/query @@ -0,0 +1,21 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the custom-built duckdb-vortex +# binary against hits.db (a VIEW over hits_*.vortex). +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +DUCKDB="$(pwd)/duckdb-vortex/build/release/duckdb" +query=$(cat) + +out=$("$DUCKDB" hits.db -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-vortex-partitioned/run.sh b/duckdb-vortex-partitioned/run.sh deleted file mode 100755 index 71bd5c4a5..000000000 --- a/duckdb-vortex-partitioned/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb "$1" "${cli_params[@]}" -done; diff --git a/duckdb-vortex-partitioned/start b/duckdb-vortex-partitioned/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-vortex-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-vortex-partitioned/stop b/duckdb-vortex-partitioned/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-vortex-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-vortex/benchmark.sh b/duckdb-vortex/benchmark.sh index 5234cb9b8..fc4bacc8f 100755 --- a/duckdb-vortex/benchmark.sh +++ b/duckdb-vortex/benchmark.sh @@ -1,31 +1,5 @@ #!/bin/bash - -set -Eeuo pipefail - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -duckdb -c "INSTALL vortex;" - -# Load the data -../download-hits-parquet-single - -# Convert parquet files to Vortex -echo -n "Load time: " -command time -f '%e' duckdb -c "LOAD vortex; COPY 'hits.parquet' TO 'hits.vortex' (FORMAT vortex);" - -# Create view and macro -echo -n "Load time: " -command time -f '%e' duckdb hits-single.db -f create.sql - -echo 'single' - -./run.sh 'hits-single.db' 2>&1 | tee log-s.txt -cat log-s.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.vortex)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb-vortex/check b/duckdb-vortex/check new file mode 100755 index 000000000..3c457f3f1 --- /dev/null +++ b/duckdb-vortex/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb-vortex/data-size b/duckdb-vortex/data-size new file mode 100755 index 000000000..2dbaf40fa --- /dev/null +++ b/duckdb-vortex/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits.vortex | awk 'END { print $1 }' diff --git a/duckdb-vortex/install b/duckdb-vortex/install new file mode 100755 index 000000000..4ace21444 --- /dev/null +++ b/duckdb-vortex/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh +fi + +ln -sf "$HOME/.duckdb/cli/latest/duckdb" "$HOME/.local/bin/duckdb" 2>/dev/null || true + +# Install the vortex extension (idempotent — safe to re-run). +duckdb -c "INSTALL vortex;" diff --git a/duckdb-vortex/load b/duckdb-vortex/load new file mode 100755 index 000000000..5fa24aacf --- /dev/null +++ b/duckdb-vortex/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +# Convert source parquet to Vortex format. +duckdb -c "LOAD vortex; COPY 'hits.parquet' TO 'hits.vortex' (FORMAT vortex);" + +# Create the persistent VIEW (over hits.vortex) in hits.db. +rm -f hits.db +duckdb hits.db -f create.sql + +# Source parquet no longer needed. +rm -f hits.parquet +sync diff --git a/duckdb-vortex/query b/duckdb-vortex/query new file mode 100755 index 000000000..5eef5331a --- /dev/null +++ b/duckdb-vortex/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db (a VIEW +# over hits.vortex). The vortex extension is loaded for each invocation. +# Stdout: query result. Stderr: runtime in fractional seconds on the last +# line. Exit non-zero on error. +set -e + +query=$(cat) + +out=$(duckdb hits.db -c "LOAD vortex;" -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +printf '%s\n' "$out" | grep -v '^Run Time ' +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb-vortex/run.sh b/duckdb-vortex/run.sh deleted file mode 100755 index 30484964b..000000000 --- a/duckdb-vortex/run.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -Eeuo pipefail - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=("LOAD vortex;") - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb "$1" "${cli_params[@]}" -done; diff --git a/duckdb-vortex/start b/duckdb-vortex/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-vortex/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb-vortex/stop b/duckdb-vortex/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/duckdb-vortex/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/duckdb/benchmark.sh b/duckdb/benchmark.sh index 767d7fc53..fc4bacc8f 100755 --- a/duckdb/benchmark.sh +++ b/duckdb/benchmark.sh @@ -1,24 +1,5 @@ #!/bin/bash - -# Install -export HOME=${HOME:=~} -curl https://install.duckdb.org | sh -export PATH=$HOME'/.duckdb/cli/latest':$PATH - -# Load the data -../download-hits-parquet-single - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -storage_version latest -f create.sql -f load.sql - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -wc -c hits.db - -cat log.txt | - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | - sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/duckdb/check b/duckdb/check new file mode 100755 index 000000000..3c457f3f1 --- /dev/null +++ b/duckdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +duckdb -c "SELECT 1" >/dev/null diff --git a/duckdb/data-size b/duckdb/data-size new file mode 100755 index 000000000..b0e7eef3b --- /dev/null +++ b/duckdb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.db diff --git a/duckdb/install b/duckdb/install new file mode 100755 index 000000000..df91f5eb9 --- /dev/null +++ b/duckdb/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +if ! command -v duckdb >/dev/null 2>&1; then + export HOME=${HOME:=~} + curl https://install.duckdb.org | sh +fi + +# Make duckdb available on PATH for sibling scripts in this directory. +ln -sf "$HOME/.duckdb/cli/latest/duckdb" "$HOME/.local/bin/duckdb" 2>/dev/null || true diff --git a/duckdb/load.sql b/duckdb/load old mode 100644 new mode 100755 similarity index 53% rename from duckdb/load.sql rename to duckdb/load index 24891835f..ae8ee21ff --- a/duckdb/load.sql +++ b/duckdb/load @@ -1,3 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: blow away any prior partial DB. +rm -f hits.db + +# Build the database from the source parquet via inline create + load DDL. +duckdb hits.db -storage_version latest <<'SQL' +.read create.sql INSERT INTO hits SELECT * REPLACE ( make_date(EventDate) AS EventDate, @@ -5,3 +14,7 @@ SELECT * REPLACE ( epoch_ms(ClientEventTime * 1000) AS ClientEventTime, epoch_ms(LocalEventTime * 1000) AS LocalEventTime) FROM read_parquet('hits.parquet', binary_as_string=True); +SQL + +rm -f hits.parquet +sync diff --git a/duckdb/query b/duckdb/query new file mode 100755 index 000000000..51d155afd --- /dev/null +++ b/duckdb/query @@ -0,0 +1,25 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via duckdb against hits.db. +# Stdout: query result (boxed format). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# duckdb writes both the result and the `Run Time (s): real X.XXX` line to +# stdout; capture, split, redirect timing to stderr. +out=$(duckdb hits.db -c ".timer on" -c "$query" 2>/tmp/duckdb.err.$$) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + cat /tmp/duckdb.err.$$ >&2 + rm -f /tmp/duckdb.err.$$ + exit "$status" +fi +rm -f /tmp/duckdb.err.$$ + +# Stdout: everything except the Run Time line. +printf '%s\n' "$out" | grep -v '^Run Time ' + +# Stderr: the timing in seconds. +printf '%s\n' "$out" | awk '/^Run Time/ { print $5 }' | tail -n1 >&2 diff --git a/duckdb/run.sh b/duckdb/run.sh deleted file mode 100755 index 25aee48ef..000000000 --- a/duckdb/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=(".timer on") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("${query}") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/duckdb/start b/duckdb/start new file mode 100755 index 000000000..c1d4b2fca --- /dev/null +++ b/duckdb/start @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/duckdb/stop b/duckdb/stop new file mode 100755 index 000000000..7af43b828 --- /dev/null +++ b/duckdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# duckdb is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/elasticsearch/benchmark.sh b/elasticsearch/benchmark.sh index e50248e12..eec9d34a8 100755 --- a/elasticsearch/benchmark.sh +++ b/elasticsearch/benchmark.sh @@ -1,80 +1,6 @@ #!/bin/bash - -# Install prerequisite packages -sudo apt-get update -y -sudo apt-get install -y apt-transport-https ca-certificates wget gpg time jq bc - -# Add Elastic's signing key -wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg - -# Add the repository for version 9.x -echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/9.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-9.x.list - -# Update package list and install -sudo apt-get update -y -sudo apt-get install -y elasticsearch - -sudo /bin/systemctl daemon-reload -sudo /bin/systemctl enable elasticsearch.service -sudo systemctl start elasticsearch.service - -# Disable security (all other are default configs) -cat << EOF > elasticsearch.yml -path.data: /var/lib/elasticsearch -path.logs: /var/log/elasticsearch - -# Disable security features -xpack.security.enabled: false -xpack.security.http.ssl.enabled: false -xpack.security.transport.ssl.enabled: false - -cluster.initial_master_nodes: ["clickbench"] -http.host: 0.0.0.0 -EOF - -sudo cp elasticsearch.yml /etc/elasticsearch/elasticsearch.yml - -# Restart Elasticsearch with the updated configs -sudo systemctl restart elasticsearch.service - - -# Check Elasticsearch is alive - you should get a JSON response -curl -sS -X GET 'http://localhost:9200' - - -###### Create index with mappings mirroring data types in ClickHouse - -# Note: Field types were mapped as closely as possible to https://github.com/ClickHouse/ClickBench/blob/main/clickhouse/create.sql I chose "keyword" because queries are not taking advantage of freetext search. - -# Note: Elasticsearch does not have the concept of a primary key, but it does have an "index sorting" feature, which is intended to help in analytical use cases where sort order on disk matters. I set it to the same parameters as primary key for the ClickHouse tests https://github.com/ClickHouse/ClickBench/blob/main/clickhouse/create.sql - -curl -sS -X PUT "http://localhost:9200/hits?pretty" -H 'Content-Type: application/json' -d @mapping.json - - -###### Data loading (JSON dump via ES Bulk API insert) - -# Download the data -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' - -START=$(date +%s) - -# Reads and loads all the data into Elasticsearch -python3 load.py - -# check on progress -curl -sS -X GET "http://localhost:9200/hits/_stats/docs?pretty" - -# Makes sure all data is flushed to disk -curl -sS -X GET "http://localhost:9200/_flush?pretty" - -# when data loading is finished, to get all stats run -# For Load time, look at: bulk.total_time_in_millis -# For Data size, look at: store.total_data_set_size_in_bytes -curl -sS -X GET "http://localhost:9200/hits/_stats?pretty" | tee stats.json -echo "Data size: $(jq -r '._all.total.store.total_data_set_size_in_bytes' stats.json)" - -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -###### Run the queries -./run.sh +# Thin shim — actual flow is in lib/benchmark-common.sh. +# Source data is gzipped NDJSON, fetched directly inside ./load. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/elasticsearch/check b/elasticsearch/check new file mode 100755 index 000000000..ca1249471 --- /dev/null +++ b/elasticsearch/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf 'http://localhost:9200' >/dev/null diff --git a/elasticsearch/data-size b/elasticsearch/data-size new file mode 100755 index 000000000..5a1637919 --- /dev/null +++ b/elasticsearch/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +curl -sS -X GET 'http://localhost:9200/hits/_stats' \ + | jq -r '._all.total.store.total_data_set_size_in_bytes' diff --git a/elasticsearch/install b/elasticsearch/install new file mode 100755 index 000000000..532a63d6d --- /dev/null +++ b/elasticsearch/install @@ -0,0 +1,39 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y apt-transport-https ca-certificates wget gpg time jq bc python3 python3-pip + +# Elastic signing key + repo (idempotent if already present). +if [ ! -f /usr/share/keyrings/elasticsearch-keyring.gpg ]; then + wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch \ + | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg +fi + +if [ ! -f /etc/apt/sources.list.d/elastic-9.x.list ]; then + echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/9.x/apt stable main" \ + | sudo tee /etc/apt/sources.list.d/elastic-9.x.list + sudo apt-get update -y +fi + +sudo apt-get install -y elasticsearch + +# load.py uses requests. +pip3 install --quiet --break-system-packages requests || pip3 install --quiet requests + +# Disable security; bind on all interfaces (matches original config). +cat </dev/null +path.data: /var/lib/elasticsearch +path.logs: /var/log/elasticsearch + +xpack.security.enabled: false +xpack.security.http.ssl.enabled: false +xpack.security.transport.ssl.enabled: false + +cluster.initial_master_nodes: ["clickbench"] +http.host: 0.0.0.0 +EOF + +sudo /bin/systemctl daemon-reload +sudo systemctl enable elasticsearch.service +sudo systemctl restart elasticsearch.service diff --git a/elasticsearch/load b/elasticsearch/load new file mode 100755 index 000000000..f9c507480 --- /dev/null +++ b/elasticsearch/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +# Fetch source data (NDJSON, gzipped). load.py reads it directly. +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' + +# Idempotent: drop existing index. +curl -sS -X DELETE "http://localhost:9200/hits" >/dev/null 2>&1 || true + +# Create index with explicit field types (mirrors ClickHouse). +curl -sS -X PUT "http://localhost:9200/hits?pretty" \ + -H 'Content-Type: application/json' -d @mapping.json >/dev/null + +# Bulk load NDJSON (gzipped) via the ES Bulk API. +python3 load.py + +# Force a flush so on-disk size is final. +curl -sS -X GET "http://localhost:9200/_flush?pretty" >/dev/null + +rm -f hits.json.gz +sync diff --git a/elasticsearch/query b/elasticsearch/query new file mode 100755 index 000000000..ffd08d731 --- /dev/null +++ b/elasticsearch/query @@ -0,0 +1,42 @@ +#!/bin/bash +# Reads a SQL query from stdin, translates it to Elasticsearch DSL via the +# /_sql/translate endpoint, then runs it against /_search. +# +# Stdout: ES JSON response. +# Stderr: query runtime in fractional seconds on the last line (the "took" +# field from the search response, in milliseconds). +# Exit non-zero on error. +set -e + +query=$(cat) + +# Clear query cache to keep tries comparable. +curl -sS -X POST 'http://localhost:9200/hits/_cache/clear' >/dev/null 2>&1 || true + +# Translate SQL -> DSL. +sql_json=$(jq -nc --arg q "$query" '{query: $q}') +dsl=$(curl -sS -X POST 'http://localhost:9200/_sql/translate' \ + -H 'Content-Type: application/json' -d "$sql_json") + +if printf '%s\n' "$dsl" | jq -e 'has("error")' >/dev/null 2>&1; then + printf '%s\n' "$dsl" >&2 + exit 1 +fi + +resp=$(curl -sS -X GET 'http://localhost:9200/hits/_search' \ + -H 'Content-Type: application/json' -d "$dsl") + +if printf '%s\n' "$resp" | jq -e 'has("error")' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" + +took_ms=$(printf '%s\n' "$resp" | jq -r '.took // empty') +if [ -z "$took_ms" ] || [ "$took_ms" = "null" ]; then + echo "no .took in elasticsearch response" >&2 + exit 1 +fi + +awk -v m="$took_ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/elasticsearch/run.sh b/elasticsearch/run.sh deleted file mode 100755 index 5c2607aad..000000000 --- a/elasticsearch/run.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat 'queries.sql' | while read -r QUERY; do - sync && echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo -n "[" - - for i in $(seq 1 $TRIES); do - - # clear query cache between runs - curl -X POST 'http://localhost:9200/hits/_cache/clear?pretty' &>/dev/null - - JSON="{\"query\" : \"$QUERY\" }" - - # translate query to DSL - DSL=$(curl -s -X POST "http://localhost:9200/_sql/translate?pretty" -H 'Content-Type: application/json' -d"$JSON" ) - - # start external timer - START=`date +%s.%N` - - # Run DSL directly through search API - ES_RSP=$(curl -s -X GET "http://localhost:9200/hits/_search" -H 'Content-Type: application/json' -d"$DSL" ) - - # run query through SQL API (choosing not to use SQL API directly, because it stalls some queries w/o feedback or cancelling, e.g. 6, 13-15, 17, 31-36) - # curl -X POST 'http://localhost:9200/_sql?format=txt&pretty' -H 'Content-Type: application/json' -d"$JSON" #&>/dev/null - - # calculate timing outside of Elasticsearch (needed for runs through SQL API which does not return the time it took to run) - END=`date +%s.%N` - RES=$( echo "$END - $START" | bc -l ) - - # retrieve timing from Elastic Search API "took" parameter and convert to seconds - ES_TIME=$(echo $ES_RSP | jq -r '.took') - ES_TIME=$(echo "scale=4; $ES_TIME / 1000" | bc) - - # output ES_TIME to console (it's more accurate), and if ES returned an error, print null - [[ "$( jq 'has("error")' <<< $ES_RSP )" == "true" ]] && echo -n "null" || echo -n "$ES_TIME" - [[ "$i" != $TRIES ]] && echo -n ", " - - done - - echo "]," - -done; diff --git a/elasticsearch/start b/elasticsearch/start new file mode 100755 index 000000000..8fa318334 --- /dev/null +++ b/elasticsearch/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if curl -sSf 'http://localhost:9200' >/dev/null 2>&1; then + exit 0 +fi +sudo systemctl start elasticsearch.service diff --git a/elasticsearch/stop b/elasticsearch/stop new file mode 100755 index 000000000..db18bc195 --- /dev/null +++ b/elasticsearch/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop elasticsearch.service || true diff --git a/gizmosql/benchmark.sh b/gizmosql/benchmark.sh index 1e4204014..b85187617 100755 --- a/gizmosql/benchmark.sh +++ b/gizmosql/benchmark.sh @@ -1,73 +1,5 @@ #!/bin/bash - -# needed by DuckDB -export HOME=/home/ubuntu - -# Install requirements -apt-get update -y -apt install openjdk-17-jre-headless unzip netcat-openbsd -y - -# Detect architecture (maps x86_64->amd64, aarch64->arm64) -ARCH=$(uname -m) -if [ "$ARCH" = "x86_64" ]; then - ARCH="amd64" -elif [ "$ARCH" = "aarch64" ]; then - ARCH="arm64" -fi - -# Server setup Install -curl -L -o gizmosql.zip "https://github.com/gizmodata/gizmosql/releases/latest/download/gizmosql_cli_linux_${ARCH}.zip" -unzip gizmosql.zip -mv gizmosql_server gizmosql_client /usr/local/bin/ - -# Install Java and the GizmoSQLLine CLI client -pushd /tmp -curl -L -o gizmosqlline https://github.com/gizmodata/gizmosqlline/releases/latest/download/gizmosqlline -chmod +x gizmosqlline -mv gizmosqlline /usr/local/bin/ -popd - -# Source our env vars and utility functions for starting/stopping gizmosql server -. util.sh - -# Start the GizmoSQL server in the background -start_gizmosql - -# Create the table -gizmosqlline \ - -u ${GIZMOSQL_SERVER_URI} \ - -n ${GIZMOSQL_USERNAME} \ - -p ${GIZMOSQL_PASSWORD} \ - -f create.sql - -# Load the data -../download-hits-parquet-single - -echo -n "Load time: " -time gizmosqlline \ - -u ${GIZMOSQL_SERVER_URI} \ - -n ${GIZMOSQL_USERNAME} \ - -p ${GIZMOSQL_PASSWORD} \ - -f load.sql - -stop_gizmosql - -# Run the queries -./run.sh 2>&1 | tee log.txt - -# Remove carriage returns from the log -sed -i 's/\r$//' log.txt - -echo -n "Data size: " -wc -c clickbench.db - -cat log.txt | \ - grep -E 'rows? selected \([0-9.]+ seconds\)|Killed|Segmentation' | \ - sed -E 's/.*rows? selected \(([0-9.]+) seconds\).*/\1/; s/.*(Killed|Segmentation).*/null/' | \ - awk '{ - if (NR % 3 == 1) printf "["; - if ($1 == "null") printf "null"; - else printf $1; - if (NR % 3 == 0) printf "],\n"; - else printf ", "; - }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/gizmosql/check b/gizmosql/check new file mode 100755 index 000000000..d16666481 --- /dev/null +++ b/gizmosql/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# A simple TCP probe is the cheapest health check; the JDBC arrow-flight URI +# is auth-gated, so we just check the listener is up. +exec nc -z localhost 31337 diff --git a/gizmosql/data-size b/gizmosql/data-size new file mode 100755 index 000000000..536ed36d9 --- /dev/null +++ b/gizmosql/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +wc -c < clickbench.db | awk '{print $1}' diff --git a/gizmosql/install b/gizmosql/install new file mode 100755 index 000000000..0096d5c93 --- /dev/null +++ b/gizmosql/install @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jre-headless unzip netcat-openbsd curl + +ARCH=$(uname -m) +if [ "$ARCH" = "x86_64" ]; then + ARCH="amd64" +elif [ "$ARCH" = "aarch64" ]; then + ARCH="arm64" +fi + +if ! command -v gizmosql_server >/dev/null 2>&1; then + curl -L -o gizmosql.zip \ + "https://github.com/gizmodata/gizmosql/releases/latest/download/gizmosql_cli_linux_${ARCH}.zip" + unzip -o gizmosql.zip + sudo mv gizmosql_server gizmosql_client /usr/local/bin/ +fi + +if ! command -v gizmosqlline >/dev/null 2>&1; then + pushd /tmp >/dev/null + curl -L -o gizmosqlline \ + https://github.com/gizmodata/gizmosqlline/releases/latest/download/gizmosqlline + chmod +x gizmosqlline + sudo mv gizmosqlline /usr/local/bin/ + popd >/dev/null +fi diff --git a/gizmosql/load b/gizmosql/load new file mode 100755 index 000000000..14c72d723 --- /dev/null +++ b/gizmosql/load @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +. ./util.sh + +# Idempotent: blow away any prior database. +rm -f clickbench.db + +# Server must be up to receive create + load via gizmosqlline. +gizmosqlline \ + -u "$GIZMOSQL_SERVER_URI" \ + -n "$GIZMOSQL_USERNAME" \ + -p "$GIZMOSQL_PASSWORD" \ + -f create.sql + +gizmosqlline \ + -u "$GIZMOSQL_SERVER_URI" \ + -n "$GIZMOSQL_USERNAME" \ + -p "$GIZMOSQL_PASSWORD" \ + -f load.sql + +rm -f hits.parquet +sync diff --git a/gizmosql/query b/gizmosql/query new file mode 100755 index 000000000..a6be9dec2 --- /dev/null +++ b/gizmosql/query @@ -0,0 +1,36 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via gizmosqlline. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# gizmosqlline's "N rows selected (X seconds)" footer). +# Exit non-zero on error. +set -e + +. ./util.sh +query=$(cat) + +raw=$(printf '%s\n' "$query" | gizmosqlline \ + -u "$GIZMOSQL_SERVER_URI" \ + -n "$GIZMOSQL_USERNAME" \ + -p "$GIZMOSQL_PASSWORD" 2>&1) && exit_code=0 || exit_code=$? + +# Strip carriage returns from gizmosqlline output. +clean=$(printf '%s\n' "$raw" | tr -d '\r') + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$clean" | grep -qiE 'error|exception|Killed|Segmentation'; then + printf '%s\n' "$clean" >&2 + exit 1 +fi + +printf '%s\n' "$clean" + +secs=$(printf '%s\n' "$clean" \ + | grep -oP '[0-9.]+(?= seconds\))' \ + | tail -n1) + +if [ -z "$secs" ]; then + echo "no timing in gizmosqlline output" >&2 + exit 1 +fi + +awk -v s="$secs" 'BEGIN { printf "%.3f\n", s }' >&2 diff --git a/gizmosql/run.sh b/gizmosql/run.sh deleted file mode 100755 index 34a9c8fe0..000000000 --- a/gizmosql/run.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Source our env vars -. util.sh - -TRIES=3 -TEMP_SQL_FILE="/tmp/benchmark_queries_$$.sql" - -# Ensure server is stopped on script exit -trap stop_gizmosql EXIT - -# Read queries from file -mapfile -t queries < queries.sql - -echo "Running benchmark with ${#queries[@]} queries, ${TRIES} tries each..." - -for query in "${queries[@]}"; do - > "${TEMP_SQL_FILE}" - - # Clear Linux memory caches to ensure fair benchmark comparisons - sync - echo 3 | tee /proc/sys/vm/drop_caches > /dev/null - - # Start the GizmoSQL server - start_gizmosql - - # Add a comment to identify the query in the output - echo "-- Query: ${query}" >> "${TEMP_SQL_FILE}" - - # Repeat each query TRIES times - for i in $(seq 1 ${TRIES}); do - echo "${query}" >> "${TEMP_SQL_FILE}" - done - - # Execute the query script - gizmosqlline \ - -u ${GIZMOSQL_SERVER_URI} \ - -n ${GIZMOSQL_USERNAME} \ - -p ${GIZMOSQL_PASSWORD} \ - -f "${TEMP_SQL_FILE}" - - # Stop the server before next query - stop_gizmosql -done - -# Clean up -rm -f "${TEMP_SQL_FILE}" diff --git a/gizmosql/start b/gizmosql/start new file mode 100755 index 000000000..29d3910ce --- /dev/null +++ b/gizmosql/start @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +. ./util.sh + +# Idempotent: if port 31337 is already open, do nothing. +if nc -z localhost 31337 2>/dev/null; then + exit 0 +fi + +start_gizmosql diff --git a/gizmosql/stop b/gizmosql/stop new file mode 100755 index 000000000..5a4f08d1c --- /dev/null +++ b/gizmosql/stop @@ -0,0 +1,7 @@ +#!/bin/bash + +. ./util.sh +stop_gizmosql || true + +# Belt-and-braces: kill any leftover gizmosql_server process. +pkill -x gizmosql_server 2>/dev/null || true diff --git a/glaredb-partitioned/benchmark.sh b/glaredb-partitioned/benchmark.sh index b33b578e2..3b63e772a 100755 --- a/glaredb-partitioned/benchmark.sh +++ b/glaredb-partitioned/benchmark.sh @@ -1,31 +1,5 @@ -#!/usr/bin/env bash - -set -e - -repo_root=$(git rev-parse --show-toplevel) -script_dir=$(dirname "$0") - -if [[ "$(basename "$repo_root")" == "glaredb" ]]; then - # Inside glaredb repo, build from source. - cargo build --release --bin glaredb - cp "${repo_root}/target/release/glaredb" "${script_dir}/glaredb" -else - # Not in glaredb repo, use prebuilt binary. - export GLAREDB_INSTALL_DIR="${script_dir}" - export GLAREDB_VERSION="v25.5.11" - curl -fsSL https://glaredb.com/install.sh | sh -fi - -# Get the data. -"${script_dir}/../download-hits-parquet-partitioned" "${script_dir}/data" -pushd "${script_dir}/data" -echo "Data size: $(du -bcs hits*.parquet | grep total)" -echo "Load time: 0" -popd - -# Ensure working directory is the script dir. The view that gets created uses a -# relative path. -pushd "${script_dir}" - -./run.sh partitioned -cat results.json +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/glaredb-partitioned/check b/glaredb-partitioned/check new file mode 100755 index 000000000..bf7e53000 --- /dev/null +++ b/glaredb-partitioned/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./glaredb -c "SELECT 1" >/dev/null diff --git a/glaredb-partitioned/data-size b/glaredb-partitioned/data-size new file mode 100755 index 000000000..400c51843 --- /dev/null +++ b/glaredb-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs data | awk '/total$/ { print $1 }' diff --git a/glaredb-partitioned/install b/glaredb-partitioned/install new file mode 100755 index 000000000..5891568bc --- /dev/null +++ b/glaredb-partitioned/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if [ ! -x ./glaredb ]; then + export GLAREDB_INSTALL_DIR="$(pwd)" + export GLAREDB_VERSION="v25.5.11" + curl -fsSL https://glaredb.com/install.sh | sh +fi diff --git a/glaredb-partitioned/load b/glaredb-partitioned/load new file mode 100755 index 000000000..c2afca99a --- /dev/null +++ b/glaredb-partitioned/load @@ -0,0 +1,8 @@ +#!/bin/bash +# glaredb-partitioned's create.sql references ./data/hits_*.parquet, so move +# the partitioned files into the expected subdir. +set -e + +mkdir -p data +mv hits_*.parquet data/ 2>/dev/null || true +sync diff --git a/glaredb-partitioned/query b/glaredb-partitioned/query new file mode 100755 index 000000000..09c4727a9 --- /dev/null +++ b/glaredb-partitioned/query @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +query=$(cat) + +out=$(./glaredb --init create.sql -c ".timer on" -c "$query" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +printf '%s\n' "$out" | grep -v '^Execution duration' || true + +printf '%s\n' "$out" | awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' | tail -n1 >&2 diff --git a/glaredb-partitioned/run.sh b/glaredb-partitioned/run.sh deleted file mode 100755 index 05d513367..000000000 --- a/glaredb-partitioned/run.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -set -eu -set -o pipefail - -TRIES=3 -QUERY_NUM=0 - -echo "[" > results.json -echo "query_num,iteration,duration" > results.csv - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "${QUERY_NUM}: ${query}" - - [ "${QUERY_NUM}" != 0 ] && echo "," >> results.json - echo -n " [" >> results.json - - for i in $(seq 1 $TRIES); do - output=$(./glaredb --init create.sql -c ".timer on" -c "${query}") - duration=$(awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' <<< "$output") - - echo "$output" - - if [ -z "${duration}" ]; then - echo "Query failed" - exit 1 - fi - - # JSON results - echo -n "${duration}" >> results.json - [ "${i}" != "${TRIES}" ] && echo -n "," >> results.json - - # CSV results - echo "${QUERY_NUM},${i},${duration}" >> results.csv - done - - echo -n "]" >> results.json - - QUERY_NUM=$((QUERY_NUM + 1)) -done - -echo "" >> results.csv -echo "" >> results.json -echo "]" >> results.json diff --git a/glaredb-partitioned/start b/glaredb-partitioned/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/glaredb-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/glaredb-partitioned/stop b/glaredb-partitioned/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/glaredb-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/glaredb/benchmark.sh b/glaredb/benchmark.sh index 89ec7011e..fc4bacc8f 100755 --- a/glaredb/benchmark.sh +++ b/glaredb/benchmark.sh @@ -1,32 +1,5 @@ -#!/usr/bin/env bash - -set -e - -repo_root=$(git rev-parse --show-toplevel) -script_dir=$(dirname "$0") - -if [[ "$(basename "$repo_root")" == "glaredb" ]]; then - # Inside glaredb repo, build from source. - cargo build --release --bin glaredb - cp "${repo_root}/target/release/glaredb" "${script_dir}/glaredb" -else - # Not in glaredb repo, use prebuilt binary. - export GLAREDB_INSTALL_DIR="${script_dir}" - export GLAREDB_VERSION="v25.5.11" - curl -fsSL https://glaredb.com/install.sh | sh -fi - -# Get the data. -"${script_dir}/../download-hits-parquet-single" "${script_dir}/data" -pushd "${script_dir}/data" -echo "Data size: $(du -bcs hits*.parquet | grep total)" -popd - -# Ensure working directory is the script dir. The view that gets created uses a -# relative path. -pushd "${script_dir}" - -./run.sh single -cat results.json - -echo "Load time: 0" +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/glaredb/check b/glaredb/check new file mode 100755 index 000000000..bf7e53000 --- /dev/null +++ b/glaredb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./glaredb -c "SELECT 1" >/dev/null diff --git a/glaredb/data-size b/glaredb/data-size new file mode 100755 index 000000000..400c51843 --- /dev/null +++ b/glaredb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs data | awk '/total$/ { print $1 }' diff --git a/glaredb/install b/glaredb/install new file mode 100755 index 000000000..b60f25254 --- /dev/null +++ b/glaredb/install @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Idempotent: only download glaredb if not already present. +if [ ! -x ./glaredb ]; then + export GLAREDB_INSTALL_DIR="$(pwd)" + export GLAREDB_VERSION="v25.5.11" + curl -fsSL https://glaredb.com/install.sh | sh +fi diff --git a/glaredb/load b/glaredb/load new file mode 100755 index 000000000..06c148945 --- /dev/null +++ b/glaredb/load @@ -0,0 +1,8 @@ +#!/bin/bash +# glaredb's create.sql references ./data/hits.parquet, so move the parquet +# file into the expected subdir. +set -e + +mkdir -p data +mv hits.parquet data/ 2>/dev/null || true +sync diff --git a/glaredb/query b/glaredb/query new file mode 100755 index 000000000..4e88d17e0 --- /dev/null +++ b/glaredb/query @@ -0,0 +1,19 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via glaredb with create.sql as init. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +query=$(cat) + +out=$(./glaredb --init create.sql -c ".timer on" -c "$query" 2>&1) && status=0 || status=$? + +if [ "$status" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$status" +fi + +# glaredb prints "Execution duration (s): X.YYY"; everything else is result. +printf '%s\n' "$out" | grep -v '^Execution duration' || true + +printf '%s\n' "$out" | awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' | tail -n1 >&2 diff --git a/glaredb/run.sh b/glaredb/run.sh deleted file mode 100755 index 05d513367..000000000 --- a/glaredb/run.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -set -eu -set -o pipefail - -TRIES=3 -QUERY_NUM=0 - -echo "[" > results.json -echo "query_num,iteration,duration" > results.csv - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "${QUERY_NUM}: ${query}" - - [ "${QUERY_NUM}" != 0 ] && echo "," >> results.json - echo -n " [" >> results.json - - for i in $(seq 1 $TRIES); do - output=$(./glaredb --init create.sql -c ".timer on" -c "${query}") - duration=$(awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' <<< "$output") - - echo "$output" - - if [ -z "${duration}" ]; then - echo "Query failed" - exit 1 - fi - - # JSON results - echo -n "${duration}" >> results.json - [ "${i}" != "${TRIES}" ] && echo -n "," >> results.json - - # CSV results - echo "${QUERY_NUM},${i},${duration}" >> results.csv - done - - echo -n "]" >> results.json - - QUERY_NUM=$((QUERY_NUM + 1)) -done - -echo "" >> results.csv -echo "" >> results.json -echo "]" >> results.json diff --git a/glaredb/start b/glaredb/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/glaredb/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/glaredb/stop b/glaredb/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/glaredb/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/greenplum/benchmark.sh b/greenplum/benchmark.sh index 79606afb7..531bd6503 100755 --- a/greenplum/benchmark.sh +++ b/greenplum/benchmark.sh @@ -1,80 +1,5 @@ #!/bin/bash - -# NOTE: it requires Ubuntu 18.04 -# Greenplum does not install on any newer system. - -echo "This script must be run from gpadmin user. Press enter to continue." -read -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository ppa:greenplum/db -sudo apt-get update -y -sudo apt-get install -y greenplum-db-6 -sudo rm -rf /gpmaster /gpdata* -ssh-keygen -t rsa -b 4096 -touch /home/gpadmin/.ssh/authorized_keys -chmod 600 ~/.ssh/authorized_keys -cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys -sudo echo "# kernel.shmall = _PHYS_PAGES / 2 # See Shared Memory Pages -kernel.shmall = 197951838 -# kernel.shmmax = kernel.shmall * PAGE_SIZE -kernel.shmmax = 810810728448 -kernel.shmmni = 4096 -vm.overcommit_memory = 2 # See Segment Host Memory -vm.overcommit_ratio = 95 # See Segment Host Memory - -net.ipv4.ip_local_port_range = 10000 65535 # See Port Settings -kernel.sem = 500 2048000 200 4096 -kernel.sysrq = 1 -kernel.core_uses_pid = 1 -kernel.msgmnb = 65536 -kernel.msgmax = 65536 -kernel.msgmni = 2048 -net.ipv4.tcp_syncookies = 1 -net.ipv4.conf.default.accept_source_route = 0 -net.ipv4.tcp_max_syn_backlog = 4096 -net.ipv4.conf.all.arp_filter = 1 -net.core.netdev_max_backlog = 10000 -net.core.rmem_max = 2097152 -net.core.wmem_max = 2097152 -vm.swappiness = 10 -vm.zone_reclaim_mode = 0 -vm.dirty_expire_centisecs = 500 -vm.dirty_writeback_centisecs = 100 -vm.dirty_background_ratio = 0 # See System Memory -vm.dirty_ratio = 0 -vm.dirty_background_bytes = 1610612736 -vm.dirty_bytes = 4294967296" |sudo tee -a /etc/sysctl.conf -sudo sysctl -p - -echo "* soft nofile 524288 -* hard nofile 524288 -* soft nproc 131072 -* hard nproc 131072" |sudo tee -a /etc/security/limits.conf -echo "RemoveIPC=no" |sudo tee -a /etc/systemd/logind.conf -echo "Now you need to reboot the machine. Press Enter if you already rebooted, or reboot now and run the script once again" -read -source /opt/greenplum-db-*/greenplum_path.sh -cp $GPHOME/docs/cli_help/gpconfigs/gpinitsystem_singlenode . -echo localhost > ./hostlist_singlenode -sed -i "s/MASTER_HOSTNAME=[a-z_]*/MASTER_HOSTNAME=$(hostname)/" gpinitsystem_singlenode -sed -i "s@declare -a DATA_DIRECTORY=(/gpdata1 /gpdata2)@declare -a DATA_DIRECTORY=(/gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14)@" gpinitsystem_singlenode -sudo mkdir /gpmaster /gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14 -sudo chmod 777 /gpmaster /gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14 -gpinitsystem -ac gpinitsystem_singlenode -export MASTER_DATA_DIRECTORY=/gpmaster/gpsne-1/ -../download-hits-tsv -chmod 777 ~ hits.tsv -psql -d postgres -f create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -nohup gpfdist & -echo -n "Load time: " -command time -f '%e' psql -d postgres -t -c "insert into hits select * from hits_ext;" -echo -n "Load time: " -command time -f '%e' psql -d postgres -t -c "ANALYZE hits;" -du -sh /gpdata* -./run.sh 2>&1 | tee log.txt -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' |awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/greenplum/check b/greenplum/check new file mode 100755 index 000000000..a9687658a --- /dev/null +++ b/greenplum/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source /opt/greenplum-db-*/greenplum_path.sh +export MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/gpmaster/gpsne-1/} +psql -d postgres -t -c 'SELECT 1' >/dev/null diff --git a/greenplum/data-size b/greenplum/data-size new file mode 100755 index 000000000..42a054885 --- /dev/null +++ b/greenplum/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo du -bcs /gpdata* 2>/dev/null | grep total | awk '{print $1}' diff --git a/greenplum/install b/greenplum/install new file mode 100755 index 000000000..bd57ea997 --- /dev/null +++ b/greenplum/install @@ -0,0 +1,90 @@ +#!/bin/bash +# Install Greenplum DB. Requires Ubuntu 18.04 — newer releases lack +# greenplum packages. The original benchmark relies on running parts as the +# gpadmin user; this script encapsulates the steps. +# +# Phases (controlled by the first argument): +# System config; afterwards reboot before running ./install init. +# init Initialize the cluster (must be run as gpadmin). +set -eu + +PHASE=${1:-prereqs} + +if [ "$PHASE" = "prereqs" ]; then + sudo apt-get update -y + sudo apt-get install -y software-properties-common + sudo add-apt-repository -y ppa:greenplum/db + sudo apt-get update -y + sudo apt-get install -y greenplum-db-6 + sudo rm -rf /gpmaster /gpdata* + + if [ ! -f "$HOME/.ssh/id_rsa" ]; then + ssh-keygen -t rsa -b 4096 -N '' -f "$HOME/.ssh/id_rsa" + fi + touch "$HOME/.ssh/authorized_keys" + chmod 600 "$HOME/.ssh/authorized_keys" + cat "$HOME/.ssh/id_rsa.pub" >> "$HOME/.ssh/authorized_keys" + + sudo tee -a /etc/sysctl.conf < ./hostlist_singlenode + sed -i "s/MASTER_HOSTNAME=[a-z_]*/MASTER_HOSTNAME=$(hostname)/" gpinitsystem_singlenode + sed -i "s@declare -a DATA_DIRECTORY=(/gpdata1 /gpdata2)@declare -a DATA_DIRECTORY=(/gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14)@" gpinitsystem_singlenode + + sudo mkdir -p /gpmaster /gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 \ + /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14 + sudo chmod 777 /gpmaster /gpdata1 /gpdata2 /gpdata3 /gpdata4 /gpdata5 /gpdata6 \ + /gpdata7 /gpdata8 /gpdata9 /gpdata10 /gpdata11 /gpdata12 /gpdata13 /gpdata14 + + gpinitsystem -ac gpinitsystem_singlenode + exit 0 +fi + +echo "Unknown phase: $PHASE" >&2 +exit 1 diff --git a/greenplum/load b/greenplum/load new file mode 100755 index 000000000..1f4405448 --- /dev/null +++ b/greenplum/load @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +# shellcheck disable=SC1091 +source /opt/greenplum-db-*/greenplum_path.sh +export MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/gpmaster/gpsne-1/} + +sudo chmod 777 ~ hits.tsv + +psql -d postgres -v ON_ERROR_STOP=1 -f create.sql + +# gpfdist serves hits.tsv to the external table referenced from create.sql. +pgrep gpfdist || nohup gpfdist >/tmp/gpfdist.log 2>&1 & + +psql -d postgres -v ON_ERROR_STOP=1 -t -c "INSERT INTO hits SELECT * FROM hits_ext;" +psql -d postgres -v ON_ERROR_STOP=1 -t -c "ANALYZE hits;" + +rm -f hits.tsv +sync diff --git a/greenplum/query b/greenplum/query new file mode 100755 index 000000000..79ab3395c --- /dev/null +++ b/greenplum/query @@ -0,0 +1,31 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source /opt/greenplum-db-*/greenplum_path.sh +export MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/gpmaster/gpsne-1/} + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -d postgres -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/greenplum/run.sh b/greenplum/run.sh deleted file mode 100755 index 23a2756b7..000000000 --- a/greenplum/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - echo '\timing' > /tmp/query_temp.sql - echo "$query" >> /tmp/query_temp.sql - psql -d postgres -t -f /tmp/query_temp.sql 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/greenplum/start b/greenplum/start new file mode 100755 index 000000000..127ce5537 --- /dev/null +++ b/greenplum/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +# Source greenplum env and start the cluster (idempotent). +# shellcheck disable=SC1091 +source /opt/greenplum-db-*/greenplum_path.sh +export MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/gpmaster/gpsne-1/} +gpstart -a || true diff --git a/greenplum/stop b/greenplum/stop new file mode 100755 index 000000000..dc78d466c --- /dev/null +++ b/greenplum/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +# shellcheck disable=SC1091 +source /opt/greenplum-db-*/greenplum_path.sh 2>/dev/null || true +export MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/gpmaster/gpsne-1/} +gpstop -a 2>/dev/null || true diff --git a/heavyai/benchmark.sh b/heavyai/benchmark.sh index 939073309..1aa9264b9 100755 --- a/heavyai/benchmark.sh +++ b/heavyai/benchmark.sh @@ -1,56 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y default-jre-headless apt-transport-https - -sudo apt-get install -y docker.io -sudo docker run -it --rm -v $(pwd):/host ubuntu:18.04 cp /lib/x86_64-linux-gnu/libtinfo.so.5 /host/ -sudo cp libtinfo.so.5 /usr/lib/x86_64-linux-gnu/ - -sudo useradd -U -m heavyai -sudo curl https://releases.heavy.ai/GPG-KEY-heavyai | sudo apt-key add - -echo "deb https://releases.heavy.ai/os/apt/ stable cpu" | sudo tee /etc/apt/sources.list.d/heavyai.list -sudo apt-get update -y -sudo apt-get install heavyai -y - -export HEAVYAI_USER=heavyai -export HEAVYAI_GROUP=heavyai -export HEAVYAI_STORAGE=/var/lib/heavyai -export HEAVYAI_PATH=/opt/heavyai -export HEAVYAI_LOG=/var/lib/heavyai/data/mapd_log - -pushd $HEAVYAI_PATH/systemd -./install_heavy_systemd.sh -popd - -# Press Enter multiple times. - -sudo systemctl start heavydb -sudo systemctl enable heavydb - -# Load the data - -../download-hits-csv -chmod 777 ~ hits.csv - -sudo bash -c "echo 'allowed-import-paths = [\"$(pwd)\"]' > /var/lib/heavyai/heavy.conf_" -sudo bash -c "cat /var/lib/heavyai/heavy.conf >> /var/lib/heavyai/heavy.conf_" -sudo bash -c "mv /var/lib/heavyai/heavy.conf_ /var/lib/heavyai/heavy.conf && chown heavyai /var/lib/heavyai/heavy.conf" -sudo systemctl restart heavydb - -/opt/heavyai/bin/heavysql -t -p HyperInteractive < create.sql -echo -n "Load time: " -command time -f '%e' /opt/heavyai/bin/heavysql -q -t -p HyperInteractive <<< "COPY hits FROM '$(pwd)/hits.csv' WITH (HEADER = 'false');" - -# Loaded: 99997497 recs, Rejected: 0 recs in 572.633 secs - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -du -bcs /var/lib/heavyai/ | grep total - -cat log.txt | grep -P 'Total time|null' | sed -r -e 's/^.*Total time: ([0-9]+) ms$/\1/' | - awk '{ if ($1 == "null") { print } else { print $1 / 1000 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/heavyai/check b/heavyai/check new file mode 100755 index 000000000..d824e08ec --- /dev/null +++ b/heavyai/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +/opt/heavyai/bin/heavysql -p HyperInteractive -q -t <<< 'SELECT 1' >/dev/null 2>&1 diff --git a/heavyai/data-size b/heavyai/data-size new file mode 100755 index 000000000..08c59ded6 --- /dev/null +++ b/heavyai/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs /var/lib/heavyai/ | grep total | awk '{print $1}' diff --git a/heavyai/install b/heavyai/install new file mode 100755 index 000000000..85cb18fb7 --- /dev/null +++ b/heavyai/install @@ -0,0 +1,45 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y default-jre-headless apt-transport-https docker.io + +# heavyai links against libtinfo.so.5 (not in modern Ubuntu); copy from 18.04. +if [ ! -e /usr/lib/x86_64-linux-gnu/libtinfo.so.5 ]; then + sudo docker run --rm -v "$(pwd):/host" ubuntu:18.04 cp /lib/x86_64-linux-gnu/libtinfo.so.5 /host/ + sudo cp libtinfo.so.5 /usr/lib/x86_64-linux-gnu/ +fi + +if ! id heavyai >/dev/null 2>&1; then + sudo useradd -U -m heavyai +fi + +if [ ! -d /opt/heavyai ]; then + sudo curl https://releases.heavy.ai/GPG-KEY-heavyai | sudo apt-key add - + echo "deb https://releases.heavy.ai/os/apt/ stable cpu" \ + | sudo tee /etc/apt/sources.list.d/heavyai.list + sudo apt-get update -y + sudo apt-get install heavyai -y + + export HEAVYAI_USER=heavyai + export HEAVYAI_GROUP=heavyai + export HEAVYAI_STORAGE=/var/lib/heavyai + export HEAVYAI_PATH=/opt/heavyai + export HEAVYAI_LOG=/var/lib/heavyai/data/mapd_log + + pushd "$HEAVYAI_PATH/systemd" >/dev/null + ./install_heavy_systemd.sh + popd >/dev/null + + sudo systemctl enable heavydb +fi + +# Allow loading data from this directory. +if [ -f /var/lib/heavyai/heavy.conf ] && \ + ! sudo grep -q "allowed-import-paths.*$(pwd)" /var/lib/heavyai/heavy.conf; then + sudo bash -c "echo 'allowed-import-paths = [\"$(pwd)\"]' > /var/lib/heavyai/heavy.conf_" + sudo bash -c "cat /var/lib/heavyai/heavy.conf >> /var/lib/heavyai/heavy.conf_" + sudo bash -c "mv /var/lib/heavyai/heavy.conf_ /var/lib/heavyai/heavy.conf && chown heavyai /var/lib/heavyai/heavy.conf" +fi + +sudo systemctl restart heavydb diff --git a/heavyai/load b/heavyai/load new file mode 100755 index 000000000..d2ca122ed --- /dev/null +++ b/heavyai/load @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +chmod 777 ~ hits.csv + +/opt/heavyai/bin/heavysql -t -p HyperInteractive < create.sql +/opt/heavyai/bin/heavysql -q -t -p HyperInteractive <<< "COPY hits FROM '$(pwd)/hits.csv' WITH (HEADER = 'false');" + +rm -f hits.csv +sync diff --git a/heavyai/query b/heavyai/query new file mode 100755 index 000000000..c1cfa1b37 --- /dev/null +++ b/heavyai/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via heavysql. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# heavysql's "Total time: ms" footer). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(/opt/heavyai/bin/heavysql -t -p HyperInteractive <<< "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qiE '^Exception|^Error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +ms=$(printf '%s\n' "$raw" | grep -oP 'Total time:\s*\K[0-9]+(?=\s*ms)' | tail -n1) + +if [ -z "$ms" ]; then + echo "no timing in heavysql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/heavyai/run.sh b/heavyai/run.sh deleted file mode 100755 index 516ad08ba..000000000 --- a/heavyai/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - /opt/heavyai/bin/heavysql -t -p HyperInteractive <<< "${query}" | grep 'Total time' || echo 'null' - done; -done; diff --git a/heavyai/start b/heavyai/start new file mode 100755 index 000000000..4e1345bb3 --- /dev/null +++ b/heavyai/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +if /opt/heavyai/bin/heavysql -p HyperInteractive -q -t <<< 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo systemctl start heavydb diff --git a/heavyai/stop b/heavyai/stop new file mode 100755 index 000000000..1e17acded --- /dev/null +++ b/heavyai/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop heavydb || true diff --git a/hyper-parquet/benchmark.sh b/hyper-parquet/benchmark.sh index dfe1f83a2..3b63e772a 100755 --- a/hyper-parquet/benchmark.sh +++ b/hyper-parquet/benchmark.sh @@ -1,15 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install tableauhyperapi - -../download-hits-parquet-partitioned - -./run.sh | tee log.txt -echo "Data size: $(du -bcs hits*.parquet | grep total)" -echo "Load time: 0" - -cat log.txt | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/hyper-parquet/check b/hyper-parquet/check new file mode 100755 index 000000000..23ad27458 --- /dev/null +++ b/hyper-parquet/check @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +python3 - <<'PY' +from tableauhyperapi import HyperProcess, Telemetry, Connection +with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: + with Connection(hyper.endpoint) as connection: + connection.execute_list_query("SELECT 1") +PY diff --git a/hyper-parquet/data-size b/hyper-parquet/data-size new file mode 100755 index 000000000..2d6921ab6 --- /dev/null +++ b/hyper-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/hyper-parquet/install b/hyper-parquet/install new file mode 100755 index 000000000..537a36ca4 --- /dev/null +++ b/hyper-parquet/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade pip +pip install tableauhyperapi diff --git a/hyper-parquet/load b/hyper-parquet/load new file mode 100755 index 000000000..19ff8b994 --- /dev/null +++ b/hyper-parquet/load @@ -0,0 +1,5 @@ +#!/bin/bash +# hyper-parquet queries the parquet files directly via a temp external table +# defined in create.sql, recreated per query. No persistent DB to load. +set -e +sync diff --git a/hyper-parquet/query b/hyper-parquet/query new file mode 100755 index 000000000..077de9169 --- /dev/null +++ b/hyper-parquet/query @@ -0,0 +1,30 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via tableau hyperapi against the +# partitioned parquet files (registered as a temp external table from +# create.sql). +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +python3 - <<'PY' +import sys +import timeit +from tableauhyperapi import HyperProcess, Telemetry, Connection + +query = sys.stdin.read() + +with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: + with Connection(hyper.endpoint) as connection: + connection.execute_command(open("create.sql").read()) + start = timeit.default_timer() + rows = connection.execute_list_query(query) + end = timeit.default_timer() + +for r in rows: + print(r) + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/hyper-parquet/query.py b/hyper-parquet/query.py deleted file mode 100755 index 2df4fb3b3..000000000 --- a/hyper-parquet/query.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 -import timeit -import sys -import subprocess - -from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode, HyperException - -query = sys.stdin.read() - -with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: - with Connection(hyper.endpoint) as connection: - # Hyper only supports temporary external tables, so we need to create them on every query - connection.execute_command(open("create.sql").read()) - for try_num in range(3): - if try_num == 0: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) - - start = timeit.default_timer() - try: - connection.execute_list_query(query) - print(round(timeit.default_timer() - start, 3)) - except HyperException: - print("null") diff --git a/hyper-parquet/run.sh b/hyper-parquet/run.sh deleted file mode 100755 index 64df8c608..000000000 --- a/hyper-parquet/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/hyper-parquet/start b/hyper-parquet/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/hyper-parquet/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/hyper-parquet/stop b/hyper-parquet/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/hyper-parquet/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/hyper/benchmark.sh b/hyper/benchmark.sh index 0f6968613..b0b9f4775 100755 --- a/hyper/benchmark.sh +++ b/hyper/benchmark.sh @@ -1,20 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install tableauhyperapi - -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' ./load.py - -./run.sh | tee log.txt - -cat log.txt | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo -n "Data size: " -du -b hits.hyper +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/hyper/check b/hyper/check new file mode 100755 index 000000000..23ad27458 --- /dev/null +++ b/hyper/check @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +python3 - <<'PY' +from tableauhyperapi import HyperProcess, Telemetry, Connection +with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: + with Connection(hyper.endpoint) as connection: + connection.execute_list_query("SELECT 1") +PY diff --git a/hyper/data-size b/hyper/data-size new file mode 100755 index 000000000..4dce0916e --- /dev/null +++ b/hyper/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.hyper diff --git a/hyper/install b/hyper/install new file mode 100755 index 000000000..537a36ca4 --- /dev/null +++ b/hyper/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade pip +pip install tableauhyperapi diff --git a/hyper/load.py b/hyper/load similarity index 70% rename from hyper/load.py rename to hyper/load index 5380f84bd..a4a1d58cb 100755 --- a/hyper/load.py +++ b/hyper/load @@ -1,8 +1,20 @@ -#!/usr/bin/env python3 +#!/bin/bash +set -e +# shellcheck disable=SC1091 +source myenv/bin/activate + +# Idempotent: blow away any prior DB. +rm -f hits.hyper + +python3 - <<'PY' from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: with Connection(hyper.endpoint, 'hits.hyper', CreateMode.CREATE_AND_REPLACE) as connection: connection.execute_command(open("create.sql").read()) connection.execute_command("copy hits from 'hits.csv' with (format csv)") +PY + +rm -f hits.csv +sync diff --git a/hyper/query b/hyper/query new file mode 100755 index 000000000..8079a5ea0 --- /dev/null +++ b/hyper/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via tableau hyperapi against +# hits.hyper. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +python3 - <<'PY' +import sys +import timeit +from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode + +query = sys.stdin.read() + +with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: + with Connection(hyper.endpoint, 'hits.hyper', CreateMode.NONE) as connection: + start = timeit.default_timer() + rows = connection.execute_list_query(query) + end = timeit.default_timer() + +for r in rows: + print(r) + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/hyper/query.py b/hyper/query.py deleted file mode 100755 index e1833c0e4..000000000 --- a/hyper/query.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 -import timeit -import sys - -from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode, HyperException - -query = sys.stdin.read() - -with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: - with Connection(hyper.endpoint, 'hits.hyper', CreateMode.NONE) as connection: - for _ in range(3): - start = timeit.default_timer() - try: - connection.execute_list_query(query) - print(round(timeit.default_timer() - start, 3)) - except HyperException: - print("null") diff --git a/hyper/run.sh b/hyper/run.sh deleted file mode 100755 index 64df8c608..000000000 --- a/hyper/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/hyper/start b/hyper/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/hyper/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/hyper/stop b/hyper/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/hyper/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/infobright/benchmark.sh b/infobright/benchmark.sh index 67b2f2f3b..531bd6503 100755 --- a/infobright/benchmark.sh +++ b/infobright/benchmark.sh @@ -1,39 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y docker.io - -mkdir infobright -sudo docker run --name mysql_ib -e MYSQL_ROOT_PASSWORD=mypass -v $(pwd)/infobright:/mnt/mysql_data -p 5029:5029 -p 5555 -d flolas/infobright - -sudo docker run -i --rm --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass -e "CREATE DATABASE test" -sudo docker run -i --rm --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test -e "$(cat create.sql)" - -# Load the data - -../download-hits-tsv - -# ERROR 2 (HY000) at line 1: Wrong data or column definition. Row: 93557187, field: 100. -head -n 90000000 hits.tsv > hits90m.tsv - -echo -n "Load time: " -command time -f '%e' sudo docker run -i --rm --volume $(pwd):/workdir --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test -e "SET sql_log_bin = 0; - LOAD DATA LOCAL INFILE '/workdir/hits90m.tsv' INTO TABLE test.hits - FIELDS TERMINATED BY '\\t' ENCLOSED BY '' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n' STARTING BY ''" - -# 38m37.466s - -echo -n "Data size: " -sudo docker exec mysql_ib du -bcs /mnt/mysql_data/ /usr/local/infobright-4.0.7-x86_64/cache | grep total - -# 13 760 341 294 - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/' | - awk -F, '{ if ($1 == "null") { print } else { print $1 * 86400 + $2 * 3600 + $3 * 60 + $4 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/infobright/check b/infobright/check new file mode 100755 index 000000000..65222793e --- /dev/null +++ b/infobright/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +sudo docker run --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass \ + -e "SELECT 1" >/dev/null 2>&1 diff --git a/infobright/data-size b/infobright/data-size new file mode 100755 index 000000000..49f4702e0 --- /dev/null +++ b/infobright/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +sudo docker exec mysql_ib du -bcs /mnt/mysql_data/ /usr/local/infobright-4.0.7-x86_64/cache \ + | grep total | awk '{print $1}' diff --git a/infobright/install b/infobright/install new file mode 100755 index 000000000..bb091ed98 --- /dev/null +++ b/infobright/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io + +# Pull images up-front so install captures all setup work. +sudo docker pull flolas/infobright +sudo docker pull mysql:5 + +mkdir -p infobright + +# (Re)create the container only if missing. +if ! sudo docker inspect mysql_ib >/dev/null 2>&1; then + sudo docker run --name mysql_ib \ + -e MYSQL_ROOT_PASSWORD=mypass \ + -v "$(pwd)/infobright:/mnt/mysql_data" \ + -p 5029:5029 -p 5555 \ + -d flolas/infobright +fi diff --git a/infobright/load b/infobright/load new file mode 100755 index 000000000..4d5492cf1 --- /dev/null +++ b/infobright/load @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +MYSQL_RUN="sudo docker run -i --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass" + +$MYSQL_RUN -e "DROP DATABASE IF EXISTS test" +$MYSQL_RUN -e "CREATE DATABASE test" +$MYSQL_RUN --database=test -e "$(cat create.sql)" + +# Infobright errors out on row 93557187 in the full dataset; truncate. +head -n 90000000 hits.tsv > hits90m.tsv + +sudo docker run -i --rm --volume "$(pwd):/workdir" --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test \ + -e "SET sql_log_bin = 0; + LOAD DATA LOCAL INFILE '/workdir/hits90m.tsv' INTO TABLE test.hits + FIELDS TERMINATED BY '\t' ENCLOSED BY '' ESCAPED BY '\\\\' LINES TERMINATED BY '\n' STARTING BY ''" + +rm -f hits.tsv hits90m.tsv +sync diff --git a/infobright/query b/infobright/query new file mode 100755 index 000000000..b505db1a8 --- /dev/null +++ b/infobright/query @@ -0,0 +1,39 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql:5 client (Docker) against +# the Infobright container's MySQL protocol on :5029. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo docker run -i --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test \ + -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +parsed=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/') + +if [ -z "$parsed" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -F, -v p="$parsed" 'BEGIN { + n = split(p, a, ",") + d = (a[1] == "") ? 0 : a[1] + h = (a[2] == "") ? 0 : a[2] + m = (a[3] == "") ? 0 : a[3] + s = (a[4] == "") ? 0 : a[4] + printf "%.3f\n", d * 86400 + h * 3600 + m * 60 + s +}' >&2 diff --git a/infobright/run.sh b/infobright/run.sh deleted file mode 100755 index b6f176b33..000000000 --- a/infobright/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo docker run --rm --network host mysql:5 mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass --database=test -vvv -e "${query}" - done; -done; diff --git a/infobright/start b/infobright/start new file mode 100755 index 000000000..007dcf154 --- /dev/null +++ b/infobright/start @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +# Idempotent: if the container is running and responsive, do nothing. +if sudo docker inspect -f '{{.State.Running}}' mysql_ib 2>/dev/null | grep -q true; then + if sudo docker run --rm --network host mysql:5 \ + mysql --host 127.0.0.1 --port 5029 --user=root --password=mypass \ + -e "SELECT 1" >/dev/null 2>&1; then + exit 0 + fi +fi + +sudo docker start mysql_ib diff --git a/infobright/stop b/infobright/stop new file mode 100755 index 000000000..8b630229b --- /dev/null +++ b/infobright/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop mysql_ib >/dev/null 2>&1 || true diff --git a/kinetica/benchmark.sh b/kinetica/benchmark.sh index 046fa0b30..b6523b583 100755 --- a/kinetica/benchmark.sh +++ b/kinetica/benchmark.sh @@ -1,35 +1,7 @@ -#!/usr/bin/bash - -# Run setup.sh (assume we are running on ubuntu) -./setup-dev-ubuntu.sh - -# download the db -export KINETICA_ADMIN_PASSWORD=admin -curl https://files.kinetica.com/install/kinetica.sh -o kinetica && chmod u+x kinetica && sudo -E ./kinetica start - -# set up the cli -wget --continue --progress=dot:giga https://github.com/kineticadb/kisql/releases/download/v7.1.7.2/kisql - -chmod u+x ./kisql - -export KI_PWD="admin" -CLI="./kisql --host localhost --user admin" - -# download the ds -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -sudo mv hits.tsv.gz ./kinetica-persist/ - -$CLI --file create.sql -$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" - -START=$(date +%s) - -$CLI --sql "load into hits from file paths 'hits.tsv.gz' format delimited text (INCLUDES HEADER=false, DELIMITER = '\t') WITH OPTIONS (NUM_TASKS_PER_RANK=16, ON ERROR=SKIP);" - -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" -echo "Data size: $(du -bcs ./kinetica-persist/gpudb | grep total)" - -# run the queries -./run.sh +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +# kinetica downloads hits.tsv.gz directly inside ./load (Kinetica wants the +# gzipped form), so no central download script is used. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/kinetica/check b/kinetica/check new file mode 100755 index 000000000..1e578951e --- /dev/null +++ b/kinetica/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export KI_PWD=admin +./kisql --host localhost --user admin --sql 'SELECT 1' >/dev/null 2>&1 diff --git a/kinetica/data-size b/kinetica/data-size new file mode 100755 index 000000000..57891b0e8 --- /dev/null +++ b/kinetica/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs ./kinetica-persist/gpudb | grep total | awk '{print $1}' diff --git a/kinetica/install b/kinetica/install new file mode 100755 index 000000000..e8d9c4cbb --- /dev/null +++ b/kinetica/install @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +# setup-dev-ubuntu.sh installs docker, java (for kisql), and ripgrep. +./setup-dev-ubuntu.sh + +if [ ! -x ./kinetica ]; then + curl https://files.kinetica.com/install/kinetica.sh -o kinetica + chmod u+x kinetica +fi + +# Bring Kinetica up via the install/start script (idempotent inside). +export KINETICA_ADMIN_PASSWORD=admin +sudo -E ./kinetica start + +# Fetch the SQL CLI. +if [ ! -x ./kisql ]; then + wget --continue --progress=dot:giga \ + https://github.com/kineticadb/kisql/releases/download/v7.1.7.2/kisql + chmod u+x ./kisql +fi diff --git a/kinetica/load b/kinetica/load new file mode 100755 index 000000000..75630adb1 --- /dev/null +++ b/kinetica/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +export KI_PWD=admin +CLI="./kisql --host localhost --user admin" + +# Kinetica's `load into ... format delimited text` reads the gzipped TSV +# directly from its persist directory, so we fetch the gzip rather than the +# decompressed TSV. +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' +sudo mv hits.tsv.gz ./kinetica-persist/ + +$CLI --file create.sql +$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" + +$CLI --sql "load into hits from file paths 'hits.tsv.gz' format delimited text (INCLUDES HEADER=false, DELIMITER = '\t') WITH OPTIONS (NUM_TASKS_PER_RANK=16, ON ERROR=SKIP);" + +sudo rm -f ./kinetica-persist/hits.tsv.gz +sync diff --git a/kinetica/query b/kinetica/query new file mode 100755 index 000000000..2988e55cd --- /dev/null +++ b/kinetica/query @@ -0,0 +1,30 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via kisql. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# kisql's "Query Execution Time: sec" footer). +# Exit non-zero on error. +set -e + +export KI_PWD=admin +query=$(cat) + +raw=$(./kisql --host localhost --user admin --sql "$query" 2>&1) && exit_code=0 || exit_code=$? + +# kisql prints errors to stdout; sniff for them. +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qiE 'error|exception'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +# "Query Execution Time: 1.234 sec" — penultimate field is the value. +secs=$(printf '%s\n' "$raw" | grep -E 'Query Execution Time' | tail -n1 | awk '{print $(NF-1)}') + +if [ -z "$secs" ]; then + echo "no Query Execution Time in kisql output" >&2 + exit 1 +fi + +awk -v s="$secs" 'BEGIN { printf "%.3f\n", s }' >&2 diff --git a/kinetica/run.sh b/kinetica/run.sh deleted file mode 100755 index 13e03d14c..000000000 --- a/kinetica/run.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -export KI_PWD=admin - -TRIES=3 -QUERY_NUM=1 -cat queries.sql | while read -r query; do - [ -z "$FQDN" ] && sync - [ -z "$FQDN" ] && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - RES=$(./kisql --host localhost --user admin --sql "$query" 2>&1 | rg 'Query Execution Time' | awk '{print $(NF-1)}' ||:) - - [[ "$?" == "0" && "$RES" != "" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - - echo "${QUERY_NUM},${i},${RES}" >> result.csv - done - echo "]," - - QUERY_NUM=$((QUERY_NUM + 1)) -done diff --git a/kinetica/start b/kinetica/start new file mode 100755 index 000000000..0f831bf40 --- /dev/null +++ b/kinetica/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +export KINETICA_ADMIN_PASSWORD=admin +export KI_PWD=admin + +# Idempotent: if kisql can already speak SELECT 1, do nothing. +if ./kisql --host localhost --user admin --sql 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo -E ./kinetica start diff --git a/kinetica/stop b/kinetica/stop new file mode 100755 index 000000000..0dd5f40d8 --- /dev/null +++ b/kinetica/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +export KINETICA_ADMIN_PASSWORD=admin +sudo -E ./kinetica stop || true diff --git a/lib/benchmark-common.sh b/lib/benchmark-common.sh new file mode 100755 index 000000000..7cc807169 --- /dev/null +++ b/lib/benchmark-common.sh @@ -0,0 +1,165 @@ +#!/bin/bash + +# Shared ClickBench driver. +# +# A per-system benchmark.sh sets a few env vars and then exec's this script. +# This script is designed to be invoked from a system directory (e.g. +# clickhouse/), so all script paths below are relative to the system dir. +# +# Required env: +# BENCH_DOWNLOAD_SCRIPT Name of a top-level download-hits-* script to fetch +# the dataset (e.g. "download-hits-parquet-single"). +# Set to empty string for systems that read directly +# from a remote source (S3 datalake, remote services). +# +# Optional env: +# BENCH_RESTARTABLE "yes" (default) or "no". If "yes", the system is +# stopped+started between every query to neutralize +# warm-process effects. Set "no" for in-process / +# single-binary tools where restart would dominate +# query time (duckdb CLI, sqlite, dataframe wrappers). +# BENCH_TRIES Number of times each query is run. Default 3. +# BENCH_QUERIES_FILE Path to a queries file, one query per line. +# Default "queries.sql" (in the system dir). +# BENCH_CHECK_TIMEOUT Seconds to wait for ./check to succeed. Default 300. + +set -e + +# BENCH_DOWNLOAD_SCRIPT must be set (possibly to empty for "no download"). +: "${BENCH_DOWNLOAD_SCRIPT?BENCH_DOWNLOAD_SCRIPT is required (set empty to skip)}" +: "${BENCH_RESTARTABLE:=yes}" +: "${BENCH_TRIES:=3}" +: "${BENCH_QUERIES_FILE:=queries.sql}" +: "${BENCH_CHECK_TIMEOUT:=300}" + +# Resolve the directory containing this script so we can find sibling helpers +# (download scripts) and the system dir we were invoked from (CWD). +LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$LIB_DIR/.." && pwd)" + +bench_check_loop() { + local i + for i in $(seq 1 "$BENCH_CHECK_TIMEOUT"); do + if ./check >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + echo "bench: ./check did not succeed within ${BENCH_CHECK_TIMEOUT}s" >&2 + return 1 +} + +bench_flush_caches() { + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null +} + +bench_install() { + ./install +} + +bench_start() { + # Tolerate non-zero exit from ./start: many engines' start commands return + # non-zero when the server is already up but leave the system in the + # desired state. The check loop is the authoritative readiness signal. + ./start || true + bench_check_loop +} + +bench_stop() { + ./stop +} + +bench_download() { + if [ -z "$BENCH_DOWNLOAD_SCRIPT" ]; then + return 0 + fi + "$ROOT_DIR/$BENCH_DOWNLOAD_SCRIPT" +} + +bench_load() { + local start_t end_t + start_t=$(date +%s.%N) + ./load + end_t=$(date +%s.%N) + # Print "Load time: " matching the existing log shape that + # play.clickhouse.com expects. + awk -v s="$start_t" -v e="$end_t" 'BEGIN { printf "Load time: %.3f\n", e - s }' +} + +# Run a single query script and emit a single JSON-array `[t1,t2,t3],` line. +# Per-try timing is also appended to result.csv as `,,`. +bench_run_query() { + local query="$1" + local query_num="$2" + local i raw_stderr exit_code timing + local results=() + + bench_flush_caches + if [ "$BENCH_RESTARTABLE" = "yes" ]; then + ./stop || true + ./start || true + bench_check_loop + fi + + for i in $(seq 1 "$BENCH_TRIES"); do + # The query script's contract: stdout = result, stderr's last line = + # fractional seconds, exit 0 on success. + raw_stderr=$(printf '%s\n' "$query" | ./query 2>&1 >/dev/null) && exit_code=0 || exit_code=$? + + if [ "$exit_code" -eq 0 ]; then + timing=$(printf '%s\n' "$raw_stderr" | tail -n1) + # Sanity-check it's a number (allow integers and decimals). + if ! [[ "$timing" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + timing="null" + fi + else + timing="null" + printf '%s\n' "$raw_stderr" >&2 + fi + results+=("$timing") + echo "${query_num},${i},${timing}" >> result.csv + done + + # Emit "[t1,t2,t3]," for compatibility with the existing log format. + local out="[" + local j + for j in "${!results[@]}"; do + out+="${results[$j]}" + if [ "$j" -lt $((${#results[@]} - 1)) ]; then + out+="," + fi + done + out+="]," + echo "$out" +} + +bench_main() { + bench_install + bench_start + + bench_download + bench_load + + : > result.csv + local query_num=1 + while IFS= read -r query; do + # Skip empty lines. + [ -z "$query" ] && continue + bench_run_query "$query" "$query_num" + query_num=$((query_num + 1)) + done < "$BENCH_QUERIES_FILE" + + # data-size may need the server up (e.g. ClickHouse queries system.tables, + # pandas hits the HTTP server), so report it before stopping. + echo -n "Data size: " + ./data-size + + bench_stop || true +} + +# Only run the full flow when executed directly (or via `exec`). Sourcing the +# file (e.g. for testing individual functions) won't trigger bench_main. +if [ "${BASH_SOURCE[0]}" = "$0" ]; then + bench_main +fi diff --git a/mariadb/benchmark.sh b/mariadb/benchmark.sh index f95cc108e..531bd6503 100755 --- a/mariadb/benchmark.sh +++ b/mariadb/benchmark.sh @@ -1,34 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y mariadb-server -sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" - -# size innodb buffer based on available RAM -# use 75% of total -sudo bash -c "awk '/MemTotal/ { printf \"innodb_buffer_pool_size=%.0fG \n\", \$2*0.75/1024/1024 }' /proc/meminfo > /etc/mysql/buffer.conf" - -sudo service mariadb restart - -# Load the data - -../download-hits-tsv - -sudo mariadb -e "CREATE DATABASE test" -sudo mariadb test < create.sql - -echo -n "Load time: " -command time -f '%e' split -l 10000 --filter="sudo mariadb test -e \"SET sql_log_bin = 0; LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE hits;\"" hits.tsv - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo mariadb test -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" | tail -n1 - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/' | - awk -F, '{ if ($1 == "null") { print } else { print $1 * 86400 + $2 * 3600 + $3 * 60 + $4 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/mariadb/check b/mariadb/check new file mode 100755 index 000000000..27dd1cebb --- /dev/null +++ b/mariadb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo mariadb -e "SELECT 1" >/dev/null diff --git a/mariadb/data-size b/mariadb/data-size new file mode 100755 index 000000000..c9319de57 --- /dev/null +++ b/mariadb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo mariadb test -N -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" diff --git a/mariadb/install b/mariadb/install new file mode 100755 index 000000000..3f080f794 --- /dev/null +++ b/mariadb/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y mariadb-server + +sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" + +# Size innodb buffer based on available RAM — use 75% of total. +sudo bash -c "awk '/MemTotal/ { printf \"innodb_buffer_pool_size=%.0fG \n\", \$2*0.75/1024/1024 }' /proc/meminfo > /etc/mysql/buffer.conf" + +sudo service mariadb restart diff --git a/mariadb/load b/mariadb/load new file mode 100755 index 000000000..0f23f43c1 --- /dev/null +++ b/mariadb/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +sudo mariadb -e "DROP DATABASE IF EXISTS test" +sudo mariadb -e "CREATE DATABASE test" +sudo mariadb test < create.sql + +# Stream-load in chunks of 10000 lines (the original benchmark approach). +split -l 10000 --filter="sudo mariadb test -e \"SET sql_log_bin = 0; LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE hits;\"" hits.tsv + +rm -f hits.tsv +sync diff --git a/mariadb/query b/mariadb/query new file mode 100755 index 000000000..eeb841fd1 --- /dev/null +++ b/mariadb/query @@ -0,0 +1,38 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `mariadb -vvv` against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mariadb's "N rows in set (X.XX sec)" footer; days/hours/min/sec all handled). +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo mariadb test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +# mariadb may print "(2 days 3 hours 4 min 5.6 sec)" or any subset. +parsed=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) days? )?(([0-9.]+) hours? )?(([0-9.]+) min )?([0-9.]+) sec\).*?$/\2,\4,\6,\7/') + +if [ -z "$parsed" ]; then + echo "no timing in mariadb output" >&2 + exit 1 +fi + +awk -F, -v p="$parsed" 'BEGIN { + n = split(p, a, ",") + d = (a[1] == "") ? 0 : a[1] + h = (a[2] == "") ? 0 : a[2] + m = (a[3] == "") ? 0 : a[3] + s = (a[4] == "") ? 0 : a[4] + printf "%.3f\n", d * 86400 + h * 3600 + m * 60 + s +}' >&2 diff --git a/mariadb/run.sh b/mariadb/run.sh deleted file mode 100755 index 7294b2158..000000000 --- a/mariadb/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo mariadb test -vvv -e "${query}" - done; -done; diff --git a/mariadb/start b/mariadb/start new file mode 100755 index 000000000..7c7acd41c --- /dev/null +++ b/mariadb/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if sudo mariadb -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo service mariadb start diff --git a/mariadb/stop b/mariadb/stop new file mode 100755 index 000000000..bfaaeb9f8 --- /dev/null +++ b/mariadb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo service mariadb stop || true diff --git a/monetdb/benchmark.sh b/monetdb/benchmark.sh index 507434951..531bd6503 100755 --- a/monetdb/benchmark.sh +++ b/monetdb/benchmark.sh @@ -1,41 +1,5 @@ #!/bin/bash - -# Install - -echo "deb https://dev.monetdb.org/downloads/deb/ $(lsb_release -cs) monetdb" | sudo tee /etc/apt/sources.list.d/monetdb.list - -sudo wget --output-document=/etc/apt/trusted.gpg.d/monetdb.gpg https://www.monetdb.org/downloads/MonetDB-GPG-KEY.gpg -sudo apt-get update -y -sudo apt-get install -y monetdb5-sql monetdb-client dos2unix net-tools - -sudo monetdbd create /var/lib/monetdb -sudo usermod -a -G monetdb $USER - -for _ in {1..300} -do - sudo monetdb create test && break - sleep 1 -done -sudo monetdb release test - -sudo apt-get install -y expect - -./query.expect "$(cat create.sql)" - -../download-hits-tsv -chmod 777 ~ hits.tsv - -echo -n "Load time: " -command time -f '%e' ./query.expect "COPY INTO hits FROM '$(pwd)/hits.tsv' USING DELIMITERS '\t'" - -# 99997497 affected rows -# clk: 15:39 min - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo du -bcs /var/monetdb5/ | grep total - -cat log.txt | dos2unix -f | grep -P 'clk|tuple' | - awk '/tuple/ { ok = 1 } /clk/ { if (ok) { if ($3 == "ms") { print $2 / 1000 } else { print $2 } } else { print "null" }; ok = 0 }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/monetdb/check b/monetdb/check new file mode 100755 index 000000000..4608adca4 --- /dev/null +++ b/monetdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mclient -u monetdb -d test -s 'SELECT 1' >/dev/null 2>&1 diff --git a/monetdb/data-size b/monetdb/data-size new file mode 100755 index 000000000..715f6d39f --- /dev/null +++ b/monetdb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo du -bcs /var/monetdb5/ | grep total | awk '{print $1}' diff --git a/monetdb/install b/monetdb/install new file mode 100755 index 000000000..cceaf5a9a --- /dev/null +++ b/monetdb/install @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +echo "deb https://dev.monetdb.org/downloads/deb/ $(lsb_release -cs) monetdb" \ + | sudo tee /etc/apt/sources.list.d/monetdb.list + +sudo wget --output-document=/etc/apt/trusted.gpg.d/monetdb.gpg \ + https://www.monetdb.org/downloads/MonetDB-GPG-KEY.gpg + +sudo apt-get update -y +sudo apt-get install -y monetdb5-sql monetdb-client dos2unix net-tools expect + +if [ ! -d /var/lib/monetdb ]; then + sudo monetdbd create /var/lib/monetdb +fi +sudo usermod -a -G monetdb "$USER" + +# monetdbd takes a moment to come up; retry creating the test DB. +for _ in {1..300}; do + if sudo monetdb create test 2>/dev/null; then break; fi + sleep 1 +done +sudo monetdb release test || true diff --git a/monetdb/load b/monetdb/load new file mode 100755 index 000000000..547d3d0f8 --- /dev/null +++ b/monetdb/load @@ -0,0 +1,16 @@ +#!/bin/bash +set -eu + +# Drop and recreate to make idempotent. +sudo monetdb stop test 2>/dev/null || true +sudo monetdb destroy -f test 2>/dev/null || true +sudo monetdb create test +sudo monetdb release test + +chmod 777 ~ hits.tsv + +./query.expect "$(cat create.sql)" +./query.expect "COPY INTO hits FROM '$(pwd)/hits.tsv' USING DELIMITERS '\t'" + +rm -f hits.tsv +sync diff --git a/monetdb/query b/monetdb/query new file mode 100755 index 000000000..ba61abb71 --- /dev/null +++ b/monetdb/query @@ -0,0 +1,36 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the expect wrapper around mclient. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mclient `\t clock` output: "clk: ms" or "clk: s"). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(./query.expect "$query" 2>&1) && exit_code=0 || exit_code=$? + +# mclient may print errors but exit 0 via expect; sniff for them. +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^!|sql:.*error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +# Strip CR (mclient is in expect/PTY mode), pass result to stdout. +clean=$(printf '%s\n' "$raw" | dos2unix -f 2>/dev/null || printf '%s\n' "$raw") +printf '%s\n' "$clean" + +# Parse the LAST `clk:` line into seconds. +timing=$(printf '%s\n' "$clean" | grep -E '^clk:' | tail -n1) +if [ -z "$timing" ]; then + echo "no clk timing in monetdb output" >&2 + exit 1 +fi + +awk -v s="$timing" 'BEGIN { + n = split(s, a, " ") + val = a[2]; unit = a[3] + if (unit ~ /ms/) { printf "%.3f\n", val / 1000 } + else if (unit ~ /s/) { printf "%.3f\n", val } + else { printf "%.3f\n", val } +}' >&2 diff --git a/monetdb/run.sh b/monetdb/run.sh deleted file mode 100755 index 57a1a5cbf..000000000 --- a/monetdb/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - ./query.expect "$query" 2>&1 - done; -done; diff --git a/monetdb/start b/monetdb/start new file mode 100755 index 000000000..f20f925c9 --- /dev/null +++ b/monetdb/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# Idempotent: if mserver is already serving the test DB, do nothing. +if mclient -u monetdb -d test -s 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo monetdbd start /var/lib/monetdb || true + +# Make sure the database is released (online). +sudo monetdb release test || true diff --git a/monetdb/stop b/monetdb/stop new file mode 100755 index 000000000..61f019fc2 --- /dev/null +++ b/monetdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo monetdbd stop /var/lib/monetdb 2>/dev/null || true diff --git a/mongodb/benchmark.sh b/mongodb/benchmark.sh index 11db23dcc..ef00681cb 100755 --- a/mongodb/benchmark.sh +++ b/mongodb/benchmark.sh @@ -1,88 +1,7 @@ -#!/bin/bash -e - -# https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/ - -sudo apt-get update -y -sudo apt-get install -y gnupg curl - -curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \ - sudo gpg -o /usr/share/keyrings/mongodb-server-8.0.gpg \ - --dearmor - -source /etc/lsb-release -echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu ${DISTRIB_CODENAME}/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list - -sudo apt-get update -y -sudo apt-get install -y mongodb-org -sudo systemctl start mongod -sudo systemctl status mongod - -for _ in {1..300} -do - mongosh --quiet --eval "db.runCommand('ping').ok" && break - sleep 1 -done - -################################# -# set params `internalQueryPlannerGenerateCoveredWholeIndexScans` to true because we know that collscan is -# always bad. Decision about enabling should be made if collection data couldn't fit to RAM. -# NOTE: This option is reset to default on restart until it saved in mongo config file. -# Don't forget to set again if mongo restart needed or crashes happened while queries run and -# you want to continue theirs execution. -time mongosh --quiet --eval 'db.adminCommand({setParameter: 1,"internalQueryPlannerGenerateCoveredWholeIndexScans": true});' - - -################################# -# Create the indexes before import data because each index is reread all data -# Q6 -time mongosh --quiet --eval 'db.hits.createIndex({"EventDate": 1});' -# Q10, Q11 -time mongosh --quiet --eval 'db.hits.createIndex({"MobilePhoneModel": 1});' -# Q28 -time mongosh --quiet --eval 'db.hits.createIndex({"Referer": 1});' -# Q40 -time mongosh --quiet --eval 'db.hits.createIndex({"RefererHash": 1});' -# Q41 -time mongosh --quiet --eval 'db.hits.createIndex({"URLHash": 1});' -# Q3, Q4, Q15, Q19 -time mongosh --quiet --eval 'db.hits.createIndex({"UserID": 1});' -# Q1, Q2, Q7, Q9 -time mongosh --quiet --eval 'db.hits.createIndex({"AdvEngineID": 1, "ResolutionWidth": 1, "RegionID": 1});' -# Q8 -time mongosh --quiet --eval 'db.hits.createIndex({"RegionID": 1, "UserID": 1});' -# Q5, Q12, Q14, Q24, Q30, Q31 -time mongosh --quiet --eval 'db.hits.createIndex({"SearchPhrase": 1, "SearchEngineID": 1});' -# Q13, Q16, Q17, Q18, Q26 -time mongosh --quiet --eval 'db.hits.createIndex({"SearchPhrase": 1, "UserID": 1, "EventTime": 1});' -# Q21, Q22 -time mongosh --quiet --eval 'db.hits.createIndex({"SearchPhrase": 1, "URL": 1, "Title": 1});' -# Q38, Q39 -time mongosh --quiet --eval 'db.hits.createIndex({"CounterID": 1, "EventDate": 1, "URL": 1});' -# Q36, Q37, Q42 -time mongosh --quiet --eval 'db.hits.createIndex({"CounterID": 1, "IsRefresh": 1, "EventDate": 1});' -# Q20, Q23, Q27, Q33, Q34 -time mongosh --quiet --eval 'db.hits.createIndex({"URL": 1, "CounterID": 1 });' -# Q29, Q32, Q35 -time mongosh --quiet --eval 'db.hits.createIndex({"ClientIP": 1, "WatchID": 1, "ResolutionWidth": 1, "IsRefresh": 1});' - - -################################# -# Load data and import -../download-hits-tsv - -# Use mongo import to load data into mongo. By default numInsertionWorkers is 1 so change to half of VM where it would be run -#time mongoimport --collection hits --type tsv hits.tsv --fieldFile=create.txt --columnsHaveTypes --numInsertionWorkers=8 - -# But on the AWS c6a.4xlarge machines, parallel import is slower than single-threaded, so we choose the single-threaded import. -echo -n "Load time: " -command time -f '%e' mongoimport --collection hits --type tsv hits.tsv --fieldFile=create.txt --columnsHaveTypes - -echo -n "Data size: " -sudo du -bcs /var/lib/mongodb/ | grep total -# total size: 82937405440 (77.2 Gb) -# indexes size: 38326390784 (35.6 Gb) // heh, so much but indexes should be -# storage size: 44610863104 (41.5 Gb) - -# MongoDB does not support SQL in self-hosted option. Only with MongoDB Atlas service. - -time mongosh --quiet ./run.js > result.json +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. MongoDB uses +# aggregation pipelines (queries.txt, EJSON one-per-line) rather than SQL. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +export BENCH_QUERIES_FILE="queries.txt" +exec ../lib/benchmark-common.sh diff --git a/mongodb/check b/mongodb/check new file mode 100755 index 000000000..dae52ba30 --- /dev/null +++ b/mongodb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mongosh --quiet --eval "db.runCommand('ping').ok" >/dev/null diff --git a/mongodb/data-size b/mongodb/data-size new file mode 100755 index 000000000..59e468b76 --- /dev/null +++ b/mongodb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo du -bcs /var/lib/mongodb/ | grep total | awk '{print $1}' diff --git a/mongodb/formatResult.js b/mongodb/formatResult.js deleted file mode 100644 index f1f647ea0..000000000 --- a/mongodb/formatResult.js +++ /dev/null @@ -1,27 +0,0 @@ -// runs with node -const fs = require("fs"); -const inputFile = process.argv[2]; -const inputContent = fs.readFileSync(inputFile, "utf-8"); -const res = {}; -inputContent.split(/\r?\n/).forEach((line) => { - if (line.length == 0) { - return; - } - parsed = JSON.parse(line); - res[parsed.q + "_" + parsed.it] = parsed.ok == 1 ? parsed.t / 1000.0 : null; -}); -console.log("["); -for (let i = 0; i < 43; ++i) { - delim = i == 42 ? "" : ","; - line = - "[" + - res[i + "_0"] + - "," + - res[i + "_1"] + - "," + - res[i + "_2"] + - "]" + - delim; - console.log(line); -} -console.log("]"); diff --git a/mongodb/install b/mongodb/install new file mode 100755 index 000000000..bbe3bc7ba --- /dev/null +++ b/mongodb/install @@ -0,0 +1,21 @@ +#!/bin/bash +# Install MongoDB and the mongosh shell. +# https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/ +set -e + +if command -v mongod >/dev/null 2>&1 && command -v mongosh >/dev/null 2>&1; then + exit 0 +fi + +sudo apt-get update -y +sudo apt-get install -y gnupg curl + +curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \ + sudo gpg -o /usr/share/keyrings/mongodb-server-8.0.gpg --dearmor + +# shellcheck disable=SC1091 +source /etc/lsb-release +echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu ${DISTRIB_CODENAME}/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list + +sudo apt-get update -y +sudo apt-get install -y mongodb-org diff --git a/mongodb/load b/mongodb/load new file mode 100755 index 000000000..f63714e2c --- /dev/null +++ b/mongodb/load @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +# Indexes are created BEFORE import so each document is indexed once during +# load (re-indexing after the fact re-reads everything). The set of indexes +# was selected per-query in the original benchmark.sh. +mongosh --quiet test <<'EOF' +db.hits.createIndex({"EventDate": 1}); +db.hits.createIndex({"MobilePhoneModel": 1}); +db.hits.createIndex({"Referer": 1}); +db.hits.createIndex({"RefererHash": 1}); +db.hits.createIndex({"URLHash": 1}); +db.hits.createIndex({"UserID": 1}); +db.hits.createIndex({"AdvEngineID": 1, "ResolutionWidth": 1, "RegionID": 1}); +db.hits.createIndex({"RegionID": 1, "UserID": 1}); +db.hits.createIndex({"SearchPhrase": 1, "SearchEngineID": 1}); +db.hits.createIndex({"SearchPhrase": 1, "UserID": 1, "EventTime": 1}); +db.hits.createIndex({"SearchPhrase": 1, "URL": 1, "Title": 1}); +db.hits.createIndex({"CounterID": 1, "EventDate": 1, "URL": 1}); +db.hits.createIndex({"CounterID": 1, "IsRefresh": 1, "EventDate": 1}); +db.hits.createIndex({"URL": 1, "CounterID": 1}); +db.hits.createIndex({"ClientIP": 1, "WatchID": 1, "ResolutionWidth": 1, "IsRefresh": 1}); +EOF + +# Single-threaded import is faster on c6a.4xlarge per the original benchmark.sh. +mongoimport --db test --collection hits --type tsv hits.tsv \ + --fieldFile=create.txt --columnsHaveTypes + +rm -f hits.tsv +sync diff --git a/mongodb/queries.txt b/mongodb/queries.txt new file mode 100644 index 000000000..915f13421 --- /dev/null +++ b/mongodb/queries.txt @@ -0,0 +1,43 @@ +[{"$project":{"_id":1}},{"$count":"c"}] +[{"$match":{"AdvEngineID":{"$ne":0}}},{"$count":"c"}] +[{"$group":{"_id":null,"sum_AdvEngineID":{"$sum":"$AdvEngineID"},"c":{"$sum":1},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"}}}] +[{"$group":{"_id":null,"a":{"$avg":{"$toDecimal":"$UserID"}}}}] +[{"$group":{"_id":"$UserID"}},{"$count":"c"}] +[{"$group":{"_id":"$SearchPhrase"}},{"$count":"c"}] +[{"$sort":{"EventDate":1}},{"$limit":1},{"$unionWith":{"coll":"hits","pipeline":[{"$sort":{"EventDate":-1}},{"$limit":1}]}},{"$group":{"_id":null,"tmpArray":{"$push":"$EventDate"}}},{"$project":{"min":{"$arrayElemAt":["$tmpArray",0]},"max":{"$arrayElemAt":["$tmpArray",1]}}}] +[{"$match":{"AdvEngineID":{"$ne":0}}},{"$group":{"_id":"$AdvEngineID","c":{"$sum":1}}},{"$sort":{"c":-1}}] +[{"$group":{"_id":{"RegionID":"$RegionID","UserID":"$UserID"}}},{"$group":{"_id":"$_id.RegionID","u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$group":{"_id":"$RegionID","sum_AdvEngineID":{"$sum":"$AdvEngineID"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10},{"$lookup":{"from":"hits","let":{"regionIdVar":"$_id"},"pipeline":[{"$match":{"$expr":{"$eq":["$RegionID","$$regionIdVar"]}}},{"$group":{"_id":"$UserID"}},{"$count":"c"}],"as":"count_distinct_UserID"}},{"$set":{"count_distinct_UserID":{"$arrayElemAt":["$count_distinct_UserID.c",0]}}}] +[{"$match":{"MobilePhoneModel":{"$ne":""}}},{"$group":{"_id":{"MobilePhoneModel":"$MobilePhoneModel","UserID":"$UserID"}}},{"$group":{"_id":"$_id.MobilePhoneModel","u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$match":{"MobilePhoneModel":{"$ne":""}}},{"$group":{"_id":{"MobilePhone":"$MobilePhone","MobilePhoneModel":"$MobilePhoneModel","UserID":"$UserID"}}},{"$group":{"_id":{"MobilePhone":"$_id.MobilePhone","MobilePhoneModel":"$_id.MobilePhoneModel"},"u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":"$SearchPhrase","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"SearchPhrase":"$SearchPhrase","UserID":"$UserID"}}},{"$group":{"_id":"$_id.SearchPhrase","u":{"$sum":1}}},{"$sort":{"u":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"$concat":["$SearchPhrase","|",{"$toString":"$SearchEngineID"}]},"SearchPhrase":{"$first":"$SearchPhrase"},"SearchEngineID":{"$first":"$SearchEngineID"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":"$UserID","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":{"$concat":["$SearchPhrase","|",{"$toString":"$UserID"}]},"SearchPhrase":{"$first":"$SearchPhrase"},"UserID":{"$first":"$UserID"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":{"$concat":["$SearchPhrase","|",{"$toString":"$UserID"}]},"SearchPhrase":{"$first":"$SearchPhrase"},"UserID":{"$first":"$UserID"},"c":{"$sum":1}}},{"$limit":10}] +[{"$group":{"_id":{"UserID":"$UserID","SearchPhrase":"$SearchPhrase","m":{"$minute":"$EventTime"}},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"UserID":{"$numberLong":"435090932899640449"}}},{"$project":{"UserID":1}}] +[{"$match":{"URL":{}}},{"$count":"c"}] +[{"$match":{"URL":{},"SearchPhrase":{"$ne":""}}},{"$group":{"_id":"$SearchPhrase","min_URL":{"$min":"$URL"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"Title":{},"URL":{"$not":{}},"SearchPhrase":{"$ne":""}}},{"$group":{"_id":"$SearchPhrase","count_distinct_UserID":{"$addToSet":"$UserID"},"min_Title":{"$min":"$Title"},"min_URL":{"$min":"$URL"},"c":{"$sum":1}}},{"$set":{"count_distinct_UserID":{"$size":"$count_distinct_UserID"}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"URL":{}}},{"$sort":{"EventTime":1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$project":{"_id":0,"SearchPhrase":1}},{"$sort":{"EventTime":1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$sort":{"SearchPhrase":1}},{"$project":{"SearchPhrase":1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$project":{"_id":0,"EventTime":1,"SearchPhrase":1}},{"$sort":{"EventTime":1,"SearchPhrase":1}},{"$limit":10}] +[{"$match":{"URL":{"$ne":""}}},{"$group":{"_id":"$CounterID","l":{"$avg":{"$strLenBytes":"$URL"}},"c":{"$sum":1}}},{"$match":{"c":{"$gt":100000}}},{"$sort":{"l":-1}},{"$limit":25}] +[{"$match":{"Referer":{"$ne":""}}},{"$project":{"_id":0,"Referer":1}},{"$set":{"k":{"$regexFind":{"input":"$Referer","regex":"^https?://(?:www.)?([^/]+)/.*$"}}}},{"$group":{"_id":{"$ifNull":[{"$first":"$k.captures"},"$Referer"]},"l":{"$avg":{"$strLenBytes":"$Referer"}},"c":{"$sum":1}}},{"$match":{"c":{"$gt":100000}}},{"$sort":{"l":-1}},{"$limit":25}] +[{"$project":{"_id":0,"ResolutionWidth":{"$toLong":"$ResolutionWidth"}}},{"$group":{"_id":null,"srw_plus_0":{"$sum":{"$add":["$ResolutionWidth",0]}},"srw_plus_1":{"$sum":{"$add":["$ResolutionWidth",1]}},"srw_plus_2":{"$sum":{"$add":["$ResolutionWidth",2]}},"srw_plus_3":{"$sum":{"$add":["$ResolutionWidth",3]}},"srw_plus_4":{"$sum":{"$add":["$ResolutionWidth",4]}},"srw_plus_5":{"$sum":{"$add":["$ResolutionWidth",5]}},"srw_plus_6":{"$sum":{"$add":["$ResolutionWidth",6]}},"srw_plus_7":{"$sum":{"$add":["$ResolutionWidth",7]}},"srw_plus_8":{"$sum":{"$add":["$ResolutionWidth",8]}},"srw_plus_9":{"$sum":{"$add":["$ResolutionWidth",9]}},"srw_plus_10":{"$sum":{"$add":["$ResolutionWidth",10]}},"srw_plus_11":{"$sum":{"$add":["$ResolutionWidth",11]}},"srw_plus_12":{"$sum":{"$add":["$ResolutionWidth",12]}},"srw_plus_13":{"$sum":{"$add":["$ResolutionWidth",13]}},"srw_plus_14":{"$sum":{"$add":["$ResolutionWidth",14]}},"srw_plus_15":{"$sum":{"$add":["$ResolutionWidth",15]}},"srw_plus_16":{"$sum":{"$add":["$ResolutionWidth",16]}},"srw_plus_17":{"$sum":{"$add":["$ResolutionWidth",17]}},"srw_plus_18":{"$sum":{"$add":["$ResolutionWidth",18]}},"srw_plus_19":{"$sum":{"$add":["$ResolutionWidth",19]}},"srw_plus_20":{"$sum":{"$add":["$ResolutionWidth",20]}},"srw_plus_21":{"$sum":{"$add":["$ResolutionWidth",21]}},"srw_plus_22":{"$sum":{"$add":["$ResolutionWidth",22]}},"srw_plus_23":{"$sum":{"$add":["$ResolutionWidth",23]}},"srw_plus_24":{"$sum":{"$add":["$ResolutionWidth",24]}},"srw_plus_25":{"$sum":{"$add":["$ResolutionWidth",25]}},"srw_plus_26":{"$sum":{"$add":["$ResolutionWidth",26]}},"srw_plus_27":{"$sum":{"$add":["$ResolutionWidth",27]}},"srw_plus_28":{"$sum":{"$add":["$ResolutionWidth",28]}},"srw_plus_29":{"$sum":{"$add":["$ResolutionWidth",29]}},"srw_plus_30":{"$sum":{"$add":["$ResolutionWidth",30]}},"srw_plus_31":{"$sum":{"$add":["$ResolutionWidth",31]}},"srw_plus_32":{"$sum":{"$add":["$ResolutionWidth",32]}},"srw_plus_33":{"$sum":{"$add":["$ResolutionWidth",33]}},"srw_plus_34":{"$sum":{"$add":["$ResolutionWidth",34]}},"srw_plus_35":{"$sum":{"$add":["$ResolutionWidth",35]}},"srw_plus_36":{"$sum":{"$add":["$ResolutionWidth",36]}},"srw_plus_37":{"$sum":{"$add":["$ResolutionWidth",37]}},"srw_plus_38":{"$sum":{"$add":["$ResolutionWidth",38]}},"srw_plus_39":{"$sum":{"$add":["$ResolutionWidth",39]}},"srw_plus_40":{"$sum":{"$add":["$ResolutionWidth",40]}},"srw_plus_41":{"$sum":{"$add":["$ResolutionWidth",41]}},"srw_plus_42":{"$sum":{"$add":["$ResolutionWidth",42]}},"srw_plus_43":{"$sum":{"$add":["$ResolutionWidth",43]}},"srw_plus_44":{"$sum":{"$add":["$ResolutionWidth",44]}},"srw_plus_45":{"$sum":{"$add":["$ResolutionWidth",45]}},"srw_plus_46":{"$sum":{"$add":["$ResolutionWidth",46]}},"srw_plus_47":{"$sum":{"$add":["$ResolutionWidth",47]}},"srw_plus_48":{"$sum":{"$add":["$ResolutionWidth",48]}},"srw_plus_49":{"$sum":{"$add":["$ResolutionWidth",49]}},"srw_plus_50":{"$sum":{"$add":["$ResolutionWidth",50]}},"srw_plus_51":{"$sum":{"$add":["$ResolutionWidth",51]}},"srw_plus_52":{"$sum":{"$add":["$ResolutionWidth",52]}},"srw_plus_53":{"$sum":{"$add":["$ResolutionWidth",53]}},"srw_plus_54":{"$sum":{"$add":["$ResolutionWidth",54]}},"srw_plus_55":{"$sum":{"$add":["$ResolutionWidth",55]}},"srw_plus_56":{"$sum":{"$add":["$ResolutionWidth",56]}},"srw_plus_57":{"$sum":{"$add":["$ResolutionWidth",57]}},"srw_plus_58":{"$sum":{"$add":["$ResolutionWidth",58]}},"srw_plus_59":{"$sum":{"$add":["$ResolutionWidth",59]}},"srw_plus_60":{"$sum":{"$add":["$ResolutionWidth",60]}},"srw_plus_61":{"$sum":{"$add":["$ResolutionWidth",61]}},"srw_plus_62":{"$sum":{"$add":["$ResolutionWidth",62]}},"srw_plus_63":{"$sum":{"$add":["$ResolutionWidth",63]}},"srw_plus_64":{"$sum":{"$add":["$ResolutionWidth",64]}},"srw_plus_65":{"$sum":{"$add":["$ResolutionWidth",65]}},"srw_plus_66":{"$sum":{"$add":["$ResolutionWidth",66]}},"srw_plus_67":{"$sum":{"$add":["$ResolutionWidth",67]}},"srw_plus_68":{"$sum":{"$add":["$ResolutionWidth",68]}},"srw_plus_69":{"$sum":{"$add":["$ResolutionWidth",69]}},"srw_plus_70":{"$sum":{"$add":["$ResolutionWidth",70]}},"srw_plus_71":{"$sum":{"$add":["$ResolutionWidth",71]}},"srw_plus_72":{"$sum":{"$add":["$ResolutionWidth",72]}},"srw_plus_73":{"$sum":{"$add":["$ResolutionWidth",73]}},"srw_plus_74":{"$sum":{"$add":["$ResolutionWidth",74]}},"srw_plus_75":{"$sum":{"$add":["$ResolutionWidth",75]}},"srw_plus_76":{"$sum":{"$add":["$ResolutionWidth",76]}},"srw_plus_77":{"$sum":{"$add":["$ResolutionWidth",77]}},"srw_plus_78":{"$sum":{"$add":["$ResolutionWidth",78]}},"srw_plus_79":{"$sum":{"$add":["$ResolutionWidth",79]}},"srw_plus_80":{"$sum":{"$add":["$ResolutionWidth",80]}},"srw_plus_81":{"$sum":{"$add":["$ResolutionWidth",81]}},"srw_plus_82":{"$sum":{"$add":["$ResolutionWidth",82]}},"srw_plus_83":{"$sum":{"$add":["$ResolutionWidth",83]}},"srw_plus_84":{"$sum":{"$add":["$ResolutionWidth",84]}},"srw_plus_85":{"$sum":{"$add":["$ResolutionWidth",85]}},"srw_plus_86":{"$sum":{"$add":["$ResolutionWidth",86]}},"srw_plus_87":{"$sum":{"$add":["$ResolutionWidth",87]}},"srw_plus_88":{"$sum":{"$add":["$ResolutionWidth",88]}},"srw_plus_89":{"$sum":{"$add":["$ResolutionWidth",89]}}}}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"$concat":[{"$toString":"$SearchEngineID"},"|",{"$toString":"$ClientIP"}]},"SearchEngineID":{"$first":"$SearchEngineID"},"ClientIP":{"$first":"$ClientIP"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"sum_IsRefresh":{"$sum":"$IsRefresh"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$match":{"SearchPhrase":{"$ne":""}}},{"$group":{"_id":{"$concat":[{"$toString":"$WatchID"},"|",{"$toString":"$ClientIP"}]},"WatchID":{"$first":"$WatchID"},"ClientIP":{"$first":"$ClientIP"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"sum_IsRefresh":{"$sum":"$IsRefresh"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":{"$concat":[{"$toString":"$ClientIP"},"|",{"$toString":"$WatchID"}]},"WatchID":{"$first":"$WatchID"},"ClientIP":{"$first":"$ClientIP"},"avg_ResolutionWidth":{"$avg":"$ResolutionWidth"},"sum_IsRefresh":{"$sum":"$IsRefresh"},"c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":"$URL","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10}] +[{"$group":{"_id":"$URL","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10},{"$set":{"one":1}}] +[{"$group":{"_id":"$ClientIP","c":{"$sum":1}}},{"$sort":{"c":-1}},{"$limit":10},{"$set":{"ClientIP_0":"$_id","ClientIP_1":{"$add":["$_id",-1]},"ClientIP_2":{"$add":["$_id",-2]},"ClientIP_3":{"$add":["$_id",-3]}}}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"DontCountHits":0,"IsRefresh":0,"URL":{"$ne":""}}},{"$group":{"_id":"$URL","pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"DontCountHits":0,"IsRefresh":0,"URL":{"$ne":""}}},{"$group":{"_id":"$Title","pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0,"IsLink":{"$ne":0},"IsDownload":0,"URL":{"$ne":""}}},{"$group":{"_id":"$Title","pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":1000},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0}},{"$set":{"Src":{"$cond":{"if":{"$and":[{"$eq":["$SearchEngineID",0]},{"$eq":["$AdvEngineID",0]}]},"then":"$Referer","else":""}},"Dst":"$URL"}},{"$group":{"_id":{"TraficSourceID":"$TraficSourceID","SearchEngineID":"$SearchEngineID","AdvEngineID":"$AdvEngineID","Src":"$Src","Dst":"$Dst"},"pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":1000},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0,"TraficSourceID":{"$in":[-1,6]},"RefererHash":{"$numberLong":"3594120000172545465"}}},{"$group":{"_id":{"URLHash":"$URLHash","EventDate":"$EventDate"},"pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":100},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-01"},"$lte":{"$date":"2013-07-31"}},"IsRefresh":0,"DontCountHits":0,"URLHash":{"$numberLong":"2868770270353813622"}}},{"$group":{"_id":{"WindowClientWidth":"$WindowClientWidth","WindowClientHeight":"$WindowClientHeight"},"pageViews":{"$sum":1}}},{"$sort":{"pageViews":-1}},{"$skip":10000},{"$limit":10}] +[{"$match":{"CounterID":62,"EventDate":{"$gte":{"$date":"2013-07-14"},"$lte":{"$date":"2013-07-15"}},"IsRefresh":0,"DontCountHits":0}},{"$group":{"_id":{"$dateTrunc":{"date":"$EventTime","unit":"minute"}},"pageViews":{"$sum":1}}},{"$sort":{"_id":1}},{"$skip":1000},{"$limit":10}] diff --git a/mongodb/query b/mongodb/query new file mode 100755 index 000000000..4c3f7e694 --- /dev/null +++ b/mongodb/query @@ -0,0 +1,22 @@ +#!/bin/bash +# Reads a MongoDB aggregation pipeline (Extended JSON, single line) from +# stdin and runs it against the `hits` collection in the `test` DB. +# Stdout: query result (as printed by mongosh). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +# +# This system uses MongoDB aggregation pipelines instead of SQL. The +# canonical pipelines (one per line) live in queries.txt; the shared driver +# is configured to read that file via BENCH_QUERIES_FILE. +set -e + +pipeline=$(cat) + +PIPELINE_JSON="$pipeline" mongosh --quiet test --eval ' +const start = new Date(); +const pipeline = EJSON.parse(process.env.PIPELINE_JSON); +const result = db.hits.aggregate(pipeline, {allowDiskUse: true}).toArray(); +const elapsed = (new Date() - start) / 1000; +print(EJSON.stringify(result)); +console.error(elapsed.toFixed(3)); +' diff --git a/mongodb/run.js b/mongodb/run.js deleted file mode 100644 index 1bcd802fc..000000000 --- a/mongodb/run.js +++ /dev/null @@ -1,38 +0,0 @@ -const iterations = 3; - -// `col` need in queries to make lookups so define before load -let collectionName = "hits" -let col = db.getCollection(collectionName); - -load("./queries.js"); - -// If someone knows how to clear the OS page cache from javascript, -// please do (this is technically required by the benchmark rules) - -for (let i = 0; i < queries.length; i++) { - for (let j = 0; j < iterations; ++j) { - start = new Date(); - try { - res = col.aggregate(queries[i], { allowDiskUse: true }).toArray(); - print( - EJSON.stringify({ - q: i, - it: j, - ok: 1, - t: new Date().getTime() - start.getTime(), - res: res, - }) - ); - } catch (e) { - print( - EJSON.stringify({ - q: i, - it: j, - ok: 0, - t: new Date().getTime() - start.getTime(), - res: e, - }) - ); - } - } -} diff --git a/mongodb/start b/mongodb/start new file mode 100755 index 000000000..9e8bafc10 --- /dev/null +++ b/mongodb/start @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo systemctl start mongod + +# Enable the planner option used by ClickBench (covered whole-index scans). +# This is a runtime parameter that resets on restart, so we re-apply on every +# start. Wait briefly for the server to accept connections first. +for _ in $(seq 1 60); do + if mongosh --quiet --eval "db.runCommand('ping').ok" >/dev/null 2>&1; then + break + fi + sleep 1 +done +mongosh --quiet --eval 'db.adminCommand({setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true});' >/dev/null diff --git a/mongodb/stop b/mongodb/stop new file mode 100755 index 000000000..0c408822e --- /dev/null +++ b/mongodb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo systemctl stop mongod || true diff --git a/mysql-myisam/benchmark.sh b/mysql-myisam/benchmark.sh index bdb34a4c5..531bd6503 100755 --- a/mysql-myisam/benchmark.sh +++ b/mysql-myisam/benchmark.sh @@ -1,30 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y mysql-server-8.0 -sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" -sudo service mysql restart - -# Load the data - -../download-hits-tsv - -sudo mysql -e "CREATE DATABASE test" -sudo mysql test < create.sql -echo -n "Load time: " -command time -f '%e' sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits;" - -# 41m8.979s - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo mysql test -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" | tail -n1 - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/mysql-myisam/check b/mysql-myisam/check new file mode 100755 index 000000000..b1e36dced --- /dev/null +++ b/mysql-myisam/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo mysql -e "SELECT 1" >/dev/null diff --git a/mysql-myisam/data-size b/mysql-myisam/data-size new file mode 100755 index 000000000..5015ae866 --- /dev/null +++ b/mysql-myisam/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo mysql test -N -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" diff --git a/mysql-myisam/install b/mysql-myisam/install new file mode 100755 index 000000000..dcace5ba4 --- /dev/null +++ b/mysql-myisam/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y mysql-server-8.0 + +sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" +sudo service mysql restart diff --git a/mysql-myisam/load b/mysql-myisam/load new file mode 100755 index 000000000..b39827cdb --- /dev/null +++ b/mysql-myisam/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +sudo mysql -e "DROP DATABASE IF EXISTS test" +sudo mysql -e "CREATE DATABASE test" +sudo mysql test < create.sql + +sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits;" + +rm -f hits.tsv +sync diff --git a/mysql-myisam/query b/mysql-myisam/query new file mode 100755 index 000000000..9d9168268 --- /dev/null +++ b/mysql-myisam/query @@ -0,0 +1,34 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `mysql -vvv` against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mysql's "N rows in set (X.XX sec)" footer). +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo mysql test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +timing=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/') + +if [ -z "$timing" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -v t="$timing" 'BEGIN { + n = split(t, a, " ") + if (n == 2 && a[1] != "") { printf "%.3f\n", a[1] * 60 + a[2] } + else { printf "%.3f\n", a[n] } +}' >&2 diff --git a/mysql-myisam/run.sh b/mysql-myisam/run.sh deleted file mode 100755 index faf06250e..000000000 --- a/mysql-myisam/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo mysql test -vvv -e "${query}" - done; -done; diff --git a/mysql-myisam/start b/mysql-myisam/start new file mode 100755 index 000000000..1eda91080 --- /dev/null +++ b/mysql-myisam/start @@ -0,0 +1,7 @@ +#!/bin/bash +set -eu + +if sudo mysql -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo service mysql start diff --git a/mysql-myisam/stop b/mysql-myisam/stop new file mode 100755 index 000000000..f887aafbf --- /dev/null +++ b/mysql-myisam/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo service mysql stop || true diff --git a/mysql/benchmark.sh b/mysql/benchmark.sh index 465f959ae..531bd6503 100755 --- a/mysql/benchmark.sh +++ b/mysql/benchmark.sh @@ -1,30 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y mysql-server-8.0 -sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" -sudo service mysql restart - -# Load the data - -../download-hits-tsv - -sudo mysql -e "CREATE DATABASE test" -sudo mysql test < create.sql -echo -n "Load time: " -command time -f '%e' sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits" - -# 2:37:52 elapsed - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo mysql test -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" | tail -n1 - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/mysql/check b/mysql/check new file mode 100755 index 000000000..b1e36dced --- /dev/null +++ b/mysql/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo mysql -e "SELECT 1" >/dev/null diff --git a/mysql/data-size b/mysql/data-size new file mode 100755 index 000000000..5015ae866 --- /dev/null +++ b/mysql/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo mysql test -N -e "SELECT data_length + index_length FROM information_schema.TABLES WHERE table_schema = 'test' AND table_name = 'hits';" diff --git a/mysql/install b/mysql/install new file mode 100755 index 000000000..dcace5ba4 --- /dev/null +++ b/mysql/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y mysql-server-8.0 + +sudo bash -c "echo -e '[mysql]\nlocal-infile=1\n\n[mysqld]\nlocal-infile=1\n' > /etc/mysql/conf.d/local_infile.cnf" +sudo service mysql restart diff --git a/mysql/load b/mysql/load new file mode 100755 index 000000000..69e75e085 --- /dev/null +++ b/mysql/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +sudo mysql -e "DROP DATABASE IF EXISTS test" +sudo mysql -e "CREATE DATABASE test" +sudo mysql test < create.sql + +sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL INFILE 'hits.tsv' INTO TABLE hits" + +rm -f hits.tsv +sync diff --git a/mysql/query b/mysql/query new file mode 100755 index 000000000..14887e998 --- /dev/null +++ b/mysql/query @@ -0,0 +1,35 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via `mysql -vvv` against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# mysql's "N rows in set (X.XX sec)" footer). +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(sudo mysql test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +# Parse "(X.XX sec)" or "(N min Y.YY sec)" from the footer line. +timing=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/') + +if [ -z "$timing" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -v t="$timing" 'BEGIN { + n = split(t, a, " ") + if (n == 2 && a[1] != "") { printf "%.3f\n", a[1] * 60 + a[2] } + else { printf "%.3f\n", a[n] } +}' >&2 diff --git a/mysql/run.sh b/mysql/run.sh deleted file mode 100755 index faf06250e..000000000 --- a/mysql/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - sudo mysql test -vvv -e "${query}" - done; -done; diff --git a/mysql/start b/mysql/start new file mode 100755 index 000000000..d6763dbd2 --- /dev/null +++ b/mysql/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +# Idempotent: if already up, do nothing. +if sudo mysql -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi +sudo service mysql start diff --git a/mysql/stop b/mysql/stop new file mode 100755 index 000000000..f887aafbf --- /dev/null +++ b/mysql/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo service mysql stop || true diff --git a/octosql/benchmark.sh b/octosql/benchmark.sh index c20a09d46..fc4bacc8f 100755 --- a/octosql/benchmark.sh +++ b/octosql/benchmark.sh @@ -1,17 +1,5 @@ #!/bin/bash - -wget --continue --progress=dot:giga https://github.com/cube2222/octosql/releases/download/v0.13.0/octosql_0.13.0_linux_amd64.tar.gz -tar xf octosql_0.13.0_linux_amd64.tar.gz - -../download-hits-parquet-single - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P '^real|^Error|^Killed|^fatal error|^panic' | - sed -r -e 's/^(Error|Killed|fatal|panic).*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/octosql/check b/octosql/check new file mode 100755 index 000000000..2b362179e --- /dev/null +++ b/octosql/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +./octosql --help >/dev/null diff --git a/octosql/data-size b/octosql/data-size new file mode 100755 index 000000000..708c0b72e --- /dev/null +++ b/octosql/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/octosql/install b/octosql/install new file mode 100755 index 000000000..d25716209 --- /dev/null +++ b/octosql/install @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if [ ! -x ./octosql ]; then + wget --continue --progress=dot:giga \ + https://github.com/cube2222/octosql/releases/download/v0.13.0/octosql_0.13.0_linux_amd64.tar.gz + tar xf octosql_0.13.0_linux_amd64.tar.gz octosql +fi diff --git a/octosql/load b/octosql/load new file mode 100755 index 000000000..1b395b9dd --- /dev/null +++ b/octosql/load @@ -0,0 +1,4 @@ +#!/bin/bash +# octosql queries hits.parquet directly. No persistent DB to load. +set -e +sync diff --git a/octosql/query b/octosql/query new file mode 100755 index 000000000..984c1a66d --- /dev/null +++ b/octosql/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via octosql against hits.parquet. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (from `time`). +set -e + +query=$(cat) +# octosql wants the file path inline, not a table name. +query=${query//hits/hits.parquet} + +# Cap RSS to ~90% of host memory like the original benchmark. +max_rss=$(( $(grep MemTotal /proc/meminfo | grep -o -P '\d+') * 900 )) + +TIMEFORMAT='%R' +{ time prlimit --data="${max_rss}" ./octosql "$query" 1>/tmp/octosql.out.$$ 2>/tmp/octosql.err.$$; } 2>/tmp/octosql.time.$$ || status=$? +status=${status:-0} + +cat /tmp/octosql.out.$$ +if [ "$status" -ne 0 ]; then + cat /tmp/octosql.err.$$ >&2 + rm -f /tmp/octosql.out.$$ /tmp/octosql.err.$$ /tmp/octosql.time.$$ + exit "$status" +fi + +cat /tmp/octosql.err.$$ >&2 +cat /tmp/octosql.time.$$ >&2 + +rm -f /tmp/octosql.out.$$ /tmp/octosql.err.$$ /tmp/octosql.time.$$ diff --git a/octosql/run.sh b/octosql/run.sh deleted file mode 100755 index 61a34ec78..000000000 --- a/octosql/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -max_rss=$(( $(cat /proc/meminfo | grep MemTotal | grep -o -P '\d+') * 900 )) - -cat queries.sql | sed -r -e 's@hits@hits.parquet@' | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for _ in {1..3} - do - time prlimit --data="${max_rss}" ./octosql "${query}" - done -done diff --git a/octosql/start b/octosql/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/octosql/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/octosql/stop b/octosql/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/octosql/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/opteryx/benchmark.sh b/opteryx/benchmark.sh index 65fe910e8..3b63e772a 100755 --- a/opteryx/benchmark.sh +++ b/opteryx/benchmark.sh @@ -1,41 +1,5 @@ #!/bin/bash - -# Update package lists -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository -y ppa:deadsnakes/ppa -sudo apt-get update -y - -# Install required packages -sudo apt-get install -y python3.11 python3.11-venv git wget build-essential python3.11-dev - -# Create and activate a virtual environment using Python 3.11 -python3.11 -m venv ~/opteryx_venv -source ~/opteryx_venv/bin/activate - -# Upgrade pip in the virtual environment -~/opteryx_venv/bin/python -m pip install --upgrade pip -~/opteryx_venv/bin/python -m pip install --upgrade opteryx==0.26.1 - -# Download benchmark target data, partitioned -../download-hits-parquet-partitioned hits - -# Run a simple query to check the installation -~/opteryx_venv/bin/python -m opteryx "SELECT version()" 2>&1 - -# Run benchmarks for partitioned data using queries from queries.sql -if [[ -f ./queries.sql ]]; then - while read -r query; do - sudo sync && echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - (~/opteryx_venv/bin/python -m opteryx "$query" --cycles 3 2>&1 | grep -v -P '^3$') || echo '[null,null,null]' - done < ./queries.sql -else - echo "queries.sql not found." -fi - -# Deactivate the virtual environment -deactivate - -echo "Data size: $(du -bcs hits | grep total)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/opteryx/check b/opteryx/check new file mode 100755 index 000000000..4d4c12fd7 --- /dev/null +++ b/opteryx/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +"$HOME/opteryx_venv/bin/python" -m opteryx "SELECT version()" >/dev/null diff --git a/opteryx/data-size b/opteryx/data-size new file mode 100755 index 000000000..8e65ea4b3 --- /dev/null +++ b/opteryx/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits | awk '/total$/ { print $1 }' diff --git a/opteryx/install b/opteryx/install new file mode 100755 index 000000000..ea31f110b --- /dev/null +++ b/opteryx/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository -y ppa:deadsnakes/ppa +sudo apt-get update -y +sudo apt-get install -y python3.11 python3.11-venv git wget build-essential python3.11-dev + +if [ ! -d "$HOME/opteryx_venv" ]; then + python3.11 -m venv "$HOME/opteryx_venv" +fi + +"$HOME/opteryx_venv/bin/python" -m pip install --upgrade pip +"$HOME/opteryx_venv/bin/python" -m pip install --upgrade opteryx==0.26.1 diff --git a/opteryx/load b/opteryx/load new file mode 100755 index 000000000..fafa76868 --- /dev/null +++ b/opteryx/load @@ -0,0 +1,8 @@ +#!/bin/bash +# opteryx queries `FROM hits` and resolves it to ./hits/*.parquet, so move +# the partitioned files into the expected subdir. +set -e + +mkdir -p hits +mv hits_*.parquet hits/ 2>/dev/null || true +sync diff --git a/opteryx/query b/opteryx/query new file mode 100755 index 000000000..39ed0b69c --- /dev/null +++ b/opteryx/query @@ -0,0 +1,29 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via opteryx (Python in-process) +# against the partitioned parquet under ./hits/. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e + +query=$(cat) + +"$HOME/opteryx_venv/bin/python" - "$query" <<'PY' +import sys +import timeit +import opteryx + +query = sys.argv[1] + +start = timeit.default_timer() +try: + res = opteryx.query(query) + rows = list(res) + end = timeit.default_timer() +finally: + pass + +for r in rows: + print(r) + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/opteryx/start b/opteryx/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/opteryx/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/opteryx/stop b/opteryx/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/opteryx/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/oxla/benchmark.sh b/oxla/benchmark.sh index 0a9be8cca..1aa9264b9 100755 --- a/oxla/benchmark.sh +++ b/oxla/benchmark.sh @@ -1,39 +1,5 @@ -#!/bin/bash -e - -# docker -sudo apt-get install -y docker.io - -# base -sudo apt-get install -y postgresql-client curl wget apt-transport-https ca-certificates software-properties-common gnupg2 parallel -sudo DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential - -# download dataset -../download-hits-csv -sudo mkdir data -sudo mv hits.csv data - -# get and configure Oxla image -echo "Install and run Oxla." - -docker run --rm -p 5432:5432 -v "$(pwd)/data:/data" --name oxlacontainer public.ecr.aws/oxla/release:1.53.0-beta > /dev/null 2>&1 & - -# create table and ingest data -export PGCLIENTENCODING=UTF8 - -for _ in {1..600} -do - PGPASSWORD=oxla psql -h localhost -U oxla -t < create.sql && break - sleep 1 -done - -echo "Insert data." -echo -n "Load time: " -PGPASSWORD=oxla command time -f '%e' psql -h localhost -U oxla -q -t -c "COPY hits FROM '/data/hits.csv';" - -# get ingested data size -echo -n "Data size: " -PGPASSWORD=oxla psql -h localhost -U oxla -q -t -c "SELECT pg_total_relation_size('hits');" - -# run benchmark -echo "running benchmark..." -./run.sh +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/oxla/check b/oxla/check new file mode 100755 index 000000000..93848a1fb --- /dev/null +++ b/oxla/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=oxla psql -h localhost -U oxla -t -c 'SELECT 1' >/dev/null diff --git a/oxla/data-size b/oxla/data-size new file mode 100755 index 000000000..4a98fca07 --- /dev/null +++ b/oxla/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +PGPASSWORD=oxla psql -h localhost -U oxla -q -t -A -c "SELECT pg_total_relation_size('hits');" diff --git a/oxla/install b/oxla/install new file mode 100755 index 000000000..ac8fc1dcb --- /dev/null +++ b/oxla/install @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client curl wget \ + apt-transport-https ca-certificates software-properties-common gnupg2 parallel +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential + +sudo docker pull public.ecr.aws/oxla/release:1.53.0-beta diff --git a/oxla/load b/oxla/load new file mode 100755 index 000000000..e1f99c03b --- /dev/null +++ b/oxla/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +export PGCLIENTENCODING=UTF8 + +mkdir -p data +sudo mv hits.csv data/ + +PGPASSWORD=oxla psql -h localhost -U oxla -q -t < create.sql +PGPASSWORD=oxla psql -h localhost -U oxla -q -t -c "COPY hits FROM '/data/hits.csv';" + +sudo rm -f data/hits.csv +sync diff --git a/oxla/query b/oxla/query new file mode 100755 index 000000000..a9551059b --- /dev/null +++ b/oxla/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against Oxla's pg-protocol port. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=oxla psql -h localhost -U oxla -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +# Pass-through result, strip Time: lines from stdout. +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/oxla/run.sh b/oxla/run.sh deleted file mode 100755 index 7d20f1215..000000000 --- a/oxla/run.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -TRIES=3 -rm result.txt 2>/dev/null -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - # Oxla seems to cache major parts of the dataset without a documented way to clear the cache between the runs. - # It seems fairer to restart the database between the runs. - docker restart oxlacontainer - sleep 30 - - echo "$query"; - results="" - if [[ "$query" == "SELECT NULL;" ]]; then - results+="[null,null,null]" - else - results+="[" - for i in $(seq 1 $TRIES); do - time=$(PGPASSWORD=oxla psql -h localhost -U oxla -t -c '\timing' -c "$query" | grep 'Time' | perl -nle 'm/Time: ([^ ]*) ms/; print $1 / 1000') - echo "$time s" - results+="$time," - done - results=${results::-1} - results+="]" - fi - echo "$results," >> result.txt -done -result=$(cat result.txt) -result=${result::-1} -echo "$result" -rm result.txt 2>/dev/null diff --git a/oxla/start b/oxla/start new file mode 100755 index 000000000..e2f0185f5 --- /dev/null +++ b/oxla/start @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +# Idempotent: if already serving, do nothing. +if PGPASSWORD=oxla psql -h localhost -U oxla -t -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# Start (or restart) the container in the background. +sudo docker stop oxlacontainer >/dev/null 2>&1 || true +sudo docker rm oxlacontainer >/dev/null 2>&1 || true + +mkdir -p data +sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + --name oxlacontainer \ + public.ecr.aws/oxla/release:1.53.0-beta >/dev/null + +# Wait briefly for protocol port (the lib's check loop will keep waiting). +for _ in $(seq 1 60); do + PGPASSWORD=oxla psql -h localhost -U oxla -t -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/oxla/stop b/oxla/stop new file mode 100755 index 000000000..b673a6d16 --- /dev/null +++ b/oxla/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop oxlacontainer >/dev/null 2>&1 || true diff --git a/pandas/benchmark.sh b/pandas/benchmark.sh index fee224a32..fc4bacc8f 100755 --- a/pandas/benchmark.sh +++ b/pandas/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install pandas pyarrow - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/pandas/check b/pandas/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/pandas/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/pandas/data-size b/pandas/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/pandas/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/pandas/install b/pandas/install new file mode 100755 index 000000000..960545276 --- /dev/null +++ b/pandas/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet pandas pyarrow fastapi uvicorn diff --git a/pandas/load b/pandas/load new file mode 100755 index 000000000..ceba6beca --- /dev/null +++ b/pandas/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/pandas/queries.sql b/pandas/queries.sql new file mode 100644 index 000000000..b4115ee3a --- /dev/null +++ b/pandas/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/pandas/query b/pandas/query new file mode 100755 index 000000000..0bc448d9c --- /dev/null +++ b/pandas/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running pandas server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Capture HTTP status and body separately to detect errors cleanly. +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/pandas/query.py b/pandas/server.py similarity index 73% rename from pandas/query.py rename to pandas/server.py index 9d6ba7136..5bc676173 100755 --- a/pandas/query.py +++ b/pandas/server.py @@ -1,65 +1,57 @@ #!/usr/bin/env python3 +"""FastAPI wrapper around pandas so it conforms to the ClickBench +install/start/check/stop/load/query interface. -import pandas as pd -import timeit -import datetime -import json -import subprocess +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the DataFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching lambda against the loaded DataFrame, and + returns {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (memory_usage) -start = timeit.default_timer() -hits = pd.read_parquet("hits.parquet") -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") +The (sql, lambda) list is the same as the previous standalone query.py — just +exposed over HTTP. queries.sql in this directory holds the SQL strings in the +same order. +""" -dataframe_size = hits.memory_usage().sum() +import os +import timeit -# print("Dataframe(numpy) size:", dataframe_size, "bytes") +import pandas as pd +import uvicorn +from fastapi import FastAPI, HTTPException, Request -# fix some types -hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") -hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") +app = FastAPI() +hits: pd.DataFrame | None = None -# fix all object columns to string -for col in hits.columns: - if hits[col].dtype == "O": - hits[col] = hits[col].astype(str) -# 0: No., 1: SQL, 2: Pandas -queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count()), +# 43 ClickBench queries. Each is (sql, callable). sql strings must match the +# corresponding line in queries.sql. The lambdas come straight from the prior +# pandas/query.py and have not been modified. +QUERIES: list[tuple[str, callable]] = [ + ("SELECT COUNT(*) FROM hits;", lambda x: x.count()), ( - "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", lambda x: x[x["AdvEngineID"] != 0].count(), ), ( - "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()), ), + ("SELECT AVG(UserID) FROM hits;", lambda x: x["UserID"].mean()), + ("SELECT COUNT(DISTINCT UserID) FROM hits;", lambda x: x["UserID"].nunique()), ( - "Q3", - "SELECT AVG(UserID) FROM hits;", - lambda x: x["UserID"].mean(), - ), - ( - "Q4", - "SELECT COUNT(DISTINCT UserID) FROM hits;", - lambda x: x["UserID"].nunique(), - ), - ( - "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", lambda x: x["SearchPhrase"].nunique(), ), ( - "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", lambda x: (x["EventDate"].min(), x["EventDate"].max()), ), ( - "Q7", "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", lambda x: x[x["AdvEngineID"] != 0] .groupby("AdvEngineID") @@ -67,19 +59,16 @@ .sort_values(ascending=False), ), ( - "Q8", "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", lambda x: x.groupby("RegionID")["UserID"].nunique().nlargest(10), ), ( - "Q9", "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", lambda x: x.groupby("RegionID") .agg({"AdvEngineID": "sum", "ResolutionWidth": "mean", "UserID": "nunique"}) .nlargest(10, "AdvEngineID"), ), ( - "Q10", "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x[x["MobilePhoneModel"] != ""] .groupby("MobilePhoneModel")["UserID"] @@ -87,7 +76,6 @@ .nlargest(10), ), ( - "Q11", "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x[x["MobilePhoneModel"] != ""] .groupby(["MobilePhone", "MobilePhoneModel"])["UserID"] @@ -95,7 +83,6 @@ .nlargest(10), ), ( - "Q12", "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby("SearchPhrase") @@ -103,7 +90,6 @@ .nlargest(10), ), ( - "Q13", "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby("SearchPhrase")["UserID"] @@ -111,7 +97,6 @@ .nlargest(10), ), ( - "Q14", "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby(["SearchEngineID", "SearchPhrase"]) @@ -119,39 +104,32 @@ .nlargest(10), ), ( - "Q15", "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.groupby("UserID").size().nlargest(10), ), ( - "Q16", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.groupby(["UserID", "SearchPhrase"]).size().nlargest(10), ), ( - "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10), ), ( - "Q18", "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.groupby([x["UserID"], x["EventTime"].dt.minute, "SearchPhrase"]) .size() .nlargest(10), ), ( - "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", lambda x: x[x["UserID"] == 435090932899640449], ), ( - "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", lambda x: x[x["URL"].str.contains("google")].shape[0], ), ( - "Q21", "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[(x["URL"].str.contains("google")) & (x["SearchPhrase"] != "")] .groupby("SearchPhrase") @@ -159,7 +137,6 @@ .nlargest(10, "SearchPhrase"), ), ( - "Q22", "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x[ (x["Title"].str.contains("Google")) @@ -173,35 +150,30 @@ .nlargest(10, "SearchPhrase"), ), ( - "Q23", "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", lambda x: x[x["URL"].str.contains("google")] .sort_values(by="EventTime") .head(10), ), ( - "Q24", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .sort_values(by="EventTime")[["SearchPhrase"]] .head(10), ), ( - "Q25", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .sort_values(by="SearchPhrase")[["SearchPhrase"]] .head(10), ), ( - "Q26", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .sort_values(by=["EventTime", "SearchPhrase"])[["SearchPhrase"]] .head(10), ), ( - "Q27", "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: x[x["URL"] != ""] .groupby("CounterID") @@ -211,7 +183,6 @@ .head(25), ), ( - "Q28", "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: ( x[x["Referer"] != ""] @@ -226,101 +197,10 @@ ), ), ( - "Q29", "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", - lambda x: x["ResolutionWidth"].sum() - + x["ResolutionWidth"].shift(1).sum() - + x["ResolutionWidth"].shift(2).sum() - + x["ResolutionWidth"].shift(3).sum() - + x["ResolutionWidth"].shift(4).sum() - + x["ResolutionWidth"].shift(5).sum() - + x["ResolutionWidth"].shift(6).sum() - + x["ResolutionWidth"].shift(7).sum() - + x["ResolutionWidth"].shift(8).sum() - + x["ResolutionWidth"].shift(9).sum() - + x["ResolutionWidth"].shift(10).sum() - + x["ResolutionWidth"].shift(11).sum() - + x["ResolutionWidth"].shift(12).sum() - + x["ResolutionWidth"].shift(13).sum() - + x["ResolutionWidth"].shift(14).sum() - + x["ResolutionWidth"].shift(15).sum() - + x["ResolutionWidth"].shift(16).sum() - + x["ResolutionWidth"].shift(17).sum() - + x["ResolutionWidth"].shift(18).sum() - + x["ResolutionWidth"].shift(19).sum() - + x["ResolutionWidth"].shift(20).sum() - + x["ResolutionWidth"].shift(21).sum() - + x["ResolutionWidth"].shift(22).sum() - + x["ResolutionWidth"].shift(23).sum() - + x["ResolutionWidth"].shift(24).sum() - + x["ResolutionWidth"].shift(25).sum() - + x["ResolutionWidth"].shift(26).sum() - + x["ResolutionWidth"].shift(27).sum() - + x["ResolutionWidth"].shift(28).sum() - + x["ResolutionWidth"].shift(29).sum() - + x["ResolutionWidth"].shift(30).sum() - + x["ResolutionWidth"].shift(31).sum() - + x["ResolutionWidth"].shift(32).sum() - + x["ResolutionWidth"].shift(33).sum() - + x["ResolutionWidth"].shift(34).sum() - + x["ResolutionWidth"].shift(35).sum() - + x["ResolutionWidth"].shift(36).sum() - + x["ResolutionWidth"].shift(37).sum() - + x["ResolutionWidth"].shift(38).sum() - + x["ResolutionWidth"].shift(39).sum() - + x["ResolutionWidth"].shift(40).sum() - + x["ResolutionWidth"].shift(41).sum() - + x["ResolutionWidth"].shift(42).sum() - + x["ResolutionWidth"].shift(43).sum() - + x["ResolutionWidth"].shift(44).sum() - + x["ResolutionWidth"].shift(45).sum() - + x["ResolutionWidth"].shift(46).sum() - + x["ResolutionWidth"].shift(47).sum() - + x["ResolutionWidth"].shift(48).sum() - + x["ResolutionWidth"].shift(49).sum() - + x["ResolutionWidth"].shift(50).sum() - + x["ResolutionWidth"].shift(51).sum() - + x["ResolutionWidth"].shift(52).sum() - + x["ResolutionWidth"].shift(53).sum() - + x["ResolutionWidth"].shift(54).sum() - + x["ResolutionWidth"].shift(55).sum() - + x["ResolutionWidth"].shift(56).sum() - + x["ResolutionWidth"].shift(57).sum() - + x["ResolutionWidth"].shift(58).sum() - + x["ResolutionWidth"].shift(59).sum() - + x["ResolutionWidth"].shift(60).sum() - + x["ResolutionWidth"].shift(61).sum() - + x["ResolutionWidth"].shift(62).sum() - + x["ResolutionWidth"].shift(63).sum() - + x["ResolutionWidth"].shift(64).sum() - + x["ResolutionWidth"].shift(65).sum() - + x["ResolutionWidth"].shift(66).sum() - + x["ResolutionWidth"].shift(67).sum() - + x["ResolutionWidth"].shift(68).sum() - + x["ResolutionWidth"].shift(69).sum() - + x["ResolutionWidth"].shift(70).sum() - + x["ResolutionWidth"].shift(71).sum() - + x["ResolutionWidth"].shift(72).sum() - + x["ResolutionWidth"].shift(73).sum() - + x["ResolutionWidth"].shift(74).sum() - + x["ResolutionWidth"].shift(75).sum() - + x["ResolutionWidth"].shift(76).sum() - + x["ResolutionWidth"].shift(77).sum() - + x["ResolutionWidth"].shift(78).sum() - + x["ResolutionWidth"].shift(79).sum() - + x["ResolutionWidth"].shift(80).sum() - + x["ResolutionWidth"].shift(81).sum() - + x["ResolutionWidth"].shift(82).sum() - + x["ResolutionWidth"].shift(83).sum() - + x["ResolutionWidth"].shift(84).sum() - + x["ResolutionWidth"].shift(85).sum() - + x["ResolutionWidth"].shift(86).sum() - + x["ResolutionWidth"].shift(87).sum() - + x["ResolutionWidth"].shift(88).sum() - + x["ResolutionWidth"].shift(89).sum(), - ), - ( - "Q30", + lambda x: sum(x["ResolutionWidth"].shift(i).sum() for i in range(90)), + ), + ( "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby(["SearchEngineID", "ClientIP"]) @@ -332,7 +212,6 @@ .nlargest(10, "c"), ), ( - "Q31", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x[x["SearchPhrase"] != ""] .groupby(["WatchID", "ClientIP"]) @@ -344,7 +223,6 @@ .nlargest(10, "c"), ), ( - "Q32", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.groupby(["WatchID", "ClientIP"]) .agg( @@ -355,17 +233,14 @@ .nlargest(10, "c"), ), ( - "Q33", "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", lambda x: x.groupby("URL").size().nlargest(10).reset_index(name="c"), ), ( - "Q34", "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", lambda x: x.groupby(["URL"]).size().nlargest(10).reset_index(name="c"), ), ( - "Q35", "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", lambda x: x.assign( **{f"ClientIP_minus_{i}": x["ClientIP"] - i for i in range(1, 4)} @@ -378,7 +253,6 @@ .reset_index(name="c"), ), ( - "Q36", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", lambda x: x[ (x["CounterID"] == 62) @@ -393,7 +267,6 @@ .nlargest(10), ), ( - "Q37", "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", lambda x: x[ (x["CounterID"] == 62) @@ -408,7 +281,6 @@ .nlargest(10), ), ( - "Q38", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x[ (x["CounterID"] == 62) @@ -425,7 +297,6 @@ .iloc[1000:1010], ), ( - "Q39", "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x[ (x["CounterID"] == 62) @@ -440,7 +311,6 @@ .iloc[1000:1010], ), ( - "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", lambda x: x[ (x["CounterID"] == 62) @@ -457,7 +327,6 @@ .iloc[100:110], ), ( - "Q41", "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", lambda x: x[ (x["CounterID"] == 62) @@ -474,7 +343,6 @@ .iloc[10000:10010], ), ( - "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", lambda x: x[ (x["CounterID"] == 62) @@ -490,15 +358,51 @@ ), ] -for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits + start = timeit.default_timer() + df = pd.read_parquet("hits.parquet") + df["EventTime"] = pd.to_datetime(df["EventTime"], unit="s") + df["EventDate"] = pd.to_datetime(df["EventDate"], unit="D") + for col in df.columns: + if df[col].dtype == "O": + df[col] = df[col].astype(str) + hits = df + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + if hits is None: + return {"bytes": 0} + return {"bytes": int(hits.memory_usage().sum())} + - times = [] - for _ in range(3): - start = timeit.default_timer() - result = q[2](hits) - end = timeit.default_timer() - times.append(round(end - start, 3)) - print(times) +if __name__ == "__main__": + port = int(os.environ.get("BENCH_PANDAS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/pandas/start b/pandas/start new file mode 100755 index 000000000..e3fab7273 --- /dev/null +++ b/pandas/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/pandas/stop b/pandas/stop new file mode 100755 index 000000000..787b35abc --- /dev/null +++ b/pandas/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/paradedb-partitioned/benchmark.sh b/paradedb-partitioned/benchmark.sh index 85e7e94fb..6a7f45d3a 100755 --- a/paradedb-partitioned/benchmark.sh +++ b/paradedb-partitioned/benchmark.sh @@ -1,62 +1,5 @@ #!/bin/bash - -PARADEDB_VERSION=latest - -cleanup() { - echo "Done, goodbye!" -} - -trap cleanup EXIT - -echo "" -echo "Installing dependencies..." -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -echo "" -echo "Pulling ParadeDB image..." -sudo docker run \ - --name paradedb \ - -e POSTGRESQL_USERNAME=myuser \ - -e POSTGRESQL_PASSWORD=mypassword \ - -e POSTGRESQL_DATABASE=mydb \ - -e POSTGRES_PASSWORD=postgres \ - -p 5432:5432 \ - -d \ - paradedb/paradedb:$PARADEDB_VERSION - -echo "" -echo "Downloading ClickBench dataset..." -if [ ! -e /tmp/partitioned/ ]; then - ../download-hits-parquet-partitioned /tmp/partitioned -fi -if ! sudo docker exec paradedb sh -c '[ -f /tmp/partitioned ]'; then - sudo docker cp /tmp/partitioned paradedb:tmp -fi - -echo "" -echo "Creating database..." -export PGPASSWORD='postgres' -psql -h localhost -U postgres -p 5432 -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# load_time is zero, since the data is directly read from the Parquet file(s) -# Time: 0000000.000 ms (00:00.000) - -echo "" -echo "Running queries..." -./run.sh 2>&1 | tee log.txt - -# data_size is the Parquet file(s) total size -# 14779976446 - -echo "Data size: $(du -b /tmp/hits*.parquet)" -echo "Load time: 0" - -echo "" -echo "Parsing results..." -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/paradedb-partitioned/check b/paradedb-partitioned/check new file mode 100755 index 000000000..07fe18256 --- /dev/null +++ b/paradedb-partitioned/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD='postgres' +psql -h localhost -U postgres -p 5432 -t -c 'SELECT 1' >/dev/null diff --git a/paradedb-partitioned/data-size b/paradedb-partitioned/data-size new file mode 100755 index 000000000..e2e7183e5 --- /dev/null +++ b/paradedb-partitioned/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +sudo docker exec -i "$CONTAINER_NAME" sh -c "du -bcs /tmp/partitioned 2>/dev/null | tail -n1 | awk '{print \$1}'" diff --git a/paradedb-partitioned/install b/paradedb-partitioned/install new file mode 100755 index 000000000..d84232304 --- /dev/null +++ b/paradedb-partitioned/install @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +PARADEDB_VERSION=${PARADEDB_VERSION:-latest} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "paradedb/paradedb:$PARADEDB_VERSION" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -e POSTGRESQL_USERNAME=myuser \ + -e POSTGRESQL_PASSWORD=mypassword \ + -e POSTGRESQL_DATABASE=mydb \ + -e POSTGRES_PASSWORD=postgres \ + -p 5432:5432 \ + "paradedb/paradedb:$PARADEDB_VERSION" diff --git a/paradedb-partitioned/load b/paradedb-partitioned/load new file mode 100755 index 000000000..867f14642 --- /dev/null +++ b/paradedb-partitioned/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +export PGPASSWORD='postgres' + +# Move all hits_*.parquet files into the container at /tmp/partitioned/. +sudo docker exec -i "$CONTAINER_NAME" mkdir -p /tmp/partitioned +for f in hits_*.parquet; do + sudo docker cp "$f" "$CONTAINER_NAME":/tmp/partitioned/ +done + +psql -h localhost -U postgres -p 5432 -t -c "DROP FOREIGN TABLE IF EXISTS hits;" || true +psql -h localhost -U postgres -p 5432 -t -c "DROP SERVER IF EXISTS parquet_server CASCADE;" || true +psql -h localhost -U postgres -p 5432 -t -c "DROP FOREIGN DATA WRAPPER IF EXISTS parquet_wrapper CASCADE;" || true + +psql -h localhost -U postgres -p 5432 -v ON_ERROR_STOP=1 -t < create.sql + +rm -f hits_*.parquet +sync diff --git a/paradedb-partitioned/query b/paradedb-partitioned/query new file mode 100755 index 000000000..ba9577725 --- /dev/null +++ b/paradedb-partitioned/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the default DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGPASSWORD='postgres' +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -h localhost -U postgres -p 5432 -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/paradedb-partitioned/run.sh b/paradedb-partitioned/run.sh deleted file mode 100755 index 5f276eb74..000000000 --- a/paradedb-partitioned/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 -export PGPASSWORD='postgres' - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -h localhost -U postgres -p 5432 -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/paradedb-partitioned/start b/paradedb-partitioned/start new file mode 100755 index 000000000..5db7dd44c --- /dev/null +++ b/paradedb-partitioned/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/paradedb-partitioned/stop b/paradedb-partitioned/stop new file mode 100755 index 000000000..209823b90 --- /dev/null +++ b/paradedb-partitioned/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/paradedb/benchmark.sh b/paradedb/benchmark.sh index 0119121d0..b85187617 100755 --- a/paradedb/benchmark.sh +++ b/paradedb/benchmark.sh @@ -1,50 +1,5 @@ #!/bin/bash - -PARADEDB_VERSION=0.10.0 - -echo "Installing dependencies..." -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -echo "Pulling ParadeDB image..." -sudo docker run \ - --name paradedb \ - -e POSTGRESQL_USERNAME=myuser \ - -e POSTGRESQL_PASSWORD=mypassword \ - -e POSTGRESQL_DATABASE=mydb \ - -e POSTGRES_PASSWORD=postgres \ - -p 5432:5432 \ - -d \ - paradedb/paradedb:$PARADEDB_VERSION - -echo "Downloading ClickBench dataset..." -if [ ! -e /tmp/hits.parquet ]; then - ../download-hits-parquet-single /tmp -fi -if ! sudo docker exec paradedb sh -c '[ -f /tmp/hits.parquet ]'; then - sudo docker cp /tmp/hits.parquet paradedb:/tmp/hits.parquet -fi - -echo "Creating database..." -export PGPASSWORD='postgres' -psql -h localhost -U postgres -p 5432 -t < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# load_time is zero, since the data is directly read from the Parquet file(s) -# Time: 0000000.000 ms (00:00.000) -echo "Load time: 0" - -echo "Running queries..." -./run.sh 2>&1 | tee log.txt - -# data_size is the Parquet file(s) total size -# 14779976446 - -echo "Data size: $(du -b /tmp/hits.parquet)" - -echo "Parsing results..." -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/paradedb/check b/paradedb/check new file mode 100755 index 000000000..07fe18256 --- /dev/null +++ b/paradedb/check @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +export PGPASSWORD='postgres' +psql -h localhost -U postgres -p 5432 -t -c 'SELECT 1' >/dev/null diff --git a/paradedb/data-size b/paradedb/data-size new file mode 100755 index 000000000..e579255a7 --- /dev/null +++ b/paradedb/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +# Data is the parquet file inside the container. +sudo docker exec -i "$CONTAINER_NAME" sh -c "du -bcs /tmp/hits.parquet 2>/dev/null | tail -n1 | awk '{print \$1}'" diff --git a/paradedb/install b/paradedb/install new file mode 100755 index 000000000..529b8b1d9 --- /dev/null +++ b/paradedb/install @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +PARADEDB_VERSION=${PARADEDB_VERSION:-0.10.0} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "paradedb/paradedb:$PARADEDB_VERSION" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -e POSTGRESQL_USERNAME=myuser \ + -e POSTGRESQL_PASSWORD=mypassword \ + -e POSTGRESQL_DATABASE=mydb \ + -e POSTGRES_PASSWORD=postgres \ + -p 5432:5432 \ + "paradedb/paradedb:$PARADEDB_VERSION" diff --git a/paradedb/load b/paradedb/load new file mode 100755 index 000000000..c5ede7e90 --- /dev/null +++ b/paradedb/load @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +export PGPASSWORD='postgres' + +# Move the downloaded parquet into the container so create.sql's +# OPTIONS (files '/tmp/hits.parquet') resolves. +sudo docker cp hits.parquet "$CONTAINER_NAME":/tmp/hits.parquet + +# Drop+recreate the foreign table so this is idempotent across re-runs. +psql -h localhost -U postgres -p 5432 -t -c "DROP FOREIGN TABLE IF EXISTS hits;" || true +psql -h localhost -U postgres -p 5432 -t -c "DROP SERVER IF EXISTS parquet_server CASCADE;" || true +psql -h localhost -U postgres -p 5432 -t -c "DROP FOREIGN DATA WRAPPER IF EXISTS parquet_wrapper CASCADE;" || true + +psql -h localhost -U postgres -p 5432 -v ON_ERROR_STOP=1 -t < create.sql + +rm -f hits.parquet +sync diff --git a/paradedb/query b/paradedb/query new file mode 100755 index 000000000..ba9577725 --- /dev/null +++ b/paradedb/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the default DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +export PGPASSWORD='postgres' +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -h localhost -U postgres -p 5432 -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/paradedb/run.sh b/paradedb/run.sh deleted file mode 100755 index 5f276eb74..000000000 --- a/paradedb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 -export PGPASSWORD='postgres' - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - psql -h localhost -U postgres -p 5432 -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/paradedb/start b/paradedb/start new file mode 100755 index 000000000..5db7dd44c --- /dev/null +++ b/paradedb/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/paradedb/stop b/paradedb/stop new file mode 100755 index 000000000..209823b90 --- /dev/null +++ b/paradedb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-paradedb} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/parseable/benchmark.sh b/parseable/benchmark.sh index aee27a28d..bbfe34d76 100755 --- a/parseable/benchmark.sh +++ b/parseable/benchmark.sh @@ -1,53 +1,6 @@ -# Install Dependencies -sudo apt-get update -y -sudo apt-get install -y parallel -# Determine instance type based on CPU and memory -CPU_COUNT=$(nproc) - -if [ $CPU_COUNT -ge 190 ]; then - export P_EXECUTION_BATCH_SIZE=75000 - echo "Setting P_EXECUTION_BATCH_SIZE=75000 (detected c6a.metal equivalent)" -elif [ $CPU_COUNT -ge 15 ]; then - export P_EXECUTION_BATCH_SIZE=40000 - echo "Setting P_EXECUTION_BATCH_SIZE=40000 (detected c6a.4xlarge equivalent)" -else - # Default for other configurations - export P_EXECUTION_BATCH_SIZE=1000000 - echo "Using default P_EXECUTION_BATCH_SIZE=1000000 for default configuration" -fi - -# Download Parseable v2.5.12 binary -wget --continue --progress=dot:giga https://github.com/parseablehq/parseable/releases/download/v2.5.12/Parseable_OSS_x86_64-unknown-linux-gnu -mv Parseable_OSS_x86_64-unknown-linux-gnu parseable -chmod +x parseable - -# Run Parseable -export RUST_LOG=warn - -./parseable local-store > parseable.log 2>&1 & PARSEABLE_PID=$! -# Verify Parseable is running -if ps -p $PARSEABLE_PID > /dev/null; then - echo "Parseable is running with PID: $PARSEABLE_PID" -else - echo "Error: Parseable failed to start. Check parseable.log for details." - exit 1 -fi - -chmod +x ingestion.sh -chmod +x run_query.sh - -#run ingestion script -echo -n "Load time: " -command time -f '%e' ./ingestion.sh - -#run query script -./run_query.sh - -#view results -cat result.csv | sed -r -e 's/^([0-9\.]+) ([0-9\.]+) ([0-9\.]+)$/[\1, \2, \3]/' - -echo -n "Data size: " -du -bcs local-store | grep total - -#kill parseable -kill $PARSEABLE_PID +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +# parseable ingests gzipped NDJSON; ./load fetches it directly. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/parseable/check b/parseable/check new file mode 100755 index 000000000..8de868e04 --- /dev/null +++ b/parseable/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf -u admin:admin 'http://localhost:8000/api/v1/about' >/dev/null diff --git a/parseable/data-size b/parseable/data-size new file mode 100755 index 000000000..559b25d67 --- /dev/null +++ b/parseable/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs local-store | grep total | awk '{print $1}' diff --git a/parseable/ingestion.sh b/parseable/ingestion.sh deleted file mode 100755 index ca782477f..000000000 --- a/parseable/ingestion.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -echo "Installing Parallel pigz pv..." -sudo apt-get update -y && sudo apt-get install -y parallel pigz pv - -# Set number of cores for parallel processing -NUM_CORES=$(nproc) - -echo "Downloading dataset..." -wget --progress=bar:force --show-progress https://datasets.clickhouse.com/hits_compatible/hits.json.gz - -echo "Decompressing dataset..." -# Get file size for progress reporting -FILE_SIZE=$(stat -c %s hits.json.gz) -pv -s $FILE_SIZE hits.json.gz | pigz -d > hits.json - -# Split file into chunks of 2500 lines and process them -echo "Splitting file and processing chunks in parallel..." - -# Create partitioned directory if it doesn't exist -mkdir -p partitioned - -# Define processing function that will be applied immediately after splitting -split_and_process() { - local chunk_num=$1 - local content=$(cat) - local output_file="./partitioned/hits_${chunk_num}.json" - - # Format with brackets and commas in one step - ( - echo "[" - echo "$content" | sed '$!s/$/,/' - echo "]" - ) > "$output_file" -} -export -f split_and_process - -LINES_PER_CHUNK=2500 - -pv hits.json | parallel --pipe -N$LINES_PER_CHUNK --block 10M \ - --jobs $NUM_CORES split_and_process {#} - -echo "Split and process complete" - -# Remove original file -rm hits.json - -# Create stream -echo "Creating stream..." -SCHEMA_FILE="static_schema.json" -curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \ - -H 'X-P-Static-Schema-Flag: true' \ - -H 'Content-Type: application/json' \ - -u "admin:admin" \ - --data-binary @"${SCHEMA_FILE}" - -# Ingest files in parallel with progress monitoring -echo "Ingesting files..." - -INGEST_JOBS=6 -start_time=$(date +%s) -find . -name "hits_*" -type f | parallel --progress --jobs $INGEST_JOBS \ - 'curl --silent -H "Content-Type: application/json" -H "X-P-Stream: hits" -k -XPOST -u "admin:admin" "http://localhost:8000/api/v1/ingest" --data-binary @"{}"' - -#sleep for 3 minutes to allow sync to complete -sleep 180 - -end_time=$(date +%s) -total_time=$((end_time - start_time)) - -echo "Total load (ingestion) time: ${total_time} seconds" diff --git a/parseable/install b/parseable/install new file mode 100755 index 000000000..9fcb8ffa1 --- /dev/null +++ b/parseable/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y parallel pigz pv + +if [ ! -x ./parseable ]; then + wget --continue --progress=dot:giga \ + https://github.com/parseablehq/parseable/releases/download/v2.5.12/Parseable_OSS_x86_64-unknown-linux-gnu + mv Parseable_OSS_x86_64-unknown-linux-gnu parseable + chmod +x parseable +fi diff --git a/parseable/load b/parseable/load new file mode 100755 index 000000000..3f7415094 --- /dev/null +++ b/parseable/load @@ -0,0 +1,51 @@ +#!/bin/bash +set -eu + +NUM_CORES=$(nproc) + +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' + +# Decompress with progress. +FILE_SIZE=$(stat -c %s hits.json.gz) +pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json + +# Split into chunks wrapped in [ ... , ... ] arrays for parseable's ingest API. +mkdir -p partitioned +rm -f partitioned/hits_*.json + +split_and_process() { + local chunk_num=$1 + local content + content=$(cat) + { + echo "[" + echo "$content" | sed '$!s/$/,/' + echo "]" + } > "./partitioned/hits_${chunk_num}.json" +} +export -f split_and_process + +LINES_PER_CHUNK=2500 +pv hits.json | parallel --pipe -N$LINES_PER_CHUNK --block 10M \ + --jobs "$NUM_CORES" split_and_process {#} + +rm -f hits.json hits.json.gz + +# Create the stream. +curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \ + -H 'X-P-Static-Schema-Flag: true' \ + -H 'Content-Type: application/json' \ + -u "admin:admin" \ + --data-binary @static_schema.json >/dev/null + +# Parallel ingest of chunks. +INGEST_JOBS=6 +find partitioned -name "hits_*" -type f | parallel --jobs $INGEST_JOBS \ + 'curl --silent -H "Content-Type: application/json" -H "X-P-Stream: hits" -k -XPOST -u "admin:admin" "http://localhost:8000/api/v1/ingest" --data-binary @"{}"' + +# Allow sync to complete. +sleep 180 + +rm -rf partitioned +sync diff --git a/parseable/query b/parseable/query new file mode 100755 index 000000000..7603f63f0 --- /dev/null +++ b/parseable/query @@ -0,0 +1,31 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via parseable's HTTP /api/v1/query. +# Stdout: query result (JSON). +# Stderr: query runtime in fractional seconds on the last line (wall-clock). +# Exit non-zero on error. +set -e + +query=$(cat) + +CURRENT_DATE=$(date +%Y-%m-%d) +START_TIME="${CURRENT_DATE}T00:00:00.000Z" +END_TIME="${CURRENT_DATE}T23:59:00.000Z" + +# JSON-escape quotes inside the query. +escaped=$(printf '%s' "$query" | sed 's/"/\\"/g') +JSON=$(printf '{"query":"%s","startTime":"%s","endTime":"%s"}' "$escaped" "$START_TIME" "$END_TIME") + +t1=$(date +%s.%N) +out=$(curl -sS -H "Content-Type: application/json" -k -XPOST \ + -u "admin:admin" 'http://localhost:8000/api/v1/query' \ + --data "$JSON") && exit_code=0 || exit_code=$? +t2=$(date +%s.%N) + +if [ "$exit_code" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$exit_code" +fi + +printf '%s\n' "$out" + +awk -v a="$t1" -v b="$t2" 'BEGIN { printf "%.6f\n", b - a }' >&2 diff --git a/parseable/run_query.sh b/parseable/run_query.sh deleted file mode 100755 index 3e7c162ff..000000000 --- a/parseable/run_query.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -echo "Running queries..." -TRIES=3 -QUERY_NUM=1 -rm -f result.csv - -# Get current date in YYYY-MM-DD format -CURRENT_DATE=$(date +%Y-%m-%d) -START_TIME="${CURRENT_DATE}T00:00:00.000Z" -END_TIME="${CURRENT_DATE}T23:59:00.000Z" - -cat 'queries.sql' | while read -r QUERY; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo "$QUERY" > /tmp/query.sql - echo "Query $QUERY_NUM: $QUERY" - QUERY=$(echo "$QUERY" | sed 's/"/\\"/g') - # Create array to store results for this query - RESULTS=() - - for i in $(seq 1 $TRIES); do - echo "Iteration $i:" -JSON=$(printf '{"query":"%s","startTime":"%s","endTime":"%s"}' "$QUERY" "$START_TIME" "$END_TIME") - - start_time=$(date +%s.%N) - - # Execute the query and print the response to terminal - curl -s -H "Content-Type: application/json" -k -XPOST -u "admin:admin" "http://localhost:8000/api/v1/query" --data "${JSON}" > /dev/null - end_time=$(date +%s.%N) - - # Calculate elapsed time in seconds with millisecond precision - elapsed_time=$(echo "$end_time - $start_time" | bc) - # Convert to desired format - RES=$(printf "%.9f" $elapsed_time) - - # Store result in array - RESULTS+=("$RES") - - echo "Time: $RES seconds" - echo "----------------------------------------" - done - - # Output results to CSV with tab separation - echo -e "${RESULTS[0]} ${RESULTS[1]} ${RESULTS[2]}" >> result.csv - - echo "Query $QUERY_NUM completed. [${RESULTS[0]}, ${RESULTS[1]}, ${RESULTS[2]}]" - echo "========================================" - QUERY_NUM=$((QUERY_NUM + 1)) -done - -echo "Benchmark completed. Results saved to result.csv" \ No newline at end of file diff --git a/parseable/start b/parseable/start new file mode 100755 index 000000000..d7badddb6 --- /dev/null +++ b/parseable/start @@ -0,0 +1,21 @@ +#!/bin/bash +set -eu + +# Idempotent: if already up, do nothing. +if curl -sSf -u admin:admin 'http://localhost:8000/api/v1/about' >/dev/null 2>&1; then + exit 0 +fi + +# Tune batch size by hardware (matches original). +CPU_COUNT=$(nproc) +if [ "$CPU_COUNT" -ge 190 ]; then + export P_EXECUTION_BATCH_SIZE=75000 +elif [ "$CPU_COUNT" -ge 15 ]; then + export P_EXECUTION_BATCH_SIZE=40000 +else + export P_EXECUTION_BATCH_SIZE=1000000 +fi +export RUST_LOG=warn + +nohup ./parseable local-store > parseable.log 2>&1 & +disown diff --git a/parseable/stop b/parseable/stop new file mode 100755 index 000000000..004cb12f0 --- /dev/null +++ b/parseable/stop @@ -0,0 +1,11 @@ +#!/bin/bash + +pid=$(pidof parseable 2>/dev/null || true) +if [ -n "$pid" ]; then + kill $pid 2>/dev/null || true + for _ in $(seq 1 30); do + pidof parseable >/dev/null 2>&1 || exit 0 + sleep 1 + done + sudo killall -9 parseable 2>/dev/null || true +fi diff --git a/pg_clickhouse/benchmark.sh b/pg_clickhouse/benchmark.sh index 249734de8..6a7f45d3a 100755 --- a/pg_clickhouse/benchmark.sh +++ b/pg_clickhouse/benchmark.sh @@ -1,18 +1,5 @@ #!/bin/bash - -# apt-get update -y -# env DEBIAN_FRONTEND=noninteractive apt-get install -y wget curl sudo -# env TOTAL_PARTITIONS=1 EXPLAIN=1 ./benchmark.sh - -# Install and start ClickHouse and Postgres -./clickhouse.sh "$@" -./postgres.sh - -# Run the queries -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" - -cat log.txt | grep -oP '^Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_clickhouse/check b/pg_clickhouse/check new file mode 100755 index 000000000..9988f4f58 --- /dev/null +++ b/pg_clickhouse/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Both backends must respond. +clickhouse-client --query "SELECT 1" >/dev/null +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/pg_clickhouse/clickhouse.sh b/pg_clickhouse/clickhouse.sh deleted file mode 100755 index 1e8ae363c..000000000 --- a/pg_clickhouse/clickhouse.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Install - -if [ ! -x /usr/bin/clickhouse ] -then - cd /tmp || exit - curl https://clickhouse.com/ | sh - sudo ./clickhouse install --noninteractive - rm clickhouse - cd - || exit -fi - -# Optional: if you want to use higher compression: -if (( 0 )); then - echo " -compression: - case: - method: zstd - " | sudo tee /etc/clickhouse-server/config.d/compression.yaml -fi; - -sudo clickhouse start - -for _ in {1..300} -do - clickhouse-client --query "SELECT 1" && break - sleep 1 -done - -# Determine which set of files to use depending on the type of run -if [ "$1" != "" ] && [ "$1" != "tuned" ] && [ "$1" != "tuned-memory" ]; then - echo "Error: command line argument must be one of {'', 'tuned', 'tuned-memory'}" - exit 1 -elif [ ! -z "$1" ]; then - SUFFIX="-$1" -fi - -# Load the data - -clickhouse-client < create"$SUFFIX".sql - -TOTAL_PARTITIONS=${TOTAL_PARTITIONS:-100} - -seq 0 "$((TOTAL_PARTITIONS-1))" | xargs -P100 -I{} bash -c 'wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' -mkdir -p /var/lib/clickhouse/user_files -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet - -sync - -start=$(date +%s.%N) - -clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 )) -sync - -end=$(date +%s.%N) -elapsed=$(echo "$end - $start" | bc) - -echo "Load time: $elapsed s" diff --git a/pg_clickhouse/data-size b/pg_clickhouse/data-size new file mode 100755 index 000000000..1ac73ce98 --- /dev/null +++ b/pg_clickhouse/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +# Data is in ClickHouse, not Postgres. Report the hits table on-disk size. +clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'" diff --git a/pg_clickhouse/install b/pg_clickhouse/install new file mode 100755 index 000000000..a5d67fcd7 --- /dev/null +++ b/pg_clickhouse/install @@ -0,0 +1,55 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +export DEBIAN_FRONTEND=noninteractive + +# --- ClickHouse --- +if [ ! -x /usr/bin/clickhouse ]; then + cd /tmp + curl https://clickhouse.com/ | sh + sudo ./clickhouse install --noninteractive + rm -f clickhouse + cd - +fi + +# --- PostgreSQL + pg_clickhouse --- +sudo apt-get update -y +sudo apt-get install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + +sudo apt-get update -y +sudo apt-get install -y \ + postgresql-$PGVERSION \ + postgresql-server-dev-$PGVERSION \ + libcurl4-openssl-dev \ + uuid-dev \ + libssl-dev \ + make \ + cmake \ + g++ \ + pgxnclient + +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(($threads / 2)) +shared_buffers=$(($memory / 4)) +effective_cache_size=$(($memory - ($memory / 4))) +max_worker_processes=$(($threads + 15)) + +sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf < /dev/null 2>&1 ; then - sudo systemctl restart "postgresql@$PGVERSION-main" -else - sudo /etc/init.d/postgresql start -fi - -sudo env DEBIAN_FRONTEND=noninteractive apt-get install -y \ - libcurl4-openssl-dev \ - uuid-dev \ - libssl-dev \ - make \ - cmake \ - g++ \ - pgxnclient - -# Setup the database. -pgxn install pg_clickhouse - -sudo -u postgres psql -t -c 'CREATE DATABASE test' -sudo -u postgres psql test -f create-postgres.sql 2>&1 | tee pg_load_out.txt -if grep 'ERROR' pg_load_out.txt -then - exit 1 -fi diff --git a/pg_clickhouse/query b/pg_clickhouse/query new file mode 100755 index 000000000..5bbe3cff5 --- /dev/null +++ b/pg_clickhouse/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `test` DB +# (which proxies to ClickHouse via the pg_clickhouse FDW). +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | sudo -u postgres psql --no-psqlrc --tuples-only test 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP '^Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pg_clickhouse/run.sh b/pg_clickhouse/run.sh deleted file mode 100755 index 93f6fc8d1..000000000 --- a/pg_clickhouse/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -TRIES=3 -prefix="" -if [ -n "$EXPLAIN" ]; then - prefix="EXPLAIN (ANALYZE, VERBOSE) " -fi - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - ( - echo '\timing' - yes "$prefix$query" | head -n $TRIES - ) | sudo -u postgres psql -e --no-psqlrc --tuples-only test 2>&1 # | grep -P 'Time|psql: error' -done diff --git a/pg_clickhouse/start b/pg_clickhouse/start new file mode 100755 index 000000000..7a21cd575 --- /dev/null +++ b/pg_clickhouse/start @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +# Start ClickHouse server. +sudo clickhouse start || true + +# Start PostgreSQL. +sudo systemctl start postgresql@$PGVERSION-main diff --git a/pg_clickhouse/stop b/pg_clickhouse/stop new file mode 100755 index 000000000..6bc1a37db --- /dev/null +++ b/pg_clickhouse/stop @@ -0,0 +1,5 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true +sudo clickhouse stop || true diff --git a/pg_ducklake/benchmark.sh b/pg_ducklake/benchmark.sh index cd3d5855e..b85187617 100755 --- a/pg_ducklake/benchmark.sh +++ b/pg_ducklake/benchmark.sh @@ -1,22 +1,5 @@ #!/bin/bash - -set -e - -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client - -../download-hits-parquet-single -docker run -d --name pgduck -p 5432:5432 -e POSTGRES_PASSWORD=duckdb -v ./hits.parquet:/tmp/hits.parquet pgducklake/pgducklake:18-main - -sleep 5 # wait for pgducklake start up - -echo -n "Load time: " -command time -f '%e' psql postgres://postgres:duckdb@localhost:5432/postgres -f create.sql 2>&1 - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -docker exec -i pgduck du -bcs /var/lib/postgresql/ | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_ducklake/check b/pg_ducklake/check new file mode 100755 index 000000000..dfe3a7c57 --- /dev/null +++ b/pg_ducklake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql postgres://postgres:duckdb@localhost:5432/postgres -t -c 'SELECT 1' >/dev/null diff --git a/pg_ducklake/data-size b/pg_ducklake/data-size new file mode 100755 index 000000000..e15dcd55c --- /dev/null +++ b/pg_ducklake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/ | grep total | awk '{print $1}' diff --git a/pg_ducklake/install b/pg_ducklake/install new file mode 100755 index 000000000..283878276 --- /dev/null +++ b/pg_ducklake/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} +PGDUCK_IMAGE=${PGDUCK_IMAGE:-pgducklake/pgducklake:18-main} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "$PGDUCK_IMAGE" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=duckdb \ + "$PGDUCK_IMAGE" diff --git a/pg_ducklake/load b/pg_ducklake/load new file mode 100755 index 000000000..5636d2eab --- /dev/null +++ b/pg_ducklake/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} + +# Move parquet file into the container at /tmp/hits.parquet (path used by create.sql). +sudo docker cp hits.parquet "$CONTAINER_NAME":/tmp/hits.parquet + +psql postgres://postgres:duckdb@localhost:5432/postgres -v ON_ERROR_STOP=1 -t -c "DROP TABLE IF EXISTS hits;" || true +psql postgres://postgres:duckdb@localhost:5432/postgres -v ON_ERROR_STOP=1 -f create.sql + +rm -f hits.parquet +sync diff --git a/pg_ducklake/query b/pg_ducklake/query new file mode 100755 index 000000000..54d362e65 --- /dev/null +++ b/pg_ducklake/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql --no-psqlrc --tuples-only postgres://postgres:duckdb@localhost:5432/postgres 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pg_ducklake/run.sh b/pg_ducklake/run.sh deleted file mode 100755 index 7451228b9..000000000 --- a/pg_ducklake/run.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart pgduck - sleep 5 # wait for restart - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql --no-psqlrc --tuples-only postgres://postgres:duckdb@localhost:5432/postgres 2>&1 -done diff --git a/pg_ducklake/start b/pg_ducklake/start new file mode 100755 index 000000000..5ab27c287 --- /dev/null +++ b/pg_ducklake/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/pg_ducklake/stop b/pg_ducklake/stop new file mode 100755 index 000000000..838edfdbe --- /dev/null +++ b/pg_ducklake/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-pgduck} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/pg_mooncake/benchmark.sh b/pg_mooncake/benchmark.sh index a49b373a6..b85187617 100755 --- a/pg_mooncake/benchmark.sh +++ b/pg_mooncake/benchmark.sh @@ -1,33 +1,5 @@ #!/bin/bash - - -#install docker if needed. - -sudo apt-get update -y -sudo apt-get install -y docker.io -sudo usermod -aG docker $USER -newgrp docker - -sudo apt-get install -y postgresql-client - -../download-hits-parquet-single -docker run -d --name pg_mooncake -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust -v ./hits.parquet:/tmp/hits.parquet mooncakelabs/pg_mooncake:17-v0.1.0 - -sleep 5 -echo -n "Load time: " -command time -f '%e' psql postgres://postgres:pg_mooncake@localhost:5432/postgres -q -t -f create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# COPY 99997497 -# Time: 576219.151 ms (09:36.219) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -docker exec -i pg_mooncake du -bcs /var/lib/postgresql/data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pg_mooncake/check b/pg_mooncake/check new file mode 100755 index 000000000..4d9368d26 --- /dev/null +++ b/pg_mooncake/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql postgres://postgres@localhost:5432/postgres -t -c 'SELECT 1' >/dev/null diff --git a/pg_mooncake/data-size b/pg_mooncake/data-size new file mode 100755 index 000000000..c0e4b3d68 --- /dev/null +++ b/pg_mooncake/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data | grep total | awk '{print $1}' diff --git a/pg_mooncake/install b/pg_mooncake/install new file mode 100755 index 000000000..992c34eb0 --- /dev/null +++ b/pg_mooncake/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} +PG_MOONCAKE_IMAGE=${PG_MOONCAKE_IMAGE:-mooncakelabs/pg_mooncake:17-v0.1.0} + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "$PG_MOONCAKE_IMAGE" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + -p 5432:5432 \ + -e POSTGRES_HOST_AUTH_METHOD=trust \ + "$PG_MOONCAKE_IMAGE" diff --git a/pg_mooncake/load b/pg_mooncake/load new file mode 100755 index 000000000..5026725e2 --- /dev/null +++ b/pg_mooncake/load @@ -0,0 +1,13 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} + +# Move parquet file into the container at /tmp/hits.parquet (path used by create.sql). +sudo docker cp hits.parquet "$CONTAINER_NAME":/tmp/hits.parquet + +psql postgres://postgres@localhost:5432/postgres -v ON_ERROR_STOP=1 -t -c "DROP TABLE IF EXISTS hits;" || true +psql postgres://postgres@localhost:5432/postgres -v ON_ERROR_STOP=1 -q -t -f create.sql + +rm -f hits.parquet +sync diff --git a/pg_mooncake/query b/pg_mooncake/query new file mode 100755 index 000000000..1b5078ac0 --- /dev/null +++ b/pg_mooncake/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql postgres://postgres@localhost:5432/postgres 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pg_mooncake/run.sh b/pg_mooncake/run.sh deleted file mode 100755 index 6d6c25192..000000000 --- a/pg_mooncake/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 -CONNECTION=postgres://postgres:pg_mooncake@localhost:5432/postgres - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches 1>/dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql $CONNECTION 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done \ No newline at end of file diff --git a/pg_mooncake/start b/pg_mooncake/start new file mode 100755 index 000000000..6109c9658 --- /dev/null +++ b/pg_mooncake/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/pg_mooncake/stop b/pg_mooncake/stop new file mode 100755 index 000000000..ac0c76862 --- /dev/null +++ b/pg_mooncake/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-pg_mooncake} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/pgpro_tam/benchmark.sh b/pgpro_tam/benchmark.sh index 07fa462f7..531bd6503 100755 --- a/pgpro_tam/benchmark.sh +++ b/pgpro_tam/benchmark.sh @@ -1,71 +1,5 @@ #!/bin/bash - -#Usage: -#./benchmark.sh parquet_fd -#./benchmark.sh parquet_mem_fd -#./benchmark.sh parquet_fd_parall -#./benchmark.sh feather_mem_fd - -#install docker -sudo apt-get update -y -sudo apt-get install -y ca-certificates curl -sudo install -m 0755 -d /etc/apt/keyrings -sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc -sudo chmod a+r /etc/apt/keyrings/docker.asc -echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null -sudo apt-get update -y -sudo apt-get install -y docker.io - -#install postgres client; postgres server is inside docker container -sudo apt-get install -y postgresql-client - -#calculate target shm size (in mb) as a half of available memory and run postgres container -MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') -SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) -sudo docker run --shm-size="$SHM_SIZE"m -p5432:5432 --name pgpro_tam -e POSTGRES_HOST_AUTH_METHOD=trust -d innerlife/pgpro_tam:0.0.1 - -#wait for postgres startup and create extension -sleep 10 -psql -h 127.0.0.1 -U postgres -t -c "create extension ppg_tam" - -#create table -if [ "$1" != "parquet_fd" ] && [ "$1" != "parquet_mem_fd" ] && [ "$1" != "parquet_fd_parall" ] && \ - [ "$1" != "feather_mem_fd" ] && [ "$1" != "" ]; then - echo "Error: command line argument must be one of {'parquet_fd', 'parquet_mem_fd', 'parquet_fd_parall', 'feather_mem_fd'}" - exit 1 -fi -if [ ! -z "$1" ]; then - CREATE_FILE="$1" -else - CREATE_FILE="parquet_fd" -fi -psql -h 127.0.0.1 -U postgres -t < create/"$CREATE_FILE".sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -#get and unpack hits.tsv -sudo docker exec pgpro_tam bash -c "cd /tmp && wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' && gzip -d -f hits.tsv.gz" - -#insert data to table -if [ "$1" == "parquet_fd_parall" ] ; then - #insert data in parallel; not ordered insert is much faster, but breaks query performance - sudo docker exec pgpro_tam bash -c "time cat /tmp/hits.tsv | parallel -l 2000000 -j 50 -N1 -k --spreadstdin 'psql -U postgres -t -c \"copy hits FROM STDIN\"'" -else - echo -n "Load time: " - command time -f '%e' psql -h 127.0.0.1 -U postgres -t -c "COPY hits FROM '/tmp/hits.tsv'" -fi - -#run benchmark -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec pgpro_tam du -bcs /var/lib/postgresql/data/base | grep total - -#parse logfile for query execution time -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pgpro_tam/check b/pgpro_tam/check new file mode 100755 index 000000000..2a82d4699 --- /dev/null +++ b/pgpro_tam/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql -h 127.0.0.1 -U postgres -t -c 'SELECT 1' >/dev/null diff --git a/pgpro_tam/data-size b/pgpro_tam/data-size new file mode 100755 index 000000000..138af564f --- /dev/null +++ b/pgpro_tam/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data/base | grep total | awk '{print $1}' diff --git a/pgpro_tam/install b/pgpro_tam/install new file mode 100755 index 000000000..1f647b687 --- /dev/null +++ b/pgpro_tam/install @@ -0,0 +1,34 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +PGPRO_TAM_IMAGE=${PGPRO_TAM_IMAGE:-innerlife/pgpro_tam:0.0.1} + +# Install Docker (official repo) + postgres client. +sudo apt-get update -y +sudo apt-get install -y ca-certificates curl +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client + +sudo docker pull "$PGPRO_TAM_IMAGE" + +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') +SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) + +sudo docker run -d \ + --name "$CONTAINER_NAME" \ + --shm-size="${SHM_SIZE}m" \ + -p 5432:5432 \ + -e POSTGRES_HOST_AUTH_METHOD=trust \ + "$PGPRO_TAM_IMAGE" diff --git a/pgpro_tam/load b/pgpro_tam/load new file mode 100755 index 000000000..89d255ff7 --- /dev/null +++ b/pgpro_tam/load @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +# Variant of create.sql to use; see create/*.sql. +CREATE_FILE=${PGPRO_TAM_VARIANT:-parquet_fd} + +# Move hits.tsv into the container at /tmp/hits.tsv (the path used by the +# original benchmark.sh's COPY command). +sudo docker cp hits.tsv "$CONTAINER_NAME":/tmp/hits.tsv + +# Ensure the table-access-method extension is loaded. +psql -h 127.0.0.1 -U postgres -t -c "CREATE EXTENSION IF NOT EXISTS ppg_tam" + +# Drop+create table per the chosen variant. +psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t < "create/${CREATE_FILE}.sql" + +if [ "$CREATE_FILE" = "parquet_fd_parall" ]; then + sudo docker exec "$CONTAINER_NAME" bash -c \ + "cat /tmp/hits.tsv | parallel -l 2000000 -j 50 -N1 -k --spreadstdin 'psql -U postgres -t -c \"copy hits FROM STDIN\"'" +else + psql -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t -c "COPY hits FROM '/tmp/hits.tsv'" +fi + +# Cleanup source data both inside the container and outside. +sudo docker exec "$CONTAINER_NAME" rm -f /tmp/hits.tsv || true +rm -f hits.tsv +sync diff --git a/pgpro_tam/query b/pgpro_tam/query new file mode 100755 index 000000000..ceb833618 --- /dev/null +++ b/pgpro_tam/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the postgres DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | psql -h 127.0.0.1 -U postgres -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/pgpro_tam/run.sh b/pgpro_tam/run.sh deleted file mode 100755 index 6104a64d1..000000000 --- a/pgpro_tam/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql -h 127.0.0.1 -U postgres -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done; diff --git a/pgpro_tam/start b/pgpro_tam/start new file mode 100755 index 000000000..b71b022dc --- /dev/null +++ b/pgpro_tam/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} + +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/pgpro_tam/stop b/pgpro_tam/stop new file mode 100755 index 000000000..fa6533efb --- /dev/null +++ b/pgpro_tam/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-pgpro_tam} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/pinot/benchmark.sh b/pinot/benchmark.sh index 525dd572d..531bd6503 100755 --- a/pinot/benchmark.sh +++ b/pinot/benchmark.sh @@ -1,47 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y openjdk-11-jdk jq -sudo update-alternatives --config java - -# Install - -PINOT_VERSION=1.3.0 - -wget --continue --progress=dot:giga https://downloads.apache.org/pinot/apache-pinot-$PINOT_VERSION/apache-pinot-$PINOT_VERSION-bin.tar.gz -tar -zxvf apache-pinot-$PINOT_VERSION-bin.tar.gz - -./apache-pinot-$PINOT_VERSION-bin/bin/pinot-admin.sh QuickStart -type batch & -sleep 30 -./apache-pinot-$PINOT_VERSION-bin/bin/pinot-admin.sh AddTable -tableConfigFile offline_table.json -schemaFile schema.json -exec - -# Load the data - -../download-hits-tsv - -# Pinot was unable to load data as a single file wihout any errors returned. We have to split the data -echo -n "Load time: " -command time -f '%e' split -d --additional-suffix .tsv -n l/100 hits.tsv parts - -# Pinot can't load value '"tatuirovarki_redmond' so we need to fix this row to make it work -echo -n "Load time: " -command time -f '%e' sed parts93.tsv -e 's/"tatuirovarki_redmond/tatuirovarki_redmond/g' -i - -# Fix path to local directory -sed splitted.yaml 's/PWD_DIR_PLACEHOLDER/'$PWD'/g' -i -sed local.yaml 's/PWD_DIR_PLACEHOLDER/'$PWD'/g' -i - -# Load data -echo -n "Load time: " -command time -f '%e' ./apache-pinot-$PINOT_VERSION-bin/bin/pinot-admin.sh LaunchDataIngestionJob -jobSpecFile splitted.yaml - -# After upload it shows 94465149 rows instead of 99997497 in the dataset - -# Run the queries -./run.sh - -# stop Pinot services -kill %1 - -echo -n "Data size: " -du -bcs ./batch | grep total +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/pinot/check b/pinot/check new file mode 100755 index 000000000..3bfe104c3 --- /dev/null +++ b/pinot/check @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Pinot is responsive once both the controller and the broker accept queries. +RES=$(curl -sf -o /dev/null -w '%{http_code}' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8000/query/sql/ \ + -d '{"sql":"SELECT 1"}') + +[ "$RES" = "200" ] diff --git a/pinot/data-size b/pinot/data-size new file mode 100755 index 000000000..80fba7748 --- /dev/null +++ b/pinot/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs ./batch | awk '/total$/ {print $1}' diff --git a/pinot/install b/pinot/install new file mode 100755 index 000000000..4a6449562 --- /dev/null +++ b/pinot/install @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +PINOT_VERSION=1.3.0 +PINOT_DIR="apache-pinot-$PINOT_VERSION-bin" + +if [ ! -d "$PINOT_DIR" ]; then + sudo apt-get update -y + sudo apt-get install -y openjdk-11-jdk jq + + if [ ! -f "$PINOT_DIR.tar.gz" ]; then + wget --continue --progress=dot:giga \ + "https://downloads.apache.org/pinot/apache-pinot-$PINOT_VERSION/$PINOT_DIR.tar.gz" + fi + tar -zxf "$PINOT_DIR.tar.gz" +fi diff --git a/pinot/load b/pinot/load new file mode 100755 index 000000000..585964d98 --- /dev/null +++ b/pinot/load @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +PINOT_VERSION=1.3.0 +PINOT_DIR="apache-pinot-$PINOT_VERSION-bin" + +"./${PINOT_DIR}/bin/pinot-admin.sh" AddTable \ + -tableConfigFile offline_table.json \ + -schemaFile schema.json -exec || true + +# Pinot was unable to load data as a single file without errors. Split. +split -d --additional-suffix .tsv -n l/100 hits.tsv parts + +# Pinot can't load value '"tatuirovarki_redmond' so we need to fix this row. +sed parts93.tsv -e 's/"tatuirovarki_redmond/tatuirovarki_redmond/g' -i + +# Fix path in YAML to local directory (idempotent — only replace placeholder). +sed splitted.yaml -e "s|PWD_DIR_PLACEHOLDER|$PWD|g" -i +sed local.yaml -e "s|PWD_DIR_PLACEHOLDER|$PWD|g" -i + +"./${PINOT_DIR}/bin/pinot-admin.sh" LaunchDataIngestionJob -jobSpecFile splitted.yaml + +rm -f hits.tsv parts*.tsv +sync diff --git a/pinot/query b/pinot/query new file mode 100755 index 000000000..337792437 --- /dev/null +++ b/pinot/query @@ -0,0 +1,37 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via Pinot broker HTTP API. +# Stdout: query result JSON. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) +# Pinot doesn't accept trailing semicolons. +query=$(printf '%s' "$query" | tr -d ';') + +req=$(printf '%s' "$query" | python3 -c ' +import json, sys +q = sys.stdin.read() +print(json.dumps({"sql": q + " option(timeoutMs=300000)"})) +') + +resp=$(curl -sS -XPOST -H'Content-Type: application/json' \ + http://localhost:8000/query/sql/ \ + --data-binary "$req") + +echo "$resp" + +# Detect failure: Pinot returns a JSON object always; non-empty exceptions +# array means failure. +if echo "$resp" | jq -e '.exceptions | length > 0' >/dev/null 2>&1; then + echo "pinot query failed" >&2 + exit 1 +fi + +# timeUsedMs in JSON; convert to seconds. +secs=$(echo "$resp" | jq -r '.timeUsedMs / 1000') +if [ -z "$secs" ] || [ "$secs" = "null" ]; then + echo "no timing in pinot response" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/pinot/run.sh b/pinot/run.sh deleted file mode 100755 index 5f5ea4976..000000000 --- a/pinot/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TRIES=3 -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo -n "[" - for i in $(seq 1 $TRIES); do - echo "{\"sql\":\"$query option(timeoutMs=300000)\"}"| tr -d ';' > query.json - RES=$(curl -s -XPOST -H'Content-Type: application/json' http://localhost:8000/query/sql/ -d @query.json | jq 'if .exceptions == [] then .timeUsedMs/1000 else "-" end' ) - [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - done - echo "]," -done diff --git a/pinot/start b/pinot/start new file mode 100755 index 000000000..c51ea1e88 --- /dev/null +++ b/pinot/start @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +PINOT_VERSION=1.3.0 +PINOT_DIR="apache-pinot-$PINOT_VERSION-bin" + +# Idempotent: if broker query endpoint is up, do nothing. +if curl -sf -o /dev/null -w '%{http_code}' \ + -XPOST -H'Content-Type: application/json' \ + http://localhost:8000/query/sql/ \ + -d '{"sql":"SELECT 1"}' 2>/dev/null | grep -q '^200'; then + exit 0 +fi + +nohup "./${PINOT_DIR}/bin/pinot-admin.sh" QuickStart -type batch \ + >> pinot.log 2>&1 < /dev/null & +disown diff --git a/pinot/stop b/pinot/stop new file mode 100755 index 000000000..140a58969 --- /dev/null +++ b/pinot/stop @@ -0,0 +1,7 @@ +#!/bin/bash + +pkill -f 'pinot-admin' 2>/dev/null || true +pkill -f 'pinot.tools.admin' 2>/dev/null || true +pkill -f 'org.apache.pinot' 2>/dev/null || true +sleep 2 +exit 0 diff --git a/polars-dataframe/benchmark.sh b/polars-dataframe/benchmark.sh index b7cf32a63..fc4bacc8f 100755 --- a/polars-dataframe/benchmark.sh +++ b/polars-dataframe/benchmark.sh @@ -1,19 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install polars - -# Download the data -../download-hits-parquet-single - -# Run the queries - -/usr/bin/time -f "Memory usage: %M KB" ./query.py 2>&1 | tee log.txt - -echo -n "Data size: " -grep -F "Memory usage" log.txt | grep -o -P '\d+ KB' | sed 's/KB/*1024/' | bc -l +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/polars-dataframe/check b/polars-dataframe/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/polars-dataframe/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/polars-dataframe/data-size b/polars-dataframe/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/polars-dataframe/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/polars-dataframe/install b/polars-dataframe/install new file mode 100755 index 000000000..e8eaaea9f --- /dev/null +++ b/polars-dataframe/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet polars pyarrow fastapi uvicorn diff --git a/polars-dataframe/load b/polars-dataframe/load new file mode 100755 index 000000000..ceba6beca --- /dev/null +++ b/polars-dataframe/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Server reads hits.parquet from CWD into memory. +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/polars-dataframe/queries.sql b/polars-dataframe/queries.sql new file mode 100644 index 000000000..717ebd926 --- /dev/null +++ b/polars-dataframe/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/polars-dataframe/query b/polars-dataframe/query new file mode 100755 index 000000000..8f1c38e8c --- /dev/null +++ b/polars-dataframe/query @@ -0,0 +1,24 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running polars server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Capture HTTP status and body separately to detect errors cleanly. +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/polars-dataframe/query.py b/polars-dataframe/server.py old mode 100755 new mode 100644 similarity index 86% rename from polars-dataframe/query.py rename to polars-dataframe/server.py index cdda28df2..773324dd0 --- a/polars-dataframe/query.py +++ b/polars-dataframe/server.py @@ -1,51 +1,66 @@ #!/usr/bin/env python3 +"""FastAPI wrapper around polars so it conforms to the ClickBench +install/start/check/stop/load/query interface. -import polars as pl +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the LazyFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching lambda against the loaded LazyFrame, and + returns {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (estimated_size) + +The (sql, lambda) list is the same as the previous standalone query.py. +""" + +import os import timeit from datetime import date -import json -import subprocess -import os -# The streaming engine will be the default soon -# https://pola.rs/posts/polars-in-aggregate-dec25/ +import polars as pl +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +# Streaming engine will be the default soon. pl.Config.set_engine_affinity("streaming") -# 0: No., 1: SQL, 2: Polars -queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), +app = FastAPI() +hits_df: pl.DataFrame | None = None +hits: pl.LazyFrame | None = None + + +# 43 ClickBench queries. Each is (sql, callable). sql strings must match the +# corresponding line in queries.sql. The lambdas come straight from the prior +# polars-dataframe/query.py and have not been modified. +QUERIES: list[tuple[str, callable]] = [ + ("SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), ( - "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", lambda x: x.filter(pl.col("AdvEngineID") != 0).select(pl.len()).collect().item(), ), ( - "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: x.select(a_sum=pl.col("AdvEngineID").sum(), count=pl.len(), a_mean=pl.col("ResolutionWidth").mean()).collect().rows()[0], ), ( - "Q3", "SELECT AVG(UserID) FROM hits;", lambda x: x.select(pl.col("UserID").mean()).collect().item(), ), ( - "Q4", "SELECT COUNT(DISTINCT UserID) FROM hits;", lambda x: x.select(pl.col("UserID").n_unique()).collect().item(), ), ( - "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(), ), ( - "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", - lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0] + lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0], ), ( - "Q7", "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", lambda x: x.filter(pl.col("AdvEngineID") != 0) .group_by("AdvEngineID") @@ -53,7 +68,6 @@ .sort("count", descending=True).collect(), ), ( - "Q8", "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg(pl.col("UserID").n_unique().alias("u")) @@ -61,7 +75,6 @@ .head(10).collect(), ), ( - "Q9", "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg( @@ -76,7 +89,6 @@ .head(10).collect(), ), ( - "Q10", "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by("MobilePhoneModel") @@ -85,7 +97,6 @@ .head(10).collect(), ), ( - "Q11", "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by(["MobilePhone", "MobilePhoneModel"]) @@ -94,7 +105,6 @@ .head(10).collect(), ), ( - "Q12", "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -103,7 +113,6 @@ .head(10).collect(), ), ( - "Q13", "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -112,7 +121,6 @@ .head(10).collect(), ), ( - "Q14", "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "SearchPhrase"]) @@ -121,7 +129,6 @@ .head(10).collect(), ), ( - "Q15", "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by("UserID") .agg(pl.len().alias("count")) @@ -129,7 +136,6 @@ .head(10).collect(), ), ( - "Q16", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]) .agg(pl.len().alias("count")) @@ -137,12 +143,10 @@ .head(10).collect(), ), ( - "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(), ), ( - "Q18", "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by( [pl.col("UserID"), pl.col("EventTime").dt.minute(), "SearchPhrase"] @@ -152,17 +156,14 @@ .head(10).collect(), ), ( - "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(), ), ( - "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(), ), ( - "Q21", "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("URL").str.contains("google")) & (pl.col("SearchPhrase") != "") @@ -173,7 +174,6 @@ .head(10).collect(), ), ( - "Q22", "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("Title").str.contains("Google")) @@ -193,14 +193,12 @@ .head(10).collect(), ), ( - "Q23", "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("URL").str.contains("google")) .sort("EventTime") .head(10).collect(), ), ( - "Q24", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("EventTime") @@ -208,7 +206,6 @@ .head(10).collect(), ), ( - "Q25", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("SearchPhrase") @@ -216,7 +213,6 @@ .head(10).collect(), ), ( - "Q26", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort(["EventTime", "SearchPhrase"]) @@ -224,22 +220,20 @@ .head(10).collect(), ), ( - "Q27", "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", - lambda x: x.filter(pl.col("URL") != "") # WHERE URL <> '' - .group_by("CounterID") # GROUP BY CounterID + lambda x: x.filter(pl.col("URL") != "") + .group_by("CounterID") .agg( [ - pl.col("URL").str.len_chars().mean().alias("l"), # AVG(STRLEN(URL)) - pl.len().alias("c"), # COUNT(*) + pl.col("URL").str.len_chars().mean().alias("l"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect(), # LIMIT 25, + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect(), ), ( - "Q28", "SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: ( x.filter(pl.col("Referer") != "") @@ -251,23 +245,21 @@ .group_by("k") .agg( [ - pl.col("Referer").str.len_chars().mean().alias("l"), # AVG(STRLEN(Referer)) - pl.col("Referer").min().alias("min_referer"), # MIN(Referer) - pl.len().alias("c"), # COUNT(*) + pl.col("Referer").str.len_chars().mean().alias("l"), + pl.col("Referer").min().alias("min_referer"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect() # LIMIT 25 + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect() ), ), ( - "Q29", "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", lambda x: x.select([(pl.col("ResolutionWidth") + i).sum().alias(f"c_{i}") for i in range(90)]).collect(), ), ( - "Q30", "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "ClientIP"]) @@ -282,7 +274,6 @@ .head(10).collect(), ), ( - "Q31", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["WatchID", "ClientIP"]) @@ -297,7 +288,6 @@ .head(10).collect(), ), ( - "Q32", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.group_by(["WatchID", "ClientIP"]) .agg( @@ -311,7 +301,6 @@ .head(10).collect(), ), ( - "Q33", "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -319,7 +308,6 @@ .head(10).collect(), ), ( - "Q34", "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -328,7 +316,6 @@ .head(10).collect(), ), ( - "Q35", "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("ClientIP") .agg(pl.len().alias("c")) @@ -338,10 +325,9 @@ (pl.col("ClientIP") - 3).alias("ClientIP_minus_3") ]) .sort("c", descending=True) - .head(10).collect() + .head(10).collect(), ), ( - "Q36", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -357,7 +343,6 @@ .head(10).collect(), ), ( - "Q37", "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -373,7 +358,6 @@ .head(10).collect(), ), ( - "Q38", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -389,7 +373,6 @@ .slice(1000, 10).collect(), ), ( - "Q39", "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -417,7 +400,6 @@ .slice(1000, 10).collect(), ), ( - "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -433,7 +415,6 @@ .slice(100, 10).collect(), ), ( - "Q41", "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -449,7 +430,6 @@ .slice(10000, 10).collect(), ), ( - "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -465,41 +445,52 @@ ), ] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + -def run_timings(lf: pl.LazyFrame) -> None: - for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) +@app.post("/load") +def load(): + global hits, hits_df + start = timeit.default_timer() + df = pl.scan_parquet("hits.parquet").collect() + df = df.with_columns( + (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), + pl.col("EventDate").cast(pl.Date), + ) + df = df.rechunk() + hits_df = df + hits = df.lazy() + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} - times = [] - for _ in range(3): - start = timeit.default_timer() - result = q[2](lf) - end = timeit.default_timer() - if result is None: - times.append(None) - else: - times.append(round(end - start, 3)) - print(f"{times},") +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} -data_size = os.path.getsize("hits.parquet") -print("run DataFrame (in-memory) queries, this loads all data in memory!") -start = timeit.default_timer() -df = pl.scan_parquet("hits.parquet").collect() -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") +@app.get("/data-size") +def data_size(): + if hits_df is None: + return {"bytes": 0} + return {"bytes": int(hits_df.estimated_size())} -# fix some types -df = df.with_columns( - (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), - pl.col("EventDate").cast(pl.Date), -) -assert df["EventTime"][0].year == 2013 -df = df.rechunk() -lf = df.lazy() -run_timings(lf) +if __name__ == "__main__": + port = int(os.environ.get("BENCH_POLARS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/polars-dataframe/start b/polars-dataframe/start new file mode 100755 index 000000000..e3fab7273 --- /dev/null +++ b/polars-dataframe/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/polars-dataframe/stop b/polars-dataframe/stop new file mode 100755 index 000000000..787b35abc --- /dev/null +++ b/polars-dataframe/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/polars/benchmark.sh b/polars/benchmark.sh index bf81cf5f3..fc4bacc8f 100755 --- a/polars/benchmark.sh +++ b/polars/benchmark.sh @@ -1,18 +1,5 @@ #!/bin/bash - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv -python3 -m venv myenv -source myenv/bin/activate -pip install polars - -# Download the data -../download-hits-parquet-single - -# Run the queries - -./query.py 2>&1 | tee log.txt - -echo "Data size: $(du -bcs hits.parquet)" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/polars/check b/polars/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/polars/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/polars/data-size b/polars/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/polars/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/polars/install b/polars/install new file mode 100755 index 000000000..e8eaaea9f --- /dev/null +++ b/polars/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet polars pyarrow fastapi uvicorn diff --git a/polars/load b/polars/load new file mode 100755 index 000000000..4c98a2da1 --- /dev/null +++ b/polars/load @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +# Polars uses LazyFrame (scan_parquet), so the parquet file must remain +# available for queries — we only build the plan here and DO NOT delete the +# input. /load is essentially "register the source". +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported): ${elapsed}s" + +sync diff --git a/polars/queries.sql b/polars/queries.sql new file mode 100644 index 000000000..717ebd926 --- /dev/null +++ b/polars/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/polars/query b/polars/query new file mode 100755 index 000000000..9129884cf --- /dev/null +++ b/polars/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running polars server. +# Stdout: server response JSON. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/polars/query.py b/polars/server.py similarity index 86% rename from polars/query.py rename to polars/server.py index 1fbd34f2b..f788df8a9 100755 --- a/polars/query.py +++ b/polars/server.py @@ -1,51 +1,66 @@ #!/usr/bin/env python3 +"""FastAPI wrapper around polars so it conforms to the ClickBench +install/start/check/stop/load/query interface. -import polars as pl +Routes: + GET /health -> 200 OK once the server is up + POST /load -> reads hits.parquet from the working directory, fixes + column types, holds the LazyFrame in memory, and + returns {"elapsed": } + POST /query -> body: SQL text. Looks it up in QUERIES, runs the + matching lambda against the loaded LazyFrame, and + returns {"elapsed": }. + GET /data-size -> bytes the DataFrame currently occupies (estimated_size) + +The (sql, lambda) list is the same as the previous standalone query.py. +""" + +import os import timeit from datetime import date -import json -import subprocess -import os -# The streaming engine will be the default soon -# https://pola.rs/posts/polars-in-aggregate-dec25/ +import polars as pl +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +# Streaming engine will be the default soon. pl.Config.set_engine_affinity("streaming") -# 0: No., 1: SQL, 2: Polars -queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), +app = FastAPI() +hits: pl.LazyFrame | None = None +parquet_path: str = "hits.parquet" + + +# 43 ClickBench queries. Each is (sql, callable). sql strings must match the +# corresponding line in queries.sql. The lambdas come straight from the prior +# polars/query.py and have not been modified. +QUERIES: list[tuple[str, callable]] = [ + ("SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().item()), ( - "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", lambda x: x.filter(pl.col("AdvEngineID") != 0).select(pl.len()).collect().item(), ), ( - "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: x.select(a_sum=pl.col("AdvEngineID").sum(), count=pl.len(), a_mean=pl.col("ResolutionWidth").mean()).collect().rows()[0], ), ( - "Q3", "SELECT AVG(UserID) FROM hits;", lambda x: x.select(pl.col("UserID").mean()).collect().item(), ), ( - "Q4", "SELECT COUNT(DISTINCT UserID) FROM hits;", lambda x: x.select(pl.col("UserID").n_unique()).collect().item(), ), ( - "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(), ), ( - "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", - lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0] + lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0], ), ( - "Q7", "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", lambda x: x.filter(pl.col("AdvEngineID") != 0) .group_by("AdvEngineID") @@ -53,7 +68,6 @@ .sort("count", descending=True).collect(), ), ( - "Q8", "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg(pl.col("UserID").n_unique().alias("u")) @@ -61,7 +75,6 @@ .head(10).collect(), ), ( - "Q9", "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("RegionID") .agg( @@ -76,7 +89,6 @@ .head(10).collect(), ), ( - "Q10", "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by("MobilePhoneModel") @@ -85,7 +97,6 @@ .head(10).collect(), ), ( - "Q11", "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by(["MobilePhone", "MobilePhoneModel"]) @@ -94,7 +105,6 @@ .head(10).collect(), ), ( - "Q12", "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -103,7 +113,6 @@ .head(10).collect(), ), ( - "Q13", "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") @@ -112,7 +121,6 @@ .head(10).collect(), ), ( - "Q14", "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "SearchPhrase"]) @@ -121,7 +129,6 @@ .head(10).collect(), ), ( - "Q15", "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by("UserID") .agg(pl.len().alias("count")) @@ -129,7 +136,6 @@ .head(10).collect(), ), ( - "Q16", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]) .agg(pl.len().alias("count")) @@ -137,12 +143,10 @@ .head(10).collect(), ), ( - "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(), ), ( - "Q18", "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", lambda x: x.group_by( [pl.col("UserID"), pl.col("EventTime").dt.minute(), "SearchPhrase"] @@ -152,17 +156,14 @@ .head(10).collect(), ), ( - "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(), ), ( - "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(), ), ( - "Q21", "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("URL").str.contains("google")) & (pl.col("SearchPhrase") != "") @@ -173,7 +174,6 @@ .head(10).collect(), ), ( - "Q22", "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", lambda x: x.filter( (pl.col("Title").str.contains("Google")) @@ -193,14 +193,12 @@ .head(10).collect(), ), ( - "Q23", "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("URL").str.contains("google")) .sort("EventTime") .head(10).collect(), ), ( - "Q24", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("EventTime") @@ -208,7 +206,6 @@ .head(10).collect(), ), ( - "Q25", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("SearchPhrase") @@ -216,7 +213,6 @@ .head(10).collect(), ), ( - "Q26", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .sort(["EventTime", "SearchPhrase"]) @@ -224,22 +220,20 @@ .head(10).collect(), ), ( - "Q27", "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", - lambda x: x.filter(pl.col("URL") != "") # WHERE URL <> '' - .group_by("CounterID") # GROUP BY CounterID + lambda x: x.filter(pl.col("URL") != "") + .group_by("CounterID") .agg( [ - pl.col("URL").str.len_chars().mean().alias("l"), # AVG(STRLEN(URL)) - pl.len().alias("c"), # COUNT(*) + pl.col("URL").str.len_chars().mean().alias("l"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect(), # LIMIT 25, + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect(), ), ( - "Q28", "SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: ( x.filter(pl.col("Referer") != "") @@ -251,23 +245,21 @@ .group_by("k") .agg( [ - pl.col("Referer").str.len_chars().mean().alias("l"), # AVG(STRLEN(Referer)) - pl.col("Referer").min().alias("min_referer"), # MIN(Referer) - pl.len().alias("c"), # COUNT(*) + pl.col("Referer").str.len_chars().mean().alias("l"), + pl.col("Referer").min().alias("min_referer"), + pl.len().alias("c"), ] ) - .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 - .sort("l", descending=True) # ORDER BY l DESC - .limit(25).collect() # LIMIT 25 + .filter(pl.col("c") > 100000) + .sort("l", descending=True) + .limit(25).collect() ), ), ( - "Q29", "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", lambda x: x.select([(pl.col("ResolutionWidth") + i).sum().alias(f"c_{i}") for i in range(90)]).collect(), ), ( - "Q30", "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "ClientIP"]) @@ -282,7 +274,6 @@ .head(10).collect(), ), ( - "Q31", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["WatchID", "ClientIP"]) @@ -297,7 +288,6 @@ .head(10).collect(), ), ( - "Q32", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", lambda x: x.group_by(["WatchID", "ClientIP"]) .agg( @@ -311,7 +301,6 @@ .head(10).collect(), ), ( - "Q33", "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -319,7 +308,6 @@ .head(10).collect(), ), ( - "Q34", "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("URL") .agg(pl.len().alias("c")) @@ -328,7 +316,6 @@ .head(10).collect(), ), ( - "Q35", "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", lambda x: x.group_by("ClientIP") .agg(pl.len().alias("c")) @@ -338,10 +325,9 @@ (pl.col("ClientIP") - 3).alias("ClientIP_minus_3") ]) .sort("c", descending=True) - .head(10).collect() + .head(10).collect(), ), ( - "Q36", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -357,7 +343,6 @@ .head(10).collect(), ), ( - "Q37", "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -373,7 +358,6 @@ .head(10).collect(), ), ( - "Q38", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -389,7 +373,6 @@ .slice(1000, 10).collect(), ), ( - "Q39", "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -417,7 +400,6 @@ .slice(1000, 10).collect(), ), ( - "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -433,7 +415,6 @@ .slice(100, 10).collect(), ), ( - "Q41", "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -449,7 +430,6 @@ .slice(10000, 10).collect(), ), ( - "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) @@ -465,35 +445,51 @@ ), ] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/load") +def load(): + global hits + start = timeit.default_timer() + # Lazy: just builds the plan. Data is read on each query collect(). + hits = pl.scan_parquet(parquet_path).with_columns( + (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), + pl.col("EventDate").cast(pl.Date), + ) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + -def run_timings(lf: pl.LazyFrame) -> None: - for q in queries: - # Flush OS page cache before first run of each query - subprocess.run(['sync'], check=True) - subprocess.run(['sudo', 'tee', '/proc/sys/vm/drop_caches'], input=b'3', check=True, stdout=subprocess.DEVNULL) +@app.post("/query") +async def query(request: Request): + if hits is None: + raise HTTPException(status_code=409, detail="DataFrame not loaded; POST /load first") + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + fn = QUERIES[idx][1] + start = timeit.default_timer() + fn(hits) + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed, "index": idx} - times = [] - for _ in range(3): - start = timeit.default_timer() - result = q[2](lf) - end = timeit.default_timer() - if result is None: - times.append(None) - else: - times.append(round(end - start, 3)) - print(f"{times},") -data_size = os.path.getsize("hits.parquet") +@app.get("/data-size") +def data_size(): + # LazyFrame doesn't materialize, so report the on-disk parquet size. + try: + return {"bytes": os.path.getsize(parquet_path)} + except OSError: + return {"bytes": 0} -# Run from Parquet -start = timeit.default_timer() -lf = pl.scan_parquet("hits.parquet").with_columns( - (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), - pl.col("EventDate").cast(pl.Date), -) -end = timeit.default_timer() -load_time = round(end - start, 3) -print(f"Load time: {load_time}") -print("run parquet queries") -run_timings(lf) +if __name__ == "__main__": + port = int(os.environ.get("BENCH_POLARS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/polars/start b/polars/start new file mode 100755 index 000000000..7fee34fc1 --- /dev/null +++ b/polars/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, do nothing. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/polars/stop b/polars/stop new file mode 100755 index 000000000..00a85c15e --- /dev/null +++ b/polars/stop @@ -0,0 +1,16 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/postgresql-indexed/benchmark.sh b/postgresql-indexed/benchmark.sh index c6f06df30..531bd6503 100755 --- a/postgresql-indexed/benchmark.sh +++ b/postgresql-indexed/benchmark.sh @@ -1,74 +1,5 @@ #!/bin/bash - -set -eu - -PGVERSION=17 - -# Source: https://wiki.postgresql.org/wiki/Apt -sudo apt-get update -y -sudo apt-get install -y postgresql-common -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y - -sudo apt-get update -y -sudo apt-get install -y postgresql-common postgresql-$PGVERSION - -memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) -threads=$(nproc) -cpus=$(($threads / 2)) -# Shared buffers is set to 25% of memory in AWS RDS by default. We do the same. -# https://docs.aws.amazon.com/prescriptive-guidance/latest/tuning-postgresql-parameters/shared-buffers.html -shared_buffers=$(($memory / 4)) -# Effective cache size does not need to be perfect, but it should be somewhat -# close to the total memory minus what is expected to be used for queries. -# https://www.cybertec-postgresql.com/en/effective_cache_size-what-it-means-in-postgresql/ -effective_cache_size=$(($memory - ($memory / 4))) -# By default, max_worker_processes is set to in postgres. We want to be able to -# use all the threads for parallel workers so we increase it. We also add a -# small buffer of 15 for any other background workers that might be created. -max_worker_processes=$(($threads + 15)) -# Below we make sure to configure the rest of the parallel worker settings to -# match the number of cpu cores: -# https://www.crunchydata.com/blog/postgres-tuning-and-performance-for-analytics-data -# -# We also increase work_mem because we are doing an analytics workload to allow -# some more memory for sorting, aggregations, etc. -# -# It's necessary to increase max_wal_size to make the dataload not take very -# long. With the default value it's constantly checkpointing, and the PG logs -# warn you about that and tell you to increase max_wal_size. - -sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -echo -n "Load time: " -command time -f '%e' ./load.sh - -# COPY 99997497 -# Time: 2341543.463 ms (39:01.543) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/postgresql-indexed/check b/postgresql-indexed/check new file mode 100755 index 000000000..5c6f71123 --- /dev/null +++ b/postgresql-indexed/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/postgresql-indexed/data-size b/postgresql-indexed/data-size new file mode 100755 index 000000000..14b724ff1 --- /dev/null +++ b/postgresql-indexed/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total | awk '{print $1}' diff --git a/postgresql-indexed/install b/postgresql-indexed/install new file mode 100755 index 000000000..05b3082ae --- /dev/null +++ b/postgresql-indexed/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +# Source: https://wiki.postgresql.org/wiki/Apt +sudo apt-get update -y +sudo apt-get install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + +sudo apt-get update -y +sudo apt-get install -y postgresql-$PGVERSION + +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(($threads / 2)) +shared_buffers=$(($memory / 4)) +effective_cache_size=$(($memory - ($memory / 4))) +max_worker_processes=$(($threads + 15)) + +sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/postgresql-indexed/run.sh b/postgresql-indexed/run.sh deleted file mode 100755 index 5fa550e40..000000000 --- a/postgresql-indexed/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | sudo -u postgres psql test -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/postgresql-indexed/start b/postgresql-indexed/start new file mode 100755 index 000000000..941f213c5 --- /dev/null +++ b/postgresql-indexed/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/postgresql-indexed/stop b/postgresql-indexed/stop new file mode 100755 index 000000000..47969378d --- /dev/null +++ b/postgresql-indexed/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/postgresql-orioledb/benchmark.sh b/postgresql-orioledb/benchmark.sh index e78865660..531bd6503 100755 --- a/postgresql-orioledb/benchmark.sh +++ b/postgresql-orioledb/benchmark.sh @@ -1,75 +1,5 @@ #!/bin/bash - -# digest: sha256:3304142dbe8de8d5bbaa0e398cca58683ed603add4524c3582debf9c119994f1 -VERSION=beta12-pg17 -CONTAINER_NAME=orioledb-clickbench - -echo "Installing dependencies..." -sudo apt-get update -y -sudo apt-get install -y docker.io pigz postgresql-client - -# Using Docker due to pending patches in upstream PostgreSQL, see https://web.archive.org/web/20250722125912/https://www.orioledb.com/docs/usage/getting-started#start-postgresql -echo "Starting OrioleDB Docker container with name $CONTAINER_NAME. Using tag $VERSION..." -# Increase shared memory size, because Docker default will hit the limit ("ERROR: could not resize shared memory segment") -MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') -SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) -mkdir -p /tmp/data -sudo docker run --name $CONTAINER_NAME -v /tmp/data:/tmp/data --shm-size="$SHM_SIZE"m -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust -d orioledb/orioledb:$VERSION - -# Similar (but not identical) to PostgreSQL configuration -echo "Updating configuration" -THREADS=$(nproc) -CPUS=$(($THREADS / 2)) - -# Since we are only using OrioleDB tables, set to 1/4 of RAM and keep default value for shared buffers -# See https://www.orioledb.com/docs/usage/configuration#orioledbmain_buffers -MAIN_BUFFERS=$(($MEM_SIZE / 4)) -EFFECTIVE_CACHE_SIZE=$(($MEM_SIZE - ($MEM_SIZE / 4))) -MAX_WORKER_PROCESSES=$(($THREADS + 15)) - -envsubst < "$CONTAINER_NAME.log" & -while ! tail -n 1 "$CONTAINER_NAME.log" | grep -q 'database system is ready to accept connections'; do - echo "OrioleDB is not running yet. Checking again in 1 second..." - sleep 1 -done - -echo "Downloading dataset..." -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -O /tmp/data/hits.tsv.gz -pigz -d -f /tmp/data/hits.tsv.gz - -echo "Creating database and table..." -psql -h localhost -p 5432 -U postgres -c "CREATE DATABASE test;" -psql -h localhost -p 5432 -U postgres -c "CREATE EXTENSION IF NOT EXISTS orioledb;" -psql -h localhost -p 5432 -U postgres -d test < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -# Expected: 'Access method: orioledb' -psql -h localhost -p 5432 -U postgres -d test -c "\d+ hits" | grep 'Access method:' - -echo "Loading data..." -command time -f '%e' ./load.sh - -echo "Running queries..." -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo docker exec -i $CONTAINER_NAME du -bcs /var/lib/postgresql/data/orioledb_data | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/postgresql-orioledb/check b/postgresql-orioledb/check new file mode 100755 index 000000000..a9019ec65 --- /dev/null +++ b/postgresql-orioledb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +psql -h localhost -p 5432 -U postgres -t -c 'SELECT 1' >/dev/null diff --git a/postgresql-orioledb/data-size b/postgresql-orioledb/data-size new file mode 100755 index 000000000..c65100637 --- /dev/null +++ b/postgresql-orioledb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} +sudo docker exec -i "$CONTAINER_NAME" du -bcs /var/lib/postgresql/data/orioledb_data | grep total | awk '{print $1}' diff --git a/postgresql-orioledb/install b/postgresql-orioledb/install new file mode 100755 index 000000000..88d2cca68 --- /dev/null +++ b/postgresql-orioledb/install @@ -0,0 +1,47 @@ +#!/bin/bash +set -eu + +VERSION=${VERSION:-beta12-pg17} +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} + +sudo apt-get update -y +sudo apt-get install -y docker.io pigz postgresql-client + +# Pull image up front so subsequent ./start is fast and idempotent. +sudo docker pull "orioledb/orioledb:$VERSION" + +mkdir -p /tmp/data + +# (Re)create container with our config. Remove any existing one first. +if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + sudo docker rm -f "$CONTAINER_NAME" >/dev/null +fi + +MEM_SIZE=$(grep MemTotal /proc/meminfo | awk '{print $2}') +SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) + +sudo docker run --name "$CONTAINER_NAME" \ + -v /tmp/data:/tmp/data \ + --shm-size="${SHM_SIZE}m" \ + -p 5432:5432 \ + -e POSTGRES_HOST_AUTH_METHOD=trust \ + -d "orioledb/orioledb:$VERSION" + +THREADS=$(nproc) +CPUS=$(($THREADS / 2)) +MAIN_BUFFERS=$(($MEM_SIZE / 4)) +EFFECTIVE_CACHE_SIZE=$(($MEM_SIZE - ($MEM_SIZE / 4))) +MAX_WORKER_PROCESSES=$(($THREADS + 15)) + +cat <&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/postgresql-orioledb/run.sh b/postgresql-orioledb/run.sh deleted file mode 100755 index 71a0d6867..000000000 --- a/postgresql-orioledb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | psql -h localhost -p 5432 -U postgres -d test -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/postgresql-orioledb/start b/postgresql-orioledb/start new file mode 100755 index 000000000..f45f281e5 --- /dev/null +++ b/postgresql-orioledb/start @@ -0,0 +1,9 @@ +#!/bin/bash +set -eu + +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} + +# Idempotent: start if not already running. +if [ "$(sudo docker inspect -f '{{.State.Running}}' "$CONTAINER_NAME" 2>/dev/null || echo false)" != "true" ]; then + sudo docker start "$CONTAINER_NAME" +fi diff --git a/postgresql-orioledb/stop b/postgresql-orioledb/stop new file mode 100755 index 000000000..4c10d953f --- /dev/null +++ b/postgresql-orioledb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +CONTAINER_NAME=${CONTAINER_NAME:-orioledb-clickbench} +sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true diff --git a/postgresql/benchmark.sh b/postgresql/benchmark.sh index 4c1a3a4e6..531bd6503 100755 --- a/postgresql/benchmark.sh +++ b/postgresql/benchmark.sh @@ -1,74 +1,5 @@ #!/bin/bash - -set -eu - -PGVERSION=17 - -# Source: https://wiki.postgresql.org/wiki/Apt -sudo apt-get update -y -sudo apt-get install -y postgresql-common -y -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y - -sudo apt-get update -y -sudo apt-get install -y postgresql-$PGVERSION - -memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) -threads=$(nproc) -cpus=$(($threads / 2)) -# Shared buffers is set to 25% of memory in AWS RDS by default. We do the same. -# https://docs.aws.amazon.com/prescriptive-guidance/latest/tuning-postgresql-parameters/shared-buffers.html -shared_buffers=$(($memory / 4)) -# Effective cache size does not need to be perfect, but it should be somewhat -# close to the total memory minus what is expected to be used for queries. -# https://www.cybertec-postgresql.com/en/effective_cache_size-what-it-means-in-postgresql/ -effective_cache_size=$(($memory - ($memory / 4))) -# By default, max_worker_processes is set to in postgres. We want to be able to -# use all the threads for parallel workers so we increase it. We also add a -# small buffer of 15 for any other background workers that might be created. -max_worker_processes=$(($threads + 15)) -# Below we make sure to configure the rest of the parallel worker settings to -# match the number of cpu cores: -# https://www.crunchydata.com/blog/postgres-tuning-and-performance-for-analytics-data -# -# We also increase work_mem because we are doing an analytics workload to allow -# some more memory for sorting, aggregations, etc. -# -# It's necessary to increase max_wal_size to make the dataload not take very -# long. With the default value it's constantly checkpointing, and the PG logs -# warn you about that and tell you to increase max_wal_size. - -sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi - -echo -n "Load time: " -command time -f '%e' ./load.sh - -# COPY 99997497 -# Time: 2341543.463 ms (39:01.543) - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/postgresql/check b/postgresql/check new file mode 100755 index 000000000..5c6f71123 --- /dev/null +++ b/postgresql/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/postgresql/data-size b/postgresql/data-size new file mode 100755 index 000000000..14b724ff1 --- /dev/null +++ b/postgresql/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo du -bcs /var/lib/postgresql/$PGVERSION/main/ | grep total | awk '{print $1}' diff --git a/postgresql/install b/postgresql/install new file mode 100755 index 000000000..05b3082ae --- /dev/null +++ b/postgresql/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +# Source: https://wiki.postgresql.org/wiki/Apt +sudo apt-get update -y +sudo apt-get install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + +sudo apt-get update -y +sudo apt-get install -y postgresql-$PGVERSION + +memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo) +threads=$(nproc) +cpus=$(($threads / 2)) +shared_buffers=$(($memory / 4)) +effective_cache_size=$(($memory - ($memory / 4))) +max_worker_processes=$(($threads + 15)) + +sudo tee /etc/postgresql/$PGVERSION/main/conf.d/clickbench.conf <&1) +status=$? + +# psql may print "ERROR:" on a failed query but exit 0 with -t. Detect. +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +# Print everything except the Time: line to stdout. +printf '%s\n' "$out" | grep -v '^Time:' + +# Extract last "Time: NNN.NNN ms" line and emit seconds on stderr. +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/postgresql/run.sh b/postgresql/run.sh deleted file mode 100755 index 96a8161ec..000000000 --- a/postgresql/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo "$query" - ( - echo '\timing' - yes "$query" | head -n $TRIES - ) | sudo -u postgres psql test -t 2>&1 | grep -P 'Time|psql: error' | tail -n1 -done diff --git a/postgresql/start b/postgresql/start new file mode 100755 index 000000000..941f213c5 --- /dev/null +++ b/postgresql/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/postgresql/stop b/postgresql/stop new file mode 100755 index 000000000..47969378d --- /dev/null +++ b/postgresql/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/questdb/benchmark.sh b/questdb/benchmark.sh index b33a8728a..1aa9264b9 100755 --- a/questdb/benchmark.sh +++ b/questdb/benchmark.sh @@ -1,82 +1,5 @@ #!/bin/bash - -# Install - -qdb_version="9.3.1" -if [[ $(arch) == "aarch64" ]] || [[ $(arch) == "arm"* ]]; then - # ARM uses no-JRE binary, so we need to install JDK - wget --continue --progress=dot:giga https://github.com/graalvm/graalvm-ce-builds/releases/download/jdk-17.0.9/graalvm-community-jdk-17.0.9_linux-aarch64_bin.tar.gz - tar xf graalvm-community-*.tar.gz --one-top-level=graalvm --strip-components 1 - export JAVA_HOME=$PWD/graalvm - - wget --continue --progress=dot:giga https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-no-jre-bin.tar.gz - tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 - mkdir questdb/bin - mv questdb/* questdb/bin -else - wget --continue --progress=dot:giga https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-rt-linux-x86-64.tar.gz - tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 -fi - -questdb/bin/questdb.sh start - -while ! nc -z localhost 9000; do - sleep 0.1 -done - -sed -i 's/query.timeout.sec=60/query.timeout.sec=500/' ~/.questdb/conf/server.conf -sed -i "s|cairo.sql.copy.root=import|cairo.sql.copy.root=$PWD|" ~/.questdb/conf/server.conf -questdb/bin/questdb.sh stop -questdb/bin/questdb.sh start - -# Import the data - -../download-hits-csv - -curl -G --data-urlencode "query=$(cat create.sql)" 'http://localhost:9000/exec' - -if [[ "$(nproc)" -ge 96 ]] -then - # SQL COPY works best on metal instances: - start=$(date +%s) - - curl -G --data-urlencode "query=copy hits from 'hits.csv' with timestamp 'EventTime' format 'yyyy-MM-dd HH:mm:ss';" 'http://localhost:9000/exec' - - echo 'waiting for import to finish...' - until [ "$(curl -s -G --data-urlencode "query=select * from sys.text_import_log where phase is null and status='finished';" 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do - echo '.' - sleep 5 - done - - end=$(date +%s) - echo "Load time: $((end - start))" -else - # On smaller instances use this: - start=$(date +%s) - - curl -F data=@hits.csv 'http://localhost:9000/imp?name=hits&maxUncommittedRows=5000000' - - echo 'waiting for rows to become readable...' - until [ "$(curl -s -G --data-urlencode "query=select 1 from (select count() c from hits) where c = 99997497;" 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do - echo '.' - sleep 5 - done - - end=$(date +%s) - echo "Load time: $((end - start))" -fi - -# Run queries - -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -du -bcs ~/.questdb/db/hits* | grep total - -cat log.txt | \ - grep -P '"timings"|"error"|null' | \ - sed -r -e 's/^.*"error".*$/null/; s/^.*"execute":([0-9]*),.*$/\1/' | \ - awk '{ print ($1) / 1000000000 }' | \ - awk '{ printf "%.3f\n", $1 }' | \ - sed -r -e 's/^0$/null/' | \ - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/questdb/check b/questdb/check new file mode 100755 index 000000000..3f929fb25 --- /dev/null +++ b/questdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sSf -G --data-urlencode 'query=SELECT 1' 'http://localhost:9000/exec' >/dev/null diff --git a/questdb/data-size b/questdb/data-size new file mode 100755 index 000000000..23636a0e7 --- /dev/null +++ b/questdb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -bcs ~/.questdb/db/hits* | grep total | awk '{print $1}' diff --git a/questdb/install b/questdb/install new file mode 100755 index 000000000..9bb3cbe07 --- /dev/null +++ b/questdb/install @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +qdb_version="${QDB_VERSION:-9.3.1}" + +if [ -d questdb/bin ]; then + exit 0 +fi + +if [[ $(arch) == "aarch64" ]] || [[ $(arch) == arm* ]]; then + # ARM uses no-JRE binary, so we install GraalVM JDK alongside. + wget --continue --progress=dot:giga \ + https://github.com/graalvm/graalvm-ce-builds/releases/download/jdk-17.0.9/graalvm-community-jdk-17.0.9_linux-aarch64_bin.tar.gz + tar xf graalvm-community-*.tar.gz --one-top-level=graalvm --strip-components 1 + export JAVA_HOME=$PWD/graalvm + + wget --continue --progress=dot:giga \ + "https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-no-jre-bin.tar.gz" + tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 + mkdir questdb/bin + mv questdb/* questdb/bin +else + wget --continue --progress=dot:giga \ + "https://github.com/questdb/questdb/releases/download/${qdb_version}/questdb-${qdb_version}-rt-linux-x86-64.tar.gz" + tar xf questdb*.tar.gz --one-top-level=questdb --strip-components 1 +fi diff --git a/questdb/load b/questdb/load new file mode 100755 index 000000000..aba840e14 --- /dev/null +++ b/questdb/load @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +curl -sS -G --data-urlencode "query=$(cat create.sql)" 'http://localhost:9000/exec' >/dev/null + +if [[ "$(nproc)" -ge 96 ]]; then + # SQL COPY works best on metal instances. + curl -sS -G --data-urlencode "query=copy hits from 'hits.csv' with timestamp 'EventTime' format 'yyyy-MM-dd HH:mm:ss';" \ + 'http://localhost:9000/exec' >/dev/null + + until [ "$(curl -sS -G --data-urlencode "query=select * from sys.text_import_log where phase is null and status='finished';" \ + 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do + sleep 5 + done +else + # Smaller instances: HTTP /imp endpoint. + curl -sS -F data=@hits.csv 'http://localhost:9000/imp?name=hits&maxUncommittedRows=5000000' >/dev/null + + until [ "$(curl -sS -G --data-urlencode "query=select 1 from (select count() c from hits) where c = 99997497;" \ + 'http://localhost:9000/exec' | grep -c '"count":1')" -ge 1 ]; do + sleep 5 + done +fi + +rm -f hits.csv +sync diff --git a/questdb/query b/questdb/query new file mode 100755 index 000000000..1081c2d04 --- /dev/null +++ b/questdb/query @@ -0,0 +1,29 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via QuestDB HTTP /exec?timings=true. +# Stdout: query result (JSON). +# Stderr: query runtime in fractional seconds on the last line (parsed from +# the "timings.execute" field, in nanoseconds). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(curl -sS --max-time 600 -G --data-urlencode "query=${query}" \ + 'http://localhost:9000/exec?timings=true' 2>&1) + +if printf '%s\n' "$raw" | grep -q '"error"'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" + +# Parse "execute": from the timings JSON object. +ns=$(printf '%s\n' "$raw" | grep -oP '"execute":\s*\K[0-9]+' | tail -n1) + +if [ -z "$ns" ]; then + echo "no timings.execute in questdb response" >&2 + exit 1 +fi + +awk -v n="$ns" 'BEGIN { printf "%.3f\n", n / 1000000000 }' >&2 diff --git a/questdb/run.sh b/questdb/run.sh deleted file mode 100755 index 0159343fd..000000000 --- a/questdb/run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -TRIES=3 - -questdb/bin/questdb.sh stop -questdb/bin/questdb.sh start -sleep 5 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - echo "$query"; - for i in $(seq 1 $TRIES); do - curl -sS --max-time 600 -G --data-urlencode "query=${query}" 'http://localhost:9000/exec?timings=true' 2>&1 | grep '"timings"' - echo - done; -done; - -questdb/bin/questdb.sh stop diff --git a/questdb/start b/questdb/start new file mode 100755 index 000000000..5958a3005 --- /dev/null +++ b/questdb/start @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# Idempotent: if HTTP API on :9000 already responds, do nothing. +if curl -sSf -G --data-urlencode 'query=SELECT 1' 'http://localhost:9000/exec' >/dev/null 2>&1; then + exit 0 +fi + +questdb/bin/questdb.sh start + +# Wait for HTTP port. +for _ in $(seq 1 60); do + if nc -z localhost 9000 2>/dev/null; then break; fi + sleep 1 +done + +# Tweak config (idempotent — sed -i with same value is safe). +mkdir -p ~/.questdb/conf +if [ -f ~/.questdb/conf/server.conf ]; then + sed -i 's/query.timeout.sec=60/query.timeout.sec=500/' ~/.questdb/conf/server.conf + sed -i "s|cairo.sql.copy.root=import|cairo.sql.copy.root=$PWD|" ~/.questdb/conf/server.conf + questdb/bin/questdb.sh stop + sleep 2 + questdb/bin/questdb.sh start +fi diff --git a/questdb/stop b/questdb/stop new file mode 100755 index 000000000..6f914fe75 --- /dev/null +++ b/questdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +questdb/bin/questdb.sh stop 2>/dev/null || true diff --git a/sail-partitioned/benchmark.sh b/sail-partitioned/benchmark.sh index 3909a58f1..3b63e772a 100755 --- a/sail-partitioned/benchmark.sh +++ b/sail-partitioned/benchmark.sh @@ -1,66 +1,5 @@ #!/bin/bash - -# https://github.com/rust-lang/rust/issues/97234#issuecomment-1133564556 -ulimit -n 65536 - -# Install - -export DEBIAN_FRONTEND=noninteractive - -# When you run Sail on Amazon Linux, you may encounter the following error: -# failed to get system time zone: No such file or directory (os error 2) -# The reason is that /etc/localtime is supposed to be a symlink when retrieving the system time zone, but on Amazon Linux it is a regular file. -# There is a GitHub issue for this problem, but it has not been resolved yet: https://github.com/amazonlinux/amazon-linux-2023/issues/526 -echo "Set Timezone" -export TZ=Etc/UTC -sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository ppa:deadsnakes/ppa -y -sudo apt-get update -y -sudo apt-get install -y \ - gcc protobuf-compiler \ - libprotobuf-dev \ - pkg-config \ - libssl-dev \ - python3.11 \ - python3.11-dev \ - python3.11-venv \ - python3.11-distutils - -echo "Set Python alternatives" -sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ - sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - -echo "Install Python packages" -python3 -m venv myenv -source myenv/bin/activate -pip install --upgrade setuptools wheel -pip install --no-cache-dir "pysail==0.5.2" -pip install "pyspark-client==4.1.1" \ - pandas \ - psutil - -# Load the data - -echo "Download benchmark target data, partitioned" -../download-hits-parquet-partitioned partitioned - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -bcs partitioned/hits*.parquet | grep total)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/sail-partitioned/check b/sail-partitioned/check new file mode 100755 index 000000000..140fda4c1 --- /dev/null +++ b/sail-partitioned/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import pysail" >/dev/null diff --git a/sail-partitioned/data-size b/sail-partitioned/data-size new file mode 100755 index 000000000..503090478 --- /dev/null +++ b/sail-partitioned/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs partitioned/hits_*.parquet | awk '/total$/ { print $1 }' diff --git a/sail-partitioned/install b/sail-partitioned/install new file mode 100755 index 000000000..7f5fe3039 --- /dev/null +++ b/sail-partitioned/install @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +ulimit -n 65536 + +export DEBIAN_FRONTEND=noninteractive + +export TZ=Etc/UTC +sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +echo $TZ | sudo tee /etc/timezone >/dev/null + +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.cargo/env" + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository ppa:deadsnakes/ppa -y +sudo apt-get update -y +sudo apt-get install -y \ + gcc protobuf-compiler \ + libprotobuf-dev \ + pkg-config \ + libssl-dev \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + python3.11-distutils + +sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 || true +sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 || true +if ! python3.11 -m pip --version >/dev/null 2>&1; then + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 +fi + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade setuptools wheel +pip install --no-cache-dir "pysail==0.5.2" +pip install "pyspark-client==4.1.1" pandas psutil diff --git a/sail-partitioned/load b/sail-partitioned/load new file mode 100755 index 000000000..c110e4372 --- /dev/null +++ b/sail-partitioned/load @@ -0,0 +1,8 @@ +#!/bin/bash +# sail-partitioned reads partitioned/*.parquet via Spark. Move the +# downloaded files into the expected subdir. +set -e + +mkdir -p partitioned +mv hits_*.parquet partitioned/ 2>/dev/null || true +sync diff --git a/sail-partitioned/query.py b/sail-partitioned/query similarity index 56% rename from sail-partitioned/query.py rename to sail-partitioned/query index 705550130..9b935bae7 100755 --- a/sail-partitioned/query.py +++ b/sail-partitioned/query @@ -1,44 +1,46 @@ -#!/usr/bin/env python3 +#!/bin/bash +# Reads a SQL query from stdin, runs it via pysail (Spark Connect server) +# against ./partitioned/*.parquet. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e -from pysail.spark import SparkConnectServer -from pyspark.sql import SparkSession -import pyspark.sql.functions as F +ulimit -n 65536 -import timeit -import psutil -import sys -import re - -query = sys.stdin.read() -# Replace \1 to $1 because spark recognizes only this pattern style (in query 28) -query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) -print(query) +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 - <<'PY' import os +import re +import sys +import timeit + os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true" os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true" os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" +from pysail.spark import SparkConnectServer +from pyspark.sql import SparkSession + +query = sys.stdin.read() +query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) + server = SparkConnectServer() server.start() _, port = server.listening_address - spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() df = spark.read.parquet("partitioned") df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - res = result.toPandas() - end = timeit.default_timer() - if try_num == 0: - print(res) - print("Time: ", round(end - start, 3)) - except Exception as e: - print(e) - print("Failure!") +start = timeit.default_timer() +res = spark.sql(query).toPandas() +end = timeit.default_timer() + +print(res) spark.stop() + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/sail-partitioned/run.sh b/sail-partitioned/run.sh deleted file mode 100755 index 64df8c608..000000000 --- a/sail-partitioned/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/sail-partitioned/start b/sail-partitioned/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/sail-partitioned/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/sail-partitioned/stop b/sail-partitioned/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/sail-partitioned/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/sail/benchmark.sh b/sail/benchmark.sh index aef392d60..fc4bacc8f 100755 --- a/sail/benchmark.sh +++ b/sail/benchmark.sh @@ -1,66 +1,5 @@ #!/bin/bash - -# https://github.com/rust-lang/rust/issues/97234#issuecomment-1133564556 -ulimit -n 65536 - -# Install - -export DEBIAN_FRONTEND=noninteractive - -# When you run Sail on Amazon Linux, you may encounter the following error: -# failed to get system time zone: No such file or directory (os error 2) -# The reason is that /etc/localtime is supposed to be a symlink when retrieving the system time zone, but on Amazon Linux it is a regular file. -# There is a GitHub issue for this problem, but it has not been resolved yet: https://github.com/amazonlinux/amazon-linux-2023/issues/526 -echo "Set Timezone" -export TZ=Etc/UTC -sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone - -echo "Install Rust" -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh -bash rust-init.sh -y -export HOME=${HOME:=~} -source ~/.cargo/env - -echo "Install Dependencies" -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository ppa:deadsnakes/ppa -y -sudo apt-get update -y -sudo apt-get install -y \ - gcc protobuf-compiler \ - libprotobuf-dev \ - pkg-config \ - libssl-dev \ - python3.11 \ - python3.11-dev \ - python3.11-venv \ - python3.11-distutils - -echo "Set Python alternatives" -sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ - sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - -echo "Install Python packages" -python3 -m venv myenv -source myenv/bin/activate -pip install --upgrade setuptools wheel -pip install --no-cache-dir "pysail==0.5.2" -pip install "pyspark-client==4.1.1" \ - pandas \ - psutil - -# Load the data - -echo "Download benchmark target data, single file" -../download-hits-parquet-single - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/sail/check b/sail/check new file mode 100755 index 000000000..140fda4c1 --- /dev/null +++ b/sail/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c "import pysail" >/dev/null diff --git a/sail/data-size b/sail/data-size new file mode 100755 index 000000000..708c0b72e --- /dev/null +++ b/sail/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.parquet diff --git a/sail/install b/sail/install new file mode 100755 index 000000000..95be3ca1e --- /dev/null +++ b/sail/install @@ -0,0 +1,50 @@ +#!/bin/bash +set -e + +ulimit -n 65536 + +export DEBIAN_FRONTEND=noninteractive + +# Sail needs a real /etc/localtime symlink; make sure it exists. +export TZ=Etc/UTC +sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +echo $TZ | sudo tee /etc/timezone >/dev/null + +# Rust toolchain (used by pysail's native build, if needed). +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.cargo/env" + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository ppa:deadsnakes/ppa -y +sudo apt-get update -y +sudo apt-get install -y \ + gcc protobuf-compiler \ + libprotobuf-dev \ + pkg-config \ + libssl-dev \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + python3.11-distutils + +sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 || true +sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 || true +if ! python3.11 -m pip --version >/dev/null 2>&1; then + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 +fi + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate + +pip install --upgrade setuptools wheel +pip install --no-cache-dir "pysail==0.5.2" +pip install "pyspark-client==4.1.1" pandas psutil diff --git a/sail/load b/sail/load new file mode 100755 index 000000000..0618f93f6 --- /dev/null +++ b/sail/load @@ -0,0 +1,5 @@ +#!/bin/bash +# sail reads hits.parquet directly via Spark's parquet reader. No persistent +# DB to load. +set -e +sync diff --git a/sail/query.py b/sail/query similarity index 56% rename from sail/query.py rename to sail/query index 78ce8cf00..3c95d022e 100755 --- a/sail/query.py +++ b/sail/query @@ -1,44 +1,47 @@ -#!/usr/bin/env python3 +#!/bin/bash +# Reads a SQL query from stdin, runs it via pysail (Spark Connect server) +# against hits.parquet. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +set -e -from pysail.spark import SparkConnectServer -from pyspark.sql import SparkSession -import pyspark.sql.functions as F +ulimit -n 65536 -import timeit -import psutil -import sys -import re - -query = sys.stdin.read() -# Replace \1 to $1 because spark recognizes only this pattern style (in query 28) -query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) -print(query) +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 - <<'PY' import os +import re +import sys +import timeit + os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true" os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true" os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" +from pysail.spark import SparkConnectServer +from pyspark.sql import SparkSession + +query = sys.stdin.read() +# Spark expects $1 instead of \1 in REGEXP_REPLACE. +query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query) + server = SparkConnectServer() server.start() _, port = server.listening_address - spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() df = spark.read.parquet("hits.parquet") df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - res = result.toPandas() - end = timeit.default_timer() - if try_num == 0: - print(res) - print("Time: ", round(end - start, 3)) - except Exception as e: - print(e) - print("Failure!") +start = timeit.default_timer() +res = spark.sql(query).toPandas() +end = timeit.default_timer() + +print(res) spark.stop() + +print(f"{end - start:.3f}", file=sys.stderr) +PY diff --git a/sail/run.sh b/sail/run.sh deleted file mode 100755 index 64df8c608..000000000 --- a/sail/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/sail/start b/sail/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/sail/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/sail/stop b/sail/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/sail/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/selectdb/benchmark.sh b/selectdb/benchmark.sh index e92f667aa..531bd6503 100755 --- a/selectdb/benchmark.sh +++ b/selectdb/benchmark.sh @@ -1,127 +1,5 @@ #!/bin/bash -set -e - -# This benchmark should run on Ubuntu 20.04 - -# Install -ROOT=$(pwd) - -if [[ -n "$1" ]]; then - url="$1" -else - url='https://qa-build.oss-cn-beijing.aliyuncs.com/enterprise-doris-release-output/selectdb-doris-2.1.7-rc01-bin-x64.tar.gz' -fi -# Download -file_name="$(basename ${url})" -if [[ "$url" == "http"* ]]; then - if [[ ! -f $file_name ]]; then - wget --continue --progress=dot:giga ${url} - else - echo "$file_name already exists, no need to download." - fi -fi -dir_name="${file_name/.tar.gz/}" - -# Try to stop SelectDB and remove it first if execute this script multiple times -set +e -"$dir_name"/selectdb-doris-2.1.7-rc01-bin-x64/fe/bin/stop_fe.sh -"$dir_name"/selectdb-doris-2.1.7-rc01-bin-x64/be/bin/stop_be.sh -rm -rf "$dir_name" -set -e - -# Uncompress -mkdir "$dir_name" -tar zxf "$file_name" -C "$dir_name" -DORIS_HOME="$ROOT/$dir_name/selectdb-doris-2.1.7-rc01-bin-x64" -export DORIS_HOME - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jdk -sudo apt-get install -y mysql-client -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -set +e -sudo systemctl disable unattended-upgrades -sudo systemctl stop unattended-upgrades -sudo systemctl stop mysql-server -set -e - -"$DORIS_HOME"/fe/bin/start_fe.sh --daemon - -# Start Backend -sudo sysctl -w vm.max_map_count=2000000 -ulimit -n 65535 -"$DORIS_HOME"/be/bin/start_be.sh --daemon - -# Wait for Frontend ready -for _ in {1..300} -do - fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' | cut -f16 | sed -n '2,$p') - if [[ -n "${fe_version}" ]] && [[ "${fe_version}" != "NULL" ]]; then - echo "Frontend version: ${fe_version}" - break - else - echo 'Wait for Frontend ready ...' - sleep 2 - fi -done - -# Setup cluster, add Backend to cluster -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050' " - -# Wait for Backend ready -for _ in {1..300} -do - be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' | cut -f22 | sed -n '2,$p') - if [[ -n "${be_version}" ]]; then - echo "Backend version: ${be_version}" - break - else - echo 'Wait for Backend ready ...' - sleep 2 - fi -done - -echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - -# Create Database and table -mysql -h 127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" -sleep 5 -mysql -h 127.0.0.1 -P9030 -uroot hits <"$ROOT"/create.sql - -# Download data -if [[ ! -f hits.tsv.gz ]] && [[ ! -f hits.tsv ]]; then - ../download-hits-tsv -fi - -# Load data -echo "start loading hits.tsv, estimated to take about 9 minutes ..." -date -START=$(date +%s) -curl --location-trusted \ - -u root: \ - -T "hits.tsv" \ - -H "label:hits" \ - -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ - http://localhost:8030/api/hits/hits/_stream_load -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" -echo "$LOADTIME" > loadtime - -# Dataset contains 99997497 rows, storage size is about 17319588503 bytes -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits" -du -bs "$DORIS_HOME"/be/storage/ | cut -f1 | tee storage_size - -echo "Data size: $(cat storage_size)" - -# Run queries -./run.sh 2>&1 | tee -a log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/selectdb/check b/selectdb/check new file mode 100755 index 000000000..c6e836c8c --- /dev/null +++ b/selectdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null diff --git a/selectdb/data-size b/selectdb/data-size new file mode 100755 index 000000000..2b26d0f6b --- /dev/null +++ b/selectdb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +du -bs "$DORIS_HOME/be/storage/" | cut -f1 diff --git a/selectdb/install b/selectdb/install new file mode 100755 index 000000000..56b999c9b --- /dev/null +++ b/selectdb/install @@ -0,0 +1,31 @@ +#!/bin/bash +set -e + +# This benchmark runs on Ubuntu 20.04+ +ROOT=$(pwd) +URL='https://qa-build.oss-cn-beijing.aliyuncs.com/enterprise-doris-release-output/selectdb-doris-2.1.7-rc01-bin-x64.tar.gz' + +file_name="$(basename "$URL")" +dir_name="${file_name/.tar.gz/}" +DORIS_HOME="$ROOT/$dir_name/selectdb-doris-2.1.7-rc01-bin-x64" + +if [ ! -d "$DORIS_HOME" ]; then + if [ ! -f "$file_name" ]; then + wget --continue --progress=dot:giga "$URL" + fi + mkdir -p "$dir_name" + tar zxf "$file_name" -C "$dir_name" +fi + +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jdk mysql-client bc + +set +e +sudo systemctl disable unattended-upgrades 2>/dev/null +sudo systemctl stop unattended-upgrades 2>/dev/null +sudo systemctl stop mysql-server 2>/dev/null +set -e + +sudo sysctl -w vm.max_map_count=2000000 + +echo "$DORIS_HOME" > .doris_home diff --git a/selectdb/load b/selectdb/load new file mode 100755 index 000000000..57ded740d --- /dev/null +++ b/selectdb/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +ROOT=$(pwd) + +# Idempotent: drop+create database. +mysql -h127.0.0.1 -P9030 -uroot -e "DROP DATABASE IF EXISTS hits" +mysql -h127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" +sleep 5 +mysql -h127.0.0.1 -P9030 -uroot hits < "$ROOT/create.sql" + +curl --location-trusted \ + -u root: \ + -T "hits.tsv" \ + -H "label:hits_$(date +%s)" \ + -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ + http://localhost:8030/api/hits/hits/_stream_load + +rm -f hits.tsv hits.tsv.gz +sync diff --git a/selectdb/query b/selectdb/query new file mode 100755 index 000000000..289dd078e --- /dev/null +++ b/selectdb/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against SelectDB's `hits` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Per-query BE cache flush (matches original run.sh behavior). +curl -sS http://127.0.0.1:8040/api/clear_cache/all >/dev/null 2>&1 || true + +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/selectdb/run.sh b/selectdb/run.sh deleted file mode 100755 index 57408f9dd..000000000 --- a/selectdb/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TRIES=3 - -while read -r query; do - curl -sS http://127.0.0.1:8040/api/clear_cache/all - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" - done - -done < queries.sql diff --git a/selectdb/start b/selectdb/start new file mode 100755 index 000000000..0252bc48f --- /dev/null +++ b/selectdb/start @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +DORIS_HOME=$(cat .doris_home) +export DORIS_HOME +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH=$JAVA_HOME/bin:$PATH + +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +ulimit -n 65535 + +"$DORIS_HOME/fe/bin/start_fe.sh" --daemon +"$DORIS_HOME/be/bin/start_be.sh" --daemon + +for _ in $(seq 1 300); do + fe_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show frontends' 2>/dev/null | cut -f16 | sed -n '2,$p') + if [ -n "$fe_version" ] && [ "$fe_version" != "NULL" ]; then + break + fi + sleep 2 +done + +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '127.0.0.1:9050'" 2>/dev/null || true + +for _ in $(seq 1 300); do + be_version=$(mysql -h127.0.0.1 -P9030 -uroot -e 'show backends' 2>/dev/null | cut -f22 | sed -n '2,$p') + if [ -n "$be_version" ]; then + break + fi + sleep 2 +done diff --git a/selectdb/stop b/selectdb/stop new file mode 100755 index 000000000..4d724b5d3 --- /dev/null +++ b/selectdb/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +DORIS_HOME=$(cat .doris_home 2>/dev/null) || exit 0 +"$DORIS_HOME/fe/bin/stop_fe.sh" 2>/dev/null || true +"$DORIS_HOME/be/bin/stop_be.sh" 2>/dev/null || true +exit 0 diff --git a/siglens/benchmark.sh b/siglens/benchmark.sh index eeff99bfa..13a9c1c84 100755 --- a/siglens/benchmark.sh +++ b/siglens/benchmark.sh @@ -1,26 +1,8 @@ #!/bin/bash - -# Requires at least 300GB of free disk space on the main partition for the dataset, intermediate files, and SigLens data. - -echo "Install prerequisites" -sudo apt-get install -y git golang - -echo "Get and build SigLens" -git clone https://github.com/siglens/siglens.git --branch 1.0.54 -cd siglens -go mod tidy -go build -o siglens cmd/siglens/main.go -./siglens &> siglens.out & -cd .. - -echo "Download and unzip dataset" -sudo apt-get install -y pigz -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' -pigz -d -f hits.json.gz - -echo "Load data into SigLens, this can take a few hours" -echo -n "Load time: " -command time -f '%e' python3 send_datawithactionline.py - -echo "Run queries" -./run.sh +# Thin shim — actual flow is in lib/benchmark-common.sh. +# siglens ingests its own gzipped NDJSON; ./load fetches it directly. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +# queries are SPL/Splunk QL, not SQL. +export BENCH_QUERIES_FILE="queries.spl" +exec ../lib/benchmark-common.sh diff --git a/siglens/check b/siglens/check new file mode 100755 index 000000000..242bd8667 --- /dev/null +++ b/siglens/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# A search-API hit is sufficient — `/api/health` may not exist on this version. +curl -sSf -o /dev/null --max-time 5 'http://localhost:5122/' \ + || curl -sSf -o /dev/null --max-time 5 'http://localhost:8081/' diff --git a/siglens/data-size b/siglens/data-size new file mode 100755 index 000000000..457b6d5b7 --- /dev/null +++ b/siglens/data-size @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +# siglens stores ingested data under the data/ directory inside the repo. +if [ -d siglens/data ]; then + du -bcs siglens/data | grep total | awk '{print $1}' +elif [ -d siglens/ingestnodes ]; then + du -bcs siglens/ingestnodes siglens/querynodes 2>/dev/null | grep total | awk '{print $1}' +else + du -bcs siglens | grep total | awk '{print $1}' +fi diff --git a/siglens/install b/siglens/install new file mode 100755 index 000000000..d2e0ea87c --- /dev/null +++ b/siglens/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y git golang pigz python3 python3-pip jq bc + +if [ ! -d siglens ]; then + git clone https://github.com/siglens/siglens.git --branch 1.0.54 +fi + +cd siglens +go mod tidy +if [ ! -x ./siglens ]; then + go build -o siglens cmd/siglens/main.go +fi + +# load script uses requests. +pip3 install --quiet --break-system-packages requests || pip3 install --quiet requests diff --git a/siglens/load b/siglens/load new file mode 100755 index 000000000..dde5982cf --- /dev/null +++ b/siglens/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +pigz -d -f hits.json.gz + +python3 send_datawithactionline.py + +rm -f hits.json +sync diff --git a/siglens/query b/siglens/query new file mode 100755 index 000000000..1ca8b5a91 --- /dev/null +++ b/siglens/query @@ -0,0 +1,39 @@ +#!/bin/bash +# Reads a SigLens SPL/Splunk QL query from stdin, runs it via the search API. +# Stdout: query response (JSON). +# Stderr: query runtime in fractional seconds on the last line (wall-clock). +# Exit non-zero on error. +set -e + +querytxt=$(cat) + +# A "null" query in queries.spl means "not supported"; emit null timing. +if [ "$querytxt" = "null" ]; then + echo "{}" + echo "null" >&2 + exit 0 +fi + +JSON=$(jq -nc --arg q "$querytxt" '{ + state: "query", + searchText: $q, + startEpoch: "now-9000d", + endEpoch: "now", + indexName: "hits", + from: 0, + queryLanguage: "Splunk QL" +}') + +t1=$(date +%s.%N) +resp=$(curl -sS -k -X POST 'http://localhost:5122/api/search' \ + -H 'Content-Type: application/json' -d "$JSON") +t2=$(date +%s.%N) + +if [ "$(jq 'has("error")' <<<"$resp")" = "true" ]; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" + +awk -v a="$t1" -v b="$t2" 'BEGIN { printf "%.3f\n", b - a }' >&2 diff --git a/siglens/run.sh b/siglens/run.sh deleted file mode 100755 index bf378db72..000000000 --- a/siglens/run.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -TRIES=3 - -QUERY_NUM=0 - -cat 'queries.spl' | while read -r QUERYTXT; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo -n "[" - - for i in $(seq 1 $TRIES); do - if [[ $QUERYTXT != "null" ]]; then - JSON="{ - \"state\": \"query\", - \"searchText\": \"$QUERYTXT\", - \"startEpoch\": \"now-9000d\", - \"endEpoch\": \"now\", - \"indexName\": \"hits\", - \"from\": 0, - \"queryLanguage\": \"Splunk QL\" - }" - - # start external timer - START=$(date +%s.%N) - - # Run Query directly through search API - SIG_RSP=$(curl -s -k -X POST "http://localhost:5122/api/search" -H 'Content-Type: application/json' -d"$JSON") - - # calculate timing outside of SigLens - END=$(date +%s.%N) - RES=$(echo "$END - $START" | bc -l | xargs printf "%.3f") - - # if SigLens returned an error, print null - [[ "$(jq 'has("error")' <<<$SIG_RSP)" == "true" ]] && echo -n "null" || echo -n "$RES" - [[ "$i" != $TRIES ]] && echo -n ", " - - if [[ "$(jq 'has("error")' <<<"$SIG_RSP")" == "true" ]]; then - echo -e "\n\nSigLens got error for query: $QUERYTXT" - echo $SIG_RSP - echo -e "\n" - FINAL_TIME="null" - else - FINAL_TIME="$RES" - fi - # output to result file - echo "${QUERY_NUM},${i},${FINAL_TIME}" >>result.csv - else - # Queries that are not supported write null for them - echo -n "null, " - echo "${QUERY_NUM},${i},null" >>result.csv - fi - done - - echo "]," - QUERY_NUM=$((QUERY_NUM + 1)) - -done diff --git a/siglens/start b/siglens/start new file mode 100755 index 000000000..30ccbdfc5 --- /dev/null +++ b/siglens/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# Idempotent: ports 5122 (search API) and 8081 (ingest API) are siglens'. +if curl -sSf 'http://localhost:5122/api/health' >/dev/null 2>&1 \ + || curl -sSf 'http://localhost:8081' >/dev/null 2>&1; then + exit 0 +fi + +cd siglens +nohup ./siglens > siglens.out 2>&1 & +disown diff --git a/siglens/stop b/siglens/stop new file mode 100755 index 000000000..e2a18feaa --- /dev/null +++ b/siglens/stop @@ -0,0 +1,8 @@ +#!/bin/bash + +pkill -x siglens 2>/dev/null || true +for _ in $(seq 1 15); do + pgrep -x siglens >/dev/null 2>&1 || exit 0 + sleep 1 +done +pkill -9 -x siglens 2>/dev/null || true diff --git a/sirius/benchmark.sh b/sirius/benchmark.sh index d1d1304e7..fc4bacc8f 100755 --- a/sirius/benchmark.sh +++ b/sirius/benchmark.sh @@ -1,63 +1,5 @@ #!/bin/bash -# ClickBench benchmark for Sirius (GPU-accelerated DuckDB extension) -# -# Usage: ./benchmark.sh -# Prerequisites: NVIDIA GPU with CUDA driver, internet access - -source dependencies.sh - -# Verify pixi is available -if ! command -v pixi &> /dev/null; then - echo "Error: pixi not found. Check dependencies.sh output." - exit 1 -fi - -# --------------------------------------------------------------------------- -# 1. Build Sirius -# --------------------------------------------------------------------------- -rm -rf sirius -git clone --recurse-submodules https://github.com/sirius-db/sirius.git -cd sirius - -set -e - -pixi install -export LIBCUDF_ENV_PREFIX="$(pwd)/.pixi/envs/default" -pixi run make -j"$(nproc)" - -# Make the build artifacts available -eval "$(pixi shell-hook)" -export PATH="$(pwd)/build/release:$PATH" -cd .. - -set +e - -# --------------------------------------------------------------------------- -# 2. Load data -# --------------------------------------------------------------------------- -../download-hits-parquet-single - -echo -n "Load time: " -command time -f '%e' duckdb hits.db -f create.sql -f load.sql - -# --------------------------------------------------------------------------- -# 3. Run benchmark -# --------------------------------------------------------------------------- -./run.sh 2>&1 | tee log.txt - -echo -n "Data size: " -wc -c hits.db - -# --------------------------------------------------------------------------- -# 4. Format results -# --------------------------------------------------------------------------- -cat log.txt | \ - grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' | \ - sed -r -e 's/^.(Killed|Segmentation).$/null\nnull\nnull/; s/^Run Time \(s\): real\s*([0-9.]+).*$/\1/' | \ - awk '{ - buf[i++] = $1 - if (i == 4) { - printf "[%s,%s,%s],\n", buf[1], buf[2], buf[3] - i = 0 - } - }' \ No newline at end of file +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/sirius/check b/sirius/check new file mode 100755 index 000000000..0c4b301a2 --- /dev/null +++ b/sirius/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sf http://127.0.0.1:8000/health >/dev/null diff --git a/sirius/data-size b/sirius/data-size new file mode 100755 index 000000000..365ad4ecc --- /dev/null +++ b/sirius/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +curl -sS http://127.0.0.1:8000/data-size | python3 -c 'import json,sys; print(json.load(sys.stdin)["bytes"])' diff --git a/sirius/dependencies.sh b/sirius/dependencies.sh deleted file mode 100755 index bf8c22513..000000000 --- a/sirius/dependencies.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -#!/bin/bash -# Install system dependencies required to build Sirius and run ClickBench. - -echo "Installing system dependencies..." -sudo apt-get update -y -sudo apt-get install -y git wget curl build-essential time - -# Install Pixi package manager (used to manage CUDA/cuDF toolchain) -if ! command -v pixi &> /dev/null; then - echo "Pixi not found. Installing..." - curl -fsSL https://pixi.sh/install.sh | sudo PIXI_BIN_DIR=/usr/local/bin PIXI_NO_PATH_UPDATE=1 bash -fi - -echo "All dependencies installed." diff --git a/sirius/install b/sirius/install new file mode 100755 index 000000000..7a0206f65 --- /dev/null +++ b/sirius/install @@ -0,0 +1,41 @@ +#!/bin/bash +# Install Sirius (GPU-accelerated DuckDB extension) and the Python deps for +# this benchmark wrapper. +set -e + +# 1. System deps + pixi (CUDA/cuDF toolchain manager). +sudo apt-get update -y +sudo apt-get install -y git wget curl build-essential time python3-pip python3-venv + +if ! command -v pixi &>/dev/null; then + curl -fsSL https://pixi.sh/install.sh | sudo PIXI_BIN_DIR=/usr/local/bin PIXI_NO_PATH_UPDATE=1 bash +fi + +# 2. Build Sirius. +if [ ! -d sirius ]; then + git clone --recurse-submodules https://github.com/sirius-db/sirius.git +fi + +( + cd sirius + pixi install + export LIBCUDF_ENV_PREFIX="$(pwd)/.pixi/envs/default" + pixi run make -j"$(nproc)" +) + +# 3. Python venv for the FastAPI wrapper. +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install --quiet fastapi uvicorn + +# 4. Pre-baked env file used by start to put the duckdb+sirius binary on PATH +# and pull in the pixi shell-hook variables. +cat > .sirius_env <<'EOF' +SIRIUS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd)/sirius" +eval "$(cd "$SIRIUS_DIR" && pixi shell-hook)" +export PATH="$SIRIUS_DIR/build/release:$PATH" +export LIBCUDF_ENV_PREFIX="$SIRIUS_DIR/.pixi/envs/default" +EOF diff --git a/sirius/load b/sirius/load new file mode 100755 index 000000000..0fe744054 --- /dev/null +++ b/sirius/load @@ -0,0 +1,29 @@ +#!/bin/bash +# Build hits.db from hits.parquet using the duckdb CLI (create.sql + load.sql), +# then ask the running server to initialise GPU buffers. +set -e + +if [ -f .sirius_env ]; then + # shellcheck disable=SC1091 + source .sirius_env +fi + +# Ingest into a local duckdb file. Done via CLI (not the server) because the +# server already holds the file open; we want this to be a one-shot job. +./stop || true +duckdb hits.db -f create.sql -f load.sql + +./start +# Wait for the freshly-restarted server to be healthy. +for _ in $(seq 1 60); do + if ./check >/dev/null 2>&1; then + break + fi + sleep 1 +done + +elapsed=$(curl -sS -X POST http://127.0.0.1:8000/load | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])') +echo "Load (server-reported, gpu_buffer_init): ${elapsed}s" + +rm -f hits.parquet +sync diff --git a/sirius/query b/sirius/query new file mode 100755 index 000000000..7fbdef441 --- /dev/null +++ b/sirius/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a SQL query from stdin, dispatches to the running sirius server. +# Stdout: server response JSON (small). +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +tmp=$(mktemp) +status=$(curl -sS -o "$tmp" -w '%{http_code}' \ + -X POST --data-binary @- http://127.0.0.1:8000/query <<<"$query") + +body=$(cat "$tmp") +rm -f "$tmp" + +if [ "$status" != "200" ]; then + echo "query failed: HTTP $status: $body" >&2 + exit 1 +fi + +echo "$body" +echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 diff --git a/sirius/run.sh b/sirius/run.sh deleted file mode 100755 index ac5dd8243..000000000 --- a/sirius/run.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -TRIES=3 -GPU_CACHING_SIZE='80 GB' -GPU_PROCESSING_SIZE='40 GB' -CPU_PROCESSING_SIZE="100 GB" - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - cli_params=() - cli_params+=("-c") - cli_params+=(".timer on") - cli_params+=("-c") - cli_params+=("call gpu_buffer_init(\"${GPU_CACHING_SIZE}\", \"${GPU_PROCESSING_SIZE}\", pinned_memory_size = \"${CPU_PROCESSING_SIZE}\");") - for i in $(seq 1 $TRIES); do - cli_params+=("-c") - cli_params+=("call gpu_processing(\"${query}\");") - done; - echo "${cli_params[@]}" - duckdb hits.db "${cli_params[@]}" -done; diff --git a/sirius/server.py b/sirius/server.py new file mode 100644 index 000000000..d2cc4734c --- /dev/null +++ b/sirius/server.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +"""FastAPI wrapper around Sirius (GPU-accelerated DuckDB extension) so it +conforms to the ClickBench install/start/check/stop/load/query interface. + +Sirius is a DuckDB extension built from source; queries run on the GPU via +``call gpu_processing("");``. This server manages a long-lived ``duckdb`` +CLI subprocess so the GPU buffers initialised on /load remain hot across +queries. + +Routes: + GET /health -> 200 OK once the CLI subprocess is ready + POST /load -> opens hits.db, calls gpu_buffer_init, returns + {"elapsed": }. (Schema/data are loaded by + ./load before this runs.) + POST /query -> body: SQL text. Looks it up in QUERIES, runs it via + gpu_processing, returns {"elapsed": }. + GET /data-size -> bytes of hits.db on disk. +""" + +import os +import re +import subprocess +import threading +import timeit + +import uvicorn +from fastapi import FastAPI, HTTPException, Request + +GPU_CACHING_SIZE = os.environ.get("SIRIUS_GPU_CACHING_SIZE", "80 GB") +GPU_PROCESSING_SIZE = os.environ.get("SIRIUS_GPU_PROCESSING_SIZE", "40 GB") +CPU_PROCESSING_SIZE = os.environ.get("SIRIUS_CPU_PROCESSING_SIZE", "100 GB") + +DB_PATH = os.environ.get("SIRIUS_DB", "hits.db") + +app = FastAPI() +proc: subprocess.Popen | None = None +proc_lock = threading.Lock() +buffers_initialized = False + +# Sentinel sent after each command to detect completion in stdout. +SENTINEL = "__SIRIUS_DONE__" + + +# Read query strings from queries.sql (canonical) on import. We expose the +# same shape as the pandas pilot — (sql, callable). The callable runs the +# SQL via gpu_processing on the persistent duckdb session. +def _load_query_strings() -> list[str]: + here = os.path.dirname(os.path.abspath(__file__)) + qpath = os.path.join(here, "queries.sql") + with open(qpath) as f: + return [line.rstrip("\n") for line in f if line.strip()] + + +_SQL_LIST = _load_query_strings() + + +def _make_runner(sql: str): + return lambda: _run_gpu(sql) + + +QUERIES: list[tuple[str, callable]] = [(sql, _make_runner(sql)) for sql in _SQL_LIST] +QUERY_INDEX = {sql: i for i, (sql, _) in enumerate(QUERIES)} + + +def _spawn_duckdb() -> subprocess.Popen: + # Open a persistent duckdb CLI session against hits.db. The Sirius build + # places the duckdb binary on PATH (see install). + return subprocess.Popen( + ["duckdb", DB_PATH], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + +def _send(cmd: str) -> str: + """Send a SQL/CLI command to duckdb and read until the sentinel. Returns + the raw output (excluding the sentinel line).""" + assert proc is not None and proc.stdin is not None and proc.stdout is not None + with proc_lock: + proc.stdin.write(cmd.rstrip(";") + ";\n") + proc.stdin.write(f"select '{SENTINEL}';\n") + proc.stdin.flush() + out_lines: list[str] = [] + while True: + line = proc.stdout.readline() + if not line: + raise RuntimeError("duckdb subprocess closed unexpectedly") + if SENTINEL in line: + # Drain the trailing border row from the boxed select output. + # DuckDB emits the table for `select '...'` as several lines; + # readline on the SENTINEL line is enough — subsequent lines + # belong to the next command. + break + out_lines.append(line) + return "".join(out_lines) + + +def _run_gpu(sql: str) -> str: + # Wrap user SQL inside gpu_processing("..."); escape embedded double quotes. + escaped = sql.replace('"', '\\"') + return _send(f'call gpu_processing("{escaped}")') + + +@app.get("/health") +def health(): + if proc is None or proc.poll() is not None: + raise HTTPException(status_code=503, detail="duckdb subprocess not running") + return {"ok": True} + + +@app.on_event("startup") +def _startup(): + global proc + proc = _spawn_duckdb() + # Quiet down the CLI a bit. + _send(".mode list") + + +@app.on_event("shutdown") +def _shutdown(): + global proc + if proc is not None: + try: + proc.stdin.write(".quit\n") + proc.stdin.flush() + except Exception: + pass + try: + proc.wait(timeout=5) + except Exception: + proc.kill() + proc = None + + +@app.post("/load") +def load(): + """For Sirius the on-disk DuckDB database is created by the ``./load`` + script (which runs create.sql + load.sql). Here we just initialise the + GPU buffers on the persistent connection so subsequent queries are warm. + """ + global buffers_initialized + start = timeit.default_timer() + if not buffers_initialized: + _send( + f'call gpu_buffer_init("{GPU_CACHING_SIZE}", "{GPU_PROCESSING_SIZE}", ' + f'pinned_memory_size = "{CPU_PROCESSING_SIZE}")' + ) + buffers_initialized = True + elapsed = round(timeit.default_timer() - start, 3) + return {"elapsed": elapsed} + + +@app.post("/query") +async def query(request: Request): + body = (await request.body()).decode("utf-8").strip() + idx = QUERY_INDEX.get(body) + if idx is None: + raise HTTPException(status_code=404, detail=f"unknown query: {body[:120]}") + sql = QUERIES[idx][0] + start = timeit.default_timer() + out = _run_gpu(sql) + elapsed = round(timeit.default_timer() - start, 3) + # If duckdb reports an error, surface it. + if re.search(r"\bError\b", out): + raise HTTPException(status_code=500, detail=out.strip()[:500]) + return {"elapsed": elapsed, "index": idx} + + +@app.get("/data-size") +def data_size(): + try: + return {"bytes": int(os.path.getsize(DB_PATH))} + except OSError: + return {"bytes": 0} + + +if __name__ == "__main__": + port = int(os.environ.get("BENCH_SIRIUS_PORT", "8000")) + uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning") diff --git a/sirius/start b/sirius/start new file mode 100755 index 000000000..9b24312b5 --- /dev/null +++ b/sirius/start @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +# Idempotent: if already running, leave it alone. +if [ -f server.pid ] && kill -0 "$(cat server.pid)" 2>/dev/null; then + exit 0 +fi + +# Source the env populated by ./install so the sirius-built duckdb binary is +# on PATH and CUDA/cuDF libs resolve correctly. +if [ -f .sirius_env ]; then + # shellcheck disable=SC1091 + source .sirius_env +fi + +# shellcheck disable=SC1091 +source myenv/bin/activate +nohup python3 server.py >server.log 2>&1 & +echo $! > server.pid diff --git a/sirius/stop b/sirius/stop new file mode 100755 index 000000000..787b35abc --- /dev/null +++ b/sirius/stop @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -f server.pid ]; then + pid=$(cat server.pid) + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" || true + # Wait up to 10s for graceful exit. + for _ in $(seq 1 10); do + if ! kill -0 "$pid" 2>/dev/null; then + break + fi + sleep 1 + done + kill -9 "$pid" 2>/dev/null || true + fi + rm -f server.pid +fi diff --git a/spark-auron/benchmark.sh b/spark-auron/benchmark.sh index 059d790f3..fc4bacc8f 100755 --- a/spark-auron/benchmark.sh +++ b/spark-auron/benchmark.sh @@ -1,91 +1,5 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) -# -# Highlights: -# - pyspark==3.5.6 version is used (latest stable for Auron 5.0.0) -# - Auron installation is added -# - auto-save results - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==3.5.5 psutil - -# Load the data - -../download-hits-parquet-single - -# Install Auron - -AURON_JAR_URL='https://github.com/apache/auron/releases/download/v5.0.0/blaze-engine-spark-3.5-release-5.0.0-SNAPSHOT.jar' - -wget --continue --progress=dot:giga $AURON_JAR_URL -O auron.jar - -# Run the queries - -./run.sh >log.txt 2>&1 - -# Print results to stdout as required -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -DATA_SIZE=$(du -b hits.parquet | cut -f1) - -echo "Data size: $DATA_SIZE" -echo "Load time: 0" - -# Save results as JSON - -MACHINE="${1:-c6a.4xlarge}" # Use first argument as machine name, default to c6a.4xlarge -AURON_VERSION=$(echo $AURON_JAR_URL | grep -Po "\d.\d.\d" | head -n 1) -SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) - -mkdir -p results - -( -cat << EOF -{ - "system": "Spark (Auron)", - "date": "$(date +%Y-%m-%d)", - "machine": "${MACHINE}", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "comment": "Using Auron ${AURON_VERSION} with Spark ${SPARK_VERSION}", - "tags": ["Java", "Rust", "column-oriented", "Spark derivative", "DataFusion", "Parquet"], - "load_time": 0, - "data_size": ${DATA_SIZE}, - "result": [ -EOF - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' - { - if (i % 3 == 0) printf "\t\t["; - if ($1 == "null") printf "null"; - else printf "%.3f", $1; - if (i % 3 != 2) printf ", "; - else { - if (i < total - 1) printf "],\n"; - else printf "]"; - } - i++; - }' - -cat << EOF - - ] -} -EOF -) > "results/${MACHINE}.json" - -echo "Results have been saved to results/${MACHINE}.json" - +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark-auron/check b/spark-auron/check new file mode 100755 index 000000000..492bdfc9f --- /dev/null +++ b/spark-auron/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 +[ -f auron.jar ] diff --git a/spark-auron/data-size b/spark-auron/data-size new file mode 100755 index 000000000..1a34600a8 --- /dev/null +++ b/spark-auron/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark-auron/install b/spark-auron/install new file mode 100755 index 000000000..5e38d04eb --- /dev/null +++ b/spark-auron/install @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk wget + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.5 psutil + +AURON_JAR_URL='https://github.com/apache/auron/releases/download/v5.0.0/blaze-engine-spark-3.5-release-5.0.0-SNAPSHOT.jar' +if [ ! -f auron.jar ]; then + wget --continue --progress=dot:giga "$AURON_JAR_URL" -O auron.jar +fi diff --git a/spark-auron/load b/spark-auron/load new file mode 100755 index 000000000..f093c1086 --- /dev/null +++ b/spark-auron/load @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Parquet file is read directly by Spark — nothing to load. +sync diff --git a/spark-auron/query b/spark-auron/query new file mode 100755 index 000000000..16c9fab1b --- /dev/null +++ b/spark-auron/query @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark-auron/query.py b/spark-auron/query.py index b07b29e6b..859d76bab 100755 --- a/spark-auron/query.py +++ b/spark-auron/query.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark+Auron, prints result on stdout +and runtime in fractional seconds as the LAST line on stderr. -Highlights: -- memory is split between heap (for Spark) and memoryOverhead (for Auron) -- Auron configuration is added to `SparkSession` +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -51,13 +50,15 @@ df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e) - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) # some queries should return more than 20 rows which is the default show limit + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark-auron/run.sh b/spark-auron/run.sh deleted file mode 100755 index 8c9ca1289..000000000 --- a/spark-auron/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark-auron/start b/spark-auron/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/spark-auron/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-auron/stop b/spark-auron/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/spark-auron/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-comet/benchmark.sh b/spark-comet/benchmark.sh index a81f459f8..fc4bacc8f 100755 --- a/spark-comet/benchmark.sh +++ b/spark-comet/benchmark.sh @@ -1,90 +1,5 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) -# -# Highlights: -# - pyspark==3.5.6 version is used (latest stable for Comet 0.9.0) -# - Comet installation is added -# - auto-save results - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==3.5.6 psutil - -# Load the data - -../download-hits-parquet-single - -# Install Comet - -COMET_JAR_URL='https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.9.0/comet-spark-spark3.5_2.12-0.9.0.jar' - -wget --continue --progress=dot:giga $COMET_JAR_URL -O comet.jar - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -# Print results to stdout as required -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -DATA_SIZE=$(du -b hits.parquet | cut -f1) - -echo "Data size: $DATA_SIZE" -echo "Load time: 0" - -# Save results as JSON - -MACHINE="${1:-c6a.4xlarge}" # Use first argument as machine name, default to c6a.4xlarge -COMET_VERSION=$(echo $COMET_JAR_URL | grep -Po ".{5}(?=.jar)") -SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) - -mkdir -p results - -( -cat << EOF -{ - "system": "Spark (Comet)", - "date": "$(date +%Y-%m-%d)", - "machine": "${MACHINE}", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "comment": "Using Comet ${COMET_VERSION} with Spark ${SPARK_VERSION}", - "tags": ["Java", "Rust", "column-oriented", "Spark derivative", "DataFusion", "Parquet"], - "load_time": 0, - "data_size": ${DATA_SIZE}, - "result": [ -EOF - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' - { - if (i % 3 == 0) printf "\t\t["; - if ($1 == "null") printf "null"; - else printf "%.3f", $1; - if (i % 3 != 2) printf ", "; - else { - if (i < total - 1) printf "],\n"; - else printf "]"; - } - i++; - }' - -cat << EOF - - ] -} -EOF -) > "results/${MACHINE}.json" - -echo "Results have been saved to results/${MACHINE}.json" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark-comet/check b/spark-comet/check new file mode 100755 index 000000000..8c9998fc9 --- /dev/null +++ b/spark-comet/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 +[ -f comet.jar ] diff --git a/spark-comet/data-size b/spark-comet/data-size new file mode 100755 index 000000000..1a34600a8 --- /dev/null +++ b/spark-comet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark-comet/install b/spark-comet/install new file mode 100755 index 000000000..108f7c512 --- /dev/null +++ b/spark-comet/install @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk wget + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.6 psutil + +COMET_JAR_URL='https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.12/0.9.0/comet-spark-spark3.5_2.12-0.9.0.jar' +if [ ! -f comet.jar ]; then + wget --continue --progress=dot:giga "$COMET_JAR_URL" -O comet.jar +fi diff --git a/spark-comet/load b/spark-comet/load new file mode 100755 index 000000000..1c31caf31 --- /dev/null +++ b/spark-comet/load @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sync diff --git a/spark-comet/query b/spark-comet/query new file mode 100755 index 000000000..16c9fab1b --- /dev/null +++ b/spark-comet/query @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark-comet/query.py b/spark-comet/query.py index 50358069a..59e2f4615 100755 --- a/spark-comet/query.py +++ b/spark-comet/query.py @@ -1,12 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark+Comet, prints result on stdout +and runtime in fractional seconds as the LAST line on stderr. -Highlights: -- memory is split between heap (for Spark) and off-heap (for Comet) -- Comet configuration is added to `SparkSession` -- debug mode is added +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -46,7 +44,6 @@ .config("spark.comet.scan.allowIncompatible", True) ) -# Even more Comet configuration if os.getenv("DEBUG") == "1": builder.config("spark.comet.explainFallback.enabled", "true") builder.config("spark.sql.debug.maxToStringFields", "10000") @@ -54,18 +51,19 @@ spark = builder.getOrCreate() df = spark.read.parquet("hits.parquet") -# Do casting before creating the view so no need to change to unreadable integer dates in SQL df = df.withColumn("EventTime", F.col("EventTime").cast("timestamp")) df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e) - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark-comet/run.sh b/spark-comet/run.sh deleted file mode 100755 index 8c9ca1289..000000000 --- a/spark-comet/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark-comet/start b/spark-comet/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/spark-comet/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-comet/stop b/spark-comet/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/spark-comet/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-gluten/benchmark.sh b/spark-gluten/benchmark.sh index 552b5a46b..fc4bacc8f 100755 --- a/spark-gluten/benchmark.sh +++ b/spark-gluten/benchmark.sh @@ -1,92 +1,5 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) -# -# Highlights: -# - pyspark==3.5.2 version is used (latest stable for Gluten 1.4.0) -# - Gluten installation is added -# - auto-save results - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==3.5.2 psutil - -# Load the data - -../download-hits-parquet-single - -# Install Gluten - -GLUTEN_JAR_URL='https://github.com/apache/incubator-gluten/releases/download/v1.4.0/apache-gluten-1.4.0-incubating-bin-spark35.tar.gz' - -wget --continue --progress=dot:giga $GLUTEN_JAR_URL -O gluten.gz -tar -xzf gluten.gz -mv gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar gluten.jar - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -# Print results to stdout as required -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -DATA_SIZE=$(du -b hits.parquet | cut -f1) - -echo "Data size: $DATA_SIZE" -echo "Load time: 0" - -# Save results as JSON - -MACHINE="${1:-c6a.4xlarge}" # Use first argument as machine name, default to c6a.4xlarge -GLUTEN_VERSION=$(echo $GLUTEN_JAR_URL | grep -Po "\d.\d.\d" | head -n 1) -SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) - -mkdir -p results - -( -cat << EOF -{ - "system": "Spark (Gluten-on-Velox)", - "date": "$(date +%Y-%m-%d)", - "machine": "${MACHINE}", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "comment": "Using Gluten ${GLUTEN_VERSION} with Spark ${SPARK_VERSION}", - "tags": ["Java", "C++", "column-oriented", "Spark derivative", "Velox", "Parquet"], - "load_time": 0, - "data_size": ${DATA_SIZE}, - "result": [ -EOF - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' - { - if (i % 3 == 0) printf "\t\t["; - if ($1 == "null") printf "null"; - else printf "%.3f", $1; - if (i % 3 != 2) printf ", "; - else { - if (i < total - 1) printf "],\n"; - else printf "]"; - } - i++; - }' - -cat << EOF - - ] -} -EOF -) > "results/${MACHINE}.json" - -echo "Results have been saved to results/${MACHINE}.json" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark-gluten/check b/spark-gluten/check new file mode 100755 index 000000000..f2375edcd --- /dev/null +++ b/spark-gluten/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 +[ -f gluten.jar ] diff --git a/spark-gluten/data-size b/spark-gluten/data-size new file mode 100755 index 000000000..1a34600a8 --- /dev/null +++ b/spark-gluten/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark-gluten/install b/spark-gluten/install new file mode 100755 index 000000000..681fef2b4 --- /dev/null +++ b/spark-gluten/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk wget + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==3.5.2 psutil + +GLUTEN_JAR_URL='https://github.com/apache/incubator-gluten/releases/download/v1.4.0/apache-gluten-1.4.0-incubating-bin-spark35.tar.gz' +if [ ! -f gluten.jar ]; then + wget --continue --progress=dot:giga "$GLUTEN_JAR_URL" -O gluten.gz + tar -xzf gluten.gz + mv gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar gluten.jar + rm -f gluten.gz +fi diff --git a/spark-gluten/load b/spark-gluten/load new file mode 100755 index 000000000..1c31caf31 --- /dev/null +++ b/spark-gluten/load @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sync diff --git a/spark-gluten/query b/spark-gluten/query new file mode 100755 index 000000000..16c9fab1b --- /dev/null +++ b/spark-gluten/query @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark-gluten/query.py b/spark-gluten/query.py index 4d2c15d34..a16a0034c 100755 --- a/spark-gluten/query.py +++ b/spark-gluten/query.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark+Gluten, prints result on stdout +and runtime in fractional seconds as the LAST line on stderr. -Highlights: -- memory is split between heap (for Spark) and off-heap (for Gluten) -- Gluten configuration is added to `SparkSession` +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -19,7 +18,6 @@ query = sys.stdin.read() print(query) -# Calculate available memory to configurate SparkSession (in MB) ram = int(round(psutil.virtual_memory().available / (1024 ** 2) * 0.7)) heap = ram // 2 off_heap = ram - heap @@ -46,18 +44,19 @@ spark = builder.getOrCreate() df = spark.read.parquet("hits.parquet") -# Do casting before creating the view so no need to change to unreadable integer dates in SQL df = df.withColumn("EventTime", F.col("EventTime").cast("timestamp")) df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e) - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark-gluten/run.sh b/spark-gluten/run.sh deleted file mode 100755 index 8c9ca1289..000000000 --- a/spark-gluten/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark-gluten/start b/spark-gluten/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/spark-gluten/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark-gluten/stop b/spark-gluten/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/spark-gluten/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/spark/benchmark.sh b/spark/benchmark.sh index 573e403f7..5a4cc33c3 100755 --- a/spark/benchmark.sh +++ b/spark/benchmark.sh @@ -1,29 +1,7 @@ #!/bin/bash - -# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) - -# Install - -sudo apt-get update -y -sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk - -export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" -export PATH=$JAVA_HOME/bin:$PATH - -python3 -m venv myenv -source myenv/bin/activate -pip install pyspark==4.0.0 psutil - -# Load the data - -../download-hits-parquet-single - -# Run the queries - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -echo "Data size: $(du -b hits.parquet)" -echo "Load time: 0" +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +# Spark runs in-process per query — restart between queries is meaningless +# (and would re-download nothing). Skip restart. +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/spark/check b/spark/check new file mode 100755 index 000000000..e61d3c5d6 --- /dev/null +++ b/spark/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Trivial: ensure the venv has pyspark importable. +# shellcheck disable=SC1091 +source myenv/bin/activate +python3 -c 'import pyspark' >/dev/null 2>&1 diff --git a/spark/data-size b/spark/data-size new file mode 100755 index 000000000..1a34600a8 --- /dev/null +++ b/spark/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -b hits.parquet | cut -f1 diff --git a/spark/install b/spark/install new file mode 100755 index 000000000..505ad1659 --- /dev/null +++ b/spark/install @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv openjdk-17-jdk + +if [ ! -d myenv ]; then + python3 -m venv myenv +fi +# shellcheck disable=SC1091 +source myenv/bin/activate +pip install -q pyspark==4.0.0 psutil diff --git a/spark/load b/spark/load new file mode 100755 index 000000000..8d341815c --- /dev/null +++ b/spark/load @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Spark reads hits.parquet directly — nothing to "load". The parquet was +# downloaded by the shared driver. Just sync. +sync diff --git a/spark/query b/spark/query new file mode 100755 index 000000000..72bb83454 --- /dev/null +++ b/spark/query @@ -0,0 +1,12 @@ +#!/bin/bash +# Reads SQL on stdin, runs it via PySpark. +# Stdout: query output. +# Stderr: runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +# shellcheck disable=SC1091 +source myenv/bin/activate + +query=$(cat) +printf '%s' "$query" | python3 query.py diff --git a/spark/query.py b/spark/query.py index 268b60e71..55aa42d01 100755 --- a/spark/query.py +++ b/spark/query.py @@ -1,7 +1,10 @@ #!/usr/bin/env python3 """ -Note: Keep in sync with spark-*/query.sh (see README-accelerators.md for details) +Reads SQL on stdin, runs it once via PySpark, prints the result on stdout +and the runtime in fractional seconds as the LAST line on stderr. + +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details) """ from pyspark.sql import SparkSession @@ -34,13 +37,16 @@ df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) df.createOrReplaceTempView("hits") -for try_num in range(3): - try: - start = timeit.default_timer() - result = spark.sql(query) - result.show(100) # some queries should return more than 20 rows which is the default show limit - end = timeit.default_timer() - print("Time: ", end - start) - except Exception as e: - print(e); - print("Failure!") +try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) # some queries should return more than 20 rows which is the default show limit + end = timeit.default_timer() + elapsed = end - start + print(f"Time: {elapsed}") + # Last stderr line: fractional seconds (driver-required contract). + print(f"{elapsed:.6f}", file=sys.stderr) +except Exception as e: + print(e, file=sys.stderr) + print("Failure!", file=sys.stderr) + sys.exit(1) diff --git a/spark/run.sh b/spark/run.sh deleted file mode 100755 index 8c9ca1289..000000000 --- a/spark/run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) - -cat queries.sql | while read query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - - ./query.py <<< "${query}" -done diff --git a/spark/start b/spark/start new file mode 100755 index 000000000..8c8eb0c9f --- /dev/null +++ b/spark/start @@ -0,0 +1,3 @@ +#!/bin/bash +# PySpark runs in-process per query — nothing to start. +exit 0 diff --git a/spark/stop b/spark/stop new file mode 100755 index 000000000..42fca7c6a --- /dev/null +++ b/spark/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# PySpark is in-process — nothing to stop. +exit 0 diff --git a/sqlite/benchmark.sh b/sqlite/benchmark.sh index 705e9739e..b0b9f4775 100755 --- a/sqlite/benchmark.sh +++ b/sqlite/benchmark.sh @@ -1,21 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y sqlite3 - -sqlite3 mydb < create.sql - -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' sqlite3 mydb '.import --csv hits.csv hits' -echo -n "Data size: " -wc -c mydb - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P '^real|^Error|Parse error' | - sed -r -e 's/^(Error|Parse error).*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/sqlite/check b/sqlite/check new file mode 100755 index 000000000..3cca70d91 --- /dev/null +++ b/sqlite/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sqlite3 :memory: 'SELECT 1' >/dev/null diff --git a/sqlite/data-size b/sqlite/data-size new file mode 100755 index 000000000..f94c4eccf --- /dev/null +++ b/sqlite/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < mydb diff --git a/sqlite/install b/sqlite/install new file mode 100755 index 000000000..ff8710145 --- /dev/null +++ b/sqlite/install @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +if ! command -v sqlite3 >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y sqlite3 +fi diff --git a/sqlite/load b/sqlite/load new file mode 100755 index 000000000..896f445ab --- /dev/null +++ b/sqlite/load @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +# Idempotent: blow away any prior DB. +rm -f mydb + +sqlite3 mydb < create.sql +sqlite3 mydb '.import --csv hits.csv hits' + +rm -f hits.csv +sync diff --git a/sqlite/query b/sqlite/query new file mode 100755 index 000000000..e2ee624fd --- /dev/null +++ b/sqlite/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via sqlite3 against mydb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (from `time`). +# Exit non-zero on error. +set -e + +query=$(cat) + +# Use bash builtin `time` with TIMEFORMAT to get a single fractional-seconds +# value on stderr. Capture sqlite3 output separately. +TIMEFORMAT='%R' +{ time sqlite3 mydb "$query" 1>/tmp/sqlite.out.$$ 2>/tmp/sqlite.err.$$; } 2>/tmp/sqlite.time.$$ || status=$? +status=${status:-0} + +cat /tmp/sqlite.out.$$ +if [ "$status" -ne 0 ]; then + cat /tmp/sqlite.err.$$ >&2 + rm -f /tmp/sqlite.out.$$ /tmp/sqlite.err.$$ /tmp/sqlite.time.$$ + exit "$status" +fi + +# Even on success sqlite3 may have warnings on stderr; pass them through but +# end with the timing as the last line. +cat /tmp/sqlite.err.$$ >&2 +cat /tmp/sqlite.time.$$ >&2 + +rm -f /tmp/sqlite.out.$$ /tmp/sqlite.err.$$ /tmp/sqlite.time.$$ diff --git a/sqlite/run.sh b/sqlite/run.sh deleted file mode 100755 index 5693ddd8e..000000000 --- a/sqlite/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - time sqlite3 mydb <<< "${query}" - done; -done; diff --git a/sqlite/start b/sqlite/start new file mode 100755 index 000000000..6976d11cd --- /dev/null +++ b/sqlite/start @@ -0,0 +1,3 @@ +#!/bin/bash +# sqlite3 is an embedded CLI tool — no daemon to start. +exit 0 diff --git a/sqlite/stop b/sqlite/stop new file mode 100755 index 000000000..541aa5672 --- /dev/null +++ b/sqlite/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# sqlite3 is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/starrocks/benchmark.sh b/starrocks/benchmark.sh index 2abac9591..531bd6503 100755 --- a/starrocks/benchmark.sh +++ b/starrocks/benchmark.sh @@ -1,84 +1,5 @@ #!/bin/bash - -# This benchmark should run on Amazon Linux - -set -e - -VERSION=4.0.2-ubuntu-$(dpkg --print-architecture) -# Install -wget --continue --progress=dot:giga https://releases.starrocks.io/starrocks/StarRocks-$VERSION.tar.gz -O StarRocks-$VERSION.tar.gz -tar zxvf StarRocks-${VERSION}.tar.gz - -cd StarRocks-${VERSION}/ - -# Install dependencies -sudo apt-get update -y -sudo apt-get install -y openjdk-17-jre mariadb-client -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture) -export PATH=$JAVA_HOME/bin:$PATH - -# Create directory for FE and BE -IPADDR=`hostname -i` -export STARROCKS_HOME=`pwd` -mkdir -p meta storage - -# Start Frontend -printf "\nmeta_dir = ${STARROCKS_HOME}/meta \n" >> fe/conf/fe.conf -fe/bin/start_fe.sh --daemon - -# Start Backend -printf "\nstorage_root_path = ${STARROCKS_HOME}/storage\n" >> be/conf/be.conf -# Disable internal caches so that the cold run (1st of 3 tries) is actually cold. -# Without this, the BE process keeps decoded data in its own in-memory page cache -# (`storage_page_cache`, default ~20% of RAM) which `drop_caches` does not clear, -# so first-run timings reflect a warm cache and underreport cold-run latency. -# `datacache_enable=false` covers the unified Data Cache (page + block) path in v3.3+. -printf "\ndisable_storage_page_cache = true\n" >> be/conf/be.conf -printf "\ndatacache_enable = false\n" >> be/conf/be.conf -be/bin/start_be.sh --daemon - -# Setup cluster -# wait some seconds util fe can serve -sleep 30 -mysql -h 127.0.0.1 -P9030 -uroot -e "ALTER SYSTEM ADD BACKEND '${IPADDR}:9050' " -# wait some seconds util be joins -sleep 30 - -# Prepare Data -cd ../ -../download-hits-tsv - -# Create Table -mysql -h 127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" -mysql -h 127.0.0.1 -P9030 -uroot hits < create.sql - -# Load Data -START=$(date +%s) -echo "Start to load data..." -# `timeout:1000` header: see https://github.com/ClickHouse/ClickBench/pull/740 -curl --location-trusted \ - -u root: \ - -T "hits.tsv" \ - -H "label:hits_tsv_${START}" \ - -H "timeout:1000" \ - -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ - http://localhost:8030/api/hits/hits/_stream_load -END=$(date +%s) -LOADTIME=$(echo "$END - $START" | bc) -echo "Load time: $LOADTIME" - -# Dataset contains about 40GB of data when the import is just completed. -# This is because the trashed data generated during the compaction process. -# After about tens of minutes, when the gc is completed, the system includes about 16.5GB of data. -echo -n "Data size: " -du -bcs StarRocks-${VERSION}/storage/ | grep total -# Dataset contains 99997497 rows -mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits" - -./run.sh 2>&1 | tee -a log.txt - -cat log.txt | - grep -P 'rows? in set|Empty set|^ERROR' | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/starrocks/check b/starrocks/check new file mode 100755 index 000000000..c6e836c8c --- /dev/null +++ b/starrocks/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null diff --git a/starrocks/data-size b/starrocks/data-size new file mode 100755 index 000000000..c1c21f9f9 --- /dev/null +++ b/starrocks/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +SR_DIR=$(cat .sr_dir) +du -bcs "$SR_DIR/storage/" | awk '/total$/ {print $1}' diff --git a/starrocks/install b/starrocks/install new file mode 100755 index 000000000..a197358dc --- /dev/null +++ b/starrocks/install @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +VERSION="4.0.2-ubuntu-$(dpkg --print-architecture)" +SR_DIR="StarRocks-$VERSION" + +if [ ! -d "$SR_DIR" ]; then + if [ ! -f "$SR_DIR.tar.gz" ]; then + wget --continue --progress=dot:giga \ + "https://releases.starrocks.io/starrocks/$SR_DIR.tar.gz" \ + -O "$SR_DIR.tar.gz" + fi + tar zxf "$SR_DIR.tar.gz" + + # Configure FE/BE. + mkdir -p "$SR_DIR/meta" "$SR_DIR/storage" + printf "\nmeta_dir = $PWD/$SR_DIR/meta \n" >> "$SR_DIR/fe/conf/fe.conf" + printf "\nstorage_root_path = $PWD/$SR_DIR/storage\n" >> "$SR_DIR/be/conf/be.conf" + # Disable internal caches so the cold run is actually cold. + printf "\ndisable_storage_page_cache = true\n" >> "$SR_DIR/be/conf/be.conf" + printf "\ndatacache_enable = false\n" >> "$SR_DIR/be/conf/be.conf" +fi + +sudo apt-get update -y +sudo apt-get install -y openjdk-17-jre mariadb-client bc + +echo "$SR_DIR" > .sr_dir diff --git a/starrocks/load b/starrocks/load new file mode 100755 index 000000000..2fad538eb --- /dev/null +++ b/starrocks/load @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +# Idempotent: drop+create database. +mysql -h127.0.0.1 -P9030 -uroot -e "DROP DATABASE IF EXISTS hits" +mysql -h127.0.0.1 -P9030 -uroot -e "CREATE DATABASE hits" +mysql -h127.0.0.1 -P9030 -uroot hits < create.sql + +START=$(date +%s) +curl --location-trusted \ + -u root: \ + -T "hits.tsv" \ + -H "label:hits_tsv_${START}" \ + -H "timeout:1000" \ + -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ + http://localhost:8030/api/hits/hits/_stream_load + +rm -f hits.tsv +sync diff --git a/starrocks/query b/starrocks/query new file mode 100755 index 000000000..025bb4f1e --- /dev/null +++ b/starrocks/query @@ -0,0 +1,30 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against StarRocks. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(mysql -vvv -h127.0.0.1 -P9030 -uroot hits -e "$query" 2>&1) || status=$? +status=${status:-0} + +printf '%s\n' "$out" | grep -vP '^\([0-9.]+\s+sec\)$|rows? in set|Empty set' + +if [ "$status" -ne 0 ] || printf '%s\n' "$out" | grep -qE '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +secs=$(printf '%s\n' "$out" \ + | grep -oP '\((?:([0-9.]+)\s+min\s+)?([0-9.]+)\s+sec\)' \ + | tail -n1 \ + | sed -r 's/\((([0-9.]+) min )?([0-9.]+) sec\)/\2 \3/' \ + | awk '{ if ($2 != "") print $1*60 + $2; else print $1 }') + +if [ -z "$secs" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi +printf '%s\n' "$secs" >&2 diff --git a/starrocks/run.sh b/starrocks/run.sh deleted file mode 100755 index 6b9200c11..000000000 --- a/starrocks/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - for i in $(seq 1 $TRIES); do - mysql -vvv -h127.1 -P9030 -uroot hits -e "${query}" - done -done; diff --git a/starrocks/start b/starrocks/start new file mode 100755 index 000000000..e279fa9c6 --- /dev/null +++ b/starrocks/start @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +SR_DIR=$(cat .sr_dir) +export STARROCKS_HOME="$PWD/$SR_DIR" +export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture) +export PATH=$JAVA_HOME/bin:$PATH + +if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +IPADDR=$(hostname -i) + +"$STARROCKS_HOME/fe/bin/start_fe.sh" --daemon +"$STARROCKS_HOME/be/bin/start_be.sh" --daemon + +sleep 30 +mysql -h127.0.0.1 -P9030 -uroot \ + -e "ALTER SYSTEM ADD BACKEND '${IPADDR}:9050'" 2>/dev/null || true +sleep 30 diff --git a/starrocks/stop b/starrocks/stop new file mode 100755 index 000000000..d34da09e3 --- /dev/null +++ b/starrocks/stop @@ -0,0 +1,6 @@ +#!/bin/bash + +SR_DIR=$(cat .sr_dir 2>/dev/null) || exit 0 +"$SR_DIR/fe/bin/stop_fe.sh" 2>/dev/null || true +"$SR_DIR/be/bin/stop_be.sh" 2>/dev/null || true +exit 0 diff --git a/tidb/benchmark.sh b/tidb/benchmark.sh index 0bcb3c106..09d595b93 100755 --- a/tidb/benchmark.sh +++ b/tidb/benchmark.sh @@ -1,126 +1,6 @@ #!/bin/bash - -shopt -s expand_aliases - -MODE="$1" -if [[ -z "$MODE" ]] -then - MODE=tiflash -fi - -TIDBVERSION=8.5.1 - -TIUP_HOME=$(pwd) -export TIUP_HOME -DB_NAME=test -TABLE_NAME=hits -DATA_DIR=/tmp/data - -if [[ ! $MODE =~ ^(tikv|tikv-tiflash|tiflash)$ ]]; then - echo "Unknown mode: '$MODE'. Expected one of 'tikv', 'tikv-tiflash', 'tiflash'" - exit 1 -fi - -sudo apt-get update -y -# TiUp installer depends on curl -sudo DEBIAN_FRONTEND=noninteractive apt-get install -y curl mysql-client -# Needs to be installed and setup for TiFlash; 2-107 corresponds to America/New_York -printf "2\n107\n" | sudo DEBIAN_FRONTEND=noninteractive apt-get install --reinstall tzdata - -wget --https-only --secure-protocol=TLSv1_2 --quiet --continue --progress=dot:giga https://tiup-mirrors.pingcap.com/install.sh -sudo chmod +x ./install.sh -./install.sh -PATH="$TIUP_HOME/bin/:$PATH" -export PATH - -tiup update --self && tiup update cluster - -if [[ $MODE == "tikv" ]]; then - echo "Running benchmark on TiKV only" - DB_CONFIG_FILE=./config/tidb-tikv.toml - NUM_TIFLASH_INSTANCES=0 -elif [[ $MODE == "tiflash" ]]; then - echo "Running benchmark on TiFlash only" - DB_CONFIG_FILE=./config/tidb-tiflash.toml - NUM_TIFLASH_INSTANCES=1 -fi; - -echo "Using configuration file $DB_CONFIG_FILE" -echo "Using $NUM_TIFLASH_INSTANCES TiFlash instances" - -nohup tiup playground $TIDBVERSION --db 1 --pd 1 --kv 1 --tiflash $NUM_TIFLASH_INSTANCES --db.config $DB_CONFIG_FILE --without-monitor > tiup-cluster.out 2>&1 & -while [ ! -f tiup-cluster.out ]; do sleep 1; done -# Might take a while because dependencies need to be downloaded -while ! grep -q 'TiDB Playground Cluster is started' tiup-cluster.out; do - echo "Cluster is not running yet. Checking again in 10 seconds..." - sleep 10 -done - -echo "Cluster is running!" -tiup playground display - -alias mysql="mysql --host 127.0.0.1 --port 4000 --connect-timeout 10800 -u root" - -# Deactivate query plan cache -# For details see https://docs.pingcap.com/tidb/v8.5/sql-non-prepared-plan-cache/ -mysql -e "SET GLOBAL tidb_enable_non_prepared_plan_cache = OFF;" - -rm -rf $DATA_DIR -mkdir $DATA_DIR -# File name must correspond to .. -wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' -O "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv.gz" -gzip -d -f "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv.gz" -chmod 444 "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv" - -START=$(date +%s) - -mysql -e "DROP DATABASE IF EXISTS $DB_NAME;" -mysql -e "CREATE DATABASE $DB_NAME;" -mysql test < create.sql - -if [[ $MODE == "tiflash" || $MODE == "tikv-tiflash" ]]; then - echo "Enabling TiFlash" - mysql test -e "ALTER TABLE $TABLE_NAME SET TIFLASH REPLICA 1;" -fi; - -rm -rf /tmp/sorted-kv-dir -mkdir /tmp/sorted-kv-dir -nohup tiup tidb-lightning -config ./config/tidb-lightning.toml > tiup-tidb-lightning.out 2>&1 & -while [ ! -f tidb-lightning.log ]; do sleep 1; done -echo "Starting to check for completion on $(date +"%T")" -while ! grep -q 'the whole procedure completed' tidb-lightning.log; do - if grep -q 'tidb lightning exit.*finished=false' tidb-lightning.log || grep -q 'ERROR' tidb-lightning.log; then - echo "An error occurred during the import. Check the log file for details." - cat tiup-tidb-lightning.out - cat tidb-lightning.log - exit 1 - fi; - grep 'progress.*total' tidb-lightning.log | tail -n 1 - echo "Data loading is not done yet. Checking again in 10 seconds..." - sleep 10 -done - -echo "Data loading is done! Checking log file for time taken to load the data." -grep 'the whole procedure completed' tidb-lightning.log | sed -r -e 's/^.+\[takeTime=([0-9\.hms])+\].+?$/\1/' -command time -f '%e' mysql test -e "ANALYZE TABLE $TABLE_NAME;" - -END=$(date +%s) -echo "Load time: $(echo "$END - $START" | bc)" - -./run.sh 2>&1 | tee log.txt - -# Take storage size of TiKV for ALL modes into account, because directly loading data into TiFlash only is currently not supported -echo "Calculating storage size of TiKV in bytes..." -echo "Data size: " -mysql test -e "SELECT (DATA_LENGTH + INDEX_LENGTH) AS TIKV_STORAGE_SIZE_BYTES FROM information_schema.tables WHERE table_schema = '$DB_NAME' AND table_name = '$TABLE_NAME';" | tail -n1 - -if [[ $MODE == "tiflash" || $MODE == "tikv-tiflash" ]]; then - echo "Calculating additional storage size of TiFlash in bytes..." - echo "Data size: " - mysql test -e "SELECT TOTAL_SIZE AS TIFLASH_STORAGE_SIZE_BYTES FROM information_schema.tiflash_tables WHERE TIDB_DATABASE = '$DB_NAME' AND TIDB_TABLE = '$TABLE_NAME';" | tail -n1 -fi; - -grep -P 'rows? in set|Empty set|^ERROR' log.txt | - sed -r -e 's/^ERROR.*$/null/; s/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +# TiDB Lightning loads from ..csv files; we use the CSV download. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/tidb/check b/tidb/check new file mode 100755 index 000000000..3c3a15187 --- /dev/null +++ b/tidb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +mysql --host 127.0.0.1 --port 4000 -u root -e "SELECT 1" >/dev/null diff --git a/tidb/data-size b/tidb/data-size new file mode 100755 index 000000000..d6f137bd5 --- /dev/null +++ b/tidb/data-size @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +DB_NAME=test +TABLE_NAME=hits + +MYSQL="mysql --host 127.0.0.1 --port 4000 -u root --silent --skip-column-names" + +# TiKV storage (always present). +tikv_size=$($MYSQL test -e \ + "SELECT (DATA_LENGTH + INDEX_LENGTH) FROM information_schema.tables \ + WHERE table_schema = '$DB_NAME' AND table_name = '$TABLE_NAME';") + +# Optional TiFlash storage (may not exist if mode is tikv-only). +tiflash_size=$($MYSQL test -e \ + "SELECT IFNULL(SUM(TOTAL_SIZE), 0) FROM information_schema.tiflash_tables \ + WHERE TIDB_DATABASE = '$DB_NAME' AND TIDB_TABLE = '$TABLE_NAME';" 2>/dev/null || echo 0) + +awk -v a="$tikv_size" -v b="$tiflash_size" 'BEGIN { printf "%d\n", a + b }' diff --git a/tidb/install b/tidb/install new file mode 100755 index 000000000..feb614280 --- /dev/null +++ b/tidb/install @@ -0,0 +1,32 @@ +#!/bin/bash +set -eu + +# Defaults match the original benchmark.sh; override via env if needed. +TIDB_MODE=${TIDB_MODE:-tiflash} +TIDBVERSION=${TIDBVERSION:-8.5.1} + +if [[ ! $TIDB_MODE =~ ^(tikv|tikv-tiflash|tiflash)$ ]]; then + echo "Unknown TIDB_MODE: '$TIDB_MODE'. Expected 'tikv', 'tikv-tiflash', or 'tiflash'." >&2 + exit 1 +fi + +sudo apt-get update -y +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y curl mysql-client wget +# tzdata install (2-107 = America/New_York) is required for TiFlash. +printf "2\n107\n" | sudo DEBIAN_FRONTEND=noninteractive apt-get install --reinstall tzdata + +TIUP_HOME=$(pwd) +export TIUP_HOME + +if [ ! -x "$TIUP_HOME/bin/tiup" ]; then + wget --https-only --secure-protocol=TLSv1_2 --quiet --continue --progress=dot:giga \ + https://tiup-mirrors.pingcap.com/install.sh + chmod +x ./install.sh + ./install.sh +fi + +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +tiup update --self +tiup update cluster diff --git a/tidb/load b/tidb/load new file mode 100755 index 000000000..f64d31623 --- /dev/null +++ b/tidb/load @@ -0,0 +1,50 @@ +#!/bin/bash +set -eu + +TIDB_MODE=${TIDB_MODE:-tiflash} +DB_NAME=test +TABLE_NAME=hits +DATA_DIR=/tmp/data + +TIUP_HOME=$(pwd) +export TIUP_HOME +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +MYSQL="mysql --host 127.0.0.1 --port 4000 --connect-timeout 10800 -u root" + +# Stage data file where TiDB Lightning expects it: .
.csv +rm -rf $DATA_DIR +mkdir $DATA_DIR +mv hits.csv "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv" +chmod 444 "$DATA_DIR/$DB_NAME.$TABLE_NAME.csv" + +$MYSQL -e "DROP DATABASE IF EXISTS $DB_NAME;" +$MYSQL -e "CREATE DATABASE $DB_NAME;" +$MYSQL test < create.sql + +if [[ $TIDB_MODE == "tiflash" || $TIDB_MODE == "tikv-tiflash" ]]; then + $MYSQL test -e "ALTER TABLE $TABLE_NAME SET TIFLASH REPLICA 1;" +fi + +rm -rf /tmp/sorted-kv-dir +mkdir /tmp/sorted-kv-dir +rm -f tidb-lightning.log +nohup tiup tidb-lightning -config ./config/tidb-lightning.toml > tiup-tidb-lightning.out 2>&1 & +while [ ! -f tidb-lightning.log ]; do sleep 1; done + +while ! grep -q 'the whole procedure completed' tidb-lightning.log; do + if grep -q 'tidb lightning exit.*finished=false' tidb-lightning.log || grep -q 'ERROR' tidb-lightning.log; then + echo "Error during import:" >&2 + cat tiup-tidb-lightning.out >&2 + cat tidb-lightning.log >&2 + exit 1 + fi + grep 'progress.*total' tidb-lightning.log | tail -n 1 || true + sleep 10 +done + +$MYSQL test -e "ANALYZE TABLE $TABLE_NAME;" + +rm -rf $DATA_DIR +sync diff --git a/tidb/query b/tidb/query new file mode 100755 index 000000000..915ad7f4b --- /dev/null +++ b/tidb/query @@ -0,0 +1,33 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via mysql client against TiDB on :4000. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(mysql --host 127.0.0.1 --port 4000 -u root test -vvv -e "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$out" | grep -q '^ERROR'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" + +timing=$(printf '%s\n' "$out" \ + | grep -P 'rows? in set|Empty set|Query OK' \ + | tail -n1 \ + | sed -r 's/^.*?\((([0-9.]+) min )?([0-9.]+) sec\).*?$/\2 \3/') + +if [ -z "$timing" ]; then + echo "no timing in mysql output" >&2 + exit 1 +fi + +awk -v t="$timing" 'BEGIN { + n = split(t, a, " ") + if (n == 2 && a[1] != "") { printf "%.3f\n", a[1] * 60 + a[2] } + else { printf "%.3f\n", a[n] } +}' >&2 diff --git a/tidb/run.sh b/tidb/run.sh deleted file mode 100755 index 9b6d56803..000000000 --- a/tidb/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches - - for i in $(seq 1 $TRIES); do - mysql --host 127.0.0.1 --port 4000 -u root test -vvv -e "${query}" - done; -done; diff --git a/tidb/start b/tidb/start new file mode 100755 index 000000000..309fab42d --- /dev/null +++ b/tidb/start @@ -0,0 +1,38 @@ +#!/bin/bash +set -eu + +TIDB_MODE=${TIDB_MODE:-tiflash} +TIDBVERSION=${TIDBVERSION:-8.5.1} + +TIUP_HOME=$(pwd) +export TIUP_HOME +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +# Idempotent: if MySQL protocol on :4000 already responds, do nothing. +if mysql --host 127.0.0.1 --port 4000 -u root -e "SELECT 1" >/dev/null 2>&1; then + exit 0 +fi + +if [[ $TIDB_MODE == "tikv" ]]; then + DB_CONFIG_FILE=./config/tidb-tikv.toml + NUM_TIFLASH_INSTANCES=0 +else + DB_CONFIG_FILE=./config/tidb-tiflash.toml + NUM_TIFLASH_INSTANCES=1 +fi + +nohup tiup playground "$TIDBVERSION" --db 1 --pd 1 --kv 1 \ + --tiflash $NUM_TIFLASH_INSTANCES \ + --db.config "$DB_CONFIG_FILE" \ + --without-monitor > tiup-cluster.out 2>&1 & + +while [ ! -f tiup-cluster.out ]; do sleep 1; done +while ! grep -q 'TiDB Playground Cluster is started' tiup-cluster.out; do + echo "Cluster is not running yet. Checking again in 10 seconds..." + sleep 10 +done + +# Disable non-prepared plan cache (matches original benchmark behavior). +mysql --host 127.0.0.1 --port 4000 -u root \ + -e "SET GLOBAL tidb_enable_non_prepared_plan_cache = OFF;" diff --git a/tidb/stop b/tidb/stop new file mode 100755 index 000000000..512a56b58 --- /dev/null +++ b/tidb/stop @@ -0,0 +1,19 @@ +#!/bin/bash + +TIUP_HOME=$(pwd) +export TIUP_HOME +PATH="$TIUP_HOME/bin/:$PATH" +export PATH + +tiup playground display >/dev/null 2>&1 || exit 0 + +# tiup playground exposes no clean stop; kill the playground process group. +pids=$(pgrep -f 'tiup playground' || true) +if [ -n "$pids" ]; then + kill $pids 2>/dev/null || true + sleep 5 + pids=$(pgrep -f 'tiup playground' || true) + if [ -n "$pids" ]; then + kill -9 $pids 2>/dev/null || true + fi +fi diff --git a/timescaledb-no-columnstore/benchmark.sh b/timescaledb-no-columnstore/benchmark.sh index 4db746a84..531bd6503 100755 --- a/timescaledb-no-columnstore/benchmark.sh +++ b/timescaledb-no-columnstore/benchmark.sh @@ -1,45 +1,5 @@ #!/bin/bash - -# Install - -export DEBIAN_FRONTEND=noninteractive -sudo apt-get update -y -sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y -sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' -wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - -sudo apt-get update -y -sudo apt-get install -y timescaledb-2-postgresql-17 postgresql-client-17 -sudo timescaledb-tune -yes -sudo systemctl restart postgresql - -sudo -u postgres psql -c "CREATE DATABASE nocolumnstore" -sudo -u postgres psql nocolumnstore -c "CREATE EXTENSION timescaledb WITH VERSION '2.17.2';" - -../download-hits-tsv -sudo chmod og+rX ~ -chmod 777 hits.tsv - -#import -sudo -u postgres psql nocolumnstore < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -sudo -u postgres psql nocolumnstore -q -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day')" -sudo -u postgres psql nocolumnstore -q -c "CREATE INDEX ix_counterid ON hits (counterid)" -sudo -u postgres psql -c "ALTER DATABASE nocolumnstore SET work_mem TO '1GB';" -sudo -u postgres psql -c "ALTER DATABASE nocolumnstore SET min_parallel_table_scan_size TO '0';" - -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql nocolumnstore -q -t -c "\\copy hits FROM 'hits.tsv'" -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql nocolumnstore -q -t -c "vacuum freeze analyze hits;" - -echo -n "Data size: " -sudo -u postgres psql nocolumnstore -q -t -c "SELECT hypertable_size('hits');" - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/timescaledb-no-columnstore/check b/timescaledb-no-columnstore/check new file mode 100755 index 000000000..5c6f71123 --- /dev/null +++ b/timescaledb-no-columnstore/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/timescaledb-no-columnstore/data-size b/timescaledb-no-columnstore/data-size new file mode 100755 index 000000000..33aa229a0 --- /dev/null +++ b/timescaledb-no-columnstore/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo -u postgres psql nocolumnstore -A -t -c "SELECT hypertable_size('hits');" diff --git a/timescaledb-no-columnstore/install b/timescaledb-no-columnstore/install new file mode 100755 index 000000000..128c0820c --- /dev/null +++ b/timescaledb-no-columnstore/install @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +export DEBIAN_FRONTEND=noninteractive + +sudo apt-get update -y +sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y +sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' +wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - + +sudo apt-get update -y +sudo apt-get install -y timescaledb-2-postgresql-$PGVERSION postgresql-client-$PGVERSION +sudo timescaledb-tune -yes + +sudo systemctl restart postgresql@$PGVERSION-main diff --git a/timescaledb-no-columnstore/load b/timescaledb-no-columnstore/load new file mode 100755 index 000000000..a5c87ec6e --- /dev/null +++ b/timescaledb-no-columnstore/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +sudo chmod og+rX ~ +chmod 777 hits.tsv + +sudo -u postgres psql -t -c "DROP DATABASE IF EXISTS nocolumnstore" +sudo -u postgres psql -t -c "CREATE DATABASE nocolumnstore" +sudo -u postgres psql nocolumnstore -c "CREATE EXTENSION IF NOT EXISTS timescaledb;" + +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore < create.sql + +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day')" +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -c "CREATE INDEX ix_counterid ON hits (counterid)" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE nocolumnstore SET work_mem TO '1GB';" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE nocolumnstore SET min_parallel_table_scan_size TO '0';" + +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -t -c "\\copy hits FROM 'hits.tsv'" +sudo -u postgres psql -v ON_ERROR_STOP=1 nocolumnstore -q -t -c "vacuum freeze analyze hits;" + +rm -f hits.tsv +sync diff --git a/timescaledb-no-columnstore/query b/timescaledb-no-columnstore/query new file mode 100755 index 000000000..9be35a636 --- /dev/null +++ b/timescaledb-no-columnstore/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `nocolumnstore` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | sudo -u postgres psql nocolumnstore -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/timescaledb-no-columnstore/run.sh b/timescaledb-no-columnstore/run.sh deleted file mode 100755 index e87c0ae26..000000000 --- a/timescaledb-no-columnstore/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - sudo -u postgres psql nocolumnstore -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/timescaledb-no-columnstore/start b/timescaledb-no-columnstore/start new file mode 100755 index 000000000..941f213c5 --- /dev/null +++ b/timescaledb-no-columnstore/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/timescaledb-no-columnstore/stop b/timescaledb-no-columnstore/stop new file mode 100755 index 000000000..47969378d --- /dev/null +++ b/timescaledb-no-columnstore/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/timescaledb/benchmark.sh b/timescaledb/benchmark.sh index 266782bda..531bd6503 100755 --- a/timescaledb/benchmark.sh +++ b/timescaledb/benchmark.sh @@ -1,54 +1,5 @@ #!/bin/bash - -# Install -export DEBIAN_FRONTEND=noninteractive -sudo apt-get update -y -sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget -sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y -sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' -wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - -sudo apt-get update -y -sudo apt-get install -y timescaledb-2-postgresql-17 postgresql-client-17 -sudo timescaledb-tune -yes - -sudo systemctl restart postgresql - -sudo -u postgres psql -c "CREATE DATABASE test" -sudo -u postgres psql test -c "CREATE EXTENSION timescaledb WITH VERSION '2.17.2';" - -# Import the data -../download-hits-tsv -sudo chmod og+rX ~ -chmod 777 hits.tsv - -sudo -u postgres psql test < create.sql 2>&1 | tee load_out.txt -if grep 'ERROR' load_out.txt -then - exit 1 -fi -sudo -u postgres psql test -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day', create_default_indexes => false)" -sudo -u postgres psql test -c "ALTER TABLE hits SET (timescaledb.compress, timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'counterid, userid, eventtime')" -sudo -u postgres psql test -c "ALTER DATABASE test SET timescaledb.enable_chunk_skipping to ON;" -sudo -u postgres psql -c "ALTER DATABASE test SET work_mem TO '1GB';" -sudo -u postgres psql -c "ALTER DATABASE test SET min_parallel_table_scan_size TO '0';" -sudo -u postgres psql test -c "SELECT enable_chunk_skipping('hits', 'counterid');" - -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql test -t -c "\\copy hits FROM 'hits.tsv'" - -# See https://github.com/timescale/timescaledb/issues/4473#issuecomment-1167095245 -# https://docs.timescale.com/timescaledb/latest/how-to-guides/compression/manually-compress-chunks/#compress-chunks-manually -# TimescaleDB benchmark wihout compression is available in timescaledb no columnstore directory - -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql test -q -c "SELECT compress_chunk(i, if_not_compressed => true) FROM show_chunks('hits') i" -echo -n "Load time: " -command time -f '%e' sudo -u postgres psql test -q -t -c "vacuum freeze analyze hits;" - -echo -n "Data size: " -sudo -u postgres psql test -q -c "\t" -c "SELECT hypertable_size('hits');" - -./run.sh 2>&1 | tee log.txt - -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/timescaledb/check b/timescaledb/check new file mode 100755 index 000000000..5c6f71123 --- /dev/null +++ b/timescaledb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo -u postgres psql -t -c 'SELECT 1' >/dev/null diff --git a/timescaledb/data-size b/timescaledb/data-size new file mode 100755 index 000000000..5bf6f670e --- /dev/null +++ b/timescaledb/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +# Report the timescaledb hypertable's logical size in bytes. +sudo -u postgres psql test -A -t -c "SELECT hypertable_size('hits');" diff --git a/timescaledb/install b/timescaledb/install new file mode 100755 index 000000000..84dda3f19 --- /dev/null +++ b/timescaledb/install @@ -0,0 +1,19 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} + +export DEBIAN_FRONTEND=noninteractive + +# PGDG repo for matching PG version + timescale repo for the extension. +sudo apt-get update -y +sudo apt-get install -y gnupg postgresql-common apt-transport-https lsb-release wget +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y +sudo bash -c 'echo "deb https://packagecloud.io/timescale/timescaledb/ubuntu/ $(lsb_release -c -s) main" > /etc/apt/sources.list.d/timescaledb.list' +wget --quiet -O - https://packagecloud.io/timescale/timescaledb/gpgkey | sudo apt-key add - + +sudo apt-get update -y +sudo apt-get install -y timescaledb-2-postgresql-$PGVERSION postgresql-client-$PGVERSION +sudo timescaledb-tune -yes + +sudo systemctl restart postgresql@$PGVERSION-main diff --git a/timescaledb/load b/timescaledb/load new file mode 100755 index 000000000..03917af0c --- /dev/null +++ b/timescaledb/load @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +sudo chmod og+rX ~ +chmod 777 hits.tsv + +sudo -u postgres psql -t -c "DROP DATABASE IF EXISTS test" +sudo -u postgres psql -t -c "CREATE DATABASE test" +sudo -u postgres psql test -c "CREATE EXTENSION IF NOT EXISTS timescaledb;" + +sudo -u postgres psql -v ON_ERROR_STOP=1 test < create.sql + +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "SELECT create_hypertable('hits', 'eventtime', chunk_time_interval => interval '3 day', create_default_indexes => false)" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "ALTER TABLE hits SET (timescaledb.compress, timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'counterid, userid, eventtime')" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "ALTER DATABASE test SET timescaledb.enable_chunk_skipping to ON;" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE test SET work_mem TO '1GB';" +sudo -u postgres psql -v ON_ERROR_STOP=1 -c "ALTER DATABASE test SET min_parallel_table_scan_size TO '0';" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -c "SELECT enable_chunk_skipping('hits', 'counterid');" + +sudo -u postgres psql -v ON_ERROR_STOP=1 test -t -c "\\copy hits FROM 'hits.tsv'" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -q -c "SELECT compress_chunk(i, if_not_compressed => true) FROM show_chunks('hits') i" +sudo -u postgres psql -v ON_ERROR_STOP=1 test -q -t -c "vacuum freeze analyze hits;" + +rm -f hits.tsv +sync diff --git a/timescaledb/query b/timescaledb/query new file mode 100755 index 000000000..cafe324f8 --- /dev/null +++ b/timescaledb/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against the `test` DB. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +out=$(printf '\\timing\n%s\n' "$query" | sudo -u postgres psql test -t 2>&1) +status=$? + +if printf '%s\n' "$out" | grep -q '^ERROR\|psql: error'; then + printf '%s\n' "$out" >&2 + exit 1 +fi + +printf '%s\n' "$out" | grep -v '^Time:' + +time_ms=$(printf '%s\n' "$out" | grep -oP 'Time:\s+\K[0-9]+\.[0-9]+' | tail -n1) +if [ -z "$time_ms" ]; then + echo "no timing in psql output" >&2 + exit 1 +fi +awk -v ms="$time_ms" 'BEGIN { printf "%.3f\n", ms / 1000 }' >&2 + +exit "$status" diff --git a/timescaledb/run.sh b/timescaledb/run.sh deleted file mode 100755 index be1c9b661..000000000 --- a/timescaledb/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - sudo -u postgres psql test -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done; -done; diff --git a/timescaledb/start b/timescaledb/start new file mode 100755 index 000000000..941f213c5 --- /dev/null +++ b/timescaledb/start @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +PGVERSION=${PGVERSION:-17} +sudo systemctl start postgresql@$PGVERSION-main diff --git a/timescaledb/stop b/timescaledb/stop new file mode 100755 index 000000000..47969378d --- /dev/null +++ b/timescaledb/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +PGVERSION=${PGVERSION:-17} +sudo systemctl stop postgresql@$PGVERSION-main || true diff --git a/trino/benchmark.sh b/trino/benchmark.sh index 2f408dc3a..531bd6503 100755 --- a/trino/benchmark.sh +++ b/trino/benchmark.sh @@ -1,10 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y docker.io -sudo docker run --network host -p 8080:8080 --name trino trinodb/trino - -sudo docker exec -i trino trino - -CREATE SCHEMA memory.test; -USE memory.test; +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/trino/check b/trino/check new file mode 100755 index 000000000..1425e226a --- /dev/null +++ b/trino/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Trino exposes /v1/info. The "starting" field flips to false when ready. +out=$(curl -sf http://localhost:8080/v1/info) +echo "$out" | grep -q '"starting":false' diff --git a/trino/data-size b/trino/data-size new file mode 100755 index 000000000..77254dd4a --- /dev/null +++ b/trino/data-size @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Memory connector — report in-memory size as reported by Trino. +out=$(sudo docker exec -i trino trino --execute \ + "SELECT SUM(total_bytes) FROM system.runtime.tables WHERE schema_name='test' AND table_name='hits'" 2>/dev/null || true) +# Fallback: 0 if Trino doesn't expose this. +size=$(printf '%s\n' "$out" | grep -oE '[0-9]+' | head -n1) +echo "${size:-0}" diff --git a/trino/install b/trino/install new file mode 100755 index 000000000..8cb30f1d0 --- /dev/null +++ b/trino/install @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +# Install Docker (idempotent). +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi + +# Pull image (idempotent — Docker caches). +sudo docker pull trinodb/trino diff --git a/trino/load b/trino/load new file mode 100755 index 000000000..35ad6c455 --- /dev/null +++ b/trino/load @@ -0,0 +1,25 @@ +#!/bin/bash +set -e + +# Create schema and table in the in-memory connector. Idempotent (IF NOT EXISTS). +sudo docker exec -i trino trino <<'EOF' +CREATE SCHEMA IF NOT EXISTS memory.test; +EOF + +sudo docker exec -i trino trino --catalog memory --schema test < create_single.sql + +# Load TSV. Trino has no native COPY for the memory connector; use INSERT via +# the trino CLI: feed the data row-by-row would be way too slow. Instead, mount +# the TSV via stdin to a server-side tooling step. Pragmatic approach: copy the +# file in and use the Hive connector's TSV is not available out-of-the-box on +# the default image, so we INSERT in batches via the CLI. +# +# This is intentionally a placeholder for a load implementation; Trino with the +# stock image does not support fast bulk loads of TSV. For ClickBench parity we +# would normally use the Hive connector and S3-backed Parquet (see +# trino-datalake). With the memory connector this is not practical at scale. +echo "trino: bulk-loading TSV into the memory connector is not supported; using SELECT against external file is required for full benchmark." >&2 +echo "trino: marking load as a no-op — see README" >&2 + +rm -f hits.tsv +sync diff --git a/trino/query b/trino/query new file mode 100755 index 000000000..bfcf7166a --- /dev/null +++ b/trino/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via the trino CLI in the running +# container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +TIMEFORMAT='%R' +{ time sudo docker exec -i trino trino --catalog memory --schema test \ + --execute "$query" 1>/tmp/trino.out.$$ 2>/tmp/trino.err.$$; } \ + 2>/tmp/trino.time.$$ || status=$? +status=${status:-0} + +cat /tmp/trino.out.$$ +if [ "$status" -ne 0 ]; then + cat /tmp/trino.err.$$ >&2 + rm -f /tmp/trino.out.$$ /tmp/trino.err.$$ /tmp/trino.time.$$ + exit "$status" +fi + +cat /tmp/trino.err.$$ >&2 +cat /tmp/trino.time.$$ >&2 + +rm -f /tmp/trino.out.$$ /tmp/trino.err.$$ /tmp/trino.time.$$ diff --git a/trino/start b/trino/start new file mode 100755 index 000000000..15a531fd2 --- /dev/null +++ b/trino/start @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Idempotent: if running, do nothing. If exists but stopped, start it. Else run. +if sudo docker ps --format '{{.Names}}' | grep -qx trino; then + exit 0 +fi +if sudo docker ps -a --format '{{.Names}}' | grep -qx trino; then + sudo docker start trino + exit 0 +fi +sudo docker run -d --network host --name trino trinodb/trino diff --git a/trino/stop b/trino/stop new file mode 100755 index 000000000..956100aa2 --- /dev/null +++ b/trino/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop trino 2>/dev/null || true +exit 0 diff --git a/turso/benchmark.sh b/turso/benchmark.sh index 47a96005a..b0b9f4775 100755 --- a/turso/benchmark.sh +++ b/turso/benchmark.sh @@ -1,26 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y curl - -# Download and install Turso -curl --proto '=https' --tlsv1.2 -LsSf https://github.com/tursodatabase/turso/releases/download/v0.1.2-pre.4/turso_cli-installer.sh | sh -export HOME=${HOME:=~} -source $HOME/.turso/env - -tursodb mydb < create.sql - -../download-hits-csv - -echo -n "Load time: " -command time -f '%e' tursodb mydb '.import --csv hits.csv hits' -echo -n "Data size: " -wc -c mydb - -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P '^real|^Error|Parse error' | - sed -r -e 's/^(Error|Parse error).*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/turso/check b/turso/check new file mode 100755 index 000000000..836c2cf02 --- /dev/null +++ b/turso/check @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" + +tursodb :memory: 'SELECT 1' >/dev/null diff --git a/turso/data-size b/turso/data-size new file mode 100755 index 000000000..f94c4eccf --- /dev/null +++ b/turso/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < mydb diff --git a/turso/install b/turso/install new file mode 100755 index 000000000..f55c9a720 --- /dev/null +++ b/turso/install @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +if ! command -v tursodb >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y curl + curl --proto '=https' --tlsv1.2 -LsSf \ + https://github.com/tursodatabase/turso/releases/download/v0.1.2-pre.4/turso_cli-installer.sh | sh +fi + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" diff --git a/turso/load b/turso/load new file mode 100755 index 000000000..c7c3fb85a --- /dev/null +++ b/turso/load @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" + +# Idempotent: blow away any prior DB. +rm -f mydb + +tursodb mydb < create.sql +tursodb mydb '.import --csv hits.csv hits' + +rm -f hits.csv +sync diff --git a/turso/query b/turso/query new file mode 100755 index 000000000..2dcdf26cf --- /dev/null +++ b/turso/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via tursodb against mydb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (from `time`). +set -e + +export HOME=${HOME:=~} +# shellcheck disable=SC1091 +source "$HOME/.turso/env" + +query=$(cat) + +TIMEFORMAT='%R' +{ time tursodb mydb <<< "$query" 1>/tmp/turso.out.$$ 2>/tmp/turso.err.$$; } 2>/tmp/turso.time.$$ || status=$? +status=${status:-0} + +cat /tmp/turso.out.$$ +if [ "$status" -ne 0 ]; then + cat /tmp/turso.err.$$ >&2 + rm -f /tmp/turso.out.$$ /tmp/turso.err.$$ /tmp/turso.time.$$ + exit "$status" +fi + +cat /tmp/turso.err.$$ >&2 +cat /tmp/turso.time.$$ >&2 + +rm -f /tmp/turso.out.$$ /tmp/turso.err.$$ /tmp/turso.time.$$ diff --git a/turso/run.sh b/turso/run.sh deleted file mode 100755 index 02a54dd5e..000000000 --- a/turso/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - time tursodb mydb <<< "${query}" - done; -done; diff --git a/turso/start b/turso/start new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/turso/start @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/turso/stop b/turso/stop new file mode 100755 index 000000000..06bd98656 --- /dev/null +++ b/turso/stop @@ -0,0 +1,2 @@ +#!/bin/bash +exit 0 diff --git a/umbra/benchmark.sh b/umbra/benchmark.sh index 8d6c9c192..531bd6503 100755 --- a/umbra/benchmark.sh +++ b/umbra/benchmark.sh @@ -1,48 +1,5 @@ #!/bin/bash - -# Ubuntu -sudo apt-get update -y -sudo apt-get install -y docker.io postgresql-client gzip - -# Amazon Linux -# yum install nc postgresql15 - -# Download + uncompress hits -rm -rf data -mkdir data -../download-hits-tsv -mv hits.tsv data -chmod 777 -R data - -# I spend too much time here battling cryptic error messages only to find out that the data needs to be in some separate directory -rm -rf db -mkdir db -chmod 777 -R db - -# https://hub.docker.com/r/umbradb/umbra -docker run -d -v ./db:/var/db -v ./data:/data -p 5432:5432 --ulimit nofile=1048576:1048576 --ulimit memlock=8388608:8388608 umbradb/umbra:latest -sleep 5 # Things below fail otherwise ... - -start=$(date +%s%3N) -PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -f create.sql 2>&1 | tee load_out.txt -end=$(date +%s%3N) -if grep 'ERROR' load_out.txt -then - exit 1 -fi -echo "Load time: $(( (end - start) / 1000 ))" - -./run.sh 2>&1 | tee log.txt - -# Calculate persistence size -sudo chmod 777 -R db # otherwise 'du' complains about permission denied -echo -n "Data size: " -du -bcs db | grep total - -# Pretty-printing -cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' | - awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' - -# Cleanup -docker stop $(docker ps -a -q) && docker rm $(docker ps -a -q) && docker volume prune --all --force -rm -rf data db +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/umbra/check b/umbra/check new file mode 100755 index 000000000..5336d8ba0 --- /dev/null +++ b/umbra/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null diff --git a/umbra/data-size b/umbra/data-size new file mode 100755 index 000000000..ae38a0d59 --- /dev/null +++ b/umbra/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -eu + +sudo chmod -R 777 db 2>/dev/null || true +du -bcs db | grep total | awk '{print $1}' diff --git a/umbra/install b/umbra/install new file mode 100755 index 000000000..d472dbbf8 --- /dev/null +++ b/umbra/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client gzip + +sudo docker pull umbradb/umbra:latest + +mkdir -p data db +chmod -R 777 data db diff --git a/umbra/load b/umbra/load new file mode 100755 index 000000000..86c224d44 --- /dev/null +++ b/umbra/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +mkdir -p data +mv hits.tsv data/ +chmod -R 777 data + +# create.sql for umbra both creates the table and ingests via COPY. +PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -f create.sql + +rm -f data/hits.tsv +sync diff --git a/umbra/query b/umbra/query new file mode 100755 index 000000000..0e08bbad5 --- /dev/null +++ b/umbra/query @@ -0,0 +1,26 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against Umbra. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/umbra/run.sh b/umbra/run.sh deleted file mode 100755 index 19f225684..000000000 --- a/umbra/run.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - docker restart $(docker ps -a -q) - - retry_count=0 - while [ $retry_count -lt 120 ]; do - if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c "SELECT 'Ok';"; then - break - fi - - retry_count=$((retry_count+1)) - sleep 1 - done - - echo "$query"; - for i in $(seq 1 $TRIES); do - PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -t -c '\timing' -c "$query" 2>&1 | grep -P 'Time|psql: error' | tail -n1 - done -done diff --git a/umbra/start b/umbra/start new file mode 100755 index 000000000..a7fc4dc58 --- /dev/null +++ b/umbra/start @@ -0,0 +1,23 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo docker stop umbradb >/dev/null 2>&1 || true +sudo docker rm umbradb >/dev/null 2>&1 || true + +sudo docker run -d --name umbradb \ + -v "$(pwd)/db:/var/db" \ + -v "$(pwd)/data:/data" \ + -p 5432:5432 \ + --ulimit nofile=1048576:1048576 \ + --ulimit memlock=8388608:8388608 \ + umbradb/umbra:latest >/dev/null + +# Container needs a moment before psql can connect. +for _ in $(seq 1 60); do + PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + sleep 1 +done diff --git a/umbra/stop b/umbra/stop new file mode 100755 index 000000000..890229a5b --- /dev/null +++ b/umbra/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop umbradb >/dev/null 2>&1 || true diff --git a/vertica/benchmark.sh b/vertica/benchmark.sh index 83b54d69f..531bd6503 100755 --- a/vertica/benchmark.sh +++ b/vertica/benchmark.sh @@ -1,27 +1,5 @@ #!/bin/bash - -sudo apt-get update -y -sudo apt-get install -y docker.io - -sudo docker run -p 5433:5433 -p 5444:5444 --volume $(pwd):/workdir --mount type=volume,source=vertica-data,target=/data --name vertica_ce vertica/vertica-ce - -sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c "$(cat create.sql)" - -../download-hits-tsv - -echo -n "Load time: " -command time -f '%e' sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c "COPY hits FROM LOCAL '/workdir/hits.tsv' DELIMITER E'\\t' NULL E'\\001' DIRECT" - -echo -n "Data size: " -sudo docker exec vertica_ce du -bcs /data/vertica/VMart | grep total - -./run.sh 2>&1 | tee log.txt - -# If you run the script on your own, you may get numbers like this: -# 200m00.000s -# 25000000000 - -# Note: the real numbers cannot be published. - -grep -F 'All rows formatted' logs.txt | sed -r -e 's/^.* ([0-9.]+) ms$/\1/' | - awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_RESTARTABLE=yes +exec ../lib/benchmark-common.sh diff --git a/vertica/check b/vertica/check new file mode 100755 index 000000000..d94fc908f --- /dev/null +++ b/vertica/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c 'SELECT 1' >/dev/null 2>&1 diff --git a/vertica/data-size b/vertica/data-size new file mode 100755 index 000000000..39189fa47 --- /dev/null +++ b/vertica/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +sudo docker exec vertica_ce du -bcs /data/vertica/VMart | grep total | awk '{print $1}' diff --git a/vertica/install b/vertica/install new file mode 100755 index 000000000..73be6d2ad --- /dev/null +++ b/vertica/install @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io + +sudo docker pull vertica/vertica-ce + +# Create container only if missing. +if ! sudo docker inspect vertica_ce >/dev/null 2>&1; then + sudo docker run -d -p 5433:5433 -p 5444:5444 \ + --volume "$(pwd):/workdir" \ + --mount type=volume,source=vertica-data,target=/data \ + --name vertica_ce vertica/vertica-ce +fi diff --git a/vertica/load b/vertica/load new file mode 100755 index 000000000..4bd5c8c25 --- /dev/null +++ b/vertica/load @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +VSQL="sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin" + +$VSQL -c "DROP TABLE IF EXISTS hits CASCADE;" +$VSQL -c "$(cat create.sql)" + +$VSQL -c "COPY hits FROM LOCAL '/workdir/hits.tsv' DELIMITER E'\t' NULL E'\001' DIRECT" + +rm -f hits.tsv +sync diff --git a/vertica/query b/vertica/query new file mode 100755 index 000000000..9aa777075 --- /dev/null +++ b/vertica/query @@ -0,0 +1,36 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via vsql inside the vertica_ce container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# vsql's `\timing` "All rows formatted: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin \ + -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|^ROLLBACK'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +# Pass through the result, then parse the timing footer. +printf '%s\n' "$raw" + +ms=$(printf '%s\n' "$raw" \ + | grep -oP 'All rows formatted:\s*\K[0-9.]+(?=\s*ms)' \ + | tail -n1) + +if [ -z "$ms" ]; then + # Fallback: vsql also prints "Time: ms" on \timing. + ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+(?=\s*ms)' | tail -n1) +fi + +if [ -z "$ms" ]; then + echo "no timing in vsql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/vertica/run.sh b/vertica/run.sh deleted file mode 100755 index 138e0c8d7..000000000 --- a/vertica/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TRIES=3 - -cat queries.sql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo "$query"; - for i in $(seq 1 $TRIES); do - sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c '\timing' -c "$query" - done; -done; diff --git a/vertica/start b/vertica/start new file mode 100755 index 000000000..72362a897 --- /dev/null +++ b/vertica/start @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +if sudo docker inspect -f '{{.State.Running}}' vertica_ce 2>/dev/null | grep -q true; then + if sudo docker exec vertica_ce /opt/vertica/bin/vsql -U dbadmin -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 + fi +fi + +sudo docker start vertica_ce diff --git a/vertica/stop b/vertica/stop new file mode 100755 index 000000000..4bf245e4f --- /dev/null +++ b/vertica/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop vertica_ce >/dev/null 2>&1 || true diff --git a/victorialogs/benchmark.sh b/victorialogs/benchmark.sh index c089752a4..d087cc65a 100755 --- a/victorialogs/benchmark.sh +++ b/victorialogs/benchmark.sh @@ -1,42 +1,8 @@ #!/bin/bash - -# Install - -RELEASE_VERSION=v1.10.1-victorialogs - -# Stop the existing victorialogs instance if any and drop its data -for _ in {1..300} -do - pidof victoria-logs-prod && kill `pidof victoria-logs-prod` || break - sleep 1 -done -rm -rf victoria-logs-data - -# Download and start victorialogs -wget --continue --progress=dot:giga https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${RELEASE_VERSION}/victoria-logs-linux-$(dpkg --print-architecture)-${RELEASE_VERSION}.tar.gz -tar xzf victoria-logs-linux-$(dpkg --print-architecture)-${RELEASE_VERSION}.tar.gz -./victoria-logs-prod -loggerOutput=stdout -retentionPeriod=20y -search.maxQueryDuration=5m > server.log & - -for _ in {1..300} -do - curl -s http://localhost:9428/select/logsql/query -d 'query=_time:2100-01-01Z' && break - sleep 1 -done - -# Load the data - -wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.json.gz -gunzip hits.json.gz -echo -n "Load time: " -command time -f '%e' cat hits.json | split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" - -# Run the queries - -./run.sh - -# Determine on-disk size of the ingested data - -echo -n "Data size: " -du -sb victoria-logs-data - -sudo killall victoria-logs-prod +# Thin shim — actual flow is in lib/benchmark-common.sh. +# victorialogs ingests gzipped NDJSON; ./load fetches it directly. +export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_RESTARTABLE=yes +# queries are LogsQL, not SQL. +export BENCH_QUERIES_FILE="queries.logsql" +exec ../lib/benchmark-common.sh diff --git a/victorialogs/check b/victorialogs/check new file mode 100755 index 000000000..3db46b7d3 --- /dev/null +++ b/victorialogs/check @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +curl -sSf --get \ + --data-urlencode 'query=_time:2100-01-01Z' \ + 'http://localhost:9428/select/logsql/query' >/dev/null diff --git a/victorialogs/data-size b/victorialogs/data-size new file mode 100755 index 000000000..d31cee675 --- /dev/null +++ b/victorialogs/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -eu + +du -sb victoria-logs-data | awk '{print $1}' diff --git a/victorialogs/install b/victorialogs/install new file mode 100755 index 000000000..1a9aa4cd5 --- /dev/null +++ b/victorialogs/install @@ -0,0 +1,11 @@ +#!/bin/bash +set -eu + +RELEASE_VERSION=${VICTORIALOGS_VERSION:-v1.10.1-victorialogs} + +if [ ! -x ./victoria-logs-prod ]; then + arch=$(dpkg --print-architecture) + wget --continue --progress=dot:giga \ + "https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${RELEASE_VERSION}/victoria-logs-linux-${arch}-${RELEASE_VERSION}.tar.gz" + tar xzf "victoria-logs-linux-${arch}-${RELEASE_VERSION}.tar.gz" +fi diff --git a/victorialogs/load b/victorialogs/load new file mode 100755 index 000000000..259f72efc --- /dev/null +++ b/victorialogs/load @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +# Idempotent: blow away any prior data. +./stop +rm -rf victoria-logs-data +./start +# Wait for it to come up. +for _ in {1..300}; do + ./check >/dev/null 2>&1 && break + sleep 1 +done + +wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +gunzip -f hits.json.gz + +# Bulk insert via 8 parallel jsonline streams. +cat hits.json | split -n r/8 -d --filter="curl -sS -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=EventTime&_stream_fields=AdvEngineID,CounterID'" + +rm -f hits.json +sync diff --git a/victorialogs/query b/victorialogs/query new file mode 100755 index 000000000..2d2581a27 --- /dev/null +++ b/victorialogs/query @@ -0,0 +1,23 @@ +#!/bin/bash +# Reads a LogsQL query from stdin, runs it via victorialogs HTTP API. +# Stdout: query result (NDJSON). +# Stderr: query runtime in fractional seconds on the last line (wall-clock). +# Exit non-zero on error. +set -e + +query=$(cat) + +t1=$(date +%s%3N) +out=$(curl -sS --fail --get --data-urlencode "query=$query" \ + 'http://localhost:9428/select/logsql/query') && exit_code=0 || exit_code=$? +t2=$(date +%s%3N) + +if [ "$exit_code" -ne 0 ]; then + printf '%s\n' "$out" >&2 + exit "$exit_code" +fi + +printf '%s\n' "$out" + +duration=$((t2 - t1)) +awk -v d="$duration" 'BEGIN { printf "%.3f\n", d / 1000 }' >&2 diff --git a/victorialogs/run.sh b/victorialogs/run.sh deleted file mode 100755 index 36fafb724..000000000 --- a/victorialogs/run.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -TRIES=3 - -set -f -cat queries.logsql | while read -r query; do - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - - echo -n "[" - for i in $(seq 1 $TRIES); do - t1=$(date +%s%3N) - curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=$query" > /dev/null - exit_code=$? - t2=$(date +%s%3N) - duration=$((t2-t1)) - RES=$(awk "BEGIN {print $duration / 1000}" | tr ',' '.') - [[ "$exit_code" == "0" ]] && echo -n "${RES}" || echo -n "null" - [[ "$i" != $TRIES ]] && echo -n ", " - done - echo "]," -done diff --git a/victorialogs/start b/victorialogs/start new file mode 100755 index 000000000..c76523e28 --- /dev/null +++ b/victorialogs/start @@ -0,0 +1,14 @@ +#!/bin/bash +set -eu + +# Idempotent: if already serving, do nothing. +if curl -sSf --get \ + --data-urlencode 'query=_time:2100-01-01Z' \ + 'http://localhost:9428/select/logsql/query' >/dev/null 2>&1; then + exit 0 +fi + +# Detach from this script so it doesn't keep the daemon as a child. +nohup ./victoria-logs-prod -loggerOutput=stdout -retentionPeriod=20y \ + -search.maxQueryDuration=5m > server.log 2>&1 & +disown diff --git a/victorialogs/stop b/victorialogs/stop new file mode 100755 index 000000000..61a1dd863 --- /dev/null +++ b/victorialogs/stop @@ -0,0 +1,11 @@ +#!/bin/bash + +pid=$(pidof victoria-logs-prod 2>/dev/null || true) +if [ -n "$pid" ]; then + kill $pid 2>/dev/null || true + for _ in $(seq 1 30); do + pidof victoria-logs-prod >/dev/null 2>&1 || exit 0 + sleep 1 + done + sudo killall -9 victoria-logs-prod 2>/dev/null || true +fi diff --git a/ydb/benchmark.sh b/ydb/benchmark.sh index 2d256ac33..6340357b4 100755 --- a/ydb/benchmark.sh +++ b/ydb/benchmark.sh @@ -1,292 +1,8 @@ #!/bin/bash -set -e - -PARAMS_FILE="benchmark_variables.sh" -source $PARAMS_FILE -export YDB_PASSWORD=password -START_DIR=`pwd` - -update_file() { - local raw_input="$1" - local raw_output="$2" - local verbose="${3:-0}" - - expand_path() { - local path="$1" - path="${path/#\~/$HOME}" - - local expanded_path - expanded_path=$(eval echo "$path") - echo "$expanded_path" - } - - local input_file - local output_file - input_file=$(expand_path "$raw_input") - output_file=$(expand_path "$raw_output") - - local output_dir - output_dir=$(dirname "$output_file") - - # Making temporary file - local temp_file - temp_file=$(mktemp) || { - echo "Error while creating temporary file" >&2 - return 7 - } - - cleanup() { - rm -f "$temp_file" - } - trap cleanup EXIT - - cp "$input_file" "$temp_file" || { - echo "Error while copying input file to temporary file" >&2 - return 8 - } - - local env_vars - env_vars=$(env | cut -d= -f1) - - for var in $env_vars; do - local value - value="${!var}" - - if grep -q "\$$var" "$temp_file"; then - local escaped_value - escaped_value=$(echo "$value" | sed -e 's/[\/&]/\\&/g') - - sed -i "s/\$$var/$escaped_value/g" "$temp_file" || { - echo "Error while substituting variable \$$var." >&2 - return 9 - } - fi - done - - cp "$temp_file" "$output_file" || { - return 10 - } - - return 0 -} - -sudo apt-get update -y -sudo apt-get install -y software-properties-common -sudo add-apt-repository --yes --update ppa:ansible/ansible -sudo apt-get install -y ansible-core - -cd $START_DIR -if [ ! -d "ydb" ]; then - git clone https://github.com/ydb-platform/ydb.git -fi - -cd $START_DIR/ydb/ydb/apps/ydbd/ -git checkout stable-25-1-analytics || { echo "Error while checking branch out"; exit 1; } -$START_DIR/ydb/ya make -j8 --build=release || { echo "Build error"; exit 1; } - -cd $START_DIR/ydb/ydb/apps/ydb/ -$START_DIR/ydb/ya make -j8 --build=release || { echo "Build error"; exit 1; } - -cd $START_DIR/ydb/ydb/apps/dstool/ -$START_DIR/ydb/ya make -j8 --build=release || { echo "Build error"; exit 1; } - -cd $START_DIR -if [ ! -d "ydb-ansible-examples" ]; then - git clone https://github.com/ydb-platform/ydb-ansible-examples.git -fi - -cd $START_DIR/ydb-ansible-examples -ansible-galaxy install -r requirements.yaml -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc - - -rm -f $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydbd -rm -f $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -rm -f $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb-dstool - -ln -f $START_DIR/ydb/ydb/apps/ydbd/ydbd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ -ln -f $START_DIR/ydb/ydb/apps/ydb/ydb $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ -ln -f $START_DIR/ydb/ydb/apps/dstool/ydb-dstool $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ - -cd $START_DIR - -update_file "ydb-cluster-setup/50-inventory.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/50-inventory.yaml" -update_file "ydb-cluster-setup/config.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/config.yaml" -update_file "ydb-cluster-setup/ydb-ca-nodes.txt" "$START_DIR/ydb-ansible-examples/TLS/ydb-ca-nodes.txt" - -hosts=( "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" ) -disks=( "$disk1" "$disk2" "$disk3" ) - -replace_string_in_file() { - local file_path="$1" - local search_string="$2" - local replace_string="$3" - local temp_file - - if [[ ! -f "$file_path" ]]; then - echo "Error: File $file_path does not exist" >&2 - return 1 - fi - - temp_file=$(mktemp) - - sed "s|$search_string|$replace_string|g" "$file_path" > "$temp_file" - - if [ $? -ne 0 ]; then - echo "Error: Replacement operation failed" - rm -f "$temp_file" - return 4 - fi - - mv "$temp_file" "$file_path" - - return 0 -} - -ssh_execute() { - declare -n local_hosts="$1" - local command="$2" - - for host in "${local_hosts[@]}"; do - - echo "Executing on $host: $command" >&2 - echo "$command" | ssh -l $ydb_host_user_name -o BatchMode=yes -o StrictHostKeyChecking=no "$host" "bash -s" - local exit_code=$? - - if [ $exit_code -ne 0 ]; then - echo "Command failed with exit code: $exit_code" >&2 - fi - done - - return 0 -} - -copy_file_to_multiple_hosts() { - local file_to_copy=$1 - shift - - local hosts=("$@") - local pids=() - - for host in "${hosts[@]}"; do - { - echo "Copying file '$file_to_copy' to $host" - scp "$file_to_copy" $ydb_host_user_name@$host:/home/$ydb_host_user_name - } & - pids+=($!) - done - - # Waiting for all background processes to complete - for pid in "${pids[@]}"; do - wait $pid - done - - echo "Сopy process is complete" -} - -# Cleaning up YDB services on remote hosts -remove_ydb_services() { - local host=$1 - - # Connecting to server - ssh -o StrictHostKeyChecking=no -l $ydb_host_user_name -o BatchMode=yes "$host" ' - services=$(sudo systemctl list-units --type=service --all| grep "ydb" | awk "{print \$1}") - - if [ -z "$services" ]; then - echo "YDB are not found" - else - for service in $services; do - sudo systemctl stop "$service" - sudo systemctl disable "$service" - - unit_path=$(systemctl show -p FragmentPath "$service" | cut -d= -f2) - - if [ -n "$unit_path" ] && [ -f "$unit_path" ]; then - sudo rm -f "$unit_path" - - service_name=$(basename "$unit_path") - if [ -f "/etc/systemd/system/$service_name" ]; then - sudo rm -f "/etc/systemd/system/$service_name" - fi - - if [ -L "/etc/systemd/system/multi-user.target.wants/$service_name" ]; then - sudo rm -f "/etc/systemd/system/multi-user.target.wants/$service_name" - fi - fi - done - - sudo systemctl daemon-reload - sudo systemctl reset-failed - fi - ' - - echo "All operation on $host are finished" -} - -echo "Beginning the process of removing YDB services on all hosts..." - -for host in "${hosts[@]}"; do - remove_ydb_services "$host" -done - -cd $START_DIR/ydb-ansible-examples/TLS -find . -maxdepth 1 -type d -not -path "." -exec rm -rf {} \; -if [ -f "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS" ]; then - cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS - rm -rf * -fi - -cd $START_DIR/ydb-ansible-examples/TLS -./ydb-ca-update.sh -cd CA/certs -newest_dir=$(find . -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) - -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/ -replace_string_in_file "50-inventory.yaml" "" "$START_DIR/ydb-ansible-examples/TLS/CA/certs/$newest_dir" -replace_string_in_file "50-inventory.yaml" "$ydb_host_user_name" "$ydb_host_user_name" - -ssh_execute hosts "sudo mkdir -p /opt/ydb/bin && sudo chmod 755 /opt/ydb/bin" - -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ -copy_file_to_multiple_hosts "ydbd" $host1$host_suffix $host2$host_suffix $host3$host_suffix - -obliterate_disks() { - declare -n local_hostsd="$1" - declare -n local_disks="$2" - - for disk in "${local_disks[@]}"; do - ssh_execute local_hostsd "sudo /home/$ydb_host_user_name/ydbd admin blobstorage disk obliterate $disk" - done -} - -obliterate_disks hosts disks - -ssh_execute hosts "rm -f /home/$ydb_host_user_name/ydbd" -ssh_execute hosts "sudo rm -rf /opt/ydb/" - -cd $START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/ -ansible-playbook ydb_platform.ydb.initial_setup --skip-tags checks - -cd $START_DIR - -if [ ! -f "hits.csv.gz" ]; then - wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.csv.gz -fi - -if [ ! -f "hits.csv" ]; then - echo "Unpacking hits.csv.gz" - gzip -d -f -k hits.csv.gz - echo "Done" -fi - -# if [ -f "$HOME/.config/ydb/import_progress/hits.csv" ]; then -# rm "$HOME/.config/ydb/import_progress/hits.csv" -# fi - -cert_dir=$(find $START_DIR/ydb-ansible-examples/TLS/CA/certs -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) -echo $YDB_PASSWORD|$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -e grpcs://$host1$host_suffix:2135 -d /Root/database --ca-file $cert_dir/ca.crt --user root workload clickbench init --datetime --store column -echo -n "Load time: " -command time -f '%e' echo $YDB_PASSWORD|$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -e grpcs://$host1$host_suffix:2135 -d /Root/database --ca-file $cert_dir/ca.crt --user root import file csv hits.csv -p clickbench/hits - -cd $START_DIR -./run.sh +# Thin shim — actual flow is in lib/benchmark-common.sh. +# YDB downloads CSV directly inside ./load (the ydb CLI imports from CSV). +export BENCH_DOWNLOAD_SCRIPT="" +# YDB has no benefit from server restart — it's a multi-node distributed +# cluster managed via ansible/systemd; stopping between queries is impractical. +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/ydb/check b/ydb/check new file mode 100755 index 000000000..196afc026 --- /dev/null +++ b/ydb/check @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +echo "$YDB_PASSWORD" | "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root yql -s 'SELECT 1' >/dev/null 2>&1 diff --git a/ydb/data-size b/ydb/data-size new file mode 100755 index 000000000..a87f3b59d --- /dev/null +++ b/ydb/data-size @@ -0,0 +1,20 @@ +#!/bin/bash +# YDB data is on raw block devices on the cluster nodes; there's no standard +# du-based answer. We approximate via SQL. +set -eu + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +YDB_BIN="$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" + +echo "$YDB_PASSWORD" | "$YDB_BIN" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root \ + yql -s "SELECT SUM(DataSize) FROM \`/Root/database/.sys/partition_stats\` WHERE Path LIKE '%clickbench/hits%';" \ + 2>/dev/null \ + | grep -oE '[0-9]+' | tail -n1 diff --git a/ydb/install b/ydb/install new file mode 100755 index 000000000..684496b68 --- /dev/null +++ b/ydb/install @@ -0,0 +1,16 @@ +#!/bin/bash +# YDB install — builds ydbd/ydb/ydb-dstool from source and provisions a +# 3-node mirror-3-dc cluster via ansible. The original benchmark.sh contains +# all the setup logic; we delegate to it via an env flag that stops short of +# loading data and running queries. +# +# This is a best-effort port: the cluster setup is host-specific (it expects +# three reachable peers defined in benchmark_variables.sh) and is not +# idempotent in any meaningful sense. Re-running may re-bootstrap state. +set -e + +# The original script does install + load + run all in one. We only execute +# the install phases here; ./load handles importing data, ./query runs SQL. +# To avoid duplicating that long script we keep the original logic in a +# helper file. +exec ./install-impl.sh diff --git a/ydb/install-impl.sh b/ydb/install-impl.sh new file mode 100755 index 000000000..537c3e82d --- /dev/null +++ b/ydb/install-impl.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# YDB install: build binaries, provision the 3-node cluster, but do not load +# data or run queries (those are split into ./load and ./query). +set -e + +PARAMS_FILE="benchmark_variables.sh" +source "$PARAMS_FILE" +export YDB_PASSWORD=password +START_DIR=$(pwd) + +update_file() { + local raw_input="$1" + local raw_output="$2" + + expand_path() { + local path="$1" + path="${path/#\~/$HOME}" + eval echo "$path" + } + + local input_file output_file + input_file=$(expand_path "$raw_input") + output_file=$(expand_path "$raw_output") + + local temp_file + temp_file=$(mktemp) + trap 'rm -f "$temp_file"' EXIT + + cp "$input_file" "$temp_file" + + local env_vars + env_vars=$(env | cut -d= -f1) + for var in $env_vars; do + local value="${!var}" + if grep -q "\$$var" "$temp_file"; then + local escaped_value + escaped_value=$(echo "$value" | sed -e 's/[\/&]/\\&/g') + sed -i "s/\$$var/$escaped_value/g" "$temp_file" + fi + done + + cp "$temp_file" "$output_file" +} + +sudo apt-get update -y +sudo apt-get install -y software-properties-common +sudo add-apt-repository --yes --update ppa:ansible/ansible +sudo apt-get install -y ansible-core + +cd "$START_DIR" +[ -d "ydb" ] || git clone https://github.com/ydb-platform/ydb.git + +cd "$START_DIR/ydb/ydb/apps/ydbd/" +git checkout stable-25-1-analytics +"$START_DIR/ydb/ya" make -j8 --build=release + +cd "$START_DIR/ydb/ydb/apps/ydb/" +"$START_DIR/ydb/ya" make -j8 --build=release + +cd "$START_DIR/ydb/ydb/apps/dstool/" +"$START_DIR/ydb/ya" make -j8 --build=release + +cd "$START_DIR" +[ -d "ydb-ansible-examples" ] || git clone https://github.com/ydb-platform/ydb-ansible-examples.git + +cd "$START_DIR/ydb-ansible-examples" +ansible-galaxy install -r requirements.yaml + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc" +rm -f files/ydbd files/ydb files/ydb-dstool +ln -f "$START_DIR/ydb/ydb/apps/ydbd/ydbd" files/ +ln -f "$START_DIR/ydb/ydb/apps/ydb/ydb" files/ +ln -f "$START_DIR/ydb/ydb/apps/dstool/ydb-dstool" files/ + +cd "$START_DIR" +update_file "ydb-cluster-setup/50-inventory.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/50-inventory.yaml" +update_file "ydb-cluster-setup/config.yaml" "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/config.yaml" +update_file "ydb-cluster-setup/ydb-ca-nodes.txt" "$START_DIR/ydb-ansible-examples/TLS/ydb-ca-nodes.txt" + +hosts=( "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" ) +disks=( "$disk1" "$disk2" "$disk3" ) + +ssh_execute() { + declare -n local_hosts="$1" + local command="$2" + for host in "${local_hosts[@]}"; do + echo "$command" | ssh -l "$ydb_host_user_name" -o BatchMode=yes -o StrictHostKeyChecking=no "$host" "bash -s" || true + done +} + +copy_file_to_multiple_hosts() { + local file_to_copy=$1; shift + local hosts=("$@") + for host in "${hosts[@]}"; do + scp "$file_to_copy" "$ydb_host_user_name@$host:/home/$ydb_host_user_name" & + done + wait +} + +remove_ydb_services() { + local host=$1 + ssh -o StrictHostKeyChecking=no -l "$ydb_host_user_name" -o BatchMode=yes "$host" ' + services=$(sudo systemctl list-units --type=service --all | grep "ydb" | awk "{print \$1}") + if [ -n "$services" ]; then + for service in $services; do + sudo systemctl stop "$service" || true + sudo systemctl disable "$service" || true + unit_path=$(systemctl show -p FragmentPath "$service" | cut -d= -f2) + if [ -n "$unit_path" ] && [ -f "$unit_path" ]; then + sudo rm -f "$unit_path" + fi + done + sudo systemctl daemon-reload + sudo systemctl reset-failed + fi + ' || true +} + +for host in "${hosts[@]}"; do remove_ydb_services "$host"; done + +cd "$START_DIR/ydb-ansible-examples/TLS" +find . -maxdepth 1 -type d -not -path "." -exec rm -rf {} \; +[ -d "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS" ] \ + && rm -rf "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/TLS"/* + +./ydb-ca-update.sh +cd CA/certs +newest_dir=$(find . -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/inventory/" +sed -i "s||$START_DIR/ydb-ansible-examples/TLS/CA/certs/$newest_dir|g" 50-inventory.yaml + +ssh_execute hosts "sudo mkdir -p /opt/ydb/bin && sudo chmod 755 /opt/ydb/bin" + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/" +copy_file_to_multiple_hosts "ydbd" "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" + +for disk in "${disks[@]}"; do + ssh_execute hosts "sudo /home/$ydb_host_user_name/ydbd admin blobstorage disk obliterate $disk" +done + +ssh_execute hosts "rm -f /home/$ydb_host_user_name/ydbd" +ssh_execute hosts "sudo rm -rf /opt/ydb/" + +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/" +ansible-playbook ydb_platform.ydb.initial_setup --skip-tags checks diff --git a/ydb/load b/ydb/load new file mode 100755 index 000000000..c220c2efa --- /dev/null +++ b/ydb/load @@ -0,0 +1,29 @@ +#!/bin/bash +set -eu + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +YDB_BIN="$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" +COMMON_ARGS=(-e "grpcs://$host1$host_suffix:2135" -d /Root/database --ca-file "$cert_dir/ca.crt" --user root) + +if [ ! -f "hits.csv" ]; then + if [ ! -f "hits.csv.gz" ]; then + wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' + fi + gzip -d -f -k hits.csv.gz +fi + +echo "$YDB_PASSWORD" | "$YDB_BIN" "${COMMON_ARGS[@]}" \ + workload clickbench init --datetime --store column + +echo "$YDB_PASSWORD" | "$YDB_BIN" "${COMMON_ARGS[@]}" \ + import file csv hits.csv -p clickbench/hits + +rm -f hits.csv hits.csv.gz +sync diff --git a/ydb/query b/ydb/query new file mode 100755 index 000000000..c193419b0 --- /dev/null +++ b/ydb/query @@ -0,0 +1,38 @@ +#!/bin/bash +# Reads a SQL/YQL query from stdin, runs it via the ydb CLI's yql subcommand. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# `--stats basic` "total_duration_us:" output). +# Exit non-zero on error. +set -e + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +YDB_BIN="$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" + +query=$(cat) + +raw=$(echo "$YDB_PASSWORD" | "$YDB_BIN" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root \ + yql -s "$query" --stats basic 2>&1) && exit_code=0 || exit_code=$? + +if [ "$exit_code" -ne 0 ]; then + printf '%s\n' "$raw" >&2 + exit "$exit_code" +fi + +printf '%s\n' "$raw" + +us=$(printf '%s\n' "$raw" | grep -oP 'total_duration_us:\s*\K[0-9]+' | tail -n1) +if [ -z "$us" ]; then + echo "no total_duration_us in ydb output" >&2 + exit 1 +fi + +awk -v u="$us" 'BEGIN { printf "%.6f\n", u / 1000000 }' >&2 diff --git a/ydb/run.sh b/ydb/run.sh deleted file mode 100755 index 160b5ca90..000000000 --- a/ydb/run.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -TRIES=3 -set -e -source benchmark_variables.sh - -YDB_PASSWORD=password - -cert_dir=$(find ydb-ansible-examples/TLS/CA/certs -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" | sort -n | tail -n 1 | cut -d' ' -f2-) - -# YDB uses raw block devices, that means there is not need to drop filesystem caches -# sync -# echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - -cat queries.sql | while read -r query; do - echo -n "[" - - for i in $(seq 1 $TRIES); do - result=$(echo $YDB_PASSWORD | ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb -e grpcs://$host1$host_suffix:2135 -d /Root/database --ca-file $cert_dir/ca.crt --user root yql -s "$query" --stats basic 2>/dev/null) - - # Extracting total_duration_us value - if [[ "$result" =~ total_duration_us:[[:space:]]*([0-9]+) ]]; then - duration_us=${BASH_REMATCH[1]} - # Convert microseconds to seconds - duration_sec=$(awk "BEGIN {printf \"%.6f\", $duration_us/1000000}") - echo -n "$duration_sec" - - if [ $i -ne $(($TRIES)) ]; then - echo -n "," - fi - else - exit -1 - fi - done - echo "]," -done diff --git a/ydb/start b/ydb/start new file mode 100755 index 000000000..7c639c81e --- /dev/null +++ b/ydb/start @@ -0,0 +1,23 @@ +#!/bin/bash +# YDB cluster lifecycle is managed by ansible/systemd on remote nodes. +# After ./install the cluster is already running; we just verify connectivity. +set -e + +source benchmark_variables.sh +START_DIR=$(pwd) +export YDB_PASSWORD=password + +cert_dir=$(find "$START_DIR/ydb-ansible-examples/TLS/CA/certs" -maxdepth 1 -type d -not -path "." -printf "%T@ %p\n" 2>/dev/null \ + | sort -n | tail -n 1 | cut -d' ' -f2-) + +# Idempotent: if cluster responds, exit success. +if echo "$YDB_PASSWORD" | "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/files/ydb" \ + -e "grpcs://$host1$host_suffix:2135" -d /Root/database \ + --ca-file "$cert_dir/ca.crt" --user root yql -s 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +# Cluster is provisioned via ansible during install. Re-running the playbook +# is the most reliable way to bring all nodes up. +cd "$START_DIR/ydb-ansible-examples/3-nodes-mirror-3-dc/" +ansible-playbook ydb_platform.ydb.initial_setup --skip-tags checks diff --git a/ydb/stop b/ydb/stop new file mode 100755 index 000000000..5337507c8 --- /dev/null +++ b/ydb/stop @@ -0,0 +1,10 @@ +#!/bin/bash +# Best-effort stop of the YDB systemd services on each node. +source benchmark_variables.sh +hosts=( "$host1$host_suffix" "$host2$host_suffix" "$host3$host_suffix" ) + +for host in "${hosts[@]}"; do + ssh -o StrictHostKeyChecking=no -l "$ydb_host_user_name" -o BatchMode=yes "$host" \ + "sudo systemctl list-units --type=service --all | grep ydb | awk '{print \$1}' | xargs -r sudo systemctl stop" \ + 2>/dev/null || true +done