Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
21 changes: 21 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,24 @@
*.parquet
hits.csv
hits.tsv

# Per-system runtime artifacts produced by benchmark.sh
result.csv
log.txt
load_out.txt
server.log
server.pid
arc_token.txt
data-size.txt
.doris_home
.sirius_env

# Per-system data files
hits.db
mydb
hits.hyper
hits.vortex
*.vortex

# Python venvs created by install scripts
myenv/
207 changes: 4 additions & 203 deletions arc/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,204 +1,5 @@
#!/bin/bash
# Arc ClickBench Complete Benchmark Script (Go Binary Version)
set -e

# ============================================================
# 1. INSTALL ARC FROM .DEB PACKAGE
# ============================================================
echo "Installing Arc from .deb package..."

# Fetch latest Arc version from GitHub releases
echo "Fetching latest Arc version..."
ARC_VERSION=$(curl -s https://api.github.com/repos/Basekick-Labs/arc/releases/latest | grep -oP '"tag_name": "v\K[^"]+')
if [ -z "$ARC_VERSION" ]; then
echo "Error: Could not fetch latest Arc version from GitHub"
exit 1
fi
echo "Latest Arc version: $ARC_VERSION"

ARCH=$(uname -m)
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/arc_${ARC_VERSION}_arm64.deb"
DEB_FILE="arc_${ARC_VERSION}_arm64.deb"
else
DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/arc_${ARC_VERSION}_amd64.deb"
DEB_FILE="arc_${ARC_VERSION}_amd64.deb"
fi

echo "Detected architecture: $ARCH -> $DEB_FILE"

if [ ! -f "$DEB_FILE" ]; then
wget -q "$DEB_URL" -O "$DEB_FILE"
fi

sudo dpkg -i "$DEB_FILE" || sudo apt-get install -f -y
echo "[OK] Arc installed"

# ============================================================
# 2. PRINT SYSTEM INFO (Arc defaults)
# ============================================================
CORES=$(nproc)
TOTAL_MEM_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}')
TOTAL_MEM_GB=$((TOTAL_MEM_KB / 1024 / 1024))
MEM_LIMIT_GB=$((TOTAL_MEM_GB * 80 / 100)) # 80% of system RAM

echo ""
echo "System Configuration:"
echo " CPU cores: $CORES"
echo " Connections: $((CORES * 2)) (cores × 2)"
echo " Threads: $CORES (same as cores)"
echo " Memory limit: ${MEM_LIMIT_GB}GB (80% of ${TOTAL_MEM_GB}GB total)"
echo ""

# ============================================================
# 3. START ARC AND CAPTURE TOKEN FROM LOGS
# ============================================================
echo "Starting Arc service..."

# Check if we already have a valid token from a previous run
if [ -f "arc_token.txt" ]; then
EXISTING_TOKEN=$(cat arc_token.txt)
echo "Found existing token file, will verify after Arc starts..."
fi

sudo systemctl start arc

# Wait for Arc to be ready
echo "Waiting for Arc to be ready..."
for i in {1..30}; do
if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
echo "[OK] Arc is ready!"
break
fi
if [ $i -eq 30 ]; then
echo "Error: Arc failed to start within 30 seconds"
sudo journalctl -u arc --no-pager | tail -50
exit 1
fi
sleep 1
done

# Try to get token - either from existing file or from logs (first run)
ARC_TOKEN=""

# First, check if existing token works
if [ -n "$EXISTING_TOKEN" ]; then
if curl -sf http://localhost:8000/health -H "x-api-key: $EXISTING_TOKEN" > /dev/null 2>&1; then
ARC_TOKEN="$EXISTING_TOKEN"
echo "[OK] Using existing token from arc_token.txt"
else
echo "Existing token invalid, looking for new token in logs..."
fi
fi

# If no valid token yet, try to extract from logs (first run scenario)
if [ -z "$ARC_TOKEN" ]; then
ARC_TOKEN=$(sudo journalctl -u arc --no-pager | grep -oP '(?:Initial admin API token|Admin API token): \K[^\s]+' | head -1)
if [ -n "$ARC_TOKEN" ]; then
echo "[OK] Captured new token from logs"
echo "$ARC_TOKEN" > arc_token.txt
else
echo "Error: Could not find or validate API token"
echo "If this is not the first run, Arc's database may need to be reset:"
echo " sudo rm -rf /var/lib/arc/data/arc.db"
exit 1
fi
fi

echo "Token: ${ARC_TOKEN:0:20}..."

# ============================================================
# 4. DOWNLOAD DATASET
# ============================================================
DATASET_FILE="hits.parquet"
DATASET_URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet"
EXPECTED_SIZE=14779976446

if [ -f "$DATASET_FILE" ]; then
CURRENT_SIZE=$(stat -c%s "$DATASET_FILE" 2>/dev/null || stat -f%z "$DATASET_FILE" 2>/dev/null)
if [ "$CURRENT_SIZE" -eq "$EXPECTED_SIZE" ]; then
echo "[OK] Dataset already downloaded (14GB)"
else
echo "Re-downloading dataset (size mismatch)..."
rm -f "$DATASET_FILE"
wget --continue --progress=dot:giga "$DATASET_URL"
fi
else
echo "Downloading ClickBench dataset (14GB)..."
wget --continue --progress=dot:giga "$DATASET_URL"
fi

# ============================================================
# 5. LOAD DATA INTO ARC
# ============================================================
echo "Loading data into Arc..."

# Determine Arc's data directory (default: /var/lib/arc/data)
ARC_DATA_DIR="/var/lib/arc/data"
TARGET_DIR="$ARC_DATA_DIR/clickbench/hits"
TARGET_FILE="$TARGET_DIR/hits.parquet"

sudo mkdir -p "$TARGET_DIR"

if [ -f "$TARGET_FILE" ]; then
SOURCE_SIZE=$(stat -c%s "$DATASET_FILE" 2>/dev/null || stat -f%z "$DATASET_FILE" 2>/dev/null)
TARGET_SIZE=$(stat -c%s "$TARGET_FILE" 2>/dev/null || stat -f%z "$TARGET_FILE" 2>/dev/null)
if [ "$SOURCE_SIZE" -eq "$TARGET_SIZE" ]; then
echo "[OK] Data already loaded"
else
echo "Reloading data (size mismatch)..."
sudo cp "$DATASET_FILE" "$TARGET_FILE"
fi
else
sudo cp "$DATASET_FILE" "$TARGET_FILE"
echo "[OK] Data loaded to $TARGET_FILE"
fi

# ============================================================
# 6. SET ENVIRONMENT AND RUN BENCHMARK
# ============================================================
export ARC_URL="http://localhost:8000"
export ARC_API_KEY="$ARC_TOKEN"
export DATABASE="clickbench"
export TABLE="hits"

echo ""
echo "Running ClickBench queries (true cold runs)..."
echo "================================================"
./run.sh 2>&1 | tee log.txt

# ============================================================
# 7. STOP ARC AND FORMAT RESULTS
# ============================================================
echo "Stopping Arc..."
sudo systemctl stop arc

# Format results as proper JSON array
cat log.txt | grep -oE '^[0-9]+\.[0-9]+|^null' | \
awk '{
if (NR % 3 == 1) printf "[";
printf "%s", $1;
if (NR % 3 == 0) print "],";
else printf ", ";
}' > results.txt

echo ""
echo "[OK] Benchmark complete!"
echo "================================================"
echo "Load time: 0"
echo "Data size: $EXPECTED_SIZE"
cat results.txt
echo "================================================"

# ============================================================
# 8. CLEANUP
# ============================================================
echo "Cleaning up..."

# Uninstall Arc package
sudo dpkg -r arc || true

# Remove Arc data directory
sudo rm -rf /var/lib/arc

echo "[OK] Cleanup complete"
# Thin shim — actual flow is in lib/benchmark-common.sh.
export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single"
export BENCH_RESTARTABLE=yes
exec ../lib/benchmark-common.sh
11 changes: 11 additions & 0 deletions arc/check
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
set -e

ARC_URL="${ARC_URL:-http://localhost:8000}"
TOKEN=$(cat arc_token.txt 2>/dev/null || true)

if [ -n "$TOKEN" ]; then
curl -sf "$ARC_URL/health" -H "x-api-key: $TOKEN" >/dev/null
else
curl -sf "$ARC_URL/health" >/dev/null
fi
10 changes: 10 additions & 0 deletions arc/data-size
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
set -e

# Source parquet file size (loaded into Arc's data directory).
F="/var/lib/arc/data/clickbench/hits/hits.parquet"
if [ -f "$F" ]; then
sudo stat -c%s "$F"
else
echo 14779976446
fi
28 changes: 28 additions & 0 deletions arc/install
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
set -e

# Install Arc from a .deb release. Idempotent.
if dpkg -l arc 2>/dev/null | grep -q '^ii '; then
exit 0
fi

ARC_VERSION=$(curl -s https://api.github.com/repos/Basekick-Labs/arc/releases/latest \
| grep -oP '"tag_name": "v\K[^"]+')
if [ -z "$ARC_VERSION" ]; then
echo "Error: Could not fetch latest Arc version from GitHub" >&2
exit 1
fi

ARCH=$(uname -m)
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
DEB_FILE="arc_${ARC_VERSION}_arm64.deb"
else
DEB_FILE="arc_${ARC_VERSION}_amd64.deb"
fi
DEB_URL="https://github.com/Basekick-Labs/arc/releases/download/v${ARC_VERSION}/${DEB_FILE}"

if [ ! -f "$DEB_FILE" ]; then
wget -q "$DEB_URL" -O "$DEB_FILE"
fi

sudo dpkg -i "$DEB_FILE" || sudo apt-get install -f -y
20 changes: 20 additions & 0 deletions arc/load
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
set -e

# Arc loads the parquet file into its data directory and indexes it on startup.
ARC_DATA_DIR="/var/lib/arc/data"
TARGET_DIR="$ARC_DATA_DIR/clickbench/hits"
TARGET_FILE="$TARGET_DIR/hits.parquet"

sudo mkdir -p "$TARGET_DIR"

if [ -f "$TARGET_FILE" ] && \
[ "$(stat -c%s hits.parquet)" -eq "$(stat -c%s "$TARGET_FILE")" ]; then
: # already loaded
else
sudo cp hits.parquet "$TARGET_FILE"
fi

# Free up local space.
rm -f hits.parquet
sync
49 changes: 49 additions & 0 deletions arc/query
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash
# Reads a SQL query from stdin, POSTs it to Arc's HTTP API.
# Stdout: query response body (JSON).
# Stderr: query runtime in fractional seconds on the last line (extracted
# from Arc's journal log line `execution_time_ms=N`).
# Exit non-zero on error.
set -e

ARC_URL="${ARC_URL:-http://localhost:8000}"
ARC_API_KEY="${ARC_API_KEY:-$(cat arc_token.txt 2>/dev/null)}"

query=$(cat)

# Build JSON payload with proper escaping.
JSON_PAYLOAD=$(jq -Rs '{sql: .}' <<<"$query")

# Mark journal position so we can locate the matching execution_time_ms entry.
LOG_MARKER=$(date -u +"%Y-%m-%dT%H:%M:%S")

RESPONSE=$(curl -s -w "\n%{http_code}" \
-X POST "$ARC_URL/api/v1/query" \
-H "x-api-key: $ARC_API_KEY" \
-H "Content-Type: application/json" \
-d "$JSON_PAYLOAD" \
--max-time 300)

HTTP_CODE=$(printf '%s\n' "$RESPONSE" | tail -1)
BODY=$(printf '%s\n' "$RESPONSE" | head -n -1)

if [ "$HTTP_CODE" != "200" ]; then
printf 'arc query failed: HTTP %s\n%s\n' "$HTTP_CODE" "$BODY" >&2
exit 1
fi

# Result body to stdout.
printf '%s\n' "$BODY"

# Extract execution_time_ms from Arc's journal — give it a moment to flush.
sleep 0.1
EXEC_MS=$(sudo journalctl -u arc --since="$LOG_MARKER" --no-pager 2>/dev/null \
| grep -oP 'execution_time_ms=\K[0-9]+' | tail -1)

if [ -z "$EXEC_MS" ]; then
echo "Could not extract execution_time_ms from arc journal" >&2
exit 1
fi

# Convert ms -> seconds and emit on stderr.
awk -v ms="$EXEC_MS" 'BEGIN { printf "%.4f\n", ms / 1000 }' >&2
Loading