diff --git a/spark-gluten-clickhouse/README.md b/spark-gluten-clickhouse/README.md new file mode 100644 index 000000000..6dc439ada --- /dev/null +++ b/spark-gluten-clickhouse/README.md @@ -0,0 +1,29 @@ +This entry runs Apache Spark with the [Apache Gluten](https://gluten.apache.org/) plugin configured to use the **ClickHouse backend** ('ch'). Gluten loads `libch.so` (a fork of ClickHouse v23.1) into the Spark executor JVM and runs the columnar physical plan natively through it. See also [`spark-gluten/`](../spark-gluten/) (Velox backend) and the [accelerators README](../spark/README-accelerators.md). + +### Run + +`./benchmark.sh` builds everything from source (no pre-built bundle is published for the CH backend) and then runs all 43 queries. Optional first argument is the machine spec, e.g. `./benchmark.sh c6a.8xlarge`. + +## Notes + +### Build + +The CH backend is not part of Apache Gluten's release tarball — only the Velox bundle is published. As a result `benchmark.sh` builds two things from source: + +1. **`libch.so`** — built from [Kyligence/ClickHouse](https://github.com/Kyligence/ClickHouse) at the branch pinned in `gluten/cpp-ch/clickhouse.version`. The build uses Clang 18 / cmake / ninja. +2. **The Gluten Spark plugin** — built via Maven with `-P backends-clickhouse,spark-3.5`. JDK 8 is required at compile time (Gluten's POM); Spark itself runs under JDK 17. + +Building libch.so essentially compiles ClickHouse from source: it is **memory-hungry** (Gluten's docs note that 64 GB RAM is recommended). On a c6a.4xlarge (32 GB RAM) the compile may OOM; use c6a.8xlarge or larger for a clean run. + +### Configuration + +- `spark.gluten.sql.columnar.backend.lib=ch` selects the ClickHouse backend over Velox. +- `spark.gluten.sql.columnar.libpath=` points to the native library. The build location is `gluten/cpp-ch/build_ch/utils/extern-local-engine/libch.so`; `benchmark.sh` symlinks it as `libch.so` in the entry directory. +- Memory is split 50/50 between Spark heap and Gluten off-heap, identical to the Velox entry — the CH backend also runs off-heap via JNI. +- Queries use ClickHouse-style regex backreferences (`\1`) rather than Spark's `$1`, since the regex evaluation happens inside libch.so. See the discussion in [`spark-gluten/README.md`](../spark-gluten/README.md) and [Gluten issue #7545](https://github.com/apache/incubator-gluten/issues/7545). + +### Links + +- [Gluten ClickHouse-backend getting started](https://gluten.apache.org/docs/get-started/ClickHouse/). +- [Gluten release page](https://gluten.apache.org/downloads/) (Velox bundles only). +- [Kyligence/ClickHouse fork](https://github.com/Kyligence/ClickHouse) (the source of libch.so). diff --git a/spark-gluten-clickhouse/benchmark.sh b/spark-gluten-clickhouse/benchmark.sh new file mode 100755 index 000000000..8b19fb39c --- /dev/null +++ b/spark-gluten-clickhouse/benchmark.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +# Spark + Apache Gluten with the ClickHouse backend ('ch'). Unlike the +# Velox backend, no pre-built bundle is published for the CH backend, so +# this script builds both libch.so (a ClickHouse fork) and the Gluten +# Spark plugin from source. +# +# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) +# +# The ClickHouse compile is RAM-hungry; building on c6a.4xlarge (32 GB) +# may OOM. A larger machine (>= 64 GB RAM, c6a.8xlarge or above) is +# recommended. + +set -e + +GLUTEN_VERSION=v1.4.0 +SPARK_PROFILE=spark-3.5 + +# Install build prerequisites: +# - Java 8 to build Gluten via Maven (Gluten's pom requires JDK 8) +# - Java 17 to run Spark (auto-selected via JAVA_HOME below) +# - Clang 18, cmake, ninja, etc. to build libch.so +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv \ + openjdk-8-jdk-headless openjdk-17-jdk-headless \ + maven git cmake ccache ninja-build nasm yasm gawk \ + lsb-release wget software-properties-common gnupg + +# Install Clang 18 (required by libch.so build). +wget -O - https://apt.llvm.org/llvm.sh | sudo bash -s -- 18 + +export CC=clang-18 +export CXX=clang++-18 + +# pyspark venv +python3 -m venv myenv +source myenv/bin/activate +pip install pyspark==3.5.2 psutil + +# Load the data +../download-hits-parquet-single + +# Clone Gluten and the Kyligence ClickHouse fork that the CH backend wraps. +GLUTEN_DIR="$PWD/gluten" +if [ ! -d "$GLUTEN_DIR" ]; then + git clone --depth 1 --branch "$GLUTEN_VERSION" \ + https://github.com/apache/gluten.git "$GLUTEN_DIR" +fi + +CH_BRANCH=$(grep '^CH_BRANCH=' "$GLUTEN_DIR/cpp-ch/clickhouse.version" | cut -d= -f2) +CH_DIR="$GLUTEN_DIR/cpp-ch/ClickHouse" +if [ ! -d "$CH_DIR" ]; then + git clone --recursive --shallow-submodules \ + --branch "$CH_BRANCH" \ + https://github.com/Kyligence/ClickHouse.git "$CH_DIR" +fi + +# Build libch.so. The wrapper at cpp-ch/build_ch invokes the inner +# ClickHouse build, whose final artifact ends up at cpp-ch/build/. +LIBCH_SO="$GLUTEN_DIR/cpp-ch/build/utils/extern-local-engine/libch.so" +if [ ! -f "$LIBCH_SO" ]; then + bash "$GLUTEN_DIR/ep/build-clickhouse/src/build_clickhouse.sh" +fi + +# Build the Gluten Spark plugin against the CH backend. JDK 8 is required +# at compile time per Gluten's pom; Spark itself runs under JDK 17 below. +# pyspark wheels ship Scala 2.12 jars, so build with scala-2.12 to match. +JAVA_HOME_8="/usr/lib/jvm/java-8-openjdk-$(dpkg --print-architecture)" +( + cd "$GLUTEN_DIR" + JAVA_HOME="$JAVA_HOME_8" PATH="$JAVA_HOME_8/bin:$PATH" \ + mvn -B clean package \ + -Pbackends-clickhouse -P"$SPARK_PROFILE" -Pscala-2.12 \ + -DskipTests -Dcheckstyle.skip +) + +# Symlink the produced uber jar (jar-with-dependencies) and libch.so into +# the entry directory; query.py expects them as ./gluten.jar and ./libch.so. +GLUTEN_JAR=$(ls "$GLUTEN_DIR"/backends-clickhouse/target/gluten-*-spark-3.5-jar-with-dependencies.jar 2>/dev/null | head -n1) +if [ -z "$GLUTEN_JAR" ]; then + echo "ERROR: could not locate built Gluten CH-backend jar" >&2 + ls "$GLUTEN_DIR/backends-clickhouse/target/" >&2 || true + exit 1 +fi +ln -sf "$GLUTEN_JAR" gluten.jar +ln -sf "$LIBCH_SO" libch.so + +# Run Spark queries under JDK 17. +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH="$JAVA_HOME/bin:$PATH" + +./run.sh 2>&1 | tee log.txt + +# Print results to stdout as required +cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | + awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' + +DATA_SIZE=$(du -b hits.parquet | cut -f1) + +echo "Data size: $DATA_SIZE" +echo "Load time: 0" + +# Save results as JSON +MACHINE="${1:-c6a.8xlarge}" +SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) +GLUTEN_TAG="${GLUTEN_VERSION#v}" + +mkdir -p results + +( +cat << EOF +{ + "system": "Spark (Gluten-on-ClickHouse)", + "date": "$(date +%Y-%m-%d)", + "machine": "${MACHINE}", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "comment": "Apache Gluten ${GLUTEN_TAG} with the ClickHouse backend (libch.so), Spark ${SPARK_VERSION}", + "tags": ["Java", "C++", "column-oriented", "Spark derivative", "ClickHouse", "Parquet"], + "load_time": 0, + "data_size": ${DATA_SIZE}, + "result": [ +EOF + +cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | + awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' + { + if (i % 3 == 0) printf "\t\t["; + if ($1 == "null") printf "null"; + else printf "%.3f", $1; + if (i % 3 != 2) printf ", "; + else { + if (i < total - 1) printf "],\n"; + else printf "]"; + } + i++; + }' + +cat << EOF + + ] +} +EOF +) > "results/${MACHINE}.json" + +echo "Results have been saved to results/${MACHINE}.json" diff --git a/spark-gluten-clickhouse/queries.sql b/spark-gluten-clickhouse/queries.sql new file mode 100644 index 000000000..31f65fc89 --- /dev/null +++ b/spark-gluten-clickhouse/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/spark-gluten-clickhouse/query.py b/spark-gluten-clickhouse/query.py new file mode 100755 index 000000000..5c43a4efb --- /dev/null +++ b/spark-gluten-clickhouse/query.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +""" +Spark + Apache Gluten using the ClickHouse backend ('ch'). The CH backend +loads libch.so (a fork of ClickHouse v23.1) into the Spark executor JVM +and runs the columnar physical plan natively. + +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details). +""" + +import os +import sys +import timeit + +import psutil +from pyspark.sql import SparkSession +import pyspark.sql.functions as F + + +query = sys.stdin.read() +print(query) + +# Calculate available memory to configure SparkSession (in MB). +# The CH backend runs off-heap (via JNI into libch.so), so split available +# memory between Spark's JVM heap and the off-heap pool the same way the +# Velox backend does. +ram = int(round(psutil.virtual_memory().available / (1024 ** 2) * 0.7)) +heap = ram // 2 +off_heap = ram - heap +print(f"SparkSession will use {heap} MB of heap and {off_heap} MB of off-heap memory (total {ram} MB)") + +builder = ( + SparkSession + .builder + .appName("ClickBench") + .config("spark.driver", "local[*]") + .config("spark.driver.memory", f"{heap}m") + .config("spark.sql.parquet.binaryAsString", True) + + # Gluten + ClickHouse backend configuration + .config("spark.jars", "gluten.jar") + .config("spark.driver.extraClassPath", "gluten.jar") + .config("spark.plugins", "org.apache.gluten.GlutenPlugin") + .config("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + .config("spark.gluten.sql.columnar.backend.lib", "ch") + .config("spark.gluten.sql.columnar.libpath", os.path.abspath("libch.so")) + .config("spark.memory.offHeap.enabled", "true") + .config("spark.memory.offHeap.size", f"{off_heap}m") + .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true") +) + +spark = builder.getOrCreate() + +df = spark.read.parquet("hits.parquet") +df = df.withColumn("EventTime", F.col("EventTime").cast("timestamp")) +df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) +df.createOrReplaceTempView("hits") + +for try_num in range(3): + try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) + end = timeit.default_timer() + print("Time: ", end - start) + except Exception as e: + print(e) + print("Failure!") diff --git a/spark-gluten-clickhouse/run.sh b/spark-gluten-clickhouse/run.sh new file mode 100755 index 000000000..8c9ca1289 --- /dev/null +++ b/spark-gluten-clickhouse/run.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) + +cat queries.sql | while read query; do + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + ./query.py <<< "${query}" +done diff --git a/spark-gluten-clickhouse/template.json b/spark-gluten-clickhouse/template.json new file mode 100644 index 000000000..6ae1ad287 --- /dev/null +++ b/spark-gluten-clickhouse/template.json @@ -0,0 +1,13 @@ +{ + "system": "Spark (Gluten-on-ClickHouse)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Java", + "C++", + "column-oriented", + "Spark derivative", + "ClickHouse" + ] +}