diff --git a/README.md b/README.md index 244f1560e..631570d66 100644 --- a/README.md +++ b/README.md @@ -311,7 +311,7 @@ Please help us add more systems and run the benchmarks on more types of VMs: - [ ] MS SQL Server with Column Store Index (without publishing) - [ ] OceanBase - [ ] Planetscale (without publishing) -- [ ] Quickwit +- [x] Quickwit - [ ] Redshift Spectrum - [ ] Seafowl - [ ] ShitholeDB diff --git a/quickwit/README.md b/quickwit/README.md new file mode 100644 index 000000000..d987f12b3 --- /dev/null +++ b/quickwit/README.md @@ -0,0 +1,49 @@ +# Quickwit + +[Quickwit](https://quickwit.io) is a Rust-based search engine for log analytics, built on top of [Tantivy](https://github.com/quickwit-oss/tantivy). It exposes an Elasticsearch-compatible REST API for ingestion and search, but does not implement an SQL endpoint, so this benchmark uses the native Elasticsearch query DSL directly. + +## Methodology + +Infrastructure: +- Single-node Quickwit **v0.9.0-rc** (Docker `quickwit/quickwit:v0.9.0-rc`). + + Stable **0.8.2** is missing `cardinality`, `wildcard`, and several other features the benchmark relies on, so we use the v0.9 release candidate. The v0.9 line is still unreleased — as soon as a stable v0.9.x ships, bump `QW_IMAGE` in `benchmark.sh`. + +Index configuration (`index_config.yaml`): +- All scalar fields declared with `fast: true` so they can participate in aggregations and sorts. +- Keyword-like text fields use the `raw` tokenizer with the `raw` fast-field normalizer to mimic Elasticsearch's `keyword` mapping. +- `EventTime` is set as the index's timestamp field, providing time-based pruning. + +Ingestion (`benchmark.sh`): +- Streams `hits.json.gz` decompressed into `quickwit tool local-ingest`, which builds splits directly on local storage. We do **not** use the Elasticsearch bulk endpoint: v0.9's sharded ingest-v2 API caps single-node throughput to a few MB/s in our testing and stalls waiting for shards to scale. `local-ingest` bypasses the ingest pipeline entirely. +- The server picks up the new splits on its next metastore poll (default 30 s). + +Queries (`queries.json`): +- Each query in `queries.sql` is hand-translated to the Elasticsearch DSL on the corresponding line of `queries.json`, and submitted to `/api/v1/_elastic/hits/_search`. +- Timing is taken from the `took` field returned by Quickwit (milliseconds, engine-internal). +- Queries that are not expressible in Quickwit's DSL are recorded as `null`. + +## Unsupported queries + +The following ClickBench queries cannot currently be expressed in Quickwit's Elasticsearch-compatible DSL and are reported as `null`: + +| Q | Reason | +|----|-----------------------------------------------------------------------| +| 19 | `extract(minute FROM …)` — no scripted/runtime fields | +| 26 | `ORDER BY` on text field — `sort by field on type text is currently not supported` | +| 27 | `ORDER BY` on text field | +| 28 | `AVG(length(URL))` — no scripted/runtime fields | +| 29 | `REGEXP_REPLACE` — not supported | +| 30 | `SUM(col + N)` — no scripted aggregations | +| 36 | `ClientIP - N` — no scripted aggregations | +| 40 | `CASE WHEN …` — no scripted/runtime fields | + +All other 35 queries run through the native Elasticsearch DSL, including `cardinality` (Q5/6/9/10/11/12/14) and `wildcard` (Q21/22/23/24). + +## Running + +```bash +bash benchmark.sh +``` + +Installs Docker and Quickwit, creates the index, downloads `hits.json.gz`, runs `local-ingest`, then runs `run.sh` to time each query three times with caches dropped between runs. diff --git a/quickwit/benchmark.sh b/quickwit/benchmark.sh new file mode 100755 index 000000000..8d438a265 --- /dev/null +++ b/quickwit/benchmark.sh @@ -0,0 +1,90 @@ +#!/bin/bash +set -eo pipefail + +export DEBIAN_FRONTEND=noninteractive + +# Install prerequisites quietly +sudo apt-get update -qq >/dev/null +sudo apt-get install -y -qq wget curl jq bc docker.io >/dev/null +sudo systemctl start docker + +# We use the Quickwit v0.9 release candidate. Stable v0.8.2 is missing +# `cardinality`, `wildcard`, and several other features the benchmark relies +# on; only the v0.9 line (still unreleased as of writing) provides them. +QW_IMAGE="quickwit/quickwit:v0.9.0-rc" +sudo docker pull -q "$QW_IMAGE" >/dev/null + +# Quickwit's data directory (shared between the server and the local-ingest +# container). +QW_DATA="$(pwd)/qwdata" +sudo rm -rf "$QW_DATA" +mkdir -p "$QW_DATA" + +# Start the server in the background. Quickwit defaults: REST on 7280, gRPC on 7281. +# Mount node-config.yaml on top of the image's default config to bump the +# searcher timeouts (defaults are 30s, which is too low for some of the +# nested high-cardinality aggregations on the full 100M-row dataset). +sudo docker run -d --name qw --network host \ + -v "$QW_DATA":/quickwit/qwdata \ + -v "$(pwd)/node-config.yaml":/quickwit/config/quickwit.yaml \ + "$QW_IMAGE" run >/dev/null +echo "Quickwit container started" + +# Wait for the server to come up. +for i in $(seq 1 60); do + if curl -sS -f http://localhost:7280/api/v1/version >/dev/null 2>&1; then + echo "Quickwit is ready" + break + fi + sleep 1 +done + +# Create the index from the YAML config. +curl -sS -X POST http://localhost:7280/api/v1/indexes \ + -H 'Content-Type: application/yaml' \ + --data-binary @index_config.yaml | jq -r '.index_uid // .message' + +# Download the data quietly (the dataset is ~14 GB; full progress would +# dominate the captured benchmark log). +wget --continue -q 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' + +START=$(date +%s) + +# Use `quickwit tool local-ingest` instead of the Elasticsearch-compatible +# bulk endpoint. v0.9's sharded ingest-v2 API caps single-node throughput +# to a few MB/s and gets stuck waiting for shards to scale, while +# `local-ingest` builds splits directly and writes them to the index +# storage. The running server picks up new splits on its next metastore +# poll (default 30s). +# +# local-ingest emits a "Num docs ... Thrghput ... Time" progress line +# roughly once per second; we throttle that to once per ~30 seconds so +# the captured log stays compact, and pass the surrounding lines through +# unchanged. +zcat hits.json.gz | sudo docker run --rm -i --network host \ + -v "$QW_DATA":/quickwit/qwdata \ + "$QW_IMAGE" tool local-ingest --index hits -y 2>&1 \ + | awk '/Num docs/ { n = systime(); if (n - last >= 30) { print; fflush(); last = n } next } + { print; fflush() }' + +# Wait long enough for the server to refresh its metastore view. +sleep 35 + +# Show stats. +curl -sS "http://localhost:7280/api/v1/indexes/hits/describe" \ + | jq '{num_published_docs, num_published_splits, size_published_splits}' \ + | tee stats.json + +END=$(date +%s) +echo "Load time: $((END - START))" + +# Data size on disk. +echo -n "Data size: " +sudo du -sb "$QW_DATA" | awk '{print $1}' + +# Run queries +chmod +x run.sh +./run.sh + +sudo docker stop qw 2>/dev/null || true +sudo docker rm qw 2>/dev/null || true diff --git a/quickwit/index_config.yaml b/quickwit/index_config.yaml new file mode 100644 index 000000000..05b593a18 --- /dev/null +++ b/quickwit/index_config.yaml @@ -0,0 +1,149 @@ +version: 0.8 + +index_id: hits + +doc_mapping: + mode: strict + timestamp_field: EventTime + field_mappings: + - {name: WatchID, type: i64, indexed: true, fast: true} + - {name: JavaEnable, type: i64, indexed: true, fast: true} + - {name: Title, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: GoodEvent, type: i64, indexed: true, fast: true} + - name: EventTime + type: datetime + input_formats: ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", unix_timestamp, rfc3339] + output_format: unix_timestamp_secs + indexed: true + fast: true + fast_precision: seconds + - name: EventDate + type: datetime + input_formats: ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", unix_timestamp, rfc3339] + output_format: unix_timestamp_secs + indexed: true + fast: true + fast_precision: seconds + - {name: CounterID, type: i64, indexed: true, fast: true} + - {name: ClientIP, type: i64, indexed: true, fast: true} + - {name: RegionID, type: i64, indexed: true, fast: true} + - {name: UserID, type: i64, indexed: true, fast: true} + - {name: CounterClass, type: i64, indexed: true, fast: true} + - {name: OS, type: i64, indexed: true, fast: true} + - {name: UserAgent, type: i64, indexed: true, fast: true} + - {name: URL, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: Referer, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: IsRefresh, type: i64, indexed: true, fast: true} + - {name: RefererCategoryID, type: i64, indexed: true, fast: true} + - {name: RefererRegionID, type: i64, indexed: true, fast: true} + - {name: URLCategoryID, type: i64, indexed: true, fast: true} + - {name: URLRegionID, type: i64, indexed: true, fast: true} + - {name: ResolutionWidth, type: i64, indexed: true, fast: true} + - {name: ResolutionHeight, type: i64, indexed: true, fast: true} + - {name: ResolutionDepth, type: i64, indexed: true, fast: true} + - {name: FlashMajor, type: i64, indexed: true, fast: true} + - {name: FlashMinor, type: i64, indexed: true, fast: true} + - {name: FlashMinor2, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: NetMajor, type: i64, indexed: true, fast: true} + - {name: NetMinor, type: i64, indexed: true, fast: true} + - {name: UserAgentMajor, type: i64, indexed: true, fast: true} + - {name: UserAgentMinor, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: CookieEnable, type: i64, indexed: true, fast: true} + - {name: JavascriptEnable, type: i64, indexed: true, fast: true} + - {name: IsMobile, type: i64, indexed: true, fast: true} + - {name: MobilePhone, type: i64, indexed: true, fast: true} + - {name: MobilePhoneModel, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: Params, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: IPNetworkID, type: i64, indexed: true, fast: true} + - {name: TraficSourceID, type: i64, indexed: true, fast: true} + - {name: SearchEngineID, type: i64, indexed: true, fast: true} + - {name: SearchPhrase, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: AdvEngineID, type: i64, indexed: true, fast: true} + - {name: IsArtifical, type: i64, indexed: true, fast: true} + - {name: WindowClientWidth, type: i64, indexed: true, fast: true} + - {name: WindowClientHeight, type: i64, indexed: true, fast: true} + - {name: ClientTimeZone, type: i64, indexed: true, fast: true} + - name: ClientEventTime + type: datetime + input_formats: ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", unix_timestamp, rfc3339] + output_format: unix_timestamp_secs + indexed: true + fast: true + fast_precision: seconds + - {name: SilverlightVersion1, type: i64, indexed: true, fast: true} + - {name: SilverlightVersion2, type: i64, indexed: true, fast: true} + - {name: SilverlightVersion3, type: i64, indexed: true, fast: true} + - {name: SilverlightVersion4, type: i64, indexed: true, fast: true} + - {name: PageCharset, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: CodeVersion, type: i64, indexed: true, fast: true} + - {name: IsLink, type: i64, indexed: true, fast: true} + - {name: IsDownload, type: i64, indexed: true, fast: true} + - {name: IsNotBounce, type: i64, indexed: true, fast: true} + - {name: FUniqID, type: i64, indexed: true, fast: true} + - {name: OriginalURL, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: HID, type: i64, indexed: true, fast: true} + - {name: IsOldCounter, type: i64, indexed: true, fast: true} + - {name: IsEvent, type: i64, indexed: true, fast: true} + - {name: IsParameter, type: i64, indexed: true, fast: true} + - {name: DontCountHits, type: i64, indexed: true, fast: true} + - {name: WithHash, type: i64, indexed: true, fast: true} + - {name: HitColor, type: text, tokenizer: raw, fast: {normalizer: raw}} + - name: LocalEventTime + type: datetime + input_formats: ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", unix_timestamp, rfc3339] + output_format: unix_timestamp_secs + indexed: true + fast: true + fast_precision: seconds + - {name: Age, type: i64, indexed: true, fast: true} + - {name: Sex, type: i64, indexed: true, fast: true} + - {name: Income, type: i64, indexed: true, fast: true} + - {name: Interests, type: i64, indexed: true, fast: true} + - {name: Robotness, type: i64, indexed: true, fast: true} + - {name: RemoteIP, type: i64, indexed: true, fast: true} + - {name: WindowName, type: i64, indexed: true, fast: true} + - {name: OpenerName, type: i64, indexed: true, fast: true} + - {name: HistoryLength, type: i64, indexed: true, fast: true} + - {name: BrowserLanguage, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: BrowserCountry, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: SocialNetwork, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: SocialAction, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: HTTPError, type: i64, indexed: true, fast: true} + - {name: SendTiming, type: i64, indexed: true, fast: true} + - {name: DNSTiming, type: i64, indexed: true, fast: true} + - {name: ConnectTiming, type: i64, indexed: true, fast: true} + - {name: ResponseStartTiming, type: i64, indexed: true, fast: true} + - {name: ResponseEndTiming, type: i64, indexed: true, fast: true} + - {name: FetchTiming, type: i64, indexed: true, fast: true} + - {name: SocialSourceNetworkID, type: i64, indexed: true, fast: true} + - {name: SocialSourcePage, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: ParamPrice, type: i64, indexed: true, fast: true} + - {name: ParamOrderID, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: ParamCurrency, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: ParamCurrencyID, type: i64, indexed: true, fast: true} + - {name: OpenstatServiceName, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: OpenstatCampaignID, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: OpenstatAdID, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: OpenstatSourceID, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: UTMSource, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: UTMMedium, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: UTMCampaign, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: UTMContent, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: UTMTerm, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: FromTag, type: text, tokenizer: raw, fast: {normalizer: raw}} + - {name: HasGCLID, type: i64, indexed: true, fast: true} + - {name: RefererHash, type: i64, indexed: true, fast: true} + - {name: URLHash, type: i64, indexed: true, fast: true} + - {name: CLID, type: i64, indexed: true, fast: true} + + store_source: false + +indexing_settings: + commit_timeout_secs: 30 + merge_policy: + type: stable_log + merge_factor: 10 + max_merge_factor: 12 + +search_settings: + default_search_fields: [] diff --git a/quickwit/node-config.yaml b/quickwit/node-config.yaml new file mode 100644 index 000000000..19a8f8457 --- /dev/null +++ b/quickwit/node-config.yaml @@ -0,0 +1,17 @@ +version: 0.8 + +searcher: + # Bump the per-request and leaf-search timeouts above the 30s default — + # a few of the high-cardinality aggregations on the full 100M-row ClickBench + # dataset (e.g. WatchID + ClientIP nested terms) take longer than that. + request_timeout_secs: 60 + leaf_request_timeout_secs: 60 + + # Disable the per-split partial result cache so warm runs don't replay a + # memoized answer. The other in-memory caches (fast_field_cache, + # split_footer_cache, predicate_cache) are data-level caches (analogous to + # ClickHouse's query condition cache) and are kept at their defaults; + # run.sh restarts the container before each query so they also start cold + # for the first run. + partial_request_cache: + capacity: 0 diff --git a/quickwit/queries.json b/quickwit/queries.json new file mode 100644 index 000000000..b7b298d69 --- /dev/null +++ b/quickwit/queries.json @@ -0,0 +1,43 @@ +{"size":0,"track_total_hits":true,"query":{"match_all":{}}} +{"size":0,"track_total_hits":true,"query":{"bool":{"must_not":[{"term":{"AdvEngineID":0}}]}}} +{"size":0,"track_total_hits":true,"aggs":{"sum_adv":{"sum":{"field":"AdvEngineID"}},"avg_res":{"avg":{"field":"ResolutionWidth"}}}} +{"size":0,"aggs":{"avg_user":{"avg":{"field":"UserID"}}}} +{"size":0,"aggs":{"u":{"cardinality":{"field":"UserID"}}}} +{"size":0,"aggs":{"u":{"cardinality":{"field":"SearchPhrase"}}}} +{"size":0,"aggs":{"min_date":{"min":{"field":"EventDate"}},"max_date":{"max":{"field":"EventDate"}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"AdvEngineID":0}}]}},"aggs":{"by_adv":{"terms":{"field":"AdvEngineID","size":1000,"order":{"_count":"desc"}}}}} +{"size":0,"aggs":{"r":{"terms":{"field":"RegionID","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"aggs":{"r":{"terms":{"field":"RegionID","size":10,"order":{"_count":"desc"}},"aggs":{"sumadv":{"sum":{"field":"AdvEngineID"}},"avgres":{"avg":{"field":"ResolutionWidth"}},"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"MobilePhoneModel":""}}]}},"aggs":{"m":{"terms":{"field":"MobilePhoneModel","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"MobilePhoneModel":""}}]}},"aggs":{"p":{"terms":{"field":"MobilePhone","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}},"m":{"terms":{"field":"MobilePhoneModel","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"se":{"terms":{"field":"SearchEngineID","size":10,"order":{"_count":"desc"}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10,"order":{"_count":"desc"}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10}}}}}} +null +{"size":10,"query":{"term":{"UserID":435090932899640449}}} +{"size":0,"track_total_hits":true,"query":{"wildcard":{"URL":"*google*"}}} +{"size":0,"query":{"bool":{"filter":[{"wildcard":{"URL":"*google*"}}],"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"filter":[{"wildcard":{"Title":"*Google*"}}],"must_not":[{"wildcard":{"URL":"*.google.*"}},{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":10,"query":{"wildcard":{"URL":"*google*"}},"sort":[{"EventTime":"asc"}]} +{"size":10,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"sort":[{"EventTime":"asc"}]} +null +null +null +null +null +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"se":{"terms":{"field":"SearchEngineID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"w":{"terms":{"field":"WatchID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} +{"size":0,"aggs":{"w":{"terms":{"field":"WatchID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} +null +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"DontCountHits":0}},{"term":{"IsRefresh":0}}],"must_not":[{"term":{"URL":""}}]}},"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"DontCountHits":0}},{"term":{"IsRefresh":0}}],"must_not":[{"term":{"Title":""}}]}},"aggs":{"t":{"terms":{"field":"Title","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"term":{"IsDownload":0}}],"must_not":[{"term":{"IsLink":0}}]}},"aggs":{"u":{"terms":{"field":"URL","size":1010,"order":{"_count":"desc"}}}}} +null +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"bool":{"should":[{"term":{"TraficSourceID":-1}},{"term":{"TraficSourceID":6}}]}},{"term":{"RefererHash":3594120000172545465}}]}},"aggs":{"uh":{"terms":{"field":"URLHash","size":110,"order":{"_count":"desc"}},"aggs":{"ed":{"terms":{"field":"EventDate","size":110,"order":{"_count":"desc"}}}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"term":{"DontCountHits":0}},{"term":{"URLHash":2868770270353813622}}]}},"aggs":{"w":{"terms":{"field":"WindowClientWidth","size":10010,"order":{"_count":"desc"}},"aggs":{"h":{"terms":{"field":"WindowClientHeight","size":10010,"order":{"_count":"desc"}}}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-14","lte":"2013-07-15"}}},{"term":{"IsRefresh":0}},{"term":{"DontCountHits":0}}]}},"aggs":{"dt":{"date_histogram":{"field":"EventTime","fixed_interval":"1m"}}}} diff --git a/quickwit/queries.sql b/quickwit/queries.sql new file mode 100644 index 000000000..7d093d057 --- /dev/null +++ b/quickwit/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1010; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1010; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 110; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10010; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 1010; diff --git a/quickwit/run.sh b/quickwit/run.sh new file mode 100755 index 000000000..bfbf5f7c2 --- /dev/null +++ b/quickwit/run.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +TRIES=3 +SEARCH_URL="http://localhost:7280/api/v1/_elastic/hits/_search" + +while IFS= read -r QUERY; do + if [ "$QUERY" != "null" ]; then + # Restart Quickwit before each query to clear all in-process caches + # (fast_field_cache, split_footer_cache). Result-style caches + # (partial_request_cache, predicate_cache) are already disabled in + # node-config.yaml. Then drop the OS page cache. This makes the first + # run cold; runs 2 and 3 may benefit from caches re-warmed by run 1. + sudo docker restart qw >/dev/null + until curl -sS -f http://localhost:7280/api/v1/version >/dev/null 2>&1; do sleep 1; done + sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + fi + + echo -n "[" + + for i in $(seq 1 $TRIES); do + if [ "$QUERY" = "null" ]; then + # Query is not expressible in Quickwit (e.g. text-field sort, + # scripts, REGEXP_REPLACE). + echo -n "null" + else + START=$(date +%s.%N) + QW_RSP=$(curl -s -X POST "$SEARCH_URL" -H 'Content-Type: application/json' -d "$QUERY") + END=$(date +%s.%N) + + # Quickwit returns "took" in milliseconds (engine-internal latency). + QW_TIME=$(echo "$QW_RSP" | jq -r 'if has("error") or has("status") then "null" else (.took | tostring) end') + + if [ "$QW_TIME" = "null" ] || [ -z "$QW_TIME" ]; then + echo -n "null" + else + printf "%.4f" "$(echo "scale=4; $QW_TIME / 1000" | bc)" + fi + fi + + [ "$i" != "$TRIES" ] && echo -n ", " + done + + echo "]," +done < queries.json diff --git a/quickwit/template.json b/quickwit/template.json new file mode 100644 index 000000000..022f7ad20 --- /dev/null +++ b/quickwit/template.json @@ -0,0 +1,10 @@ +{ + "system": "Quickwit", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Rust", + "search" + ] +}