Skip to content

Commit 2debd93

Browse files
committed
perf: refactor donnelly block simd code
1 parent 9881a88 commit 2debd93

17 files changed

Lines changed: 3052 additions & 233 deletions

Cargo.toml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,11 @@ name = "v6_stem_strategies"
218218
harness = false
219219
required-features = ["test_utils"]
220220

221+
[[bench]]
222+
name = "v6_stem_strategies_focus"
223+
harness = false
224+
required-features = ["test_utils"]
225+
221226
#[[example]]
222227
#name = "avx2-check"
223228
#path = "examples/avx2-check.rs"
@@ -412,6 +417,16 @@ name = "profile_v6_leaf_strategies"
412417
path = "src/bin/profile_v6_leaf_strategies.rs"
413418
required-features = ["profile_v5"]
414419

420+
[[bin]]
421+
name = "profile_v6_stem_exact_stats"
422+
path = "src/bin/profile_v6_stem_exact_stats.rs"
423+
required-features = ["simd", "test_utils"]
424+
425+
[[bin]]
426+
name = "repro_donnelly_block3_exact_divergence"
427+
path = "src/bin/repro_donnelly_block3_exact_divergence.rs"
428+
required-features = ["simd", "test_utils"]
429+
415430
[[test]]
416431
name = "donnelly_simd_regressions"
417432
path = "tests/donnelly_simd_regressions.rs"
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
use codspeed_criterion_compat::{
2+
black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput,
3+
};
4+
use kiddo::dist::SquaredEuclidean;
5+
use kiddo::kd_tree::leaf_strategies::VecOfArenas;
6+
use kiddo::kd_tree::KdTree;
7+
use kiddo::stem_strategies::donnelly_2_pf::DonnellyPf;
8+
use kiddo::stem_strategies::eytzinger_pf_far::EytzingerPfFar;
9+
#[cfg(all(
10+
feature = "simd",
11+
target_arch = "x86_64",
12+
any(target_feature = "avx2", target_feature = "avx512f")
13+
))]
14+
use kiddo::stem_strategies::{Block3, DonnellyMarkerSimd};
15+
use kiddo::stem_strategies::{Donnelly, Eytzinger};
16+
use rand::Rng;
17+
use rand::SeedableRng;
18+
use rand_chacha::ChaCha8Rng;
19+
20+
const K: usize = 3;
21+
const B: usize = 32;
22+
const DEFAULT_QUERY_COUNT: usize = 10_000;
23+
const DEFAULT_POINT_COUNT: usize = 1usize << 22;
24+
const POINT_SEED: u64 = 0x5eed_0000_0000_0201;
25+
const QUERY_SEED: u64 = 0x5eed_0000_0000_0202;
26+
27+
type ArenaLeaves = VecOfArenas<f64, u32, K, B>;
28+
type EytzingerTree = KdTree<f64, u32, Eytzinger<K>, ArenaLeaves, K, B>;
29+
type EytzingerPfFarTree = KdTree<f64, u32, EytzingerPfFar<K, 8>, ArenaLeaves, K, B>;
30+
type DonnellyPfTree = KdTree<f64, u32, DonnellyPf<3, 64, 8, K>, ArenaLeaves, K, B>;
31+
#[cfg(all(
32+
feature = "simd",
33+
target_arch = "x86_64",
34+
any(target_feature = "avx2", target_feature = "avx512f")
35+
))]
36+
type DonnellySimdTree = KdTree<f64, u32, DonnellyMarkerSimd<Block3, 64, 8, K>, ArenaLeaves, K, B>;
37+
38+
fn read_usize_env(var: &str, default: usize) -> usize {
39+
std::env::var(var)
40+
.ok()
41+
.and_then(|value| value.parse::<usize>().ok())
42+
.unwrap_or(default)
43+
}
44+
45+
fn build_points(point_count: usize) -> Vec<[f64; K]> {
46+
let mut rng = ChaCha8Rng::seed_from_u64(POINT_SEED);
47+
(0..point_count).map(|_| rng.random::<[f64; K]>()).collect()
48+
}
49+
50+
fn build_queries(query_count: usize) -> Vec<[f64; K]> {
51+
let mut rng = ChaCha8Rng::seed_from_u64(QUERY_SEED);
52+
(0..query_count).map(|_| rng.random::<[f64; K]>()).collect()
53+
}
54+
55+
fn run_nearest_queries_eytzinger(tree: &EytzingerTree, queries: &[[f64; K]]) -> (f64, u64) {
56+
let mut checksum_dist = 0.0f64;
57+
let mut checksum_item = 0u64;
58+
59+
for query in queries {
60+
let (dist, item) = tree.nearest_one::<SquaredEuclidean<f64>>(black_box(query));
61+
checksum_dist += dist;
62+
checksum_item = checksum_item.wrapping_add(item as u64);
63+
}
64+
65+
(checksum_dist, checksum_item)
66+
}
67+
68+
fn run_nearest_queries_eytzinger_pf_far(
69+
tree: &EytzingerPfFarTree,
70+
queries: &[[f64; K]],
71+
) -> (f64, u64) {
72+
let mut checksum_dist = 0.0f64;
73+
let mut checksum_item = 0u64;
74+
75+
for query in queries {
76+
let (dist, item) = tree.nearest_one::<SquaredEuclidean<f64>>(black_box(query));
77+
checksum_dist += dist;
78+
checksum_item = checksum_item.wrapping_add(item as u64);
79+
}
80+
81+
(checksum_dist, checksum_item)
82+
}
83+
84+
fn run_nearest_queries_donnelly(tree: &DonnellyPfTree, queries: &[[f64; K]]) -> (f64, u64) {
85+
let mut checksum_dist = 0.0f64;
86+
let mut checksum_item = 0u64;
87+
88+
for query in queries {
89+
let (dist, item) = tree.nearest_one::<SquaredEuclidean<f64>>(black_box(query));
90+
checksum_dist += dist;
91+
checksum_item = checksum_item.wrapping_add(item as u64);
92+
}
93+
94+
(checksum_dist, checksum_item)
95+
}
96+
97+
#[cfg(all(
98+
feature = "simd",
99+
target_arch = "x86_64",
100+
any(target_feature = "avx2", target_feature = "avx512f")
101+
))]
102+
fn run_nearest_queries_donnelly_simd(tree: &DonnellySimdTree, queries: &[[f64; K]]) -> (f64, u64) {
103+
let mut checksum_dist = 0.0f64;
104+
let mut checksum_item = 0u64;
105+
106+
for query in queries {
107+
let (dist, item) = tree.nearest_one::<SquaredEuclidean<f64>>(black_box(query));
108+
checksum_dist += dist;
109+
checksum_item = checksum_item.wrapping_add(item as u64);
110+
}
111+
112+
(checksum_dist, checksum_item)
113+
}
114+
115+
fn v6_stem_strategies_focus(c: &mut Criterion) {
116+
let query_count = read_usize_env("KIDDO_BENCH_QUERIES", DEFAULT_QUERY_COUNT);
117+
let point_count = read_usize_env("KIDDO_BENCH_POINTS", DEFAULT_POINT_COUNT);
118+
let points = build_points(point_count);
119+
let queries = build_queries(query_count);
120+
121+
let eytzinger_tree: EytzingerTree = KdTree::new_from_slice(&points);
122+
let eytzinger_pf_far_tree: EytzingerPfFarTree = KdTree::new_from_slice(&points);
123+
let donnelly_tree: DonnellyPfTree = KdTree::new_from_slice(&points);
124+
#[cfg(all(
125+
feature = "simd",
126+
target_arch = "x86_64",
127+
any(target_feature = "avx2", target_feature = "avx512f")
128+
))]
129+
let donnelly_simd_tree: DonnellySimdTree = KdTree::new_from_slice(&points);
130+
131+
let mut group = c.benchmark_group("v6 nearest_one stem strategies focus");
132+
group.throughput(Throughput::Elements(query_count as u64));
133+
134+
group.bench_function(BenchmarkId::new("Eytzinger", point_count), |b| {
135+
b.iter(|| black_box(run_nearest_queries_eytzinger(&eytzinger_tree, &queries)));
136+
});
137+
138+
group.bench_function(BenchmarkId::new("Eytzinger PF Far", point_count), |b| {
139+
b.iter(|| {
140+
black_box(run_nearest_queries_eytzinger_pf_far(
141+
&eytzinger_pf_far_tree,
142+
&queries,
143+
))
144+
});
145+
});
146+
147+
group.bench_function(BenchmarkId::new("Donnelly PF", point_count), |b| {
148+
b.iter(|| black_box(run_nearest_queries_donnelly(&donnelly_tree, &queries)));
149+
});
150+
151+
#[cfg(all(
152+
feature = "simd",
153+
target_arch = "x86_64",
154+
any(target_feature = "avx2", target_feature = "avx512f")
155+
))]
156+
group.bench_function(BenchmarkId::new("Donnelly Block SIMD", point_count), |b| {
157+
b.iter(|| {
158+
black_box(run_nearest_queries_donnelly_simd(
159+
&donnelly_simd_tree,
160+
&queries,
161+
))
162+
});
163+
});
164+
165+
group.finish();
166+
}
167+
168+
criterion_group!(benches, v6_stem_strategies_focus);
169+
criterion_main!(benches);

justfile

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,12 +99,49 @@ asm-k6-nearest-one-donnelly-block3-fill-avx512-clean:
9999
asm-k6-nearest-one-donnelly-block3-pending-select-avx512-clean:
100100
RUSTC_WRAPPER= cargo asm --simplify --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "donnelly_block3_pending_select_f64_cargo_asm_hook" | python3 scripts/clean_cargo_asm.py > v6_nearest_one_donnelly_block3_pending_select_avx512_clean.asm
101101

102+
asm-k6-nearest-one-donnelly-block3-pending-fast-path-avx512-clean:
103+
RUSTC_WRAPPER= cargo asm --simplify --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "donnelly_block3_pending_fast_path_f64_cargo_asm_hook" | python3 scripts/clean_cargo_asm.py > v6_nearest_one_donnelly_block3_pending_fast_path_avx512_clean.asm
104+
105+
asm-k6-nearest-one-donnelly-block3-exact-step-avx512-clean:
106+
RUSTC_WRAPPER= cargo asm --simplify --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "donnelly_block3_exact_step_f64_cargo_asm_hook" | python3 scripts/clean_cargo_asm.py > v6_nearest_one_donnelly_block3_exact_step_avx512_clean.asm
107+
108+
asm-k6-nearest-one-donnelly-voarena-v3-avx512-clean:
109+
RUSTC_WRAPPER= cargo asm --simplify --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "v6_nearest_one_donnelly_vec_of_arenas_cargo_asm_hook" | python3 scripts/clean_cargo_asm.py > v6_nearest_one_donnelly_vec_of_arenas_v3_avx512_clean.asm
110+
111+
asm-k6-nearest-one-donnelly-blocksimd-voarena-v3-avx512-clean:
112+
RUSTC_WRAPPER= cargo asm --simplify --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "v6_nearest_one_donnelly_blocksimd_vec_of_arenas_cargo_asm_hook" | python3 scripts/clean_cargo_asm.py > v6_nearest_one_donnelly_blocksimd_vec_of_arenas_v3_avx512_clean.asm
113+
114+
bench-v6-stem-strategies-focus FEATURES='simd,test_utils,logging_off' POINTS='4194304' QUERIES='10000':
115+
RUSTC_WRAPPER= \
116+
KIDDO_BENCH_POINTS={{POINTS}} \
117+
KIDDO_BENCH_QUERIES={{QUERIES}} \
118+
RUSTFLAGS='-C target-cpu=native' \
119+
cargo criterion --bench v6_stem_strategies_focus --features {{FEATURES}}
120+
121+
profile-v6-stem-exact-stats FEATURES='simd,test_utils,logging_off' POINTS='4194304' QUERIES='10000' REPEATS='1':
122+
RUSTC_WRAPPER= \
123+
KIDDO_PROFILE_POINTS={{POINTS}} \
124+
KIDDO_PROFILE_QUERIES={{QUERIES}} \
125+
KIDDO_PROFILE_QUERY_BATCH_REPEATS={{REPEATS}} \
126+
RUSTFLAGS='-C target-cpu=native' \
127+
cargo run --release --bin profile_v6_stem_exact_stats --features {{FEATURES}}
128+
129+
repro-donnelly-block3-exact-divergence FEATURES='simd,test_utils,logging_off' POINTS='4194304' QUERIES='10000':
130+
RUSTC_WRAPPER= \
131+
KIDDO_REPRO_POINTS={{POINTS}} \
132+
KIDDO_REPRO_QUERIES={{QUERIES}} \
133+
RUSTFLAGS='-C target-cpu=native' \
134+
cargo run --release --bin repro_donnelly_block3_exact_divergence --features {{FEATURES}}
135+
102136
asm-k6-nearest-one-eytz-v3-core-avx512:
103137
cargo asm --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "v6_nearest_one_eytzinger_arithmetic_core_cargo_asm_hook" > v6_nearest_one_eytzinger_v3_core_avx512.asm
104138

105139
asm-k6-nearest-one-eytz-v3-leaf-avx512:
106140
cargo asm --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "kiddo::kd_tree::leaf_view_chunked::nearest_one::avx512::leaf_nearest_one_chunked_nozero_f64_k3::<f64, kiddo::dist::squared_euclidean::SquaredEuclidean<f64>, usize>" > v6_nearest_one_eytzinger_v3_leaf_avx512.asm
107141

142+
asm-k6-nearest-one-arena-leaf-v3-avx512-clean:
143+
RUSTC_WRAPPER= cargo asm --simplify --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "v6_nearest_one_arena_leaf_cargo_asm_hook" | python3 scripts/clean_cargo_asm.py > v6_nearest_one_arena_leaf_v3_avx512_clean.asm
144+
108145
asm-k6-nearest-one-eytz-v3-leaf-avx512-clean:
109146
RUSTC_WRAPPER= cargo asm --simplify --features simd,cargo_asm,logging_off --lib --target-cpu=native -C="opt-level=2" -C="target-cpu=native" "kiddo::kd_tree::leaf_view_chunked::nearest_one::avx512::leaf_nearest_one_chunked_nozero_f64_k3::<f64, kiddo::dist::squared_euclidean::SquaredEuclidean<f64>, usize>" | python3 scripts/clean_cargo_asm.py > v6_nearest_one_eytzinger_v3_leaf_avx512_clean.asm
110147

0 commit comments

Comments
 (0)