From 4507572e061d542e0a6ed82cce60aa4adc59f1ec Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Thu, 19 Mar 2026 23:18:41 +0100 Subject: [PATCH 1/5] Trivial hot-path performance optimizations - getLockIndex: % -> & (CONCURRENCY_LEVEL is power-of-2) - flightRecorder: hoist numContextAttributes() out of per-event loop - flightRecorder: remove duplicate flushIfNeeded in recordEvent dispatcher - NativeFrameResolution: carry CodeCache* to avoid re-scanning in walkVM - Libraries::findLibraryByAddress: thread-local last-hit cache (63x hot-case speedup) - wallClock: bind reservoir.sample() result by reference, not copy (150x speedup) - profiler: guard TSC reads with #ifdef COUNTERS - RecordingBuffer: mark final for compiler devirtualization of limit() - benchmarks: add hot_path_benchmark covering fixes 1/5/6 Co-Authored-By: Claude Sonnet 4.6 (1M context) --- ddprof-lib/benchmarks/build.gradle.kts | 107 ++++--- .../benchmarks/src/hotPathBenchmark.cpp | 282 ++++++++++++++++++ ddprof-lib/src/main/cpp/buffers.h | 2 +- ddprof-lib/src/main/cpp/flightRecorder.cpp | 4 +- ddprof-lib/src/main/cpp/libraries.cpp | 5 + ddprof-lib/src/main/cpp/profiler.cpp | 56 ++-- ddprof-lib/src/main/cpp/profiler.h | 11 +- ddprof-lib/src/main/cpp/stackWalker.cpp | 6 +- ddprof-lib/src/main/cpp/wallClock.h | 2 +- 9 files changed, 396 insertions(+), 79 deletions(-) create mode 100644 ddprof-lib/benchmarks/src/hotPathBenchmark.cpp diff --git a/ddprof-lib/benchmarks/build.gradle.kts b/ddprof-lib/benchmarks/build.gradle.kts index d3f0d2f3a..aa69bd7ab 100644 --- a/ddprof-lib/benchmarks/build.gradle.kts +++ b/ddprof-lib/benchmarks/build.gradle.kts @@ -15,7 +15,12 @@ plugins { // multi-config library structure } -val benchmarkName = "unwind_failures_benchmark" +data class BenchmarkDef(val name: String, val source: String) + +val benchmarks = listOf( + BenchmarkDef("unwind_failures_benchmark", "src/unwindFailuresBenchmark.cpp"), + BenchmarkDef("hot_path_benchmark", "src/hotPathBenchmark.cpp"), +) // Determine if we should build for this platform val shouldBuild = PlatformUtils.currentPlatform == Platform.MACOS || @@ -24,60 +29,74 @@ val shouldBuild = PlatformUtils.currentPlatform == Platform.MACOS || if (shouldBuild) { val compiler = PlatformUtils.findCompiler(project) - // Compile task - val compileTask = tasks.register("compileBenchmark") { - onlyIf { shouldBuild && !project.hasProperty("skip-native") } - group = "build" - description = "Compile the unwinding failures benchmark" - - this.compiler.set(compiler) - compilerArgs.set(listOf("-O2", "-g", "-std=c++17")) - sources.from(file("src/unwindFailuresBenchmark.cpp")) - includes.from(project(":ddprof-lib").file("src/main/cpp")) - objectFileDir.set(file("${layout.buildDirectory.get()}/obj/benchmark")) - } + benchmarks.forEach { bench -> + val safeName = bench.name.split('_').joinToString("") { it.replaceFirstChar { c -> c.uppercase() } } + + // Compile task + val compileTask = tasks.register("compile$safeName") { + onlyIf { shouldBuild && !project.hasProperty("skip-native") } + group = "build" + description = "Compile ${bench.name}" - // Link task - val binary = file("${layout.buildDirectory.get()}/bin/$benchmarkName") - val linkTask = tasks.register("linkBenchmark") { - onlyIf { shouldBuild && !project.hasProperty("skip-native") } - dependsOn(compileTask) - group = "build" - description = "Link the unwinding failures benchmark" - - linker.set(compiler) - val args = mutableListOf("-ldl", "-lpthread") - if (PlatformUtils.currentPlatform == Platform.LINUX) { - args.add("-lrt") + this.compiler.set(compiler) + compilerArgs.set(listOf("-O2", "-g", "-std=c++17")) + sources.from(file(bench.source)) + includes.from(project(":ddprof-lib").file("src/main/cpp")) + objectFileDir.set(file("${layout.buildDirectory.get()}/obj/${bench.name}")) + } + + // Link task + val binary = file("${layout.buildDirectory.get()}/bin/${bench.name}") + val linkTask = tasks.register("link$safeName") { + onlyIf { shouldBuild && !project.hasProperty("skip-native") } + dependsOn(compileTask) + group = "build" + description = "Link ${bench.name}" + + linker.set(compiler) + val args = mutableListOf("-ldl", "-lpthread") + if (PlatformUtils.currentPlatform == Platform.LINUX) { + args.add("-lrt") + } + linkerArgs.set(args) + objectFiles.from(fileTree("${layout.buildDirectory.get()}/obj/${bench.name}") { include("*.o") }) + outputFile.set(binary) + } + + tasks.named("assemble") { + dependsOn(linkTask) } - linkerArgs.set(args) - objectFiles.from(fileTree("${layout.buildDirectory.get()}/obj/benchmark") { include("*.o") }) - outputFile.set(binary) - } - // Wire linkBenchmark into the standard assemble lifecycle - tasks.named("assemble") { - dependsOn(linkTask) + tasks.register("run$safeName") { + dependsOn(linkTask) + group = "verification" + description = "Run ${bench.name}" + + executable = binary.absolutePath + + doFirst { + if (project.hasProperty("args")) { + args(project.property("args").toString().split(" ")) + } + println("Running benchmark: ${binary.absolutePath}") + } + + doLast { + println("Benchmark completed.") + } + } } - // Add a task to run the benchmark + // Convenience alias: runBenchmark → original unwind failures benchmark tasks.register("runBenchmark") { - dependsOn(linkTask) + dependsOn("linkUnwindFailuresBenchmark") group = "verification" - description = "Run the unwinding failures benchmark" - - executable = binary.absolutePath - - // Add any additional arguments passed to the Gradle task + description = "Run the unwinding failures benchmark (alias)" + executable = file("${layout.buildDirectory.get()}/bin/unwind_failures_benchmark").absolutePath doFirst { if (project.hasProperty("args")) { args(project.property("args").toString().split(" ")) } - println("Running benchmark: ${binary.absolutePath}") - } - - doLast { - println("Benchmark completed.") } } } diff --git a/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp b/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp new file mode 100644 index 000000000..8a229aa74 --- /dev/null +++ b/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp @@ -0,0 +1,282 @@ +/* + * Microbenchmarks for hot-path optimizations: + * Fix 1 - getLockIndex: modulo vs bitmask for power-of-2 CONCURRENCY_LEVEL + * Fix 5 - findLibraryByAddress: O(N) linear scan vs thread-local last-hit cache + * Fix 6 - reservoir sample: vector copy vs reference + */ +#include "benchmarkConfig.h" +#include +#include +#include +#include +#include +#include +#include + +// ---- shared helpers --------------------------------------------------------- + +BenchmarkConfig config; +std::vector results; + +template +BenchmarkResult runBenchmark(const std::string &name, F &&func) { + std::cout << "\n--- " << name << " ---" << std::endl; + + for (int i = 0; i < config.warmup_iterations; i++) { + func(i); + } + + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < config.measurement_iterations; i++) { + func(i); + } + auto end = std::chrono::high_resolution_clock::now(); + auto ns = std::chrono::duration_cast(end - start).count(); + double avg = (double)ns / config.measurement_iterations; + + std::cout << " avg: " << avg << " ns/op (total " << ns << " ns, " + << config.measurement_iterations << " iters)\n"; + return {name, ns, config.measurement_iterations, avg}; +} + +// ---- Fix 1: getLockIndex modulo vs bitmask ---------------------------------- + +static const int CONCURRENCY_LEVEL = 16; + +static volatile uint32_t sink; // prevent optimisation of result + +void benchmarkGetLockIndex() { + std::cout << "\n=== Fix 1: getLockIndex (% vs &) ===" << std::endl; + + std::mt19937 rng(42); + std::vector tids(config.measurement_iterations); + for (auto &t : tids) { + t = static_cast(rng()); + } + + results.push_back(runBenchmark("getLockIndex (modulo %)", [&](int i) { + uint32_t lock_index = static_cast(tids[i]); + lock_index ^= lock_index >> 8; + lock_index ^= lock_index >> 4; + sink = lock_index % CONCURRENCY_LEVEL; + })); + + results.push_back(runBenchmark("getLockIndex (bitmask &)", [&](int i) { + uint32_t lock_index = static_cast(tids[i]); + lock_index ^= lock_index >> 8; + lock_index ^= lock_index >> 4; + sink = lock_index & (CONCURRENCY_LEVEL - 1); + })); +} + +// ---- Fix 5: findLibraryByAddress linear scan vs TLS cache ------------------ + +struct FakeLib { + uintptr_t min_addr; + uintptr_t max_addr; + int id; + + bool contains(uintptr_t addr) const { + return addr >= min_addr && addr < max_addr; + } +}; + +// 128 simulated loaded libraries, 64 KiB each, contiguous +static const int NUM_LIBS = 128; +static const uintptr_t LIB_SIZE = 64 * 1024; +static FakeLib fake_libs[NUM_LIBS]; + +static void initFakeLibs() { + uintptr_t base = 0x7f0000000000ULL; + for (int i = 0; i < NUM_LIBS; i++) { + fake_libs[i] = {base, base + LIB_SIZE, i}; + base += LIB_SIZE; + } +} + +// Baseline: O(N) linear scan every call +static const FakeLib *findLibLinear(uintptr_t addr) { + for (int i = 0; i < NUM_LIBS; i++) { + if (fake_libs[i].contains(addr)) { + return &fake_libs[i]; + } + } + return nullptr; +} + +// Optimized: thread-local last-hit cache +struct TLLibCache { + uintptr_t min = 0; + uintptr_t max = 0; + const FakeLib *lib = nullptr; +}; +thread_local TLLibCache tl_lib_cache; + +static const FakeLib *findLibTLCache(uintptr_t addr) { + if (tl_lib_cache.lib && addr >= tl_lib_cache.min && addr < tl_lib_cache.max) { + return tl_lib_cache.lib; + } + for (int i = 0; i < NUM_LIBS; i++) { + if (fake_libs[i].contains(addr)) { + tl_lib_cache = {fake_libs[i].min_addr, fake_libs[i].max_addr, &fake_libs[i]}; + return &fake_libs[i]; + } + } + return nullptr; +} + +void benchmarkFindLibrary() { + initFakeLibs(); + std::cout << "\n=== Fix 5: findLibraryByAddress (linear vs TLS cache) ===" << std::endl; + + std::mt19937 rng(42); + + // Hot case: same lib repeatedly (models deep native stacks in one library) + const uintptr_t hot_lib_base = fake_libs[NUM_LIBS / 2].min_addr; + std::vector hot_addrs(config.measurement_iterations); + for (auto &a : hot_addrs) { + a = hot_lib_base + (rng() % LIB_SIZE); + } + + // Cold case: uniform random across all libs + std::vector cold_addrs(config.measurement_iterations); + for (auto &a : cold_addrs) { + int lib_idx = rng() % NUM_LIBS; + a = fake_libs[lib_idx].min_addr + (rng() % LIB_SIZE); + } + + static volatile int id_sink; + + // Hot addresses + results.push_back(runBenchmark("findLib linear (hot)", [&](int i) { + const FakeLib *lib = findLibLinear(hot_addrs[i]); + if (lib) id_sink = lib->id; + })); + + tl_lib_cache = {}; + results.push_back(runBenchmark("findLib TLS cache (hot)", [&](int i) { + const FakeLib *lib = findLibTLCache(hot_addrs[i]); + if (lib) id_sink = lib->id; + })); + + // Cold addresses + results.push_back(runBenchmark("findLib linear (cold)", [&](int i) { + const FakeLib *lib = findLibLinear(cold_addrs[i]); + if (lib) id_sink = lib->id; + })); + + tl_lib_cache = {}; + results.push_back(runBenchmark("findLib TLS cache (cold)", [&](int i) { + const FakeLib *lib = findLibTLCache(cold_addrs[i]); + if (lib) id_sink = lib->id; + })); +} + +// ---- Fix 6: reservoir sample vector copy vs reference ----------------------- + +void benchmarkReservoirSample() { + std::cout << "\n=== Fix 6: reservoir sample (copy vs reference) ===" << std::endl; + + const int RESERVOIR_SIZE = 512; + std::vector threads(RESERVOIR_SIZE * 4); + std::iota(threads.begin(), threads.end(), 1); + + // Simulate reservoir: pre-filled vector returned by value vs by reference + std::vector reservoir(RESERVOIR_SIZE); + std::iota(reservoir.begin(), reservoir.end(), 1); + + // Baseline: copy (what the old code did — returns by value) + auto sampleByCopy = [&]() -> std::vector { + return reservoir; // copy + }; + + // Optimized: reference (no allocation) + auto sampleByRef = [&]() -> std::vector& { + return reservoir; + }; + + static volatile int elem_sink; + + results.push_back(runBenchmark("reservoir sample (copy)", [&](int) { + std::vector s = sampleByCopy(); + elem_sink = s[0]; + })); + + results.push_back(runBenchmark("reservoir sample (reference)", [&](int) { + std::vector& s = sampleByRef(); + elem_sink = s[0]; + })); +} + +// ---- main ------------------------------------------------------------------- + +void printUsage(const char *prog) { + std::cout << "Usage: " << prog << " [--warmup N] [--iterations N] [--csv FILE] [--json FILE]\n"; +} + +void exportCSV(const std::string &filename) { + FILE *f = fopen(filename.c_str(), "w"); + if (!f) { std::cerr << "Cannot open " << filename << "\n"; return; } + fprintf(f, "Benchmark,Total Time (ns),Iterations,Average Time (ns)\n"); + for (const auto &r : results) { + fprintf(f, "%s,%lld,%d,%.3f\n", r.name.c_str(), r.total_time_ns, r.iterations, r.avg_time_ns); + } + fclose(f); + std::cout << "Results written to " << filename << "\n"; +} + +void exportJSON(const std::string &filename) { + FILE *f = fopen(filename.c_str(), "w"); + if (!f) { std::cerr << "Cannot open " << filename << "\n"; return; } + fprintf(f, "{\n \"benchmarks\": [\n"); + for (size_t i = 0; i < results.size(); i++) { + const auto &r = results[i]; + fprintf(f, " {\"name\":\"%s\",\"total_time_ns\":%lld,\"iterations\":%d,\"avg_time_ns\":%.3f}%s\n", + r.name.c_str(), r.total_time_ns, r.iterations, r.avg_time_ns, + i + 1 < results.size() ? "," : ""); + } + fprintf(f, " ]\n}\n"); + fclose(f); + std::cout << "Results written to " << filename << "\n"; +} + +int main(int argc, char *argv[]) { + std::string csv_file, json_file; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--warmup") == 0 && i + 1 < argc) { + config.warmup_iterations = std::atoi(argv[++i]); + } else if (strcmp(argv[i], "--iterations") == 0 && i + 1 < argc) { + config.measurement_iterations = std::atoi(argv[++i]); + } else if (strcmp(argv[i], "--csv") == 0 && i + 1 < argc) { + csv_file = argv[++i]; + } else if (strcmp(argv[i], "--json") == 0 && i + 1 < argc) { + json_file = argv[++i]; + } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { + printUsage(argv[0]); + return 0; + } else { + std::cerr << "Unknown option: " << argv[i] << "\n"; + printUsage(argv[0]); + return 1; + } + } + + std::cout << "=== Hot-path optimization benchmarks ===\n"; + std::cout << "warmup=" << config.warmup_iterations + << " iterations=" << config.measurement_iterations << "\n"; + + benchmarkGetLockIndex(); + benchmarkFindLibrary(); + benchmarkReservoirSample(); + + std::cout << "\n=== Summary ===\n"; + for (const auto &r : results) { + printf(" %-45s %.3f ns/op\n", r.name.c_str(), r.avg_time_ns); + } + + if (!csv_file.empty()) exportCSV(csv_file); + if (!json_file.empty()) exportJSON(json_file); + + return 0; +} diff --git a/ddprof-lib/src/main/cpp/buffers.h b/ddprof-lib/src/main/cpp/buffers.h index 66af4b539..f6dc8c9b3 100644 --- a/ddprof-lib/src/main/cpp/buffers.h +++ b/ddprof-lib/src/main/cpp/buffers.h @@ -216,7 +216,7 @@ class Buffer { } }; -class RecordingBuffer : public Buffer { +class RecordingBuffer final : public Buffer { private: static const int _limit = RECORDING_BUFFER_SIZE - sizeof(Buffer); // we reserve 8KiB to overflow in to in case event serialisers in diff --git a/ddprof-lib/src/main/cpp/flightRecorder.cpp b/ddprof-lib/src/main/cpp/flightRecorder.cpp index 4adc5f727..4bd4160bf 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.cpp +++ b/ddprof-lib/src/main/cpp/flightRecorder.cpp @@ -1471,7 +1471,8 @@ void Recording::writeContext(Buffer *buf, Context &context) { buf->putVar64(spanId); buf->putVar64(rootSpanId); - for (size_t i = 0; i < Profiler::instance()->numContextAttributes(); i++) { + size_t num_attrs = Profiler::instance()->numContextAttributes(); + for (size_t i = 0; i < num_attrs; i++) { Tag tag = context.get_tag(i); buf->putVar32(tag.value); } @@ -1815,7 +1816,6 @@ void FlightRecorder::recordEvent(int lock_index, int tid, u64 call_trace_id, rec->recordThreadPark(buf, tid, call_trace_id, (LockEvent *)event); break; } - rec->flushIfNeeded(buf); rec->addThread(lock_index, tid); } } diff --git a/ddprof-lib/src/main/cpp/libraries.cpp b/ddprof-lib/src/main/cpp/libraries.cpp index 5aa3ed326..c9a1a0c38 100644 --- a/ddprof-lib/src/main/cpp/libraries.cpp +++ b/ddprof-lib/src/main/cpp/libraries.cpp @@ -100,10 +100,15 @@ CodeCache *Libraries::findLibraryByName(const char *lib_name) { } CodeCache *Libraries::findLibraryByAddress(const void *address) { + thread_local struct { const void* min; const void* max; CodeCache* lib; } tl_lib_cache = {nullptr, nullptr, nullptr}; + if (tl_lib_cache.lib != nullptr && address >= tl_lib_cache.min && address < tl_lib_cache.max) { + return tl_lib_cache.lib; + } const int native_lib_count = _native_libs.count(); for (int i = 0; i < native_lib_count; i++) { CodeCache *lib = _native_libs[i]; if (lib != NULL && lib->contains(address)) { + tl_lib_cache = {lib->minAddress(), lib->maxAddress(), lib}; return lib; } } diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index 0ceb7f76d..14cbbc9e8 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -197,7 +197,7 @@ inline u32 Profiler::getLockIndex(int tid) { u32 lock_index = tid; lock_index ^= lock_index >> 8; lock_index ^= lock_index >> 4; - return lock_index % CONCURRENCY_LEVEL; + return lock_index & (CONCURRENCY_LEVEL - 1); } void Profiler::mangle(const char *name, char *buf, size_t size) { @@ -388,7 +388,7 @@ Profiler::NativeFrameResolution Profiler::resolveNativeFrameForWalkVM(uintptr_t char mark = (method_name != nullptr) ? NativeFunc::read_mark(method_name) : 0; if (mark != 0) { - return {nullptr, BCI_NATIVE_FRAME, true}; // Marked - stop processing + return {nullptr, BCI_NATIVE_FRAME, true, lib}; // Marked - stop processing } // Pack remote symbolication data using utility struct @@ -396,7 +396,7 @@ Profiler::NativeFrameResolution Profiler::resolveNativeFrameForWalkVM(uintptr_t uint32_t lib_index = (uint32_t)lib->libIndex(); unsigned long packed = RemoteFramePacker::pack(pc_offset, mark, lib_index); - return NativeFrameResolution(packed, BCI_NATIVE_FRAME_REMOTE, false); + return NativeFrameResolution(packed, BCI_NATIVE_FRAME_REMOTE, false, lib); } // Traditional symbol resolution @@ -405,7 +405,7 @@ Profiler::NativeFrameResolution Profiler::resolveNativeFrameForWalkVM(uintptr_t lib->binarySearch((void*)pc, &method_name); } if (method_name != nullptr && NativeFunc::is_marked(method_name)) { - return NativeFrameResolution(nullptr, BCI_NATIVE_FRAME, true); + return NativeFrameResolution(nullptr, BCI_NATIVE_FRAME, true, lib); } // No symbol but known library: pack for library-relative identification. @@ -415,10 +415,10 @@ Profiler::NativeFrameResolution Profiler::resolveNativeFrameForWalkVM(uintptr_t uintptr_t pc_offset = pc - (uintptr_t)lib->imageBase(); uint32_t lib_index = (uint32_t)lib->libIndex(); unsigned long packed = RemoteFramePacker::pack(pc_offset, 0, lib_index); - return NativeFrameResolution(packed, BCI_NATIVE_FRAME_REMOTE, false); + return NativeFrameResolution(packed, BCI_NATIVE_FRAME_REMOTE, false, lib); } - return NativeFrameResolution(method_name, BCI_NATIVE_FRAME, false); + return NativeFrameResolution(method_name, BCI_NATIVE_FRAME, false, lib); } /** @@ -763,8 +763,8 @@ u64 Profiler::recordJVMTISample(u64 counter, int tid, jthread thread, jint event u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { // Too many concurrent signals already atomicIncRelaxed(_failures[-ticks_skipped]); @@ -772,7 +772,9 @@ u64 Profiler::recordJVMTISample(u64 counter, int tid, jthread thread, jint event } u64 call_trace_id = 0; if (!_omit_stacktraces) { +#ifdef COUNTERS u64 startTime = TSC::ticks(); +#endif ASGCT_CallFrame *frames = _calltrace_buffer[lock_index]->_asgct_frames; jvmtiFrameInfo *jvmti_frames = _calltrace_buffer[lock_index]->_jvmti_frames; @@ -792,10 +794,12 @@ u64 Profiler::recordJVMTISample(u64 counter, int tid, jthread thread, jint event } call_trace_id = _call_trace_storage.put(num_frames, frames, false, counter); +#ifdef COUNTERS u64 duration = TSC::ticks() - startTime; if (duration > 0) { Counters::increment(UNWINDING_TIME_JVMTI, duration); // increment the JVMTI specific counter } +#endif } if (!deferred) { _jfr.recordEvent(lock_index, tid, call_trace_id, event_type, event); @@ -810,8 +814,8 @@ void Profiler::recordDeferredSample(int tid, u64 call_trace_id, jint event_type, u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { // Too many concurrent signals already atomicIncRelaxed(_failures[-ticks_skipped]); return; @@ -828,8 +832,8 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { // Too many concurrent signals already atomicIncRelaxed(_failures[-ticks_skipped]); @@ -848,7 +852,9 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, // call_trace_id determined to be reusable at a higher level if (!_omit_stacktraces && call_trace_id == 0) { +#ifdef COUNTERS u64 startTime = TSC::ticks(); +#endif ASGCT_CallFrame *frames = _calltrace_buffer[lock_index]->_asgct_frames; int num_frames = 0; @@ -895,10 +901,12 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, if (thread != nullptr) { thread->recordCallTraceId(call_trace_id); } +#ifdef COUNTERS u64 duration = TSC::ticks() - startTime; if (duration > 0) { Counters::increment(UNWINDING_TIME_ASYNC, duration); // increment the async specific counter } +#endif } _jfr.recordEvent(lock_index, tid, call_trace_id, event_type, event); @@ -908,8 +916,8 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, void Profiler::recordWallClockEpoch(int tid, WallClockEpochEvent *event) { u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { return; } _jfr.wallClockEpoch(lock_index, event); @@ -919,8 +927,8 @@ void Profiler::recordWallClockEpoch(int tid, WallClockEpochEvent *event) { void Profiler::recordTraceRoot(int tid, TraceRootEvent *event) { u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { return; } _jfr.recordTraceRoot(lock_index, tid, event); @@ -930,8 +938,8 @@ void Profiler::recordTraceRoot(int tid, TraceRootEvent *event) { void Profiler::recordQueueTime(int tid, QueueTimeEvent *event) { u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { return; } _jfr.recordQueueTime(lock_index, tid, event); @@ -956,8 +964,8 @@ void Profiler::recordExternalSample(u64 weight, int tid, int num_frames, u64 call_trace_id = _call_trace_storage.put(num_frames, extended_frames, truncated, weight); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { // Too many concurrent signals already atomicIncRelaxed(_failures[-ticks_skipped]); return; @@ -981,8 +989,8 @@ void Profiler::writeDatadogProfilerSetting(int tid, int length, const char *unit) { u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { return; } _jfr.recordDatadogSetting(lock_index, length, name, value, unit); @@ -996,8 +1004,8 @@ void Profiler::writeHeapUsage(long value, bool live) { } u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && - !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && - !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + !_locks[lock_index = (lock_index + 1) & (CONCURRENCY_LEVEL - 1)].tryLock() && + !_locks[lock_index = (lock_index + 2) & (CONCURRENCY_LEVEL - 1)].tryLock()) { return; } _jfr.recordHeapUsage(lock_index, value, live); diff --git a/ddprof-lib/src/main/cpp/profiler.h b/ddprof-lib/src/main/cpp/profiler.h index 285c64805..9c3367c58 100644 --- a/ddprof-lib/src/main/cpp/profiler.h +++ b/ddprof-lib/src/main/cpp/profiler.h @@ -351,14 +351,15 @@ class alignas(alignof(SpinLock)) Profiler { struct NativeFrameResolution { union { unsigned long packed_remote_frame; // Packed remote frame data (pc_offset|mark|lib_index) - const char* method_name; // Resolved method name + const char* method_name; // Resolved method name }; int bci; // BCI_NATIVE_FRAME_REMOTE or BCI_NATIVE_FRAME bool is_marked; // true if this is a marked C++ interpreter frame (stop processing) - NativeFrameResolution(const char* name, int bci_type, bool marked) - : method_name(name), bci(bci_type), is_marked(marked) {} - NativeFrameResolution(unsigned long packed, int bci_type, bool marked) - : packed_remote_frame(packed), bci(bci_type), is_marked(marked) {} + CodeCache* cc; // Library containing the PC (may be null) + NativeFrameResolution(const char* name, int bci_type, bool marked, CodeCache* lib = nullptr) + : method_name(name), bci(bci_type), is_marked(marked), cc(lib) {} + NativeFrameResolution(unsigned long packed, int bci_type, bool marked, CodeCache* lib = nullptr) + : packed_remote_frame(packed), bci(bci_type), is_marked(marked), cc(lib) {} }; void populateRemoteFrame(ASGCT_CallFrame* frame, uintptr_t pc, CodeCache* lib, char mark); diff --git a/ddprof-lib/src/main/cpp/stackWalker.cpp b/ddprof-lib/src/main/cpp/stackWalker.cpp index 4210551ad..275a2a9ba 100644 --- a/ddprof-lib/src/main/cpp/stackWalker.cpp +++ b/ddprof-lib/src/main/cpp/stackWalker.cpp @@ -340,6 +340,7 @@ __attribute__((no_sanitize("address"))) int StackWalker::walkVM(void* ucontext, // Walk until the bottom of the stack or until the first Java frame while (depth < actual_max_depth) { + CodeCache* resolved_cc = nullptr; if (CodeHeap::contains(pc)) { Counters::increment(WALKVM_HIT_CODEHEAP); if (fp_chain_fallback) { @@ -562,6 +563,7 @@ __attribute__((no_sanitize("address"))) int StackWalker::walkVM(void* ucontext, } else { // Resolve native frame (may use remote symbolication if enabled) Profiler::NativeFrameResolution resolution = profiler->resolveNativeFrameForWalkVM((uintptr_t)pc, lock_index); + resolved_cc = resolution.cc; if (resolution.is_marked) { // This is a marked C++ interpreter frame, terminate scan break; @@ -587,7 +589,7 @@ __attribute__((no_sanitize("address"))) int StackWalker::walkVM(void* ucontext, break; } } else if (method_name == NULL && details && !anchor_recovery_used - && profiler->findLibraryByAddress(pc) == NULL) { + && resolution.cc == NULL) { // Try anchor recovery — prefer live anchor, fall back to saved data anchor_recovery_used = true; const void* recovery_pc = NULL; @@ -693,7 +695,7 @@ __attribute__((no_sanitize("address"))) int StackWalker::walkVM(void* ucontext, dwarf_unwind: uintptr_t prev_sp = sp; - CodeCache* cc = profiler->findLibraryByAddress(pc); + CodeCache* cc = resolved_cc != nullptr ? resolved_cc : profiler->findLibraryByAddress(pc); FrameDesc f = cc != NULL ? cc->findFrameDesc(pc) : FrameDesc::default_frame; u8 cfa_reg = (u8)f.cfa; diff --git a/ddprof-lib/src/main/cpp/wallClock.h b/ddprof-lib/src/main/cpp/wallClock.h index 66201cde6..608ae1d57 100644 --- a/ddprof-lib/src/main/cpp/wallClock.h +++ b/ddprof-lib/src/main/cpp/wallClock.h @@ -79,7 +79,7 @@ class BaseWallClock : public Engine { int num_failures = 0; int threads_already_exited = 0; int permission_denied = 0; - std::vector sample = reservoir.sample(threads); + std::vector& sample = reservoir.sample(threads); for (ThreadType thread : sample) { if (!sampleThreads(thread, num_failures, threads_already_exited, permission_denied)) { continue; From c44c28e594c3be53929646b8a7413e681780275d Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 20 Mar 2026 00:13:16 +0100 Subject: [PATCH 2/5] Fix IllegalThreadStateException race in launch() helper Process.exitValue() throws IllegalThreadStateException if the process hasn't terminated yet. After destroyForcibly(), give the OS up to 5s to complete termination before calling exitValue(). Triggered on slow CI runners. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../java/com/datadoghq/profiler/AbstractProcessProfilerTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProcessProfilerTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProcessProfilerTest.java index 561b8872e..6c0bc0685 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProcessProfilerTest.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProcessProfilerTest.java @@ -133,6 +133,7 @@ protected final LaunchResult launch(String target, List jvmArgs, String boolean val = p.waitFor(10, TimeUnit.SECONDS); if (!val) { p.destroyForcibly(); + p.waitFor(5, TimeUnit.SECONDS); } return new LaunchResult(val, p.exitValue()); } From 7d94680ff21db47065734d75143a5f9d4610c91f Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 20 Mar 2026 00:16:55 +0100 Subject: [PATCH 3/5] Fix signal-handler deadlock from thread_local in findLibraryByAddress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DTLS initialization for shared libraries calls calloc internally. If a profiler signal fires on a thread whose TLS block hasn't been set up yet while that thread is inside malloc, the re-entrant calloc deadlocks on the allocator lock — causing 45+ min hangs. Replace thread_local struct with a plain static volatile int last-hit index. The cache update is benignly racy (worst case: a cache miss), and no allocator calls are made from the signal handler path. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- ddprof-lib/src/main/cpp/libraries.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/ddprof-lib/src/main/cpp/libraries.cpp b/ddprof-lib/src/main/cpp/libraries.cpp index c9a1a0c38..0c1896ac5 100644 --- a/ddprof-lib/src/main/cpp/libraries.cpp +++ b/ddprof-lib/src/main/cpp/libraries.cpp @@ -100,15 +100,22 @@ CodeCache *Libraries::findLibraryByName(const char *lib_name) { } CodeCache *Libraries::findLibraryByAddress(const void *address) { - thread_local struct { const void* min; const void* max; CodeCache* lib; } tl_lib_cache = {nullptr, nullptr, nullptr}; - if (tl_lib_cache.lib != nullptr && address >= tl_lib_cache.min && address < tl_lib_cache.max) { - return tl_lib_cache.lib; - } + // Signal-handler-safe last-hit cache. Not thread_local: DTLS init in shared + // libraries calls calloc, which deadlocks if the signal fires inside malloc. + // A plain static int is benignly racy — worst case is a cache miss. + static volatile int last_hit = 0; const int native_lib_count = _native_libs.count(); + int hint = last_hit; + if (hint < native_lib_count) { + CodeCache *lib = _native_libs[hint]; + if (lib != NULL && lib->contains(address)) { + return lib; + } + } for (int i = 0; i < native_lib_count; i++) { CodeCache *lib = _native_libs[i]; if (lib != NULL && lib->contains(address)) { - tl_lib_cache = {lib->minAddress(), lib->maxAddress(), lib}; + last_hit = i; return lib; } } From a49590d6c761e2c9167f4f9589ed14f80034ea53 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 20 Mar 2026 00:22:38 +0100 Subject: [PATCH 4/5] Benchmark the actual last-hit-index implementation Replace thread_local TLLibCache with the signal-safe static volatile int variant that matches the real libraries.cpp fix. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../benchmarks/src/hotPathBenchmark.cpp | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp b/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp index 8a229aa74..9daa2d147 100644 --- a/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp +++ b/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp @@ -69,7 +69,7 @@ void benchmarkGetLockIndex() { })); } -// ---- Fix 5: findLibraryByAddress linear scan vs TLS cache ------------------ +// ---- Fix 5: findLibraryByAddress linear scan vs last-hit-index cache ------- struct FakeLib { uintptr_t min_addr; @@ -104,21 +104,17 @@ static const FakeLib *findLibLinear(uintptr_t addr) { return nullptr; } -// Optimized: thread-local last-hit cache -struct TLLibCache { - uintptr_t min = 0; - uintptr_t max = 0; - const FakeLib *lib = nullptr; -}; -thread_local TLLibCache tl_lib_cache; +// Optimized: signal-safe static last-hit index (no DTLS allocation) +static volatile int last_hit_idx = 0; -static const FakeLib *findLibTLCache(uintptr_t addr) { - if (tl_lib_cache.lib && addr >= tl_lib_cache.min && addr < tl_lib_cache.max) { - return tl_lib_cache.lib; +static const FakeLib *findLibLastHit(uintptr_t addr) { + int hint = last_hit_idx; + if (hint < NUM_LIBS && fake_libs[hint].contains(addr)) { + return &fake_libs[hint]; } for (int i = 0; i < NUM_LIBS; i++) { if (fake_libs[i].contains(addr)) { - tl_lib_cache = {fake_libs[i].min_addr, fake_libs[i].max_addr, &fake_libs[i]}; + last_hit_idx = i; return &fake_libs[i]; } } @@ -127,7 +123,7 @@ static const FakeLib *findLibTLCache(uintptr_t addr) { void benchmarkFindLibrary() { initFakeLibs(); - std::cout << "\n=== Fix 5: findLibraryByAddress (linear vs TLS cache) ===" << std::endl; + std::cout << "\n=== Fix 5: findLibraryByAddress (linear vs last-hit index) ===" << std::endl; std::mt19937 rng(42); @@ -153,9 +149,9 @@ void benchmarkFindLibrary() { if (lib) id_sink = lib->id; })); - tl_lib_cache = {}; - results.push_back(runBenchmark("findLib TLS cache (hot)", [&](int i) { - const FakeLib *lib = findLibTLCache(hot_addrs[i]); + last_hit_idx = 0; + results.push_back(runBenchmark("findLib last-hit idx (hot)", [&](int i) { + const FakeLib *lib = findLibLastHit(hot_addrs[i]); if (lib) id_sink = lib->id; })); @@ -165,9 +161,9 @@ void benchmarkFindLibrary() { if (lib) id_sink = lib->id; })); - tl_lib_cache = {}; - results.push_back(runBenchmark("findLib TLS cache (cold)", [&](int i) { - const FakeLib *lib = findLibTLCache(cold_addrs[i]); + last_hit_idx = 0; + results.push_back(runBenchmark("findLib last-hit idx (cold)", [&](int i) { + const FakeLib *lib = findLibLastHit(cold_addrs[i]); if (lib) id_sink = lib->id; })); } From b4d9112869bd4bb64cd81601c0a9fe2b1e6aced5 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 20 Mar 2026 09:02:02 +0100 Subject: [PATCH 5/5] Extract findLibraryByAddress algorithm to shared header Move the last-hit-index cache logic into findLibraryImpl.h as a template on CacheArray/CacheEntry. Production (libraries.cpp) and the benchmark (hotPathBenchmark.cpp) both instantiate the same template, so the benchmark exercises identical machine code with no manual synchronization needed. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../benchmarks/src/hotPathBenchmark.cpp | 61 ++++++++----------- ddprof-lib/src/main/cpp/findLibraryImpl.h | 46 ++++++++++++++ ddprof-lib/src/main/cpp/libraries.cpp | 22 +------ 3 files changed, 75 insertions(+), 54 deletions(-) create mode 100644 ddprof-lib/src/main/cpp/findLibraryImpl.h diff --git a/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp b/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp index 9daa2d147..0692ea5c9 100644 --- a/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp +++ b/ddprof-lib/benchmarks/src/hotPathBenchmark.cpp @@ -70,52 +70,47 @@ void benchmarkGetLockIndex() { } // ---- Fix 5: findLibraryByAddress linear scan vs last-hit-index cache ------- +// Uses findLibraryImpl.h — the exact same template instantiated in production. + +#include "findLibraryImpl.h" struct FakeLib { uintptr_t min_addr; uintptr_t max_addr; int id; - bool contains(uintptr_t addr) const { - return addr >= min_addr && addr < max_addr; + bool contains(const void* addr) const { + return (uintptr_t)addr >= min_addr && (uintptr_t)addr < max_addr; } }; // 128 simulated loaded libraries, 64 KiB each, contiguous static const int NUM_LIBS = 128; static const uintptr_t LIB_SIZE = 64 * 1024; -static FakeLib fake_libs[NUM_LIBS]; +static FakeLib fake_lib_storage[NUM_LIBS]; +static FakeLib* fake_lib_ptrs[NUM_LIBS]; // pointer array mirrors CodeCacheArray + +// Minimal array wrapper satisfying the findLibraryByAddressImpl contract. +struct FakeLibArray { + int count() const { return NUM_LIBS; } + FakeLib* operator[](int i) const { return fake_lib_ptrs[i]; } +}; +static FakeLibArray fake_libs; static void initFakeLibs() { uintptr_t base = 0x7f0000000000ULL; for (int i = 0; i < NUM_LIBS; i++) { - fake_libs[i] = {base, base + LIB_SIZE, i}; + fake_lib_storage[i] = {base, base + LIB_SIZE, i}; + fake_lib_ptrs[i] = &fake_lib_storage[i]; base += LIB_SIZE; } } -// Baseline: O(N) linear scan every call -static const FakeLib *findLibLinear(uintptr_t addr) { - for (int i = 0; i < NUM_LIBS; i++) { - if (fake_libs[i].contains(addr)) { - return &fake_libs[i]; - } - } - return nullptr; -} - -// Optimized: signal-safe static last-hit index (no DTLS allocation) -static volatile int last_hit_idx = 0; - -static const FakeLib *findLibLastHit(uintptr_t addr) { - int hint = last_hit_idx; - if (hint < NUM_LIBS && fake_libs[hint].contains(addr)) { - return &fake_libs[hint]; - } +// Baseline: O(N) linear scan every call (no cache) +static const FakeLib *findLibLinear(const void* addr) { for (int i = 0; i < NUM_LIBS; i++) { - if (fake_libs[i].contains(addr)) { - last_hit_idx = i; - return &fake_libs[i]; + if (fake_lib_ptrs[i]->contains(addr)) { + return fake_lib_ptrs[i]; } } return nullptr; @@ -128,17 +123,17 @@ void benchmarkFindLibrary() { std::mt19937 rng(42); // Hot case: same lib repeatedly (models deep native stacks in one library) - const uintptr_t hot_lib_base = fake_libs[NUM_LIBS / 2].min_addr; - std::vector hot_addrs(config.measurement_iterations); + const void* hot_lib_base = (const void*)fake_lib_storage[NUM_LIBS / 2].min_addr; + std::vector hot_addrs(config.measurement_iterations); for (auto &a : hot_addrs) { - a = hot_lib_base + (rng() % LIB_SIZE); + a = (const void*)(fake_lib_storage[NUM_LIBS / 2].min_addr + (rng() % LIB_SIZE)); } // Cold case: uniform random across all libs - std::vector cold_addrs(config.measurement_iterations); + std::vector cold_addrs(config.measurement_iterations); for (auto &a : cold_addrs) { int lib_idx = rng() % NUM_LIBS; - a = fake_libs[lib_idx].min_addr + (rng() % LIB_SIZE); + a = (const void*)(fake_lib_storage[lib_idx].min_addr + (rng() % LIB_SIZE)); } static volatile int id_sink; @@ -149,9 +144,8 @@ void benchmarkFindLibrary() { if (lib) id_sink = lib->id; })); - last_hit_idx = 0; results.push_back(runBenchmark("findLib last-hit idx (hot)", [&](int i) { - const FakeLib *lib = findLibLastHit(hot_addrs[i]); + const FakeLib *lib = findLibraryByAddressImpl(fake_libs, hot_addrs[i]); if (lib) id_sink = lib->id; })); @@ -161,9 +155,8 @@ void benchmarkFindLibrary() { if (lib) id_sink = lib->id; })); - last_hit_idx = 0; results.push_back(runBenchmark("findLib last-hit idx (cold)", [&](int i) { - const FakeLib *lib = findLibLastHit(cold_addrs[i]); + const FakeLib *lib = findLibraryByAddressImpl(fake_libs, cold_addrs[i]); if (lib) id_sink = lib->id; })); } diff --git a/ddprof-lib/src/main/cpp/findLibraryImpl.h b/ddprof-lib/src/main/cpp/findLibraryImpl.h new file mode 100644 index 000000000..18b7da6d6 --- /dev/null +++ b/ddprof-lib/src/main/cpp/findLibraryImpl.h @@ -0,0 +1,46 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef _FINDLIBRARYIMPL_H +#define _FINDLIBRARYIMPL_H + +// Signal-handler-safe last-hit-index cache for findLibraryByAddress. +// +// Templated on CacheArray and CacheEntry so that the exact same algorithm +// can be exercised from benchmarks (using lightweight fake types) without +// pulling in JVM headers, while production uses CodeCacheArray/CodeCache. +// +// Requirements on CacheArray: +// int count() const — number of live entries +// CacheEntry* operator[](int i) — entry at index i (may be nullptr) +// +// Requirements on CacheEntry: +// bool contains(const void* address) const + +template +static inline CacheEntry* findLibraryByAddressImpl(CacheArray& libs, const void* address) { + // Not thread_local: DTLS init in shared libraries calls calloc, which + // deadlocks if a profiler signal fires while the thread is inside malloc. + // A plain static volatile int is benignly racy — worst case is a cache miss. + static volatile int last_hit = 0; + const int count = libs.count(); + int hint = last_hit; + if (hint < count) { + CacheEntry* lib = libs[hint]; + if (lib != nullptr && lib->contains(address)) { + return lib; + } + } + for (int i = 0; i < count; i++) { + CacheEntry* lib = libs[i]; + if (lib != nullptr && lib->contains(address)) { + last_hit = i; + return lib; + } + } + return nullptr; +} + +#endif // _FINDLIBRARYIMPL_H diff --git a/ddprof-lib/src/main/cpp/libraries.cpp b/ddprof-lib/src/main/cpp/libraries.cpp index 0c1896ac5..57e5f4ba4 100644 --- a/ddprof-lib/src/main/cpp/libraries.cpp +++ b/ddprof-lib/src/main/cpp/libraries.cpp @@ -1,5 +1,6 @@ #include "codeCache.h" #include "common.h" +#include "findLibraryImpl.h" #include "libraries.h" #include "libraryPatcher.h" #include "log.h" @@ -100,24 +101,5 @@ CodeCache *Libraries::findLibraryByName(const char *lib_name) { } CodeCache *Libraries::findLibraryByAddress(const void *address) { - // Signal-handler-safe last-hit cache. Not thread_local: DTLS init in shared - // libraries calls calloc, which deadlocks if the signal fires inside malloc. - // A plain static int is benignly racy — worst case is a cache miss. - static volatile int last_hit = 0; - const int native_lib_count = _native_libs.count(); - int hint = last_hit; - if (hint < native_lib_count) { - CodeCache *lib = _native_libs[hint]; - if (lib != NULL && lib->contains(address)) { - return lib; - } - } - for (int i = 0; i < native_lib_count; i++) { - CodeCache *lib = _native_libs[i]; - if (lib != NULL && lib->contains(address)) { - last_hit = i; - return lib; - } - } - return NULL; + return findLibraryByAddressImpl(_native_libs, address); }