From c8d8476666110cd4f8cdc81f72ea44ae0bb88317 Mon Sep 17 00:00:00 2001 From: Erwan Viollet Date: Fri, 29 May 2026 17:02:37 +0200 Subject: [PATCH] feat(pprof): emit per-sample process_language label Heuristically detect the native language family (go/rust/cpp) of each profiled process' main executable and attach it as a pprof per-sample label `process_language`. Unknown/mixed cases fall back to the existing "native" tag (no label emitted). Detection runs once per PID, lazily on first sample, using the Elf* already opened by libdwfl (via dwfl_module_getelf) -- no extra file open. The check is cheap and never reads DWARF: * Go -> .go.buildinfo / .gopclntab ELF section * Rust -> .note.rustc section, or rustc-mangled symbols (`_R...` v0, or legacy `...17h<16 hex>E` tail) * Cpp -> any `_Z...` Itanium-mangled symbol * else -> kUnknown (caller leaves label unset) Symbol-table scan is bounded to 4096 entries. Result is cached on the Process object and cleared with the rest of its state on PID exit. A fallback path opening /proc//exe is provided for callers without an Elf* in hand (unit tests, early bring-up). --- include/ddprof_process.hpp | 22 ++++ include/native_language.hpp | 49 ++++++++ include/pprof/ddprof_pprof.hpp | 1 + include/unwind_output.hpp | 4 + src/ddprof_process.cc | 18 +++ src/native_language.cc | 203 +++++++++++++++++++++++++++++++++ src/pprof/ddprof_pprof.cc | 7 +- src/unwind.cc | 39 +++++++ test/CMakeLists.txt | 1 + 9 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 include/native_language.hpp create mode 100644 src/native_language.cc diff --git a/include/ddprof_process.hpp b/include/ddprof_process.hpp index 66ba23b6b..dbcccbb20 100644 --- a/include/ddprof_process.hpp +++ b/include/ddprof_process.hpp @@ -10,6 +10,12 @@ #include "ddres_def.hpp" #include "dwfl_wrapper.hpp" #include "logger.hpp" +#include "native_language.hpp" + +// libelf forward declaration +extern "C" { +struct Elf; +} #include #include @@ -40,6 +46,19 @@ class Process { [[nodiscard]] std::string_view get_or_insert_thread_name(pid_t tid); + // Cached native language of the process' main executable. + // Returns kUnknown until populated by set_language(). + NativeLanguage get_language() const { return _language; } + + // Detect (only once) the language using an already-opened Elf* (typically + // libdwfl's main-module Elf*). No-op on subsequent calls. + // Returns true if detection was attempted on this call. + bool detect_language_once(::Elf *main_exe_elf); + + // Fallback path: detect by opening /proc//exe ourselves. Use only when + // no Elf* is available yet. + bool detect_language_once_from_proc(std::string_view path_to_proc); + [[nodiscard]] DwflWrapper *get_or_insert_dwfl(); [[nodiscard]] DwflWrapper *get_dwfl(); [[nodiscard]] const DwflWrapper *get_dwfl() const; @@ -57,6 +76,8 @@ class Process { pid_t _pid; CGroupId_t _cgroup_ns; uint64_t _sample_counter{}; + NativeLanguage _language{NativeLanguage::kUnknown}; + bool _language_detected{false}; }; class ProcessHdr { @@ -66,6 +87,7 @@ class ProcessHdr { void flag_visited(pid_t pid); Process &get(pid_t pid); const ContainerId &get_container_id(pid_t pid); + void clear(pid_t pid) { _process_map.erase(pid); } std::vector get_unvisited() const; diff --git a/include/native_language.hpp b/include/native_language.hpp new file mode 100644 index 000000000..773c95d81 --- /dev/null +++ b/include/native_language.hpp @@ -0,0 +1,49 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +#pragma once + +#include +#include +#include + +// libelf forward declaration: callers that pass an Elf* must include +// . +extern "C" { +struct Elf; +} + +namespace ddprof { + +// Heuristic native-language family for a process' main executable. +// Only meant to refine the "native" language tag for cases where a single +// language clearly dominates a process. Mixed-language binaries fall back to +// kUnknown (caller is expected to report "native" in that case). +enum class NativeLanguage : uint8_t { + kUnknown = 0, // fallback -> reported as "native" + kGo, + kRust, + kCpp, +}; + +// Returns a stable label string for a detected language. +// kUnknown maps to "native" (the existing default tag). +std::string_view to_string(NativeLanguage lang); + +// Detect the native language of an already-opened ELF object. +// Intentionally heuristic and cheap: +// * Go -> `.go.buildinfo` / `.gopclntab` ELF section +// * Rust -> `.note.rustc` section or rustc-mangled symbols in +// .dynsym / .symtab (bounded scan) +// * Cpp -> any `_Z`-mangled symbol (Itanium ABI) not matching Rust +// Never reads DWARF. +// Returns kUnknown on null input or unrecognised ELF. +NativeLanguage detect_native_language(::Elf *elf); + +// Convenience wrapper: open `/proc//exe` ourselves. Prefer the Elf* +// overload above when the caller already has a handle (e.g. via libdwfl). +NativeLanguage detect_native_language(pid_t pid, std::string_view path_to_proc); + +} // namespace ddprof diff --git a/include/pprof/ddprof_pprof.hpp b/include/pprof/ddprof_pprof.hpp index ca1836a61..a7c00e148 100644 --- a/include/pprof/ddprof_pprof.hpp +++ b/include/pprof/ddprof_pprof.hpp @@ -31,6 +31,7 @@ struct DDProfPProf { ddog_prof_StringId2 thread_id{}; ddog_prof_StringId2 thread_name{}; ddog_prof_StringId2 tracepoint_type{}; + ddog_prof_StringId2 process_language{}; }; /* single profile gathering several value types */ diff --git a/include/unwind_output.hpp b/include/unwind_output.hpp index 753de852e..0b4e67656 100644 --- a/include/unwind_output.hpp +++ b/include/unwind_output.hpp @@ -34,11 +34,15 @@ struct UnwindOutput { container_id = k_container_id_unknown; exe_name = {}; thread_name = {}; + language = {}; } std::vector locs; std::string_view container_id; std::string_view exe_name; std::string_view thread_name; + // Heuristic native language of the process' main executable + // ("go"/"rust"/"cpp"). Empty -> caller falls back to "native". + std::string_view language; int pid; int tid; friend auto operator<=>(const UnwindOutput &, const UnwindOutput &) = default; diff --git a/src/ddprof_process.cc b/src/ddprof_process.cc index 5fccbfa3b..998cc348b 100644 --- a/src/ddprof_process.cc +++ b/src/ddprof_process.cc @@ -145,6 +145,24 @@ const ContainerId &ProcessHdr::get_container_id(pid_t pid) { return p.get_container_id(_path_to_proc); } +bool Process::detect_language_once(::Elf *main_exe_elf) { + if (_language_detected || main_exe_elf == nullptr) { + return false; + } + _language = detect_native_language(main_exe_elf); + _language_detected = true; + return true; +} + +bool Process::detect_language_once_from_proc(std::string_view path_to_proc) { + if (_language_detected) { + return false; + } + _language = detect_native_language(_pid, path_to_proc); + _language_detected = true; + return true; +} + void ProcessHdr::flag_visited(pid_t pid) { _visited_pid.insert(pid); } Process &ProcessHdr::get(pid_t pid) { diff --git a/src/native_language.cc b/src/native_language.cc new file mode 100644 index 000000000..9940aad02 --- /dev/null +++ b/src/native_language.cc @@ -0,0 +1,203 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +#include "native_language.hpp" + +#include "logger.hpp" +#include "unique_fd.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace ddprof { + +namespace { + +constexpr size_t k_max_symbols_scanned = 4096; + +// Returns true if `name` looks like a rustc-mangled symbol. +// * v0 mangling: starts with "_R" +// * legacy mangling: Itanium "_ZN...17h<16 hex chars>E" tail +bool looks_like_rust_symbol(std::string_view name) { + if (name.size() > 2 && name[0] == '_' && name[1] == 'R') { + return true; + } + // Legacy mangling: ..."17h" + 16 hex + "E" at the very end. + // We don't need to validate the full Itanium prefix, the tail is unique + // enough for a heuristic. + constexpr size_t k_tail = 20; // "17h" + 16 hex + "E" + if (name.size() < k_tail || name.back() != 'E') { + return false; + } + const size_t pos = name.size() - k_tail; + if (name.compare(pos, 3, "17h") != 0) { + return false; + } + for (size_t i = pos + 3; i < name.size() - 1; ++i) { + const char c = name[i]; + const bool is_hex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); + if (!is_hex) { + return false; + } + } + return true; +} + +bool looks_like_cpp_symbol(std::string_view name) { + // Itanium C++ mangling: "_Z..." (and we already ruled out Rust legacy above). + return name.size() > 2 && name[0] == '_' && name[1] == 'Z'; +} + +// Scan a symbol table section. Returns true and sets `out` if a definitive +// signal is found. Bails after k_max_symbols_scanned entries. +bool scan_symtab(Elf *elf, Elf_Scn *scn, GElf_Shdr const &shdr, + NativeLanguage &out) { + Elf_Data *data = elf_getdata(scn, nullptr); + if (data == nullptr || shdr.sh_entsize == 0) { + return false; + } + const size_t nsyms = shdr.sh_size / shdr.sh_entsize; + bool saw_cpp = false; + size_t scanned = 0; + for (size_t i = 0; i < nsyms && scanned < k_max_symbols_scanned; ++i) { + GElf_Sym sym; + if (gelf_getsym(data, static_cast(i), &sym) == nullptr) { + continue; + } + const char *raw = elf_strptr(elf, shdr.sh_link, sym.st_name); + if (raw == nullptr || raw[0] == '\0') { + continue; + } + ++scanned; + std::string_view const name(raw); + if (looks_like_rust_symbol(name)) { + out = NativeLanguage::kRust; + return true; // Rust wins immediately + } + if (!saw_cpp && looks_like_cpp_symbol(name)) { + saw_cpp = true; + } + } + if (saw_cpp) { + out = NativeLanguage::kCpp; + return true; + } + return false; +} + +NativeLanguage detect_from_elf_impl(Elf *elf) { + // First pass: section-name probes (cheapest). + size_t shstrndx = 0; + if (elf_getshdrstrndx(elf, &shstrndx) != 0) { + return NativeLanguage::kUnknown; + } + + Elf_Scn *symtab_scn = nullptr; + GElf_Shdr symtab_shdr{}; + Elf_Scn *dynsym_scn = nullptr; + GElf_Shdr dynsym_shdr{}; + + Elf_Scn *scn = nullptr; + while ((scn = elf_nextscn(elf, scn)) != nullptr) { + GElf_Shdr shdr; + if (gelf_getshdr(scn, &shdr) == nullptr) { + continue; + } + const char *name = elf_strptr(elf, shstrndx, shdr.sh_name); + if (name == nullptr) { + continue; + } + std::string_view const sname(name); + if (sname == ".go.buildinfo" || sname == ".gopclntab") { + return NativeLanguage::kGo; + } + if (sname == ".note.rustc") { + return NativeLanguage::kRust; + } + if (shdr.sh_type == SHT_SYMTAB) { + symtab_scn = scn; + symtab_shdr = shdr; + } else if (shdr.sh_type == SHT_DYNSYM) { + dynsym_scn = scn; + dynsym_shdr = shdr; + } + } + + // Second pass: symbol-table heuristics. Prefer .symtab (richer); fall back + // to .dynsym for stripped binaries. + NativeLanguage out = NativeLanguage::kUnknown; + if (symtab_scn != nullptr && scan_symtab(elf, symtab_scn, symtab_shdr, out)) { + return out; + } + if (dynsym_scn != nullptr && scan_symtab(elf, dynsym_scn, dynsym_shdr, out)) { + return out; + } + return NativeLanguage::kUnknown; +} + +} // namespace + +NativeLanguage detect_native_language(::Elf *elf) { + if (elf == nullptr) { + return NativeLanguage::kUnknown; + } + // Safety: libelf needs to have been initialised. ddprof calls + // elf_version(EV_CURRENT) in unwind_init(); make it idempotent here too. + elf_version(EV_CURRENT); + if (elf_kind(elf) != ELF_K_ELF) { + return NativeLanguage::kUnknown; + } + NativeLanguage const result = detect_from_elf_impl(elf); + LG_DBG("[NATIVE-LANG] -> %s", std::string(to_string(result)).c_str()); + return result; +} + +std::string_view to_string(NativeLanguage lang) { + switch (lang) { + case NativeLanguage::kGo: + return "go"; + case NativeLanguage::kRust: + return "rust"; + case NativeLanguage::kCpp: + return "cpp"; + case NativeLanguage::kUnknown: + default: + return "native"; + } +} + +NativeLanguage detect_native_language(pid_t pid, + std::string_view path_to_proc) { + // libelf must be initialised; ddprof already calls elf_version() in + // unwind_init(), but make it idempotent-safe here in case this is invoked + // from a context where it has not been. + elf_version(EV_CURRENT); + + const std::string exe_path = + absl::StrCat(path_to_proc, "/proc/", pid, "/exe"); + const UniqueFd fd{::open(exe_path.c_str(), O_RDONLY | O_CLOEXEC)}; + if (!fd) { + return NativeLanguage::kUnknown; + } + Elf *elf = elf_begin(fd.get(), ELF_C_READ_MMAP, nullptr); + if (elf == nullptr) { + return NativeLanguage::kUnknown; + } + NativeLanguage result = NativeLanguage::kUnknown; + if (elf_kind(elf) == ELF_K_ELF) { + result = detect_from_elf_impl(elf); + } + elf_end(elf); + LG_DBG("[NATIVE-LANG] (from /proc) pid=%d -> %s", pid, + std::string(to_string(result)).c_str()); + return result; +} + +} // namespace ddprof diff --git a/src/pprof/ddprof_pprof.cc b/src/pprof/ddprof_pprof.cc index 1d10719b5..9440ebfb3 100644 --- a/src/pprof/ddprof_pprof.cc +++ b/src/pprof/ddprof_pprof.cc @@ -31,7 +31,7 @@ using namespace std::string_view_literals; namespace ddprof { namespace { -constexpr size_t k_max_pprof_labels{8}; +constexpr size_t k_max_pprof_labels{9}; constexpr std::string_view k_container_id_label = "container_id"sv; constexpr std::string_view k_process_id_label = "process_id"sv; @@ -41,6 +41,7 @@ constexpr std::string_view k_process_name_label = "process_name"sv; constexpr std::string_view k_thread_id_label = "thread id"sv; constexpr std::string_view k_thread_name_label = "thread_name"sv; constexpr std::string_view k_tracepoint_label = "tracepoint_type"sv; +constexpr std::string_view k_process_language_label = "process_language"sv; // Maps a ddog_prof_SampleType to the kebab-case name used in debug log output // (must match what simple_malloc-ut.sh greps for). @@ -100,6 +101,7 @@ void init_dict_label_key_ids(DDProfPProf::DictLabelKeyIds &label_keys, label_keys.thread_id = intern_string(dict, k_thread_id_label); label_keys.thread_name = intern_string(dict, k_thread_name_label); label_keys.tracepoint_type = intern_string(dict, k_tracepoint_label); + label_keys.process_language = intern_string(dict, k_process_language_label); } size_t prepare_labels2(const UnwindOutput &uw_output, @@ -141,6 +143,9 @@ size_t prepare_labels2(const UnwindOutput &uw_output, if (!uw_output.thread_name.empty()) { push_label(label_keys.thread_name, uw_output.thread_name); } + if (!uw_output.language.empty()) { + push_label(label_keys.process_language, uw_output.language); + } DDPROF_DCHECK_FATAL(labels_num <= labels.size(), "pprof_aggregate - label buffer exceeded"); diff --git a/src/unwind.cc b/src/unwind.cc index d599f8bdb..da8682c45 100644 --- a/src/unwind.cc +++ b/src/unwind.cc @@ -19,6 +19,7 @@ #include #include +#include namespace ddprof { @@ -49,6 +50,43 @@ void add_exe_name(UnwindState *us) { void add_thread_name(Process &process, UnwindState *us) { us->output.thread_name = process.get_or_insert_thread_name(us->output.tid); } + +// Resolve the main executable's Elf* via libdwfl. Returns nullptr if the +// main-exe module is not registered yet for this PID (e.g. very first sample, +// or unwinding never crossed the main exe). +Elf *get_main_exe_elf(UnwindState *us) { + if (us->_dwfl_wrapper == nullptr) { + return nullptr; + } + const DsoHdr::DsoFindRes find_res = + us->dso_hdr.dso_find_first_std_executable(us->pid); + if (!find_res.second) { + return nullptr; + } + const Dso &dso = find_res.first->second; + const FileInfoId_t file_info_id = us->dso_hdr.get_or_insert_file_info(dso); + if (file_info_id <= k_file_info_error) { + return nullptr; + } + DDProfMod *mod = us->_dwfl_wrapper->unsafe_get(file_info_id); + if (mod == nullptr || mod->_mod == nullptr) { + return nullptr; + } + Dwarf_Addr bias = 0; + // dwfl_module_getelf returns the Elf* libdwfl already opened for this + // module (cached). No extra file open on our side. + return dwfl_module_getelf(mod->_mod, &bias); +} + +void add_process_language(Process &process, UnwindState *us) { + if (!process.detect_language_once(get_main_exe_elf(us))) { + // Detection deferred (no Elf* yet) -- try again next sample. + } + const NativeLanguage lang = process.get_language(); + if (lang != NativeLanguage::kUnknown) { + us->output.language = to_string(lang); + } +} } // namespace void unwind_init() { elf_version(EV_CURRENT); } @@ -96,6 +134,7 @@ DDRes unwindstate_unwind(UnwindState *us) { // Add a frame that identifies executable to which these belong add_virtual_base_frame(us); add_container_id(process, us); + add_process_language(process, us); if (us->is_timeline) { // the lookup is only useful in timeline view // keep this as a way to remove the possible overhead of opening the files diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 328f6694e..60fe0739f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -99,6 +99,7 @@ set(PROCESS_SRC ../src/dso_hdr.cc ../src/dwfl_wrapper.cc ../src/dwfl_thread_callbacks.cc + ../src/native_language.cc ../src/procutils.cc ../src/signal_helper.cc ../src/stack_helper.cc