diff --git a/examples/cpp/pyperf/PyPerfNativeStackTrace.cc b/examples/cpp/pyperf/PyPerfNativeStackTrace.cc index d33772695bee..423472a2bfcf 100644 --- a/examples/cpp/pyperf/PyPerfNativeStackTrace.cc +++ b/examples/cpp/pyperf/PyPerfNativeStackTrace.cc @@ -5,14 +5,16 @@ #include "PyPerfNativeStackTrace.h" -#include +#include #include +#include +#include #include -#include - +#include #include #include #include +#include #include "PyPerfLoggingHelper.h" @@ -20,10 +22,21 @@ namespace ebpf { namespace pyperf { // Ideally it was preferable to save this as the context in libunwind accessors, but it's already used by UPT +const uint16_t NativeStackTrace::CacheMaxSizeMB = 56; +const uint16_t NativeStackTrace::CacheMaxTTL = 300; // In seconds const uint8_t *NativeStackTrace::stack = NULL; size_t NativeStackTrace::stack_len = 0; uintptr_t NativeStackTrace::sp = 0; uintptr_t NativeStackTrace::ip = 0; +time_t NativeStackTrace::now; +UnwindCache NativeStackTrace::cache; + +// DEBUG SECTION +bool dbg_Unlimited_Cache = true; +float NativeStackTrace::dbg_maxSize = 0.0; +int NativeStackTrace::dbg_counter = 0; +int dbg_maxNumOfKeys = 0; +// THE END OF DEBUG SECTION NativeStackTrace::NativeStackTrace(uint32_t pid, const unsigned char *raw_stack, size_t stack_len, uintptr_t ip, uintptr_t sp) : error_occurred(false) { @@ -31,6 +44,7 @@ NativeStackTrace::NativeStackTrace(uint32_t pid, const unsigned char *raw_stack, NativeStackTrace::stack_len = stack_len; NativeStackTrace::ip = ip; NativeStackTrace::sp = sp; + NativeStackTrace::now = time(NULL); if (stack_len == 0) { return; @@ -43,71 +57,151 @@ NativeStackTrace::NativeStackTrace(uint32_t pid, const unsigned char *raw_stack, // The UPT implementation of these functions uses ptrace. We want to make sure they aren't getting called my_accessors.access_fpreg = NULL; my_accessors.resume = NULL; - + unw_cursor_t cursor; int res; - unw_addr_space_t as = unw_create_addr_space(&my_accessors, 0); - void *upt = _UPT_create(pid); - if (!upt) { - this->symbols.push_back(std::string("[Error _UPT_create (system OOM)]")); - this->error_occurred = true; - goto out; + std::ostringstream error; + + // Pseudo-proactive way of implementing TTL - whenever any call is made, all expired entries are removed + cache_eviction(); + + // DEBUG SECTION + float dbg_current_cache_size = cache_size_KB(); + if (dbg_maxSize < dbg_current_cache_size) { + logInfo(2, "DEBUGIZA - Increase in cache usage by %.2f KB\n", + dbg_current_cache_size - dbg_maxSize); + dbg_maxSize = dbg_current_cache_size; + dbg_maxNumOfKeys = cache.size(); + dbg_counter = 0; + } else { + dbg_counter++; + if (dbg_counter >= 10) { + // after 10 times we are sure that the cache max size is stable + logInfo(2, "DEBUGIZA - cache usage stops growing\n"); + } } - unw_cursor_t cursor; - // TODO: It's possible to make libunwind use cache using unw_set_caching_policy, which might lead to significent - // performance improvement. We just need to make sure it's not dangerous. For now the overhead is good enough. - res = unw_init_remote(&cursor, as, upt); - if (res) { - std::ostringstream error; - error << "[Error unw_init_remote (" << unw_strerror(res) << ")]"; - this->symbols.push_back(error.str()); - this->error_occurred = true; - goto out; - } - - do { - unw_word_t offset; - char sym[256]; - char *realname; - int status; - - // TODO: This function is very heavy. We should try to do some caching here, maybe in the - // underlying UPT function. - res = unw_get_proc_name(&cursor, sym, sizeof(sym), &offset); - if (res == 0) { - realname = abi::__cxa_demangle(sym, NULL, NULL, &status); - if (status == 0) { - this->symbols.push_back(std::string(realname)); - } else { - this->symbols.push_back(std::string(sym)); - } - free(realname); - } else { - unw_word_t ip; - unw_get_reg(&cursor, UNW_REG_IP, &ip); - unw_word_t sp; - unw_get_reg(&cursor, UNW_REG_SP, &sp); - logInfo(2, - "IP=0x%lx -- error: unable to obtain symbol name for this frame - %s " - "(frame SP=0x%lx)\n", - ip, unw_strerror(res), sp); - this->symbols.push_back(std::string("(missing)")); + logInfo(2, "DEBUGIZA - cache_size=%.2f KB | %d entries \n", + dbg_current_cache_size, cache.size()); + logInfo(2, "DEBUGIZA - cache_max_size=%.2f KB | %d entries \n", dbg_maxSize, + dbg_maxNumOfKeys); + + // THE END OF DEBUG-SECTION + + void *upt; + // Check whether the entry for the process ID is presented in the cache + if (!is_cached(pid)) { + logInfo(3,"The given key %d is not presented in the cache\n", pid); + unw_addr_space_t as; + + as = unw_create_addr_space(&my_accessors, 0); + res = unw_set_caching_policy(as, UNW_CACHE_GLOBAL); + if (res) { + error << "[Error unw_set_caching_policy (" << unw_strerror(res) << ")]"; + this->symbols.push_back(error.str()); this->error_occurred = true; - break; + cleanup(upt, as); + return; } - // Unwind only until we get to the function from which the current Python function is executed. - // On Python3 the main loop function is called "_PyEval_EvalFrameDefault", and on Python2 it's - // "PyEval_EvalFrameEx". - if (memcmp(sym, "_PyEval_EvalFrameDefault", - sizeof("_PyEval_EvalFrameDefault")) == 0 || - memcmp(sym, "PyEval_EvalFrameEx", sizeof("PyEval_EvalFrameEx")) == 0) - { - break; + upt = _UPT_create(pid); + if (!upt) { + this->symbols.push_back(std::string("[Error _UPT_create (system OOM)]")); + this->error_occurred = true; + cleanup(upt, as); + return; } - } while (unw_step(&cursor) > 0); -out: + res = unw_init_remote(&cursor, as, upt); + if (res) { + error << "[Error unw_init_remote (" << unw_strerror(res) << ")]"; + this->symbols.push_back(error.str()); + this->error_occurred = true; + cleanup(upt, as); + return; + } + + // Put to the cache + cache_put(pid, cursor, as, upt); + + } else { + logInfo(2, "Found entry for the given key %d in the cache\n", pid); + // Get cursor from the cache + cursor = cache_get(pid).cursor; + upt =cache_get(pid).upt; + } + + if (cache.at(pid).unwinded) { + // logInfo(2, "DEBUGIZA: unwiding is already done for pid %d\n", pid); + for (std::string proc_name : cache.at(pid).proc_names) { + // logInfo(2, "DEBUGIZA: read unwinded symbol=%s\n", + // std::string(proc_name)); + this->symbols.push_back(proc_name); + } + + } else { + logInfo(2, "DEBUGIZA: process unwinwining for pid %d\n", pid); + int dbg_proc_name_counter = 0; + do { + dbg_proc_name_counter++; + unw_word_t offset; + char sym[256]; + char *realname; + int status; + + // TODO: This function is very heavy. We should try to do some caching here, maybe in the + // underlying UPT function. + res = unw_get_proc_name(&cursor, sym, sizeof(sym), &offset); + if (res == 0) { + logInfo(2, "DEBUGIZA offset = %lx name = %s\n", (long)offset, sym); + realname = abi::__cxa_demangle(sym, NULL, NULL, &status); + if (status == 0) { + this->symbols.push_back(std::string(realname)); + cache[pid].proc_names.push_back(std::string(realname)); + } else { + this->symbols.push_back(std::string(sym)); + cache[pid].proc_names.push_back(std::string(sym)); + } + free(realname); + } else { + unw_word_t ip; + unw_get_reg(&cursor, UNW_REG_IP, &ip); + unw_word_t sp; + unw_get_reg(&cursor, UNW_REG_SP, &sp); + logInfo(2, + "IP=0x%lx -- error: unable to obtain symbol name for this frame - %s " + "(frame SP=0x%lx)\n", + ip, unw_strerror(res), sp); + this->symbols.push_back(std::string("(missing)")); + cache[pid].proc_names.push_back(std::string("(missing)")); + this->error_occurred = true; + break; + } + + // Unwind only until we get to the function from which the current Python function is executed. + // On Python3 the main loop function is called "_PyEval_EvalFrameDefault", and on Python2 it's + // "PyEval_EvalFrameEx". + if (memcmp(sym, "_PyEval_EvalFrameDefault", + sizeof("_PyEval_EvalFrameDefault")) == 0 || + memcmp(sym, "PyEval_EvalFrameEx", sizeof("PyEval_EvalFrameEx")) == 0) + { + break; + } + } while (unw_step(&cursor) > 0); + + logInfo(2, "DEBUGIZA: setting unwind=true for pid %d\n", pid); + + cache[pid].unwinded = true; + logInfo(2, "DEBUGIZA: for pid=%d stored %d proc_names\n", pid, + dbg_proc_name_counter); + } + +} + +void NativeStackTrace::Prune_dead_pid(uint32_t dead_pid) { + cache_delete_key(dead_pid); +} + +void NativeStackTrace::cleanup(void *upt, unw_addr_space_t as) { if (upt) { _UPT_destroy(upt); } @@ -200,5 +294,101 @@ bool NativeStackTrace::error_occured() const { return error_occurred; } +bool NativeStackTrace::is_cached(const uint32_t &key) { + try { + cache.at(key); + return true; + } + catch (const std::out_of_range&) { + logInfo(3, "is_cached: no entry for pid %d\n", key); + } + return false; +} + +UnwindCacheEntry NativeStackTrace::cache_get(const uint32_t &key) { + UnwindCacheEntry &entry = cache.at(key); + return entry; +} + +// cache_put adds a new entry to the unwind cache if its capacity allows +void NativeStackTrace::cache_put(const uint32_t &key, const unw_cursor_t cursor, + const unw_addr_space_t as, void *upt) { + // Check available capacity + // For debug purpose, skip the limit + if (!dbg_Unlimited_Cache) { + if (cache_size() > NativeStackTrace::CacheMaxSizeMB * 1024 * 1024 - + cache_single_entry_size()) { + logInfo( + 2, + "Skipping caching entry for pid %d due to the current cache usage " + "equals to %.2f MB is close to the limit (%d MB)\n", + key, cache_size_KB() / 1024, NativeStackTrace::CacheMaxSizeMB); + return; + } + } + + UnwindCacheEntry entry = {cursor, as, upt, now}; + cache[key] = entry; + logInfo(3, "New entry for pid %d was added to the cache\n", key); +} + +// cache_delete_key removes the element from the cache and destroys unwind address space and UPT +// to ensure that all memory and other resources are freed up +bool NativeStackTrace::cache_delete_key(const uint32_t &key) { + UnwindCacheEntry e; + try { + e = cache_get(key); + } + catch (const std::out_of_range&) { + logInfo(3, "cache_delete_key: no entry for pid %d\n", key); + return false; + } + + cache.erase(key); + cleanup(e.upt, e.as); + logInfo(3, "The entry for pid %d was deleted from the cache\n", key); + return true; +} + +// cache_single_entry_size returns the number of bytes taken by single entry +uint32_t NativeStackTrace::cache_single_entry_size() { + return sizeof(decltype(cache)::key_type) + sizeof(decltype(cache)::mapped_type); +} + +// cache_size returns the number of bytes currently in use by the cache +uint32_t NativeStackTrace::cache_size() { + return sizeof(cache) + cache.size()*cache_single_entry_size(); +} + +// cache_size_KB returns the number of kilobytes currently in use by the cache +float NativeStackTrace::cache_size_KB() { return cache_size() / 1024; } + +// cache_eviction removes elements older than 5 minutes (CacheMaxTTL=300) +void NativeStackTrace::cache_eviction() { + std::vector keys_to_delete; + float _prev_cache_size = cache_size_KB(); + + for (std::map::iterator iter = cache.begin(); + iter != cache.end(); ++iter) { + uint32_t k = iter->first; + const UnwindCacheEntry & e = iter->second; + if (std::abs(difftime(e.timestamp, NativeStackTrace::now)) > NativeStackTrace::CacheMaxTTL) { + // exceeding TTL + keys_to_delete.push_back(k); + } + } + + // Delete expired entries + for( size_t i = 0; i < keys_to_delete.size(); i++ ) { + cache_delete_key(keys_to_delete[i]); + } + + if (keys_to_delete.size() > 0) { + float _cache_size = cache_size_KB(); + logInfo(3,"Evicted %d item(s) from the cache\n", keys_to_delete.size()); + logInfo(3,"The cache usage after eviction action is %.2f KB (released %.2f KB)\n", _cache_size, _prev_cache_size - _cache_size); + } +} + } // namespace pyperf } // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfNativeStackTrace.h b/examples/cpp/pyperf/PyPerfNativeStackTrace.h index 3d06e8b7797e..172376d5393e 100644 --- a/examples/cpp/pyperf/PyPerfNativeStackTrace.h +++ b/examples/cpp/pyperf/PyPerfNativeStackTrace.h @@ -8,32 +8,64 @@ #include #include +#include namespace ebpf { namespace pyperf { +typedef struct { + unw_cursor_t cursor; + unw_addr_space_t as; + void *upt; + time_t timestamp; + std::vector proc_names; + bool unwinded = false; +} UnwindCacheEntry; + +typedef std::map UnwindCache; + class NativeStackTrace { public: explicit NativeStackTrace(uint32_t pid, const uint8_t *raw_stack, size_t stack_len, uintptr_t ip, uintptr_t sp); + static void Prune_dead_pid(uint32_t dead_pid); std::vector get_stack_symbol() const; bool error_occured() const; - private: std::vector symbols; bool error_occurred; + static UnwindCache cache; static const uint8_t *stack; static size_t stack_len; static uintptr_t ip; static uintptr_t sp; + static time_t now; + static int dbg_counter; + static float dbg_maxSize; + + static const uint16_t CacheMaxSizeMB; + static const uint16_t CacheMaxTTL; + + static uint32_t cache_size(); + static uint32_t cache_single_entry_size(); + static float cache_size_KB(); static int access_reg(unw_addr_space_t as, unw_regnum_t regnum, unw_word_t *valp, int write, void *arg); static int access_mem(unw_addr_space_t as, unw_word_t addr, unw_word_t *valp, int write, void *arg); + + static void cleanup(void *upt, unw_addr_space_t as); + + bool is_cached(const uint32_t &key); + void cache_put(const uint32_t &key, const unw_cursor_t cursor, + const unw_addr_space_t as, void *upt); + static UnwindCacheEntry cache_get(const uint32_t &key); + static bool cache_delete_key(const uint32_t &key); + static void cache_eviction(); }; } // namespace pyperf diff --git a/examples/cpp/pyperf/PyPerfProfiler.cc b/examples/cpp/pyperf/PyPerfProfiler.cc index 9db85915ee3e..ce405ccfa130 100644 --- a/examples/cpp/pyperf/PyPerfProfiler.cc +++ b/examples/cpp/pyperf/PyPerfProfiler.cc @@ -27,8 +27,9 @@ #include #include "PyPerfLoggingHelper.h" -#include "PyPerfVersion.h" +#include "PyPerfNativeStackTrace.h" #include "PyPerfProc.h" +#include "PyPerfVersion.h" #include "bcc_elf.h" #include "bcc_proc.h" #include "bcc_syms.h" @@ -239,15 +240,19 @@ bool PyPerfProfiler::populatePidTable() { for (const auto pid : pid_config_keys) { auto pos = std::find(pids.begin(), pids.end(), pid); if (pos == pids.end()) { + // Remove dead pid from config map and native stack trace's cache pid_config_map.remove_value(pid); + NativeStackTrace::Prune_dead_pid(pid); } else { result = true; + // To avoid re-population pids.erase(pos); } } logInfo(3, "Populating pid table\n"); + // Populate only those pids not seen before for (const auto pid : pids) { PidData pidData;