diff options
Diffstat (limited to 'contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h')
-rw-r--r-- | contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h new file mode 100644 index 000000000000..f55068e98dd4 --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h @@ -0,0 +1,175 @@ +/*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===*\ +|* +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +|* See https://llvm.org/LICENSE.txt for license information. +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +|* +\*===----------------------------------------------------------------------===*/ + +#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_ +#define CTX_PROFILE_CTXINSTRPROFILING_H_ + +#include "CtxInstrContextNode.h" +#include "sanitizer_common/sanitizer_mutex.h" +#include <sanitizer/common_interface_defs.h> + +using namespace llvm::ctx_profile; + +// Forward-declare for the one unittest checking Arena construction zeroes out +// its allocatable space. +class ArenaTest_ZeroInit_Test; +namespace __ctx_profile { + +static constexpr size_t ExpectedAlignment = 8; +// We really depend on this, see further below. We currently support x86_64. +// When we want to support other archs, we need to trace the places Alignment is +// used and adjust accordingly. +static_assert(sizeof(void *) == ExpectedAlignment); + +/// Arena (bump allocator) forming a linked list. Intentionally not thread safe. +/// Allocation and de-allocation happen using sanitizer APIs. We make that +/// explicit. +class Arena final { +public: + // When allocating a new Arena, optionally specify an existing one to append + // to, assumed to be the last in the Arena list. We only need to support + // appending to the arena list. + static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr); + static void freeArenaList(Arena *&A); + + uint64_t size() const { return Size; } + + // Allocate S bytes or return nullptr if we don't have that many available. + char *tryBumpAllocate(size_t S) { + if (Pos + S > Size) + return nullptr; + Pos += S; + return start() + (Pos - S); + } + + Arena *next() const { return Next; } + + // the beginning of allocatable memory. + const char *start() const { return const_cast<Arena *>(this)->start(); } + const char *pos() const { return start() + Pos; } + +private: + friend class ::ArenaTest_ZeroInit_Test; + explicit Arena(uint32_t Size); + ~Arena() = delete; + + char *start() { return reinterpret_cast<char *>(&this[1]); } + + Arena *Next = nullptr; + uint64_t Pos = 0; + const uint64_t Size; +}; + +// The memory available for allocation follows the Arena header, and we expect +// it to be thus aligned. +static_assert(alignof(Arena) == ExpectedAlignment); + +// Verify maintenance to ContextNode doesn't change this invariant, which makes +// sure the inlined vectors are appropriately aligned. +static_assert(alignof(ContextNode) == ExpectedAlignment); + +/// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned +/// with allocating and zero-initializing the global value (as in, GlobalValue) +/// for it. +struct ContextRoot { + ContextNode *FirstNode = nullptr; + Arena *FirstMemBlock = nullptr; + Arena *CurrentMem = nullptr; + // This is init-ed by the static zero initializer in LLVM. + // Taken is used to ensure only one thread traverses the contextual graph - + // either to read it or to write it. On server side, the same entrypoint will + // be entered by numerous threads, but over time, the profile aggregated by + // collecting sequentially on one thread at a time is expected to converge to + // the aggregate profile that may have been observable on all the threads. + // Note that this is node-by-node aggregation, i.e. summing counters of nodes + // at the same position in the graph, not flattening. + // Threads that cannot lock Taken (fail TryLock) are given a "scratch context" + // - a buffer they can clobber, safely from a memory access perspective. + // + // Note about "scratch"-ness: we currently ignore the data written in them + // (which is anyway clobbered). The design allows for that not be the case - + // because "scratch"-ness is first and foremost about not trying to build + // subcontexts, and is captured by tainting the pointer value (pointer to the + // memory treated as context), but right now, we drop that info. + // + // We could consider relaxing the requirement of more than one thread + // entering by holding a few context trees per entrypoint and then aggregating + // them (as explained above) at the end of the profile collection - it's a + // tradeoff between collection time and memory use: higher precision can be + // obtained with either less concurrent collections but more collection time, + // or with more concurrent collections (==more memory) and less collection + // time. Note that concurrent collection does happen for different + // entrypoints, regardless. + ::__sanitizer::StaticSpinMutex Taken; + + // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM + // instrumentation lowering side because it is responsible for allocating and + // zero-initializing ContextRoots. + static_assert(sizeof(Taken) == 1); +}; + +/// This API is exposed for testing. See the APIs below about the contract with +/// LLVM. +inline bool isScratch(const void *Ctx) { + return (reinterpret_cast<uint64_t>(Ctx) & 1); +} + +} // namespace __ctx_profile + +extern "C" { + +// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic. +// position 0 is used when the current context isn't scratch, 1 when it is. They +// are volatile because of signal handlers - we mean to specifically control +// when the data is loaded. +// +/// TLS where LLVM stores the pointer of the called value, as part of lowering a +/// llvm.instrprof.callsite +extern __thread void *volatile __llvm_ctx_profile_expected_callee[2]; +/// TLS where LLVM stores the pointer inside a caller's subcontexts vector that +/// corresponds to the callsite being lowered. +extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[2]; + +// __llvm_ctx_profile_current_context_root is exposed for unit testing, +// othwerise it's only used internally by compiler-rt/ctx_profile. +extern __thread __ctx_profile::ContextRoot + *volatile __llvm_ctx_profile_current_context_root; + +/// called by LLVM in the entry BB of a "entry point" function. The returned +/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch. +ContextNode *__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root, + GUID Guid, uint32_t Counters, + uint32_t Callsites); + +/// paired with __llvm_ctx_profile_start_context, and called at the exit of the +/// entry point function. +void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root); + +/// called for any other function than entry points, in the entry BB of such +/// function. Same consideration about LSB of returned value as .._start_context +ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, + uint32_t NrCounters, + uint32_t NrCallsites); + +/// Prepares for collection. Currently this resets counter values but preserves +/// internal context tree structure. +void __llvm_ctx_profile_start_collection(); + +/// Completely free allocated memory. +void __llvm_ctx_profile_free(); + +/// Used to obtain the profile. The Writer is called for each root ContextNode, +/// with the ContextRoot::Taken taken. The Writer is responsible for traversing +/// the structure underneath. +/// The Writer's first parameter plays the role of closure for Writer, and is +/// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter. +/// The second parameter is the root of a context tree. +bool __llvm_ctx_profile_fetch(void *Data, + bool (*Writer)(void *, const ContextNode &)); +} +#endif // CTX_PROFILE_CTXINSTRPROFILING_H_ |