diff options
Diffstat (limited to 'lib/xray')
49 files changed, 4877 insertions, 1206 deletions
diff --git a/lib/xray/CMakeLists.txt b/lib/xray/CMakeLists.txt index 5547600b943a..8e18f55658f8 100644 --- a/lib/xray/CMakeLists.txt +++ b/lib/xray/CMakeLists.txt @@ -1,16 +1,29 @@ -# Build for the XRay runtime support library. +# Build for all components of the XRay runtime support library. # XRay runtime library implementation files. set(XRAY_SOURCES - xray_inmemory_log.cc - xray_init.cc - xray_flags.cc - xray_interface.cc - xray_buffer_queue.cc - xray_log_interface.cc - xray_fdr_logging.cc - xray_utils.cc) + xray_init.cc + xray_flags.cc + xray_interface.cc + xray_log_interface.cc + xray_utils.cc) +# Implementation files for all XRay modes. +set(XRAY_FDR_MODE_SOURCES + xray_fdr_flags.cc + xray_buffer_queue.cc + xray_fdr_logging.cc) + +set(XRAY_BASIC_MODE_SOURCES + xray_basic_flags.cc + xray_basic_logging.cc) + +set(XRAY_PROFILING_MODE_SOURCES + xray_profile_collector.cc + xray_profiling.cc + xray_profiling_flags.cc) + +# Implementation files for all XRay architectures. set(x86_64_SOURCES xray_x86_64.cc xray_trampoline_x86_64.S) @@ -23,8 +36,8 @@ set(armhf_SOURCES ${arm_SOURCES}) set(aarch64_SOURCES - xray_AArch64.cc - xray_trampoline_AArch64.S) + xray_AArch64.cc + xray_trampoline_AArch64.S) set(mips_SOURCES xray_mips.cc @@ -47,11 +60,68 @@ set(powerpc64le_SOURCES xray_trampoline_powerpc64.cc xray_trampoline_powerpc64_asm.S) +set(XRAY_IMPL_HEADERS + xray_allocator.h + xray_basic_flags.h + xray_basic_flags.inc + xray_basic_logging.h + xray_buffer_queue.h + xray_defs.h + xray_fdr_flags.h + xray_fdr_flags.inc + xray_fdr_log_records.h + xray_fdr_logging.h + xray_flags.h + xray_flags.inc + xray_function_call_trie.h + xray_interface_internal.h + xray_powerpc64.inc + xray_profile_collector.h + xray_profiling_flags.h + xray_profiling_flags.inc + xray_recursion_guard.h + xray_segmented_array.h + xray_tsc.h + xray_utils.h + xray_x86_64.inc) + +# Create list of all source files for +# consumption by tests. +set(XRAY_ALL_SOURCE_FILES + ${XRAY_SOURCES} + ${XRAY_FDR_MODE_SOURCES} + ${XRAY_BASIC_MODE_SOURCES} + ${XRAY_PROFILING_MODE_SOURCES} + ${x86_64_SOURCES} + ${arm_SOURCES} + ${armhf_SOURCES} + ${mips_SOURCES} + ${mipsel_SOURCES} + ${mips64_SOURCES} + ${mips64el_SOURCES} + ${powerpc64le_SOURCES} + ${XRAY_IMPL_HEADERS} +) +list(REMOVE_DUPLICATES XRAY_ALL_SOURCE_FILES) +# Make list that uses absolute paths +set(XRAY_ALL_SOURCE_FILES_ABS_PATHS "") +foreach (src_file ${XRAY_ALL_SOURCE_FILES}) + list(APPEND + XRAY_ALL_SOURCE_FILES_ABS_PATHS + "${CMAKE_CURRENT_SOURCE_DIR}/${src_file}") +endforeach() + + +# Now put it all together... include_directories(..) include_directories(../../include) set(XRAY_CFLAGS ${SANITIZER_COMMON_CFLAGS}) set(XRAY_COMMON_DEFINITIONS XRAY_HAS_EXCEPTIONS=1) + +# We don't need RTTI in XRay, so turn that off. +append_rtti_flag(OFF XRAY_CFLAGS) + append_list_if( COMPILER_RT_HAS_XRAY_COMPILER_FLAG XRAY_SUPPORTED=1 XRAY_COMMON_DEFINITIONS) append_list_if( @@ -60,10 +130,13 @@ append_list_if( add_compiler_rt_component(xray) set(XRAY_COMMON_RUNTIME_OBJECT_LIBS - RTXray RTSanitizerCommon RTSanitizerCommonLibc) +if (TARGET cxx-headers OR HAVE_LIBCXX) + set(XRAY_DEPS cxx-headers) +endif() + if (APPLE) set(XRAY_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS}) add_asm_sources(XRAY_ASM_SOURCES xray_trampoline_x86_64.S) @@ -75,8 +148,34 @@ if (APPLE) OS ${XRAY_SUPPORTED_OS} ARCHS ${XRAY_SUPPORTED_ARCH} SOURCES ${x86_64_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayFDR + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + SOURCES ${XRAY_FDR_MODE_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS}) + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayBASIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + SOURCES ${XRAY_BASIC_MODE_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayPROFILING + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + SOURCES ${XRAY_PROFILING_MODE_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) # We only support running on osx for now. add_compiler_rt_runtime(clang_rt.xray @@ -91,24 +190,104 @@ if (APPLE) LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} LINK_LIBS ${XRAY_LINK_LIBS} PARENT_TARGET xray) -else() -foreach(arch ${XRAY_SUPPORTED_ARCH}) - if(CAN_TARGET_${arch}) + add_compiler_rt_runtime(clang_rt.xray-fdr + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + OBJECT_LIBS RTXrayFDR + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-basic + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + OBJECT_LIBS RTXrayBASIC + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-profiling + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + OBJECT_LIBS RTXrayPROFILING + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) +else() # not Apple + foreach(arch ${XRAY_SUPPORTED_ARCH}) + if(NOT CAN_TARGET_${arch}) + continue() + endif() add_compiler_rt_object_libraries(RTXray ARCHS ${arch} - SOURCES ${XRAY_SOURCES} CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS}) + SOURCES ${XRAY_SOURCES} ${${arch}_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayFDR + ARCHS ${arch} + SOURCES ${XRAY_FDR_MODE_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayBASIC + ARCHS ${arch} + SOURCES ${XRAY_BASIC_MODE_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayPROFILING + ARCHS ${arch} + SOURCES ${XRAY_PROFILING_MODE_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + DEPS ${XRAY_DEPS}) + + # Common XRay archive for instrumented binaries. add_compiler_rt_runtime(clang_rt.xray STATIC ARCHS ${arch} - SOURCES ${${arch}_SOURCES} CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS} - OBJECT_LIBS ${XRAY_COMMON_RUNTIME_OBJECT_LIBS} + OBJECT_LIBS ${XRAY_COMMON_RUNTIME_OBJECT_LIBS} RTXray PARENT_TARGET xray) - endif() -endforeach() -endif() + # FDR mode runtime archive (addon for clang_rt.xray) + add_compiler_rt_runtime(clang_rt.xray-fdr + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayFDR + PARENT_TARGET xray) + # Basic mode runtime archive (addon for clang_rt.xray) + add_compiler_rt_runtime(clang_rt.xray-basic + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayBASIC + PARENT_TARGET xray) + # Profiler Mode runtime + add_compiler_rt_runtime(clang_rt.xray-profiling + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayPROFILING + PARENT_TARGET xray) + endforeach() +endif() # not Apple if(COMPILER_RT_INCLUDE_TESTS) add_subdirectory(tests) diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt index e54e63f27890..11f373167d24 100644 --- a/lib/xray/tests/CMakeLists.txt +++ b/lib/xray/tests/CMakeLists.txt @@ -3,6 +3,18 @@ include_directories(..) add_custom_target(XRayUnitTests) set_target_properties(XRayUnitTests PROPERTIES FOLDER "XRay unittests") +# Sanity check XRAY_ALL_SOURCE_FILES_ABS_PATHS +list(LENGTH XRAY_ALL_SOURCE_FILES_ABS_PATHS XASFAP_LENGTH) +if (${XASFAP_LENGTH} EQUAL 0) + message(FATAL_ERROR "XRAY_ALL_SOURCE_FILES_ABS_PATHS cannot be empty") +endif() +unset(XASFAP_LENGTH) +foreach (src_file ${XRAY_ALL_SOURCE_FILES_ABS_PATHS}) + if (NOT EXISTS "${src_file}") + message(FATAL_ERROR "Source file \"${src_file}\" does not exist") + endif() +endforeach() + set(XRAY_UNITTEST_CFLAGS ${XRAY_CFLAGS} ${COMPILER_RT_UNITTEST_CFLAGS} @@ -11,27 +23,77 @@ set(XRAY_UNITTEST_CFLAGS -I${COMPILER_RT_SOURCE_DIR}/lib/xray -I${COMPILER_RT_SOURCE_DIR}/lib) +function(add_xray_lib library) + add_library(${library} STATIC ${ARGN}) + set_target_properties(${library} PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + FOLDER "Compiler-RT Runtime tests") +endfunction() + +function(get_xray_lib_for_arch arch lib) + if(APPLE) + set(tgt_name "RTXRay.test.osx") + else() + set(tgt_name "RTXRay.test.${arch}") + endif() + set(${lib} "${tgt_name}" PARENT_SCOPE) +endfunction() + set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH}) +set(XRAY_UNITTEST_LINK_FLAGS + ${CMAKE_THREAD_LIBS_INIT} + -l${SANITIZER_CXX_ABI_LIBRARY} + -fxray-instrument + ) +if (NOT APPLE) + append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS) + append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS) + append_list_if(COMPILER_RT_HAS_LIBDL -ldl XRAY_UNITTEST_LINK_FLAGS) + append_list_if(COMPILER_RT_HAS_LIBPTHREAD -pthread XRAY_UNITTEST_LINK_FLAGS) +endif() + macro(add_xray_unittest testname) cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN}) if(UNIX AND NOT APPLE) + set(CMAKE_DL_LIBS_INIT "") foreach(arch ${XRAY_TEST_ARCH}) set(TEST_OBJECTS) + get_xray_lib_for_arch(${arch} XRAY_RUNTIME_LIBS) generate_compiler_rt_tests(TEST_OBJECTS XRayUnitTests "${testname}-${arch}-Test" "${arch}" SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE} + # Note that any change in the implementations will cause all the unit + # tests to be re-built. This is by design, but may be cumbersome during + # the build/test cycle. + COMPILE_DEPS ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE} + ${XRAY_HEADERS} ${XRAY_ALL_SOURCE_FILES_ABS_PATHS} + RUNTIME "${XRAY_RUNTIME_LIBS}" DEPS gtest xray llvm-xray CFLAGS ${XRAY_UNITTEST_CFLAGS} - LINK_FLAGS -fxray-instrument - ${TARGET_LINK_FLAGS} - -lstdc++ -lm ${CMAKE_THREAD_LIBS_INIT} - -lpthread - -ldl -lrt) - set_target_properties(XRayUnitTests PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS}) + set_target_properties(XRayUnitTests + PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endforeach() endif() endmacro() if(COMPILER_RT_CAN_EXECUTE_TESTS) + if (APPLE) + add_xray_lib("RTXRay.test.osx" + $<TARGET_OBJECTS:RTXray.osx> + $<TARGET_OBJECTS:RTXrayFDR.osx> + $<TARGET_OBJECTS:RTXrayPROFILING.osx> + $<TARGET_OBJECTS:RTSanitizerCommon.osx> + $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>) + else() + foreach(arch ${XRAY_SUPPORTED_ARCH}) + add_xray_lib("RTXRay.test.${arch}" + $<TARGET_OBJECTS:RTXray.${arch}> + $<TARGET_OBJECTS:RTXrayFDR.${arch}> + $<TARGET_OBJECTS:RTXrayPROFILING.${arch}> + $<TARGET_OBJECTS:RTSanitizerCommon.${arch}> + $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>) + endforeach() + endif() add_subdirectory(unit) endif() diff --git a/lib/xray/tests/unit/CMakeLists.txt b/lib/xray/tests/unit/CMakeLists.txt index 62d01f239581..b42eb50d0790 100644 --- a/lib/xray/tests/unit/CMakeLists.txt +++ b/lib/xray/tests/unit/CMakeLists.txt @@ -2,3 +2,11 @@ add_xray_unittest(XRayBufferQueueTest SOURCES buffer_queue_test.cc xray_unit_test_main.cc) add_xray_unittest(XRayFDRLoggingTest SOURCES fdr_logging_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayAllocatorTest SOURCES + allocator_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRaySegmentedArrayTest SOURCES + segmented_array_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayFunctionCallTrieTest SOURCES + function_call_trie_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayProfileCollectorTest SOURCES + profile_collector_test.cc xray_unit_test_main.cc) diff --git a/lib/xray/tests/unit/allocator_test.cc b/lib/xray/tests/unit/allocator_test.cc new file mode 100644 index 000000000000..be404160e417 --- /dev/null +++ b/lib/xray/tests/unit/allocator_test.cc @@ -0,0 +1,42 @@ +//===-- allocator_test.cc -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// + +#include "xray_allocator.h" +#include "gtest/gtest.h" + +namespace __xray { +namespace { + +struct TestData { + s64 First; + s64 Second; +}; + +TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11); } + +TEST(AllocatorTest, Allocate) { + Allocator<sizeof(TestData)> A(2 << 11); + auto B = A.Allocate(); + ASSERT_NE(B.Data, nullptr); +} + +TEST(AllocatorTest, OverAllocate) { + Allocator<sizeof(TestData)> A(sizeof(TestData)); + auto B1 = A.Allocate(); + (void)B1; + auto B2 = A.Allocate(); + ASSERT_EQ(B2.Data, nullptr); +} + +} // namespace +} // namespace __xray diff --git a/lib/xray/tests/unit/buffer_queue_test.cc b/lib/xray/tests/unit/buffer_queue_test.cc index 1ec7469ce187..c0d4ccb268d6 100644 --- a/lib/xray/tests/unit/buffer_queue_test.cc +++ b/lib/xray/tests/unit/buffer_queue_test.cc @@ -32,9 +32,9 @@ TEST(BufferQueueTest, GetAndRelease) { ASSERT_TRUE(Success); BufferQueue::Buffer Buf; ASSERT_EQ(Buffers.getBuffer(Buf), BufferQueue::ErrorCode::Ok); - ASSERT_NE(nullptr, Buf.Buffer); + ASSERT_NE(nullptr, Buf.Data); ASSERT_EQ(Buffers.releaseBuffer(Buf), BufferQueue::ErrorCode::Ok); - ASSERT_EQ(nullptr, Buf.Buffer); + ASSERT_EQ(nullptr, Buf.Data); } TEST(BufferQueueTest, GetUntilFailed) { @@ -53,7 +53,7 @@ TEST(BufferQueueTest, ReleaseUnknown) { BufferQueue Buffers(kSize, 1, Success); ASSERT_TRUE(Success); BufferQueue::Buffer Buf; - Buf.Buffer = reinterpret_cast<void *>(0xdeadbeef); + Buf.Data = reinterpret_cast<void *>(0xdeadbeef); Buf.Size = kSize; EXPECT_EQ(BufferQueue::ErrorCode::UnrecognizedBuffer, Buffers.releaseBuffer(Buf)); @@ -65,7 +65,7 @@ TEST(BufferQueueTest, ErrorsWhenFinalising) { ASSERT_TRUE(Success); BufferQueue::Buffer Buf; ASSERT_EQ(Buffers.getBuffer(Buf), BufferQueue::ErrorCode::Ok); - ASSERT_NE(nullptr, Buf.Buffer); + ASSERT_NE(nullptr, Buf.Data); ASSERT_EQ(Buffers.finalize(), BufferQueue::ErrorCode::Ok); BufferQueue::Buffer OtherBuf; ASSERT_EQ(BufferQueue::ErrorCode::QueueFinalizing, diff --git a/lib/xray/tests/unit/fdr_logging_test.cc b/lib/xray/tests/unit/fdr_logging_test.cc index 76738ea4dff3..b6961efbc351 100644 --- a/lib/xray/tests/unit/fdr_logging_test.cc +++ b/lib/xray/tests/unit/fdr_logging_test.cc @@ -10,6 +10,7 @@ // This file is a part of XRay, a function call tracing system. // //===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_common.h" #include "xray_fdr_logging.h" #include "gtest/gtest.h" @@ -86,7 +87,7 @@ TEST(FDRLoggingTest, Simple) { XRayFileHeader H; memcpy(&H, Contents, sizeof(XRayFileHeader)); - ASSERT_EQ(H.Version, 2); + ASSERT_EQ(H.Version, 3); ASSERT_EQ(H.Type, FileTypes::FDR_LOG); // We require one buffer at least to have the "extents" metadata record, @@ -131,7 +132,7 @@ TEST(FDRLoggingTest, Multiple) { XRayFileHeader H; memcpy(&H, Contents, sizeof(XRayFileHeader)); - ASSERT_EQ(H.Version, 2); + ASSERT_EQ(H.Version, 3); ASSERT_EQ(H.Type, FileTypes::FDR_LOG); MetadataRecord MDR0, MDR1; @@ -154,12 +155,12 @@ TEST(FDRLoggingTest, MultiThreadedCycling) { // Now we want to create one thread, do some logging, then create another one, // in succession and making sure that we're able to get thread records from // the latest thread (effectively being able to recycle buffers). - std::array<pid_t, 2> Threads; + std::array<tid_t, 2> Threads; for (uint64_t I = 0; I < 2; ++I) { std::thread t{[I, &Threads] { fdrLoggingHandleArg0(I + 1, XRayEntryType::ENTRY); fdrLoggingHandleArg0(I + 1, XRayEntryType::EXIT); - Threads[I] = syscall(SYS_gettid); + Threads[I] = GetTid(); }}; t.join(); } @@ -182,7 +183,7 @@ TEST(FDRLoggingTest, MultiThreadedCycling) { XRayFileHeader H; memcpy(&H, Contents, sizeof(XRayFileHeader)); - ASSERT_EQ(H.Version, 2); + ASSERT_EQ(H.Version, 3); ASSERT_EQ(H.Type, FileTypes::FDR_LOG); MetadataRecord MDR0, MDR1; @@ -192,9 +193,9 @@ TEST(FDRLoggingTest, MultiThreadedCycling) { ASSERT_EQ(MDR0.RecordKind, uint8_t(MetadataRecord::RecordKinds::BufferExtents)); ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer)); - pid_t Latest = 0; - memcpy(&Latest, MDR1.Data, sizeof(pid_t)); - ASSERT_EQ(Latest, Threads[1]); + int32_t Latest = 0; + memcpy(&Latest, MDR1.Data, sizeof(int32_t)); + ASSERT_EQ(Latest, static_cast<int32_t>(Threads[1])); } } // namespace diff --git a/lib/xray/tests/unit/function_call_trie_test.cc b/lib/xray/tests/unit/function_call_trie_test.cc new file mode 100644 index 000000000000..049ecfb07e01 --- /dev/null +++ b/lib/xray/tests/unit/function_call_trie_test.cc @@ -0,0 +1,286 @@ +//===-- function_call_trie_test.cc ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#include "gtest/gtest.h" + +#include "xray_function_call_trie.h" + +namespace __xray { + +namespace { + +TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) { + profilingFlags()->setDefaults(); + FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(Allocators); +} + +TEST(FunctionCallTrieTest, EnterAndExitFunction) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 1); + Trie.exitFunction(1, 2); + + // We need a way to pull the data out. At this point, until we get a data + // collection service implemented, we're going to export the data as a list of + // roots, and manually walk through the structure ourselves. + + const auto &R = Trie.getRoots(); + + ASSERT_EQ(R.size(), 1u); + ASSERT_EQ(R.front()->FId, 1); + ASSERT_EQ(R.front()->CallCount, 1); + ASSERT_EQ(R.front()->CumulativeLocalTime, 1u); +} + +TEST(FunctionCallTrieTest, MissingFunctionEntry) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.exitFunction(1, 1); + const auto &R = Trie.getRoots(); + + ASSERT_TRUE(R.empty()); +} + +TEST(FunctionCallTrieTest, NoMatchingEntersForExit) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.enterFunction(2, 1); + Trie.enterFunction(3, 3); + Trie.exitFunction(1, 5); + const auto &R = Trie.getRoots(); + + ASSERT_FALSE(R.empty()); + EXPECT_EQ(R.size(), size_t{1}); +} + +TEST(FunctionCallTrieTest, MissingFunctionExit) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.enterFunction(1, 1); + const auto &R = Trie.getRoots(); + + ASSERT_FALSE(R.empty()); + EXPECT_EQ(R.size(), size_t{1}); +} + +TEST(FunctionCallTrieTest, MultipleRoots) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + // Enter and exit FId = 1. + Trie.enterFunction(1, 1); + Trie.exitFunction(1, 2); + + // Enter and exit FId = 2. + Trie.enterFunction(2, 3); + Trie.exitFunction(2, 4); + + const auto &R = Trie.getRoots(); + ASSERT_FALSE(R.empty()); + ASSERT_EQ(R.size(), 2u); + + // Make sure the roots have different IDs. + const auto R0 = R[0]; + const auto R1 = R[1]; + ASSERT_NE(R0->FId, R1->FId); + + // Inspect the roots that they have the right data. + ASSERT_NE(R0, nullptr); + EXPECT_EQ(R0->CallCount, 1u); + EXPECT_EQ(R0->CumulativeLocalTime, 1u); + + ASSERT_NE(R1, nullptr); + EXPECT_EQ(R1->CallCount, 1u); + EXPECT_EQ(R1->CumulativeLocalTime, 1u); +} + +// While missing an intermediary entry may be rare in practice, we still enforce +// that we can handle the case where we've missed the entry event somehow, in +// between call entry/exits. To illustrate, imagine the following shadow call +// stack: +// +// f0@t0 -> f1@t1 -> f2@t2 +// +// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for +// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of +// accounting local time to `f2` from d = (t3 - t2), then local time to `f1` +// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'. +TEST(FunctionCallTrieTest, MissingIntermediaryExit) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 0); + Trie.enterFunction(2, 100); + Trie.enterFunction(3, 200); + Trie.exitFunction(3, 300); + Trie.exitFunction(1, 400); + + // What we should see at this point is all the functions in the trie in a + // specific order (1 -> 2 -> 3) with the appropriate count(s) and local + // latencies. + const auto &R = Trie.getRoots(); + ASSERT_FALSE(R.empty()); + ASSERT_EQ(R.size(), 1u); + + const auto &F1 = *R[0]; + ASSERT_EQ(F1.FId, 1); + ASSERT_FALSE(F1.Callees.empty()); + + const auto &F2 = *F1.Callees[0].NodePtr; + ASSERT_EQ(F2.FId, 2); + ASSERT_FALSE(F2.Callees.empty()); + + const auto &F3 = *F2.Callees[0].NodePtr; + ASSERT_EQ(F3.FId, 3); + ASSERT_TRUE(F3.Callees.empty()); + + // Now that we've established the preconditions, we check for specific aspects + // of the nodes. + EXPECT_EQ(F3.CallCount, 1); + EXPECT_EQ(F2.CallCount, 1); + EXPECT_EQ(F1.CallCount, 1); + EXPECT_EQ(F3.CumulativeLocalTime, 100); + EXPECT_EQ(F2.CumulativeLocalTime, 300); + EXPECT_EQ(F1.CumulativeLocalTime, 100); +} + +TEST(FunctionCallTrieTest, DeepCallStack) { + // Simulate a relatively deep call stack (32 levels) and ensure that we can + // properly pop all the way up the stack. + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + for (int i = 0; i < 32; ++i) + Trie.enterFunction(i + 1, i); + Trie.exitFunction(1, 33); + + // Here, validate that we have a 32-level deep function call path from the + // root (1) down to the leaf (33). + const auto &R = Trie.getRoots(); + ASSERT_EQ(R.size(), 1u); + auto F = R[0]; + for (int i = 0; i < 32; ++i) { + EXPECT_EQ(F->FId, i + 1); + EXPECT_EQ(F->CallCount, 1); + if (F->Callees.empty() && i != 31) + FAIL() << "Empty callees for FId " << F->FId; + if (i != 31) + F = F->Callees[0].NodePtr; + } +} + +// TODO: Test that we can handle cross-CPU migrations, where TSCs are not +// guaranteed to be synchronised. +TEST(FunctionCallTrieTest, DeepCopy) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 0); + Trie.enterFunction(2, 1); + Trie.exitFunction(2, 2); + Trie.enterFunction(3, 3); + Trie.exitFunction(3, 4); + Trie.exitFunction(1, 5); + + // We want to make a deep copy and compare notes. + auto B = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Copy(B); + Trie.deepCopyInto(Copy); + + ASSERT_NE(Trie.getRoots().size(), 0u); + ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size()); + const auto &R0Orig = *Trie.getRoots()[0]; + const auto &R0Copy = *Copy.getRoots()[0]; + EXPECT_EQ(R0Orig.FId, 1); + EXPECT_EQ(R0Orig.FId, R0Copy.FId); + + ASSERT_EQ(R0Orig.Callees.size(), 2u); + ASSERT_EQ(R0Copy.Callees.size(), 2u); + + const auto &F1Orig = + *R0Orig.Callees + .find_element( + [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; }) + ->NodePtr; + const auto &F1Copy = + *R0Copy.Callees + .find_element( + [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; }) + ->NodePtr; + EXPECT_EQ(&R0Orig, F1Orig.Parent); + EXPECT_EQ(&R0Copy, F1Copy.Parent); +} + +TEST(FunctionCallTrieTest, MergeInto) { + profilingFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie T0(A); + FunctionCallTrie T1(A); + + // 1 -> 2 -> 3 + T0.enterFunction(1, 0); + T0.enterFunction(2, 1); + T0.enterFunction(3, 2); + T0.exitFunction(3, 3); + T0.exitFunction(2, 4); + T0.exitFunction(1, 5); + + // 1 -> 2 -> 3 + T1.enterFunction(1, 0); + T1.enterFunction(2, 1); + T1.enterFunction(3, 2); + T1.exitFunction(3, 3); + T1.exitFunction(2, 4); + T1.exitFunction(1, 5); + + // We use a different allocator here to make sure that we're able to transfer + // data into a FunctionCallTrie which uses a different allocator. This + // reflects the inteded usage scenario for when we're collecting profiles that + // aggregate across threads. + auto B = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Merged(B); + + T0.mergeInto(Merged); + T1.mergeInto(Merged); + + ASSERT_EQ(Merged.getRoots().size(), 1u); + const auto &R0 = *Merged.getRoots()[0]; + EXPECT_EQ(R0.FId, 1); + EXPECT_EQ(R0.CallCount, 2); + EXPECT_EQ(R0.CumulativeLocalTime, 10); + EXPECT_EQ(R0.Callees.size(), 1u); + + const auto &F1 = *R0.Callees[0].NodePtr; + EXPECT_EQ(F1.FId, 2); + EXPECT_EQ(F1.CallCount, 2); + EXPECT_EQ(F1.CumulativeLocalTime, 6); + EXPECT_EQ(F1.Callees.size(), 1u); + + const auto &F2 = *F1.Callees[0].NodePtr; + EXPECT_EQ(F2.FId, 3); + EXPECT_EQ(F2.CallCount, 2); + EXPECT_EQ(F2.CumulativeLocalTime, 2); + EXPECT_EQ(F2.Callees.size(), 0u); +} + +} // namespace + +} // namespace __xray diff --git a/lib/xray/tests/unit/profile_collector_test.cc b/lib/xray/tests/unit/profile_collector_test.cc new file mode 100644 index 000000000000..b7dbe567312a --- /dev/null +++ b/lib/xray/tests/unit/profile_collector_test.cc @@ -0,0 +1,179 @@ +//===-- profile_collector_test.cc -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#include "gtest/gtest.h" + +#include "xray_profile_collector.h" +#include "xray_profiling_flags.h" +#include <cstdint> +#include <thread> +#include <utility> +#include <vector> + +namespace __xray { +namespace { + +static constexpr auto kHeaderSize = 16u; + +void ValidateBlock(XRayBuffer B) { + profilingFlags()->setDefaults(); + ASSERT_NE(static_cast<const void *>(B.Data), nullptr); + ASSERT_NE(B.Size, 0u); + ASSERT_GE(B.Size, kHeaderSize); + // We look at the block size, the block number, and the thread ID to ensure + // that none of them are zero (or that the header data is laid out as we + // expect). + char LocalBuffer[kHeaderSize] = {}; + internal_memcpy(LocalBuffer, B.Data, kHeaderSize); + u32 BlockSize = 0; + u32 BlockNumber = 0; + u64 ThreadId = 0; + internal_memcpy(&BlockSize, LocalBuffer, sizeof(u32)); + internal_memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32)); + internal_memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64)); + ASSERT_NE(BlockSize, 0u); + ASSERT_GE(BlockNumber, 0u); + ASSERT_NE(ThreadId, 0u); +} + +std::tuple<u32, u32, u64> ParseBlockHeader(XRayBuffer B) { + char LocalBuffer[kHeaderSize] = {}; + internal_memcpy(LocalBuffer, B.Data, kHeaderSize); + u32 BlockSize = 0; + u32 BlockNumber = 0; + u64 ThreadId = 0; + internal_memcpy(&BlockSize, LocalBuffer, sizeof(u32)); + internal_memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32)); + internal_memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64)); + return std::make_tuple(BlockSize, BlockNumber, ThreadId); +} + +struct Profile { + int64_t CallCount; + int64_t CumulativeLocalTime; + std::vector<int32_t> Path; +}; + +std::tuple<Profile, const char *> ParseProfile(const char *P) { + Profile Result; + // Read the path first, until we find a sentinel 0. + int32_t F; + do { + internal_memcpy(&F, P, sizeof(int32_t)); + P += sizeof(int32_t); + Result.Path.push_back(F); + } while (F != 0); + + // Then read the CallCount. + internal_memcpy(&Result.CallCount, P, sizeof(int64_t)); + P += sizeof(int64_t); + + // Then read the CumulativeLocalTime. + internal_memcpy(&Result.CumulativeLocalTime, P, sizeof(int64_t)); + P += sizeof(int64_t); + return std::make_tuple(std::move(Result), P); +} + +TEST(profileCollectorServiceTest, PostSerializeCollect) { + profilingFlags()->setDefaults(); + // The most basic use-case (the one we actually only care about) is the one + // where we ensure that we can post FunctionCallTrie instances, which are then + // destroyed but serialized properly. + // + // First, we initialise a set of allocators in the local scope. This ensures + // that we're able to copy the contents of the FunctionCallTrie that uses + // the local allocators. + auto Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie T(Allocators); + + // Then, we populate the trie with some data. + T.enterFunction(1, 1); + T.enterFunction(2, 2); + T.exitFunction(2, 3); + T.exitFunction(1, 4); + + // Then we post the data to the global profile collector service. + profileCollectorService::post(T, 1); + + // Then we serialize the data. + profileCollectorService::serialize(); + + // Then we go through a single buffer to see whether we're getting the data we + // expect. + auto B = profileCollectorService::nextBuffer({nullptr, 0}); + ValidateBlock(B); + u32 BlockSize; + u32 BlockNum; + u64 ThreadId; + std::tie(BlockSize, BlockNum, ThreadId) = ParseBlockHeader(B); + + // We look at the serialized buffer to see whether the Trie we're expecting + // to see is there. + auto DStart = static_cast<const char *>(B.Data) + kHeaderSize; + std::vector<char> D(DStart, DStart + BlockSize); + B = profileCollectorService::nextBuffer(B); + ASSERT_EQ(B.Data, nullptr); + ASSERT_EQ(B.Size, 0u); + + Profile Profile1, Profile2; + auto P = static_cast<const char *>(D.data()); + std::tie(Profile1, P) = ParseProfile(P); + std::tie(Profile2, P) = ParseProfile(P); + + ASSERT_NE(Profile1.Path.size(), Profile2.Path.size()); + auto &P1 = Profile1.Path.size() < Profile2.Path.size() ? Profile2 : Profile1; + auto &P2 = Profile1.Path.size() < Profile2.Path.size() ? Profile1 : Profile2; + std::vector<int32_t> P1Expected = {2, 1, 0}; + std::vector<int32_t> P2Expected = {1, 0}; + ASSERT_EQ(P1.Path.size(), P1Expected.size()); + ASSERT_EQ(P2.Path.size(), P2Expected.size()); + ASSERT_EQ(P1.Path, P1Expected); + ASSERT_EQ(P2.Path, P2Expected); +} + +// We break out a function that will be run in multiple threads, one that will +// use a thread local allocator, and will post the FunctionCallTrie to the +// profileCollectorService. This simulates what the threads being profiled would +// be doing anyway, but through the XRay logging implementation. +void threadProcessing() { + thread_local auto Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie T(Allocators); + + T.enterFunction(1, 1); + T.enterFunction(2, 2); + T.exitFunction(2, 3); + T.exitFunction(1, 4); + + profileCollectorService::post(T, GetTid()); +} + +TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) { + profilingFlags()->setDefaults(); + std::thread t1(threadProcessing); + std::thread t2(threadProcessing); + + t1.join(); + t2.join(); + + // At this point, t1 and t2 are already done with what they were doing. + profileCollectorService::serialize(); + + // Ensure that we see two buffers. + auto B = profileCollectorService::nextBuffer({nullptr, 0}); + ValidateBlock(B); + + B = profileCollectorService::nextBuffer(B); + ValidateBlock(B); +} + +} // namespace +} // namespace __xray diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc new file mode 100644 index 000000000000..035674ccfaf5 --- /dev/null +++ b/lib/xray/tests/unit/segmented_array_test.cc @@ -0,0 +1,200 @@ +#include "xray_segmented_array.h" +#include "gtest/gtest.h" + +namespace __xray { +namespace { + +struct TestData { + s64 First; + s64 Second; + + // Need a constructor for emplace operations. + TestData(s64 F, s64 S) : First(F), Second(S) {} +}; + +TEST(SegmentedArrayTest, ConstructWithAllocators) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 4); + Array<TestData> Data(A); + (void)Data; +} + +TEST(SegmentedArrayTest, ConstructAndPopulate) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 4); + Array<TestData> data(A); + ASSERT_NE(data.Append(TestData{0, 0}), nullptr); + ASSERT_NE(data.Append(TestData{1, 1}), nullptr); + ASSERT_EQ(data.size(), 2u); +} + +TEST(SegmentedArrayTest, ConstructPopulateAndLookup) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 4); + Array<TestData> data(A); + ASSERT_NE(data.Append(TestData{0, 1}), nullptr); + ASSERT_EQ(data.size(), 1u); + ASSERT_EQ(data[0].First, 0); + ASSERT_EQ(data[0].Second, 1); +} + +TEST(SegmentedArrayTest, PopulateWithMoreElements) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 24); + Array<TestData> data(A); + static const auto kMaxElements = 100u; + for (auto I = 0u; I < kMaxElements; ++I) { + ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr); + } + ASSERT_EQ(data.size(), kMaxElements); + for (auto I = 0u; I < kMaxElements; ++I) { + ASSERT_EQ(data[I].First, I); + ASSERT_EQ(data[I].Second, I + 1); + } +} + +TEST(SegmentedArrayTest, AppendEmplace) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 4); + Array<TestData> data(A); + ASSERT_NE(data.AppendEmplace(1, 1), nullptr); + ASSERT_EQ(data[0].First, 1); + ASSERT_EQ(data[0].Second, 1); +} + +TEST(SegmentedArrayTest, AppendAndTrim) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 4); + Array<TestData> data(A); + ASSERT_NE(data.AppendEmplace(1, 1), nullptr); + ASSERT_EQ(data.size(), 1u); + data.trim(1); + ASSERT_EQ(data.size(), 0u); + ASSERT_TRUE(data.empty()); +} + +TEST(SegmentedArrayTest, IteratorAdvance) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 4); + Array<TestData> data(A); + ASSERT_TRUE(data.empty()); + ASSERT_EQ(data.begin(), data.end()); + auto I0 = data.begin(); + ASSERT_EQ(I0++, data.begin()); + ASSERT_NE(I0, data.begin()); + for (const auto &D : data) { + (void)D; + FAIL(); + } + ASSERT_NE(data.AppendEmplace(1, 1), nullptr); + ASSERT_EQ(data.size(), 1u); + ASSERT_NE(data.begin(), data.end()); + auto &D0 = *data.begin(); + ASSERT_EQ(D0.First, 1); + ASSERT_EQ(D0.Second, 1); +} + +TEST(SegmentedArrayTest, IteratorRetreat) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 4); + Array<TestData> data(A); + ASSERT_TRUE(data.empty()); + ASSERT_EQ(data.begin(), data.end()); + ASSERT_NE(data.AppendEmplace(1, 1), nullptr); + ASSERT_EQ(data.size(), 1u); + ASSERT_NE(data.begin(), data.end()); + auto &D0 = *data.begin(); + ASSERT_EQ(D0.First, 1); + ASSERT_EQ(D0.Second, 1); + + auto I0 = data.end(); + ASSERT_EQ(I0--, data.end()); + ASSERT_NE(I0, data.end()); + ASSERT_EQ(I0, data.begin()); + ASSERT_EQ(I0->First, 1); + ASSERT_EQ(I0->Second, 1); +} + +TEST(SegmentedArrayTest, IteratorTrimBehaviour) { + using AllocatorType = typename Array<TestData>::AllocatorType; + AllocatorType A(1 << 20); + Array<TestData> Data(A); + ASSERT_TRUE(Data.empty()); + auto I0Begin = Data.begin(), I0End = Data.end(); + // Add enough elements in Data to have more than one chunk. + constexpr auto Segment = Array<TestData>::SegmentSize; + constexpr auto SegmentX2 = Segment * 2; + for (auto i = SegmentX2; i > 0u; --i) { + Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i)); + } + ASSERT_EQ(Data.size(), SegmentX2); + { + auto &Back = Data.back(); + ASSERT_EQ(Back.First, 1); + ASSERT_EQ(Back.Second, 1); + } + + // Trim one chunk's elements worth. + Data.trim(Segment); + ASSERT_EQ(Data.size(), Segment); + + // Check that we are still able to access 'back' properly. + { + auto &Back = Data.back(); + ASSERT_EQ(Back.First, static_cast<s64>(Segment + 1)); + ASSERT_EQ(Back.Second, static_cast<s64>(Segment + 1)); + } + + // Then trim until it's empty. + Data.trim(Segment); + ASSERT_TRUE(Data.empty()); + + // Here our iterators should be the same. + auto I1Begin = Data.begin(), I1End = Data.end(); + EXPECT_EQ(I0Begin, I1Begin); + EXPECT_EQ(I0End, I1End); + + // Then we ensure that adding elements back works just fine. + for (auto i = SegmentX2; i > 0u; --i) { + Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i)); + } + EXPECT_EQ(Data.size(), SegmentX2); +} + +struct ShadowStackEntry { + uint64_t EntryTSC = 0; + uint64_t *NodePtr = nullptr; + ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {} +}; + +TEST(SegmentedArrayTest, SimulateStackBehaviour) { + using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType; + AllocatorType A(1 << 10); + Array<ShadowStackEntry> Data(A); + static uint64_t Dummy = 0; + constexpr uint64_t Max = 9; + + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data.Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data.back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // Simulate a stack by checking the data from the end as we're trimming. + auto Counter = Max; + ASSERT_EQ(Data.size(), size_t(Max)); + while (!Data.empty()) { + const auto &Top = Data.back(); + uint64_t *TopNode = Top.NodePtr; + EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter; + Data.trim(1); + --Counter; + ASSERT_EQ(Data.size(), size_t(Counter)); + } +} + +} // namespace +} // namespace __xray diff --git a/lib/xray/xray_AArch64.cc b/lib/xray/xray_AArch64.cc index f26e77dd7fc1..096de009e83c 100644 --- a/lib/xray/xray_AArch64.cc +++ b/lib/xray/xray_AArch64.cc @@ -112,6 +112,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, return false; } +bool patchTypedEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in aarch64? + return false; +} + // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h new file mode 100644 index 000000000000..8244815284a8 --- /dev/null +++ b/lib/xray/xray_allocator.h @@ -0,0 +1,129 @@ +//===-- xray_allocator.h ---------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Defines the allocator interface for an arena allocator, used primarily for +// the profiling runtime. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_ALLOCATOR_H +#define XRAY_ALLOCATOR_H + +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_internal_defs.h" +#include "sanitizer_common/sanitizer_mutex.h" +#include "sanitizer_common/sanitizer_posix.h" +#include "xray_utils.h" +#include <sys/mman.h> +#include <cstddef> +#include <cstdint> + +#ifndef MAP_NORESERVE +// no-op on NetBSD (at least), unsupported flag on FreeBSD basically because unneeded +#define MAP_NORESERVE 0 +#endif + +namespace __xray { + +/// The Allocator type hands out fixed-sized chunks of memory that are +/// cache-line aligned and sized. This is useful for placement of +/// performance-sensitive data in memory that's frequently accessed. The +/// allocator also self-limits the peak memory usage to a dynamically defined +/// maximum. +/// +/// N is the lower-bound size of the block of memory to return from the +/// allocation function. N is used to compute the size of a block, which is +/// cache-line-size multiples worth of memory. We compute the size of a block by +/// determining how many cache lines worth of memory is required to subsume N. +/// +/// The Allocator instance will manage its own memory acquired through mmap. +/// This severely constrains the platforms on which this can be used to POSIX +/// systems where mmap semantics are well-defined. +/// +/// FIXME: Isolate the lower-level memory management to a different abstraction +/// that can be platform-specific. +template <size_t N> struct Allocator { + // The Allocator returns memory as Block instances. + struct Block { + /// Compute the minimum cache-line size multiple that is >= N. + static constexpr auto Size = nearest_boundary(N, kCacheLineSize); + void *Data; + }; + +private: + const size_t MaxMemory{0}; + void *BackingStore = nullptr; + void *AlignedNextBlock = nullptr; + size_t AllocatedBlocks = 0; + SpinMutex Mutex{}; + + void *Alloc() { + SpinMutexLock Lock(&Mutex); + if (UNLIKELY(BackingStore == nullptr)) { + BackingStore = reinterpret_cast<void *>( + internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0)); + if (BackingStore == MAP_FAILED) { + BackingStore = nullptr; + if (Verbosity()) + Report("XRay Profiling: Failed to allocate memory for allocator.\n"); + return nullptr; + } + + AlignedNextBlock = BackingStore; + + // Ensure that NextBlock is aligned appropriately. + auto BackingStoreNum = reinterpret_cast<uintptr_t>(BackingStore); + auto AlignedNextBlockNum = nearest_boundary( + reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize); + if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) { + munmap(BackingStore, MaxMemory); + AlignedNextBlock = BackingStore = nullptr; + if (Verbosity()) + Report("XRay Profiling: Cannot obtain enough memory from " + "preallocated region.\n"); + return nullptr; + } + + AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum); + + // Assert that AlignedNextBlock is cache-line aligned. + DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize, + 0); + } + + if ((AllocatedBlocks * Block::Size) >= MaxMemory) + return nullptr; + + // Align the pointer we'd like to return to an appropriate alignment, then + // advance the pointer from where to start allocations. + void *Result = AlignedNextBlock; + AlignedNextBlock = reinterpret_cast<void *>( + reinterpret_cast<char *>(AlignedNextBlock) + N); + ++AllocatedBlocks; + return Result; + } + +public: + explicit Allocator(size_t M) + : MaxMemory(nearest_boundary(M, kCacheLineSize)) {} + + Block Allocate() { return {Alloc()}; } + + ~Allocator() NOEXCEPT { + if (BackingStore != nullptr) { + internal_munmap(BackingStore, MaxMemory); + } + } +}; + +} // namespace __xray + +#endif // XRAY_ALLOCATOR_H diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc index da4efcdd2b17..5b828287e3f6 100644 --- a/lib/xray/xray_arm.cc +++ b/lib/xray/xray_arm.cc @@ -149,6 +149,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, return false; } +bool patchTypedEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in arm? + return false; +} + // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } diff --git a/lib/xray/xray_basic_flags.cc b/lib/xray/xray_basic_flags.cc new file mode 100644 index 000000000000..14d805c71a88 --- /dev/null +++ b/lib/xray/xray_basic_flags.cc @@ -0,0 +1,50 @@ +//===-- xray_basic_flags.cc -------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay Basic flag parsing logic. +//===----------------------------------------------------------------------===// + +#include "xray_basic_flags.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_libc.h" +#include "xray_defs.h" + +using namespace __sanitizer; + +namespace __xray { + +/// Use via basicFlags(). +BasicFlags xray_basic_flags_dont_use_directly; + +void BasicFlags::setDefaults() XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue; +#include "xray_basic_flags.inc" +#undef XRAY_FLAG +} + +void registerXRayBasicFlags(FlagParser *P, + BasicFlags *F) XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) \ + RegisterFlag(P, #Name, Description, &F->Name); +#include "xray_basic_flags.inc" +#undef XRAY_FLAG +} + +const char *useCompilerDefinedBasicFlags() XRAY_NEVER_INSTRUMENT { +#ifdef XRAY_BASIC_OPTIONS + return SANITIZER_STRINGIFY(XRAY_BASIC_OPTIONS); +#else + return ""; +#endif +} + +} // namespace __xray diff --git a/lib/xray/xray_basic_flags.h b/lib/xray/xray_basic_flags.h new file mode 100644 index 000000000000..041578f0663c --- /dev/null +++ b/lib/xray/xray_basic_flags.h @@ -0,0 +1,38 @@ +//===-- xray_basic_flags.h -------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instruementation system. +// +// XRay Basic Mode runtime flags. +//===----------------------------------------------------------------------===// + +#ifndef XRAY_BASIC_FLAGS_H +#define XRAY_BASIC_FLAGS_H + +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_internal_defs.h" + +namespace __xray { + +struct BasicFlags { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name; +#include "xray_basic_flags.inc" +#undef XRAY_FLAG + + void setDefaults(); +}; + +extern BasicFlags xray_basic_flags_dont_use_directly; +extern void registerXRayBasicFlags(FlagParser *P, BasicFlags *F); +const char *useCompilerDefinedBasicFlags(); +inline BasicFlags *basicFlags() { return &xray_basic_flags_dont_use_directly; } + +} // namespace __xray + +#endif // XRAY_BASIC_FLAGS_H diff --git a/lib/xray/xray_basic_flags.inc b/lib/xray/xray_basic_flags.inc new file mode 100644 index 000000000000..327735b51055 --- /dev/null +++ b/lib/xray/xray_basic_flags.inc @@ -0,0 +1,24 @@ +//===-- xray_basic_flags.inc ------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// XRay runtime flags. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FLAG +#error "Define XRAY_FLAG prior to including this file!" +#endif + +XRAY_FLAG(int, func_duration_threshold_us, 5, + "Basic logging will try to skip functions that execute for fewer " + "microseconds than this threshold.") +XRAY_FLAG(int, max_stack_depth, 64, + "Basic logging will keep track of at most this deep a call stack, " + "any more and the recordings will be dropped.") +XRAY_FLAG(int, thread_buffer_size, 1024, + "The number of entries to keep on a per-thread buffer.") diff --git a/lib/xray/xray_inmemory_log.cc b/lib/xray/xray_basic_logging.cc index a27ffbcbd12e..585ca641cd0c 100644 --- a/lib/xray/xray_inmemory_log.cc +++ b/lib/xray/xray_basic_logging.cc @@ -1,4 +1,4 @@ -//===-- xray_inmemory_log.cc ------------------------------------*- C++ -*-===// +//===-- xray_basic_logging.cc -----------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -15,8 +15,6 @@ // //===----------------------------------------------------------------------===// -#include <cassert> -#include <cstring> #include <errno.h> #include <fcntl.h> #include <pthread.h> @@ -29,16 +27,18 @@ #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_libc.h" #include "xray/xray_records.h" +#include "xray_recursion_guard.h" +#include "xray_basic_flags.h" +#include "xray_basic_logging.h" #include "xray_defs.h" #include "xray_flags.h" -#include "xray_inmemory_log.h" #include "xray_interface_internal.h" #include "xray_tsc.h" #include "xray_utils.h" namespace __xray { -__sanitizer::SpinMutex LogMutex; +SpinMutex LogMutex; // We use elements of this type to record the entry TSC of every function ID we // see as we're tracing a particular thread's execution. @@ -60,43 +60,41 @@ struct alignas(64) ThreadLocalData { size_t StackSize = 0; size_t StackEntries = 0; int Fd = -1; - pid_t TID = 0; }; static pthread_key_t PThreadKey; -static __sanitizer::atomic_uint8_t BasicInitialized{0}; +static atomic_uint8_t BasicInitialized{0}; BasicLoggingOptions GlobalOptions; -thread_local volatile bool RecursionGuard = false; +thread_local atomic_uint8_t Guard{0}; -static uint64_t thresholdTicks() XRAY_NEVER_INSTRUMENT { - static uint64_t TicksPerSec = probeRequiredCPUFeatures() - ? getTSCFrequency() - : __xray::NanosecondsPerSecond; - static const uint64_t ThresholdTicks = - TicksPerSec * GlobalOptions.DurationFilterMicros / 1000000; - return ThresholdTicks; -} +static atomic_uint8_t UseRealTSC{0}; +static atomic_uint64_t ThresholdTicks{0}; +static atomic_uint64_t TicksPerSec{0}; +static atomic_uint64_t CycleFrequency{NanosecondsPerSecond}; static int openLogFile() XRAY_NEVER_INSTRUMENT { int F = getLogFD(); if (F == -1) return -1; - // Test for required CPU features and cache the cycle frequency - static bool TSCSupported = probeRequiredCPUFeatures(); - static uint64_t CycleFrequency = - TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond; + static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT; + pthread_once(&DetectOnce, +[] { + if (atomic_load(&UseRealTSC, memory_order_acquire)) + atomic_store(&CycleFrequency, getTSCFrequency(), memory_order_release); + }); // Since we're here, we get to write the header. We set it up so that the // header will only be written once, at the start, and let the threads // logging do writes which just append. XRayFileHeader Header; - Header.Version = 2; // Version 2 includes tail exit records. + // Version 2 includes tail exit records. + // Version 3 includes pid inside records. + Header.Version = 3; Header.Type = FileTypes::NAIVE_LOG; - Header.CycleFrequency = CycleFrequency; + Header.CycleFrequency = atomic_load(&CycleFrequency, memory_order_acquire); // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc' // before setting the values in the header. @@ -107,20 +105,21 @@ static int openLogFile() XRAY_NEVER_INSTRUMENT { return F; } -int getGlobalFd() XRAY_NEVER_INSTRUMENT { - static int Fd = openLogFile(); +static int getGlobalFd() XRAY_NEVER_INSTRUMENT { + static pthread_once_t OnceInit = PTHREAD_ONCE_INIT; + static int Fd = 0; + pthread_once(&OnceInit, +[] { Fd = openLogFile(); }); return Fd; } -ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { +static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { thread_local ThreadLocalData TLD; thread_local bool UNUSED TOnce = [] { if (GlobalOptions.ThreadBufferSize == 0) { - if (__sanitizer::Verbosity()) + if (Verbosity()) Report("Not initializing TLD since ThreadBufferSize == 0.\n"); return false; } - TLD.TID = __sanitizer::GetTid(); pthread_setspecific(PThreadKey, &TLD); TLD.Fd = getGlobalFd(); TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>( @@ -129,7 +128,7 @@ ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { TLD.BufferSize = GlobalOptions.ThreadBufferSize; TLD.BufferOffset = 0; if (GlobalOptions.MaxStackDepth == 0) { - if (__sanitizer::Verbosity()) + if (Verbosity()) Report("Not initializing the ShadowStack since MaxStackDepth == 0.\n"); TLD.StackSize = 0; TLD.StackEntries = 0; @@ -141,13 +140,6 @@ ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { alignof(StackEntry))); TLD.StackSize = GlobalOptions.MaxStackDepth; TLD.StackEntries = 0; - if (__sanitizer::Verbosity() >= 2) { - static auto UNUSED Once = [] { - auto ticks = thresholdTicks(); - Report("Ticks threshold: %d\n", ticks); - return false; - }(); - } return false; }(); return TLD; @@ -157,7 +149,6 @@ template <class RDTSC> void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT { auto &TLD = getThreadLocalData(); - auto &InMemoryBuffer = TLD.InMemoryBuffer; int Fd = getGlobalFd(); if (Fd == -1) return; @@ -165,10 +156,9 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, // Use a simple recursion guard, to handle cases where we're already logging // and for one reason or another, this function gets called again in the same // thread. - if (RecursionGuard) + RecursionGuard G(Guard); + if (!G) return; - RecursionGuard = true; - auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; }); uint8_t CPU = 0; uint64_t TSC = ReadTSC(CPU); @@ -189,7 +179,7 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, E.TSC = TSC; auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) + (sizeof(StackEntry) * (TLD.StackEntries - 1)); - __sanitizer::internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry)); + internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry)); break; } case XRayEntryType::EXIT: @@ -213,12 +203,12 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, StackEntry StackTop; auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) + (sizeof(StackEntry) * TLD.StackEntries); - __sanitizer::internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry)); + internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry)); if (StackTop.FuncId == FuncId && StackTop.CPU == CPU && StackTop.TSC < TSC) { auto Delta = TSC - StackTop.TSC; - if (Delta < thresholdTicks()) { - assert(TLD.BufferOffset > 0); + if (Delta < atomic_load(&ThresholdTicks, memory_order_relaxed)) { + DCHECK(TLD.BufferOffset > 0); TLD.BufferOffset -= StackTop.Type == XRayEntryType::ENTRY ? 1 : 2; return; } @@ -227,27 +217,26 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, } default: // Should be unreachable. - assert(false && "Unsupported XRayEntryType encountered."); + DCHECK(false && "Unsupported XRayEntryType encountered."); break; } // First determine whether the delta between the function's enter record and // the exit record is higher than the threshold. - __xray::XRayRecord R; + XRayRecord R; R.RecordType = RecordTypes::NORMAL; R.CPU = CPU; R.TSC = TSC; - R.TId = TLD.TID; + R.TId = GetTid(); + R.PId = internal_getpid(); R.Type = Type; R.FuncId = FuncId; - auto EntryPtr = static_cast<char *>(InMemoryBuffer) + - (sizeof(__xray::XRayRecord) * TLD.BufferOffset); - __sanitizer::internal_memcpy(EntryPtr, &R, sizeof(R)); + auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer); + internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R)); if (++TLD.BufferOffset == TLD.BufferSize) { - __sanitizer::SpinMutexLock L(&LogMutex); - auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer); - retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer), - reinterpret_cast<char *>(RecordBuffer + TLD.BufferOffset)); + SpinMutexLock L(&LogMutex); + retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry), + reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); TLD.BufferOffset = 0; TLD.StackEntries = 0; } @@ -257,8 +246,8 @@ template <class RDTSC> void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1, RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT { auto &TLD = getThreadLocalData(); - auto &InMemoryBuffer = TLD.InMemoryBuffer; - auto &Offset = TLD.BufferOffset; + auto FirstEntry = + reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer); const auto &BuffLen = TLD.BufferSize; int Fd = getGlobalFd(); if (Fd == -1) @@ -267,45 +256,41 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1, // First we check whether there's enough space to write the data consecutively // in the thread-local buffer. If not, we first flush the buffer before // attempting to write the two records that must be consecutive. - if (Offset + 2 > BuffLen) { - __sanitizer::SpinMutexLock L(&LogMutex); - auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer); - retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer), - reinterpret_cast<char *>(RecordBuffer + Offset)); - Offset = 0; + if (TLD.BufferOffset + 2 > BuffLen) { + SpinMutexLock L(&LogMutex); + retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry), + reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); + TLD.BufferOffset = 0; TLD.StackEntries = 0; } // Then we write the "we have an argument" record. InMemoryRawLog(FuncId, Type, ReadTSC); - if (RecursionGuard) + RecursionGuard G(Guard); + if (!G) return; - RecursionGuard = true; - auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; }); - // And from here on write the arg payload. - __xray::XRayArgPayload R; + // And, from here on write the arg payload. + XRayArgPayload R; R.RecordType = RecordTypes::ARG_PAYLOAD; R.FuncId = FuncId; - R.TId = TLD.TID; + R.TId = GetTid(); + R.PId = internal_getpid(); R.Arg = Arg1; - auto EntryPtr = - &reinterpret_cast<__xray::XRayArgPayload *>(&InMemoryBuffer)[Offset]; - std::memcpy(EntryPtr, &R, sizeof(R)); - if (++Offset == BuffLen) { - __sanitizer::SpinMutexLock L(&LogMutex); - auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer); - retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer), - reinterpret_cast<char *>(RecordBuffer + Offset)); - Offset = 0; + internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R)); + if (++TLD.BufferOffset == BuffLen) { + SpinMutexLock L(&LogMutex); + retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry), + reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); + TLD.BufferOffset = 0; TLD.StackEntries = 0; } } void basicLoggingHandleArg0RealTSC(int32_t FuncId, XRayEntryType Type) XRAY_NEVER_INSTRUMENT { - InMemoryRawLog(FuncId, Type, __xray::readTSC); + InMemoryRawLog(FuncId, Type, readTSC); } void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type) @@ -318,13 +303,13 @@ void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type) TS = {0, 0}; } CPU = 0; - return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; + return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec; }); } void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Type, uint64_t Arg1) XRAY_NEVER_INSTRUMENT { - InMemoryRawLogWithArg(FuncId, Type, Arg1, __xray::readTSC); + InMemoryRawLogWithArg(FuncId, Type, Arg1, readTSC); } void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type, @@ -338,34 +323,34 @@ void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type, TS = {0, 0}; } CPU = 0; - return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; + return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec; }); } static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT { ThreadLocalData &TLD = *reinterpret_cast<ThreadLocalData *>(P); - auto ExitGuard = __sanitizer::at_scope_exit([&TLD] { + auto ExitGuard = at_scope_exit([&TLD] { // Clean up dynamic resources. if (TLD.InMemoryBuffer) InternalFree(TLD.InMemoryBuffer); if (TLD.ShadowStack) InternalFree(TLD.ShadowStack); - if (__sanitizer::Verbosity()) - Report("Cleaned up log for TID: %d\n", TLD.TID); + if (Verbosity()) + Report("Cleaned up log for TID: %d\n", GetTid()); }); if (TLD.Fd == -1 || TLD.BufferOffset == 0) { - if (__sanitizer::Verbosity()) - Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", TLD.TID, + if (Verbosity()) + Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", GetTid(), TLD.Fd, TLD.BufferOffset); return; } { - __sanitizer::SpinMutexLock L(&LogMutex); + SpinMutexLock L(&LogMutex); retryingWriteAll(TLD.Fd, reinterpret_cast<char *>(TLD.InMemoryBuffer), reinterpret_cast<char *>(TLD.InMemoryBuffer) + - (sizeof(__xray::XRayRecord) * TLD.BufferOffset)); + (sizeof(XRayRecord) * TLD.BufferOffset)); } // Because this thread's exit could be the last one trying to write to @@ -378,45 +363,89 @@ static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT { XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { - static bool UNUSED Once = [] { - pthread_key_create(&PThreadKey, TLDDestructor); - return false; - }(); - uint8_t Expected = 0; - if (!__sanitizer::atomic_compare_exchange_strong( - &BasicInitialized, &Expected, 1, __sanitizer::memory_order_acq_rel)) { - if (__sanitizer::Verbosity()) + if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1, + memory_order_acq_rel)) { + if (Verbosity()) Report("Basic logging already initialized.\n"); return XRayLogInitStatus::XRAY_LOG_INITIALIZED; } - if (OptionsSize != sizeof(BasicLoggingOptions)) { + static pthread_once_t OnceInit = PTHREAD_ONCE_INIT; + pthread_once(&OnceInit, +[] { + pthread_key_create(&PThreadKey, TLDDestructor); + atomic_store(&UseRealTSC, probeRequiredCPUFeatures(), memory_order_release); + // Initialize the global TicksPerSec value. + atomic_store(&TicksPerSec, + probeRequiredCPUFeatures() ? getTSCFrequency() + : NanosecondsPerSecond, + memory_order_release); + if (!atomic_load(&UseRealTSC, memory_order_relaxed) && Verbosity()) + Report("WARNING: Required CPU features missing for XRay instrumentation, " + "using emulation instead.\n"); + }); + + if (BufferSize == 0 && BufferMax == 0 && Options != nullptr) { + FlagParser P; + BasicFlags F; + F.setDefaults(); + registerXRayBasicFlags(&P, &F); + P.ParseString(useCompilerDefinedBasicFlags()); + auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS"); + if (EnvOpts == nullptr) + EnvOpts = ""; + + P.ParseString(EnvOpts); + + // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options + // set through XRAY_OPTIONS instead. + if (internal_strlen(EnvOpts) == 0) { + F.func_duration_threshold_us = + flags()->xray_naive_log_func_duration_threshold_us; + F.max_stack_depth = flags()->xray_naive_log_max_stack_depth; + F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size; + } + + P.ParseString(static_cast<const char *>(Options)); + GlobalOptions.ThreadBufferSize = F.thread_buffer_size; + GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us; + GlobalOptions.MaxStackDepth = F.max_stack_depth; + *basicFlags() = F; + } else if (OptionsSize != sizeof(BasicLoggingOptions)) { Report("Invalid options size, potential ABI mismatch; expected %d got %d", sizeof(BasicLoggingOptions), OptionsSize); return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } else { + if (Verbosity()) + Report("XRay Basic: struct-based init is deprecated, please use " + "string-based configuration instead.\n"); + GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options); } - static auto UseRealTSC = probeRequiredCPUFeatures(); - if (!UseRealTSC && __sanitizer::Verbosity()) - Report("WARNING: Required CPU features missing for XRay instrumentation, " - "using emulation instead.\n"); - - GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options); - __xray_set_handler_arg1(UseRealTSC ? basicLoggingHandleArg1RealTSC - : basicLoggingHandleArg1EmulateTSC); - __xray_set_handler(UseRealTSC ? basicLoggingHandleArg0RealTSC - : basicLoggingHandleArg0EmulateTSC); + atomic_store(&ThresholdTicks, + atomic_load(&TicksPerSec, memory_order_acquire) * + GlobalOptions.DurationFilterMicros / 1000000, + memory_order_release); + __xray_set_handler_arg1(atomic_load(&UseRealTSC, memory_order_acquire) + ? basicLoggingHandleArg1RealTSC + : basicLoggingHandleArg1EmulateTSC); + __xray_set_handler(atomic_load(&UseRealTSC, memory_order_acquire) + ? basicLoggingHandleArg0RealTSC + : basicLoggingHandleArg0EmulateTSC); + + // TODO: Implement custom event and typed event handling support in Basic + // Mode. __xray_remove_customevent_handler(); + __xray_remove_typedevent_handler(); return XRayLogInitStatus::XRAY_LOG_INITIALIZED; } XRayLogInitStatus basicLoggingFinalize() XRAY_NEVER_INSTRUMENT { uint8_t Expected = 0; - if (!__sanitizer::atomic_compare_exchange_strong( - &BasicInitialized, &Expected, 0, __sanitizer::memory_order_acq_rel) && - __sanitizer::Verbosity()) + if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 0, + memory_order_acq_rel) && + Verbosity()) Report("Basic logging already finalized.\n"); // Nothing really to do aside from marking state of the global to be @@ -444,24 +473,41 @@ bool basicLogDynamicInitializer() XRAY_NEVER_INSTRUMENT { }; auto RegistrationResult = __xray_log_register_mode("xray-basic", Impl); if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && - __sanitizer::Verbosity()) + Verbosity()) Report("Cannot register XRay Basic Mode to 'xray-basic'; error = %d\n", RegistrationResult); if (flags()->xray_naive_log || - !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-basic")) { - __xray_set_log_impl(Impl); - BasicLoggingOptions Options; - Options.DurationFilterMicros = - flags()->xray_naive_log_func_duration_threshold_us; - Options.MaxStackDepth = flags()->xray_naive_log_max_stack_depth; - Options.ThreadBufferSize = flags()->xray_naive_log_thread_buffer_size; - __xray_log_init(flags()->xray_naive_log_thread_buffer_size, 0, &Options, - sizeof(BasicLoggingOptions)); - static auto UNUSED Once = [] { - static auto UNUSED &TLD = getThreadLocalData(); - __sanitizer::Atexit(+[] { TLDDestructor(&TLD); }); + !internal_strcmp(flags()->xray_mode, "xray-basic")) { + auto SelectResult = __xray_log_select_mode("xray-basic"); + if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) { + if (Verbosity()) + Report("Failed selecting XRay Basic Mode; error = %d\n", SelectResult); return false; - }(); + } + + // We initialize the implementation using the data we get from the + // XRAY_BASIC_OPTIONS environment variable, at this point of the + // implementation. + auto *Env = GetEnv("XRAY_BASIC_OPTIONS"); + auto InitResult = + __xray_log_init_mode("xray-basic", Env == nullptr ? "" : Env); + if (InitResult != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { + if (Verbosity()) + Report("Failed initializing XRay Basic Mode; error = %d\n", InitResult); + return false; + } + + // At this point we know that we've successfully initialized Basic mode + // tracing, and the only chance we're going to get for the current thread to + // clean-up may be at thread/program exit. To ensure that we're going to get + // the cleanup even without calling the finalization routines, we're + // registering a program exit function that will do the cleanup. + static pthread_once_t DynamicOnce = PTHREAD_ONCE_INIT; + pthread_once(&DynamicOnce, +[] { + static void *FakeTLD = nullptr; + FakeTLD = &getThreadLocalData(); + Atexit(+[] { TLDDestructor(FakeTLD); }); + }); } return true; } diff --git a/lib/xray/xray_inmemory_log.h b/lib/xray/xray_basic_logging.h index e4fcb8ca5ffd..1639b96d91a1 100644 --- a/lib/xray/xray_inmemory_log.h +++ b/lib/xray/xray_basic_logging.h @@ -1,5 +1,4 @@ -//===-- xray_inmemory_log.h -//------------------------------------------------===// +//===-- xray_basic_logging.h ----------------------------------------------===// // // The LLVM Compiler Infrastructure // diff --git a/lib/xray/xray_buffer_queue.cc b/lib/xray/xray_buffer_queue.cc index a0018f6b0cba..8dfcc23540b1 100644 --- a/lib/xray/xray_buffer_queue.cc +++ b/lib/xray/xray_buffer_queue.cc @@ -16,14 +16,37 @@ #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_libc.h" +#include <memory> using namespace __xray; using namespace __sanitizer; +template <class T> static T *initArray(size_t N) { + auto A = reinterpret_cast<T *>( + InternalAlloc(N * sizeof(T), nullptr, kCacheLineSize)); + if (A != nullptr) + while (N > 0) + new (A + (--N)) T(); + return A; +} + BufferQueue::BufferQueue(size_t B, size_t N, bool &Success) - : BufferSize(B), Buffers(new BufferRep[N]()), BufferCount(N), Finalizing{0}, - OwnedBuffers(new void *[N]()), Next(Buffers), First(Buffers), - LiveBuffers(0) { + : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)), + BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)), + Next(Buffers), First(Buffers), LiveBuffers(0) { + if (Buffers == nullptr) { + Success = false; + return; + } + if (OwnedBuffers == nullptr) { + // Clean up the buffers we've already allocated. + for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B) + B->~BufferRep(); + InternalFree(Buffers); + Success = false; + return; + }; + for (size_t i = 0; i < N; ++i) { auto &T = Buffers[i]; void *Tmp = InternalAlloc(BufferSize, nullptr, 64); @@ -37,7 +60,7 @@ BufferQueue::BufferQueue(size_t B, size_t N, bool &Success) return; } auto &Buf = T.Buff; - Buf.Buffer = Tmp; + Buf.Data = Tmp; Buf.Size = B; Buf.Extents = reinterpret_cast<BufferExtents *>(Extents); OwnedBuffers[i] = Tmp; @@ -46,9 +69,9 @@ BufferQueue::BufferQueue(size_t B, size_t N, bool &Success) } BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) { - if (__sanitizer::atomic_load(&Finalizing, __sanitizer::memory_order_acquire)) + if (atomic_load(&Finalizing, memory_order_acquire)) return ErrorCode::QueueFinalizing; - __sanitizer::SpinMutexLock Guard(&Mutex); + SpinMutexLock Guard(&Mutex); if (LiveBuffers == BufferCount) return ErrorCode::NotEnoughMemory; @@ -68,7 +91,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { // Blitz through the buffers array to find the buffer. bool Found = false; for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) { - if (*I == Buf.Buffer) { + if (*I == Buf.Data) { Found = true; break; } @@ -76,7 +99,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { if (!Found) return ErrorCode::UnrecognizedBuffer; - __sanitizer::SpinMutexLock Guard(&Mutex); + SpinMutexLock Guard(&Mutex); // This points to a semantic bug, we really ought to not be releasing more // buffers than we actually get. @@ -86,7 +109,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { // Now that the buffer has been released, we mark it as "used". First->Buff = Buf; First->Used = true; - Buf.Buffer = nullptr; + Buf.Data = nullptr; Buf.Size = 0; --LiveBuffers; if (++First == (Buffers + BufferCount)) @@ -96,8 +119,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { } BufferQueue::ErrorCode BufferQueue::finalize() { - if (__sanitizer::atomic_exchange(&Finalizing, 1, - __sanitizer::memory_order_acq_rel)) + if (atomic_exchange(&Finalizing, 1, memory_order_acq_rel)) return ErrorCode::QueueFinalizing; return ErrorCode::Ok; } @@ -106,9 +128,11 @@ BufferQueue::~BufferQueue() { for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) { auto &T = *I; auto &Buf = T.Buff; - InternalFree(Buf.Buffer); + InternalFree(Buf.Data); InternalFree(Buf.Extents); } - delete[] Buffers; - delete[] OwnedBuffers; + for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B) + B->~BufferRep(); + InternalFree(Buffers); + InternalFree(OwnedBuffers); } diff --git a/lib/xray/xray_buffer_queue.h b/lib/xray/xray_buffer_queue.h index 1ceb58274616..e76fa7983c90 100644 --- a/lib/xray/xray_buffer_queue.h +++ b/lib/xray/xray_buffer_queue.h @@ -15,9 +15,10 @@ #ifndef XRAY_BUFFER_QUEUE_H #define XRAY_BUFFER_QUEUE_H -#include <cstddef> #include "sanitizer_common/sanitizer_atomic.h" +#include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_mutex.h" +#include <cstddef> namespace __xray { @@ -27,18 +28,17 @@ namespace __xray { /// the "flight data recorder" (FDR) mode to support ongoing XRay function call /// trace collection. class BufferQueue { - public: +public: struct alignas(64) BufferExtents { - __sanitizer::atomic_uint64_t Size; + atomic_uint64_t Size; }; struct Buffer { - void *Buffer = nullptr; + void *Data = nullptr; size_t Size = 0; - BufferExtents* Extents; + BufferExtents *Extents; }; - private: struct BufferRep { // The managed buffer. Buffer Buff; @@ -48,14 +48,72 @@ class BufferQueue { bool Used = false; }; +private: + // This models a ForwardIterator. |T| Must be either a `Buffer` or `const + // Buffer`. Note that we only advance to the "used" buffers, when + // incrementing, so that at dereference we're always at a valid point. + template <class T> class Iterator { + public: + BufferRep *Buffers = nullptr; + size_t Offset = 0; + size_t Max = 0; + + Iterator &operator++() { + DCHECK_NE(Offset, Max); + do { + ++Offset; + } while (!Buffers[Offset].Used && Offset != Max); + return *this; + } + + Iterator operator++(int) { + Iterator C = *this; + ++(*this); + return C; + } + + T &operator*() const { return Buffers[Offset].Buff; } + + T *operator->() const { return &(Buffers[Offset].Buff); } + + Iterator(BufferRep *Root, size_t O, size_t M) + : Buffers(Root), Offset(O), Max(M) { + // We want to advance to the first Offset where the 'Used' property is + // true, or to the end of the list/queue. + while (!Buffers[Offset].Used && Offset != Max) { + ++Offset; + } + } + + Iterator() = default; + Iterator(const Iterator &) = default; + Iterator(Iterator &&) = default; + Iterator &operator=(const Iterator &) = default; + Iterator &operator=(Iterator &&) = default; + ~Iterator() = default; + + template <class V> + friend bool operator==(const Iterator &L, const Iterator<V> &R) { + DCHECK_EQ(L.Max, R.Max); + return L.Buffers == R.Buffers && L.Offset == R.Offset; + } + + template <class V> + friend bool operator!=(const Iterator &L, const Iterator<V> &R) { + return !(L == R); + } + }; + // Size of each individual Buffer. size_t BufferSize; BufferRep *Buffers; + + // Amount of pre-allocated buffers. size_t BufferCount; - __sanitizer::SpinMutex Mutex; - __sanitizer::atomic_uint8_t Finalizing; + SpinMutex Mutex; + atomic_uint8_t Finalizing; // Pointers to buffers managed/owned by the BufferQueue. void **OwnedBuffers; @@ -70,7 +128,7 @@ class BufferQueue { // Count of buffers that have been handed out through 'getBuffer'. size_t LiveBuffers; - public: +public: enum class ErrorCode : unsigned { Ok, NotEnoughMemory, @@ -81,16 +139,16 @@ class BufferQueue { static const char *getErrorString(ErrorCode E) { switch (E) { - case ErrorCode::Ok: - return "(none)"; - case ErrorCode::NotEnoughMemory: - return "no available buffers in the queue"; - case ErrorCode::QueueFinalizing: - return "queue already finalizing"; - case ErrorCode::UnrecognizedBuffer: - return "buffer being returned not owned by buffer queue"; - case ErrorCode::AlreadyFinalized: - return "queue already finalized"; + case ErrorCode::Ok: + return "(none)"; + case ErrorCode::NotEnoughMemory: + return "no available buffers in the queue"; + case ErrorCode::QueueFinalizing: + return "queue already finalizing"; + case ErrorCode::UnrecognizedBuffer: + return "buffer being returned not owned by buffer queue"; + case ErrorCode::AlreadyFinalized: + return "queue already finalized"; } return "unknown error"; } @@ -122,8 +180,7 @@ class BufferQueue { ErrorCode releaseBuffer(Buffer &Buf); bool finalizing() const { - return __sanitizer::atomic_load(&Finalizing, - __sanitizer::memory_order_acquire); + return atomic_load(&Finalizing, memory_order_acquire); } /// Returns the configured size of the buffers in the buffer queue. @@ -141,19 +198,29 @@ class BufferQueue { /// Applies the provided function F to each Buffer in the queue, only if the /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a /// releaseBuffer(...) operation). - template <class F> - void apply(F Fn) { - __sanitizer::SpinMutexLock G(&Mutex); - for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) { - const auto &T = *I; - if (T.Used) Fn(T.Buff); - } + template <class F> void apply(F Fn) { + SpinMutexLock G(&Mutex); + for (auto I = begin(), E = end(); I != E; ++I) + Fn(*I); + } + + using const_iterator = Iterator<const Buffer>; + using iterator = Iterator<Buffer>; + + /// Provides iterator access to the raw Buffer instances. + iterator begin() const { return iterator(Buffers, 0, BufferCount); } + const_iterator cbegin() const { + return const_iterator(Buffers, 0, BufferCount); + } + iterator end() const { return iterator(Buffers, BufferCount, BufferCount); } + const_iterator cend() const { + return const_iterator(Buffers, BufferCount, BufferCount); } // Cleans up allocated buffers. ~BufferQueue(); }; -} // namespace __xray +} // namespace __xray -#endif // XRAY_BUFFER_QUEUE_H +#endif // XRAY_BUFFER_QUEUE_H diff --git a/lib/xray/xray_fdr_flags.cc b/lib/xray/xray_fdr_flags.cc new file mode 100644 index 000000000000..a14851b1b616 --- /dev/null +++ b/lib/xray/xray_fdr_flags.cc @@ -0,0 +1,48 @@ +//===-- xray_fdr_flags.cc ---------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay FDR flag parsing logic. +//===----------------------------------------------------------------------===// + +#include "xray_fdr_flags.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_libc.h" +#include "xray_defs.h" + +using namespace __sanitizer; + +namespace __xray { + +FDRFlags xray_fdr_flags_dont_use_directly; // use via fdrFlags(). + +void FDRFlags::setDefaults() XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue; +#include "xray_fdr_flags.inc" +#undef XRAY_FLAG +} + +void registerXRayFDRFlags(FlagParser *P, FDRFlags *F) XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) \ + RegisterFlag(P, #Name, Description, &F->Name); +#include "xray_fdr_flags.inc" +#undef XRAY_FLAG +} + +const char *useCompilerDefinedFDRFlags() XRAY_NEVER_INSTRUMENT { +#ifdef XRAY_FDR_OPTIONS + return SANITIZER_STRINGIFY(XRAY_FDR_OPTIONS); +#else + return ""; +#endif +} + +} // namespace __xray diff --git a/lib/xray/xray_fdr_flags.h b/lib/xray/xray_fdr_flags.h new file mode 100644 index 000000000000..9c953f1cabcf --- /dev/null +++ b/lib/xray/xray_fdr_flags.h @@ -0,0 +1,38 @@ +//===-- xray_fdr_flags.h ---------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This file defines the flags for the flight-data-recorder mode implementation. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FDR_FLAGS_H +#define XRAY_FDR_FLAGS_H + +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_internal_defs.h" + +namespace __xray { + +struct FDRFlags { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name; +#include "xray_fdr_flags.inc" +#undef XRAY_FLAG + + void setDefaults(); +}; + +extern FDRFlags xray_fdr_flags_dont_use_directly; +extern void registerXRayFDRFlags(FlagParser *P, FDRFlags *F); +const char *useCompilerDefinedFDRFlags(); +inline FDRFlags *fdrFlags() { return &xray_fdr_flags_dont_use_directly; } + +} // namespace __xray + +#endif // XRAY_FDR_FLAGS_H diff --git a/lib/xray/xray_fdr_flags.inc b/lib/xray/xray_fdr_flags.inc new file mode 100644 index 000000000000..d8721ad12cbe --- /dev/null +++ b/lib/xray/xray_fdr_flags.inc @@ -0,0 +1,29 @@ +//===-- xray_fdr_flags.inc --------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// XRay FDR Mode runtime flags. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FLAG +#error "Define XRAY_FLAG prior to including this file!" +#endif + +// FDR (Flight Data Recorder) Mode logging options. +XRAY_FLAG(int, func_duration_threshold_us, 5, + "FDR logging will try to skip functions that execute for fewer " + "microseconds than this threshold.") +XRAY_FLAG(int, grace_period_ms, 100, + "FDR logging will wait this much time in milliseconds before " + "actually flushing the log; this gives a chance for threads to " + "notice that the log has been finalized and clean up.") +XRAY_FLAG(int, buffer_size, 16384, + "Size of buffers in the circular buffer queue.") +XRAY_FLAG(int, buffer_max, 100, "Maximum number of buffers in the queue.") +XRAY_FLAG(bool, no_file_flush, false, + "Set to true to not write log files by default.") diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h index 324208db82ca..87096d4fc29e 100644 --- a/lib/xray/xray_fdr_log_records.h +++ b/lib/xray/xray_fdr_log_records.h @@ -32,6 +32,8 @@ struct alignas(16) MetadataRecord { CustomEventMarker, CallArgument, BufferExtents, + TypedEventMarker, + Pid, }; // Use 7 bits to identify this record type. diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc index 1bfa10c21f5c..6cb2dfa0c658 100644 --- a/lib/xray/xray_fdr_logging.cc +++ b/lib/xray/xray_fdr_logging.cc @@ -15,64 +15,836 @@ // //===----------------------------------------------------------------------===// #include "xray_fdr_logging.h" +#include <cassert> #include <errno.h> +#include <limits> +#include <memory> +#include <pthread.h> #include <sys/syscall.h> #include <sys/time.h> #include <time.h> #include <unistd.h> +#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" #include "xray/xray_interface.h" #include "xray/xray_records.h" #include "xray_buffer_queue.h" #include "xray_defs.h" -#include "xray_fdr_logging_impl.h" +#include "xray_fdr_flags.h" #include "xray_flags.h" +#include "xray_recursion_guard.h" #include "xray_tsc.h" #include "xray_utils.h" namespace __xray { +atomic_sint32_t LoggingStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; + +// Group together thread-local-data in a struct, then hide it behind a function +// call so that it can be initialized on first use instead of as a global. We +// force the alignment to 64-bytes for x86 cache line alignment, as this +// structure is used in the hot path of implementation. +struct alignas(64) ThreadLocalData { + BufferQueue::Buffer Buffer; + char *RecordPtr = nullptr; + // The number of FunctionEntry records immediately preceding RecordPtr. + uint8_t NumConsecutiveFnEnters = 0; + + // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit + // records preceding RecordPtr. + uint8_t NumTailCalls = 0; + + // We use a thread_local variable to keep track of which CPUs we've already + // run, and the TSC times for these CPUs. This allows us to stop repeating the + // CPU field in the function records. + // + // We assume that we'll support only 65536 CPUs for x86_64. + uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max(); + uint64_t LastTSC = 0; + uint64_t LastFunctionEntryTSC = 0; + + // Make sure a thread that's ever called handleArg0 has a thread-local + // live reference to the buffer queue for this particular instance of + // FDRLogging, and that we're going to clean it up when the thread exits. + BufferQueue *BQ = nullptr; +}; + +static_assert(std::is_trivially_destructible<ThreadLocalData>::value, + "ThreadLocalData must be trivially destructible"); + +static constexpr auto MetadataRecSize = sizeof(MetadataRecord); +static constexpr auto FunctionRecSize = sizeof(FunctionRecord); + +// Use a global pthread key to identify thread-local data for logging. +static pthread_key_t Key; + // Global BufferQueue. -BufferQueue *BQ = nullptr; +static BufferQueue *BQ = nullptr; -__sanitizer::atomic_sint32_t LogFlushStatus = { +static atomic_sint32_t LogFlushStatus = { XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; -FDRLoggingOptions FDROptions; +static FDRLoggingOptions FDROptions; + +static SpinMutex FDROptionsMutex; + +// This function will initialize the thread-local data structure used by the FDR +// logging implementation and return a reference to it. The implementation +// details require a bit of care to maintain. +// +// First, some requirements on the implementation in general: +// +// - XRay handlers should not call any memory allocation routines that may +// delegate to an instrumented implementation. This means functions like +// malloc() and free() should not be called while instrumenting. +// +// - We would like to use some thread-local data initialized on first-use of +// the XRay instrumentation. These allow us to implement unsynchronized +// routines that access resources associated with the thread. +// +// The implementation here uses a few mechanisms that allow us to provide both +// the requirements listed above. We do this by: +// +// 1. Using a thread-local aligned storage buffer for representing the +// ThreadLocalData struct. This data will be uninitialized memory by +// design. +// +// 2. Not requiring a thread exit handler/implementation, keeping the +// thread-local as purely a collection of references/data that do not +// require cleanup. +// +// We're doing this to avoid using a `thread_local` object that has a +// non-trivial destructor, because the C++ runtime might call std::malloc(...) +// to register calls to destructors. Deadlocks may arise when, for example, an +// externally provided malloc implementation is XRay instrumented, and +// initializing the thread-locals involves calling into malloc. A malloc +// implementation that does global synchronization might be holding a lock for a +// critical section, calling a function that might be XRay instrumented (and +// thus in turn calling into malloc by virtue of registration of the +// thread_local's destructor). +static_assert(alignof(ThreadLocalData) >= 64, + "ThreadLocalData must be cache line aligned."); +static ThreadLocalData &getThreadLocalData() { + thread_local typename std::aligned_storage< + sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{}; + + if (pthread_getspecific(Key) == NULL) { + new (reinterpret_cast<ThreadLocalData *>(&TLDStorage)) ThreadLocalData{}; + pthread_setspecific(Key, &TLDStorage); + } + + return *reinterpret_cast<ThreadLocalData *>(&TLDStorage); +} + +static void writeNewBufferPreamble(tid_t Tid, timespec TS, + pid_t Pid) XRAY_NEVER_INSTRUMENT { + static constexpr int InitRecordsCount = 3; + auto &TLD = getThreadLocalData(); + MetadataRecord Metadata[InitRecordsCount]; + { + // Write out a MetadataRecord to signify that this is the start of a new + // buffer, associated with a particular thread, with a new CPU. For the + // data, we have 15 bytes to squeeze as much information as we can. At this + // point we only write down the following bytes: + // - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes) + auto &NewBuffer = Metadata[0]; + NewBuffer.Type = uint8_t(RecordType::Metadata); + NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer); + int32_t tid = static_cast<int32_t>(Tid); + internal_memcpy(&NewBuffer.Data, &tid, sizeof(tid)); + } + + // Also write the WalltimeMarker record. + { + static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes"); + auto &WalltimeMarker = Metadata[1]; + WalltimeMarker.Type = uint8_t(RecordType::Metadata); + WalltimeMarker.RecordKind = + uint8_t(MetadataRecord::RecordKinds::WalltimeMarker); + + // We only really need microsecond precision here, and enforce across + // platforms that we need 64-bit seconds and 32-bit microseconds encoded in + // the Metadata record. + int32_t Micros = TS.tv_nsec / 1000; + int64_t Seconds = TS.tv_sec; + internal_memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds)); + internal_memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros, + sizeof(Micros)); + } + + // Also write the Pid record. + { + // Write out a MetadataRecord that contains the current pid + auto &PidMetadata = Metadata[2]; + PidMetadata.Type = uint8_t(RecordType::Metadata); + PidMetadata.RecordKind = uint8_t(MetadataRecord::RecordKinds::Pid); + int32_t pid = static_cast<int32_t>(Pid); + internal_memcpy(&PidMetadata.Data, &pid, sizeof(pid)); + } + + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + if (TLD.BQ == nullptr || TLD.BQ->finalizing()) + return; + internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata)); + TLD.RecordPtr += sizeof(Metadata); + // Since we write out the extents as the first metadata record of the + // buffer, we need to write out the extents including the extents record. + atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata), + memory_order_release); +} + +static void setupNewBuffer(int (*wall_clock_reader)( + clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + auto &B = TLD.Buffer; + TLD.RecordPtr = static_cast<char *>(B.Data); + tid_t Tid = GetTid(); + timespec TS{0, 0}; + pid_t Pid = internal_getpid(); + // This is typically clock_gettime, but callers have injection ability. + wall_clock_reader(CLOCK_MONOTONIC, &TS); + writeNewBufferPreamble(Tid, TS, Pid); + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; +} + +static void incrementExtents(size_t Add) { + auto &TLD = getThreadLocalData(); + atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel); +} + +static void decrementExtents(size_t Subtract) { + auto &TLD = getThreadLocalData(); + atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel); +} + +static void writeNewCPUIdMetadata(uint16_t CPU, + uint64_t TSC) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + MetadataRecord NewCPUId; + NewCPUId.Type = uint8_t(RecordType::Metadata); + NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId); + + // The data for the New CPU will contain the following bytes: + // - CPU ID (uint16_t, 2 bytes) + // - Full TSC (uint64_t, 8 bytes) + // Total = 10 bytes. + internal_memcpy(&NewCPUId.Data, &CPU, sizeof(CPU)); + internal_memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC)); + internal_memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord)); + TLD.RecordPtr += sizeof(MetadataRecord); + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + incrementExtents(sizeof(MetadataRecord)); +} + +static void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + MetadataRecord TSCWrap; + TSCWrap.Type = uint8_t(RecordType::Metadata); + TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap); + + // The data for the TSCWrap record contains the following bytes: + // - Full TSC (uint64_t, 8 bytes) + // Total = 8 bytes. + internal_memcpy(&TSCWrap.Data, &TSC, sizeof(TSC)); + internal_memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord)); + TLD.RecordPtr += sizeof(MetadataRecord); + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + incrementExtents(sizeof(MetadataRecord)); +} + +// Call Argument metadata records store the arguments to a function in the +// order of their appearance; holes are not supported by the buffer format. +static void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + MetadataRecord CallArg; + CallArg.Type = uint8_t(RecordType::Metadata); + CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument); + + internal_memcpy(CallArg.Data, &A, sizeof(A)); + internal_memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord)); + TLD.RecordPtr += sizeof(MetadataRecord); + incrementExtents(sizeof(MetadataRecord)); +} + +static void writeFunctionRecord(int FuncId, uint32_t TSCDelta, + XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT { + FunctionRecord FuncRecord; + FuncRecord.Type = uint8_t(RecordType::Function); + // Only take 28 bits of the function id. + FuncRecord.FuncId = FuncId & ~(0x0F << 28); + FuncRecord.TSCDelta = TSCDelta; + + auto &TLD = getThreadLocalData(); + switch (EntryType) { + case XRayEntryType::ENTRY: + ++TLD.NumConsecutiveFnEnters; + FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); + break; + case XRayEntryType::LOG_ARGS_ENTRY: + // We should not rewind functions with logged args. + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); + break; + case XRayEntryType::EXIT: + // If we've decided to log the function exit, we will never erase the log + // before it. + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit); + break; + case XRayEntryType::TAIL: + // If we just entered the function we're tail exiting from or erased every + // invocation since then, this function entry tail pair is a candidate to + // be erased when the child function exits. + if (TLD.NumConsecutiveFnEnters > 0) { + ++TLD.NumTailCalls; + TLD.NumConsecutiveFnEnters = 0; + } else { + // We will never be able to erase this tail call since we have logged + // something in between the function entry and tail exit. + TLD.NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + } + FuncRecord.RecordKind = + uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); + break; + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static atomic_uint8_t ErrorLatch{0}; + if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d\n", + FuncId); + return; + } + case XRayEntryType::TYPED_EVENT: { + static atomic_uint8_t ErrorLatch{0}; + if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) + Report("Internal error: patched an XRay typed event call as a function; " + "func id = %d\n", + FuncId); + return; + } + } + + internal_memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord)); + TLD.RecordPtr += sizeof(FunctionRecord); + incrementExtents(sizeof(FunctionRecord)); +} + +static atomic_uint64_t TicksPerSec{0}; +static atomic_uint64_t ThresholdTicks{0}; + +// Re-point the thread local pointer into this thread's Buffer before the recent +// "Function Entry" record and any "Tail Call Exit" records after that. +static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, + uint64_t &LastFunctionEntryTSC, int32_t FuncId) { + auto &TLD = getThreadLocalData(); + TLD.RecordPtr -= FunctionRecSize; + decrementExtents(FunctionRecSize); + FunctionRecord FuncRecord; + internal_memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize); + DCHECK(FuncRecord.RecordKind == + uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && + "Expected to find function entry recording when rewinding."); + DCHECK(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) && + "Expected matching function id when rewinding Exit"); + --TLD.NumConsecutiveFnEnters; + LastTSC -= FuncRecord.TSCDelta; + + // We unwound one call. Update the state and return without writing a log. + if (TLD.NumConsecutiveFnEnters != 0) { + LastFunctionEntryTSC -= FuncRecord.TSCDelta; + return; + } + + // Otherwise we've rewound the stack of all function entries, we might be + // able to rewind further by erasing tail call functions that are being + // exited from via this exit. + LastFunctionEntryTSC = 0; + auto RewindingTSC = LastTSC; + auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize; + while (TLD.NumTailCalls > 0) { + // Rewind the TSC back over the TAIL EXIT record. + FunctionRecord ExpectedTailExit; + internal_memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize); + + DCHECK(ExpectedTailExit.RecordKind == + uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) && + "Expected to find tail exit when rewinding."); + RewindingRecordPtr -= FunctionRecSize; + RewindingTSC -= ExpectedTailExit.TSCDelta; + FunctionRecord ExpectedFunctionEntry; + internal_memcpy(&ExpectedFunctionEntry, RewindingRecordPtr, + FunctionRecSize); + DCHECK(ExpectedFunctionEntry.RecordKind == + uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && + "Expected to find function entry when rewinding tail call."); + DCHECK(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId && + "Expected funcids to match when rewinding tail call."); + + // This tail call exceeded the threshold duration. It will not be erased. + if ((TSC - RewindingTSC) >= atomic_load_relaxed(&ThresholdTicks)) { + TLD.NumTailCalls = 0; + return; + } + + // We can erase a tail exit pair that we're exiting through since + // its duration is under threshold. + --TLD.NumTailCalls; + RewindingRecordPtr -= FunctionRecSize; + RewindingTSC -= ExpectedFunctionEntry.TSCDelta; + TLD.RecordPtr -= 2 * FunctionRecSize; + LastTSC = RewindingTSC; + decrementExtents(2 * FunctionRecSize); + } +} + +static bool releaseThreadLocalBuffer(BufferQueue &BQArg) { + auto &TLD = getThreadLocalData(); + auto EC = BQArg.releaseBuffer(TLD.Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Data, + BufferQueue::getErrorString(EC)); + return false; + } + return true; +} + +static bool prepareBuffer(uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, + struct timespec *), + size_t MaxSize) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + char *BufferStart = static_cast<char *>(TLD.Buffer.Data); + if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) { + if (!releaseThreadLocalBuffer(*TLD.BQ)) + return false; + auto EC = TLD.BQ->getBuffer(TLD.Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + Report("Failed to prepare a buffer; error = '%s'\n", + BufferQueue::getErrorString(EC)); + return false; + } + setupNewBuffer(wall_clock_reader); -__sanitizer::SpinMutex FDROptionsMutex; + // Always write the CPU metadata as the first record in the buffer. + writeNewCPUIdMetadata(CPU, TSC); + } + return true; +} + +static bool +isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *)) + XRAY_NEVER_INSTRUMENT { + // Bail out right away if logging is not initialized yet. + // We should take the opportunity to release the buffer though. + auto Status = atomic_load(&LoggingStatus, memory_order_acquire); + auto &TLD = getThreadLocalData(); + if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { + if (TLD.RecordPtr != nullptr && + (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || + Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { + if (!releaseThreadLocalBuffer(*LBQ)) + return false; + TLD.RecordPtr = nullptr; + return false; + } + return false; + } + + if (atomic_load(&LoggingStatus, memory_order_acquire) != + XRayLogInitStatus::XRAY_LOG_INITIALIZED || + LBQ->finalizing()) { + if (!releaseThreadLocalBuffer(*LBQ)) + return false; + TLD.RecordPtr = nullptr; + } + + if (TLD.Buffer.Data == nullptr) { + auto EC = LBQ->getBuffer(TLD.Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + auto LS = atomic_load(&LoggingStatus, memory_order_acquire); + if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING && + LS != XRayLogInitStatus::XRAY_LOG_FINALIZED) + Report("Failed to acquire a buffer; error = '%s'\n", + BufferQueue::getErrorString(EC)); + return false; + } + + setupNewBuffer(wall_clock_reader); + + // Always write the CPU metadata as the first record in the buffer. + writeNewCPUIdMetadata(CPU, TSC); + } + + if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) { + // This means this is the first CPU this thread has ever run on. We set + // the current CPU and record this as the first TSC we've seen. + TLD.CurrentCPU = CPU; + writeNewCPUIdMetadata(CPU, TSC); + } + + return true; +} + +// Compute the TSC difference between the time of measurement and the previous +// event. There are a few interesting situations we need to account for: +// +// - The thread has migrated to a different CPU. If this is the case, then +// we write down the following records: +// +// 1. A 'NewCPUId' Metadata record. +// 2. A FunctionRecord with a 0 for the TSCDelta field. +// +// - The TSC delta is greater than the 32 bits we can store in a +// FunctionRecord. In this case we write down the following records: +// +// 1. A 'TSCWrap' Metadata record. +// 2. A FunctionRecord with a 0 for the TSCDelta field. +// +// - The TSC delta is representable within the 32 bits we can store in a +// FunctionRecord. In this case we write down just a FunctionRecord with +// the correct TSC delta. +static uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC, + uint8_t CPU) { + if (CPU != TLD.CurrentCPU) { + // We've moved to a new CPU. + writeNewCPUIdMetadata(CPU, TSC); + return 0; + } + // If the delta is greater than the range for a uint32_t, then we write out + // the TSC wrap metadata entry with the full TSC, and the TSC for the + // function record be 0. + uint64_t Delta = TSC - TLD.LastTSC; + if (Delta <= std::numeric_limits<uint32_t>::max()) + return Delta; + + writeTSCWrapMetadata(TSC); + return 0; +} + +static void endBufferIfFull() XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + auto BufferStart = static_cast<char *>(TLD.Buffer.Data); + if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <= + ptrdiff_t{MetadataRecSize}) { + if (!releaseThreadLocalBuffer(*TLD.BQ)) + return; + TLD.RecordPtr = nullptr; + } +} + +thread_local atomic_uint8_t Running{0}; + +/// Here's where the meat of the processing happens. The writer captures +/// function entry, exit and tail exit points with a time and will create +/// TSCWrap, NewCPUId and Function records as necessary. The writer might +/// walk backward through its buffer and erase trivial functions to avoid +/// polluting the log and may use the buffer queue to obtain or release a +/// buffer. +static void processFunctionHook(int32_t FuncId, XRayEntryType Entry, + uint64_t TSC, unsigned char CPU, uint64_t Arg1, + int (*wall_clock_reader)(clockid_t, + struct timespec *)) + XRAY_NEVER_INSTRUMENT { + __asm volatile("# LLVM-MCA-BEGIN processFunctionHook"); + // Prevent signal handler recursion, so in case we're already in a log writing + // mode and the signal handler comes in (and is also instrumented) then we + // don't want to be clobbering potentially partial writes already happening in + // the thread. We use a simple thread_local latch to only allow one on-going + // handleArg0 to happen at any given time. + RecursionGuard Guard{Running}; + if (!Guard) { + DCHECK(atomic_load_relaxed(&Running) && "RecursionGuard is buggy!"); + return; + } + + auto &TLD = getThreadLocalData(); + + if (TLD.BQ == nullptr) + TLD.BQ = BQ; + + if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader)) + return; + + // Before we go setting up writing new function entries, we need to be really + // careful about the pointer math we're doing. This means we need to ensure + // that the record we are about to write is going to fit into the buffer, + // without overflowing the buffer. + // + // To do this properly, we use the following assumptions: + // + // - The least number of bytes we will ever write is 8 + // (sizeof(FunctionRecord)) only if the delta between the previous entry + // and this entry is within 32 bits. + // - The most number of bytes we will ever write is 8 + 16 + 16 = 40. + // This is computed by: + // + // MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord) + // + // These arise in the following cases: + // + // 1. When the delta between the TSC we get and the previous TSC for the + // same CPU is outside of the uint32_t range, we end up having to + // write a MetadataRecord to indicate a "tsc wrap" before the actual + // FunctionRecord. + // 2. When we learn that we've moved CPUs, we need to write a + // MetadataRecord to indicate a "cpu change", and thus write out the + // current TSC for that CPU before writing out the actual + // FunctionRecord. + // 3. When we learn about a new CPU ID, we need to write down a "new cpu + // id" MetadataRecord before writing out the actual FunctionRecord. + // 4. The second MetadataRecord is the optional function call argument. + // + // So the math we need to do is to determine whether writing 40 bytes past the + // current pointer exceeds the buffer's maximum size. If we don't have enough + // space to write 40 bytes in the buffer, we need get a new Buffer, set it up + // properly before doing any further writing. + size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize; + if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) { + TLD.BQ = nullptr; + return; + } + + // By this point, we are now ready to write up to 40 bytes (explained above). + DCHECK((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Data) >= + static_cast<ptrdiff_t>(MetadataRecSize) && + "Misconfigured BufferQueue provided; Buffer size not large enough."); + + auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU); + TLD.LastTSC = TSC; + TLD.CurrentCPU = CPU; + switch (Entry) { + case XRayEntryType::ENTRY: + case XRayEntryType::LOG_ARGS_ENTRY: + // Update the thread local state for the next invocation. + TLD.LastFunctionEntryTSC = TSC; + break; + case XRayEntryType::TAIL: + case XRayEntryType::EXIT: + // Break out and write the exit record if we can't erase any functions. + if (TLD.NumConsecutiveFnEnters == 0 || + (TSC - TLD.LastFunctionEntryTSC) >= + atomic_load_relaxed(&ThresholdTicks)) + break; + rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId); + return; // without writing log. + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static atomic_uint8_t ErrorLatch{0}; + if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d\n", + FuncId); + return; + } + case XRayEntryType::TYPED_EVENT: { + static atomic_uint8_t ErrorLatch{0}; + if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) + Report("Internal error: patched an XRay typed event call as a function; " + "func id = %d\n", + FuncId); + return; + } + } + + writeFunctionRecord(FuncId, RecordTSCDelta, Entry); + if (Entry == XRayEntryType::LOG_ARGS_ENTRY) + writeCallArgumentMetadata(Arg1); + + // If we've exhausted the buffer by this time, we then release the buffer to + // make sure that other threads may start using this buffer. + endBufferIfFull(); + __asm volatile("# LLVM-MCA-END"); +} + +static XRayFileHeader &fdrCommonHeaderInfo() { + static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage; + static pthread_once_t OnceInit = PTHREAD_ONCE_INIT; + static bool TSCSupported = true; + static uint64_t CycleFrequency = NanosecondsPerSecond; + pthread_once(&OnceInit, +[] { + XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage); + // Version 2 of the log writes the extents of the buffer, instead of + // relying on an end-of-buffer record. + // Version 3 includes PID metadata record + H.Version = 3; + H.Type = FileTypes::FDR_LOG; + + // Test for required CPU features and cache the cycle frequency + TSCSupported = probeRequiredCPUFeatures(); + if (TSCSupported) + CycleFrequency = getTSCFrequency(); + H.CycleFrequency = CycleFrequency; + + // FIXME: Actually check whether we have 'constant_tsc' and + // 'nonstop_tsc' before setting the values in the header. + H.ConstantTSC = 1; + H.NonstopTSC = 1; + }); + return reinterpret_cast<XRayFileHeader &>(HStorage); +} + +// This is the iterator implementation, which knows how to handle FDR-mode +// specific buffers. This is used as an implementation of the iterator function +// needed by __xray_set_buffer_iterator(...). It maintains a global state of the +// buffer iteration for the currently installed FDR mode buffers. In particular: +// +// - If the argument represents the initial state of XRayBuffer ({nullptr, 0}) +// then the iterator returns the header information. +// - If the argument represents the header information ({address of header +// info, size of the header info}) then it returns the first FDR buffer's +// address and extents. +// - It will keep returning the next buffer and extents as there are more +// buffers to process. When the input represents the last buffer, it will +// return the initial state to signal completion ({nullptr, 0}). +// +// See xray/xray_log_interface.h for more details on the requirements for the +// implementations of __xray_set_buffer_iterator(...) and +// __xray_log_process_buffers(...). +XRayBuffer fdrIterator(const XRayBuffer B) { + DCHECK(internal_strcmp(__xray_log_get_current_mode(), "xray-fdr") == 0); + DCHECK(BQ->finalizing()); + + if (BQ == nullptr || !BQ->finalizing()) { + if (Verbosity()) + Report( + "XRay FDR: Failed global buffer queue is null or not finalizing!\n"); + return {nullptr, 0}; + } + + // We use a global scratch-pad for the header information, which only gets + // initialized the first time this function is called. We'll update one part + // of this information with some relevant data (in particular the number of + // buffers to expect). + static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage; + static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT; + pthread_once(&HeaderOnce, +[] { + reinterpret_cast<XRayFileHeader &>(HeaderStorage) = fdrCommonHeaderInfo(); + }); + + // We use a convenience alias for code referring to Header from here on out. + auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage); + if (B.Data == nullptr && B.Size == 0) { + Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()}; + return XRayBuffer{static_cast<void *>(&Header), sizeof(Header)}; + } + + static BufferQueue::const_iterator It{}; + static BufferQueue::const_iterator End{}; + static void *CurrentBuffer{nullptr}; + if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) { + // From this point on, we provide raw access to the raw buffer we're getting + // from the BufferQueue. We're relying on the iterators from the current + // Buffer queue. + It = BQ->cbegin(); + End = BQ->cend(); + } + + if (CurrentBuffer != nullptr) { + InternalFree(CurrentBuffer); + CurrentBuffer = nullptr; + } + + if (It == End) + return {nullptr, 0}; + + // Set up the current buffer to contain the extents like we would when writing + // out to disk. The difference here would be that we still write "empty" + // buffers, or at least go through the iterators faithfully to let the + // handlers see the empty buffers in the queue. + auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire); + auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord); + CurrentBuffer = InternalAlloc(SerializedBufferSize); + if (CurrentBuffer == nullptr) + return {nullptr, 0}; + + // Write out the extents as a Metadata Record into the CurrentBuffer. + MetadataRecord ExtentsRecord; + ExtentsRecord.Type = uint8_t(RecordType::Metadata); + ExtentsRecord.RecordKind = + uint8_t(MetadataRecord::RecordKinds::BufferExtents); + internal_memcpy(ExtentsRecord.Data, &BufferSize, sizeof(BufferSize)); + auto AfterExtents = + static_cast<char *>(internal_memcpy(CurrentBuffer, &ExtentsRecord, + sizeof(MetadataRecord))) + + sizeof(MetadataRecord); + internal_memcpy(AfterExtents, It->Data, BufferSize); + + XRayBuffer Result; + Result.Data = CurrentBuffer; + Result.Size = SerializedBufferSize; + ++It; + return Result; +} // Must finalize before flushing. XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { - if (__sanitizer::atomic_load(&LoggingStatus, - __sanitizer::memory_order_acquire) != + if (atomic_load(&LoggingStatus, memory_order_acquire) != XRayLogInitStatus::XRAY_LOG_FINALIZED) { - if (__sanitizer::Verbosity()) + if (Verbosity()) Report("Not flushing log, implementation is not finalized.\n"); return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; } s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; - if (!__sanitizer::atomic_compare_exchange_strong( - &LogFlushStatus, &Result, XRayLogFlushStatus::XRAY_LOG_FLUSHING, - __sanitizer::memory_order_release)) { - - if (__sanitizer::Verbosity()) + if (!atomic_compare_exchange_strong(&LogFlushStatus, &Result, + XRayLogFlushStatus::XRAY_LOG_FLUSHING, + memory_order_release)) { + if (Verbosity()) Report("Not flushing log, implementation is still finalizing.\n"); return static_cast<XRayLogFlushStatus>(Result); } if (BQ == nullptr) { - if (__sanitizer::Verbosity()) + if (Verbosity()) Report("Cannot flush when global buffer queue is null.\n"); return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; } // We wait a number of milliseconds to allow threads to see that we've // finalised before attempting to flush the log. - __sanitizer::SleepForMillis(flags()->xray_fdr_log_grace_period_ms); + SleepForMillis(fdrFlags()->grace_period_ms); + + // At this point, we're going to uninstall the iterator implementation, before + // we decide to do anything further with the global buffer queue. + __xray_log_remove_buffer_iterator(); + + // Once flushed, we should set the global status of the logging implementation + // to "uninitialized" to allow for FDR-logging multiple runs. + auto ResetToUnitialized = at_scope_exit([] { + atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, + memory_order_release); + }); + + auto CleanupBuffers = at_scope_exit([] { + if (BQ != nullptr) { + auto &TLD = getThreadLocalData(); + if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr) + releaseThreadLocalBuffer(*TLD.BQ); + BQ->~BufferQueue(); + InternalFree(BQ); + BQ = nullptr; + } + }); + + if (fdrFlags()->no_file_flush) { + if (Verbosity()) + Report("XRay FDR: Not flushing to file, 'no_file_flush=true'.\n"); + + atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + memory_order_release); + return XRayLogFlushStatus::XRAY_LOG_FLUSHED; + } // We write out the file in the following format: // @@ -85,35 +857,20 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { // int Fd = -1; { - __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); + // FIXME: Remove this section of the code, when we remove the struct-based + // configuration API. + SpinMutexLock Guard(&FDROptionsMutex); Fd = FDROptions.Fd; } if (Fd == -1) Fd = getLogFD(); if (Fd == -1) { auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; - __sanitizer::atomic_store(&LogFlushStatus, Result, - __sanitizer::memory_order_release); + atomic_store(&LogFlushStatus, Result, memory_order_release); return Result; } - // Test for required CPU features and cache the cycle frequency - static bool TSCSupported = probeRequiredCPUFeatures(); - static uint64_t CycleFrequency = - TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond; - - XRayFileHeader Header; - - // Version 2 of the log writes the extents of the buffer, instead of relying - // on an end-of-buffer record. - Header.Version = 2; - Header.Type = FileTypes::FDR_LOG; - Header.CycleFrequency = CycleFrequency; - - // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc' - // before setting the values in the header. - Header.ConstantTSC = 1; - Header.NonstopTSC = 1; + XRayFileHeader Header = fdrCommonHeaderInfo(); Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()}; retryingWriteAll(Fd, reinterpret_cast<char *>(&Header), reinterpret_cast<char *>(&Header) + sizeof(Header)); @@ -121,39 +878,36 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { BQ->apply([&](const BufferQueue::Buffer &B) { // Starting at version 2 of the FDR logging implementation, we only write // the records identified by the extents of the buffer. We use the Extents - // from the Buffer and write that out as the first record in the buffer. - // We still use a Metadata record, but fill in the extents instead for the + // from the Buffer and write that out as the first record in the buffer. We + // still use a Metadata record, but fill in the extents instead for the // data. MetadataRecord ExtentsRecord; - auto BufferExtents = __sanitizer::atomic_load( - &B.Extents->Size, __sanitizer::memory_order_acquire); - assert(BufferExtents <= B.Size); + auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire); + DCHECK(BufferExtents <= B.Size); ExtentsRecord.Type = uint8_t(RecordType::Metadata); ExtentsRecord.RecordKind = uint8_t(MetadataRecord::RecordKinds::BufferExtents); - std::memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents)); + internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents)); if (BufferExtents > 0) { retryingWriteAll(Fd, reinterpret_cast<char *>(&ExtentsRecord), reinterpret_cast<char *>(&ExtentsRecord) + sizeof(MetadataRecord)); - retryingWriteAll(Fd, reinterpret_cast<char *>(B.Buffer), - reinterpret_cast<char *>(B.Buffer) + BufferExtents); + retryingWriteAll(Fd, reinterpret_cast<char *>(B.Data), + reinterpret_cast<char *>(B.Data) + BufferExtents); } }); - __sanitizer::atomic_store(&LogFlushStatus, - XRayLogFlushStatus::XRAY_LOG_FLUSHED, - __sanitizer::memory_order_release); + atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + memory_order_release); return XRayLogFlushStatus::XRAY_LOG_FLUSHED; } XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT { s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED; - if (!__sanitizer::atomic_compare_exchange_strong( - &LoggingStatus, &CurrentStatus, - XRayLogInitStatus::XRAY_LOG_FINALIZING, - __sanitizer::memory_order_release)) { - if (__sanitizer::Verbosity()) + if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_FINALIZING, + memory_order_release)) { + if (Verbosity()) Report("Cannot finalize log, implementation not initialized.\n"); return static_cast<XRayLogInitStatus>(CurrentStatus); } @@ -162,39 +916,11 @@ XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT { // operations to be performed until re-initialized. BQ->finalize(); - __sanitizer::atomic_store(&LoggingStatus, - XRayLogInitStatus::XRAY_LOG_FINALIZED, - __sanitizer::memory_order_release); + atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED, + memory_order_release); return XRayLogInitStatus::XRAY_LOG_FINALIZED; } -XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT { - s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_FINALIZED; - if (__sanitizer::atomic_compare_exchange_strong( - &LoggingStatus, &CurrentStatus, - XRayLogInitStatus::XRAY_LOG_INITIALIZED, - __sanitizer::memory_order_release)) - return static_cast<XRayLogInitStatus>(CurrentStatus); - - // Release the in-memory buffer queue. - delete BQ; - BQ = nullptr; - - // Spin until the flushing status is flushed. - s32 CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED; - while (__sanitizer::atomic_compare_exchange_weak( - &LogFlushStatus, &CurrentFlushingStatus, - XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING, - __sanitizer::memory_order_release)) { - if (CurrentFlushingStatus == XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING) - break; - CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED; - } - - // At this point, we know that the status is flushed, and that we can assume - return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; -} - struct TSCAndCPU { uint64_t TSC = 0; unsigned char CPU = 0; @@ -202,12 +928,14 @@ struct TSCAndCPU { static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT { // We want to get the TSC as early as possible, so that we can check whether - // we've seen this CPU before. We also do it before we load anything else, to - // allow for forward progress with the scheduling. + // we've seen this CPU before. We also do it before we load anything else, + // to allow for forward progress with the scheduling. TSCAndCPU Result; // Test once for required CPU features - static bool TSCSupported = probeRequiredCPUFeatures(); + static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT; + static bool TSCSupported = true; + pthread_once(&OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); }); if (TSCSupported) { Result.TSC = __xray::readTSC(Result.CPU); @@ -228,20 +956,17 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT { void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { auto TC = getTimestamp(); - __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, - clock_gettime, BQ); + processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, clock_gettime); } void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry, uint64_t Arg) XRAY_NEVER_INSTRUMENT { auto TC = getTimestamp(); - __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, - clock_gettime, BQ); + processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, clock_gettime); } void fdrLoggingHandleCustomEvent(void *Event, std::size_t EventSize) XRAY_NEVER_INSTRUMENT { - using namespace __xray_fdr_internal; auto TC = getTimestamp(); auto &TSC = TC.TSC; auto &CPU = TC.CPU; @@ -249,13 +974,8 @@ void fdrLoggingHandleCustomEvent(void *Event, if (!Guard) return; if (EventSize > std::numeric_limits<int32_t>::max()) { - using Empty = struct {}; - static Empty Once = [&] { - Report("Event size too large = %zu ; > max = %d\n", EventSize, - std::numeric_limits<int32_t>::max()); - return Empty(); - }(); - (void)Once; + static pthread_once_t Once = PTHREAD_ONCE_INIT; + pthread_once(&Once, +[] { Report("Event size too large.\n"); }); } int32_t ReducedEventSize = static_cast<int32_t>(EventSize); auto &TLD = getThreadLocalData(); @@ -264,8 +984,8 @@ void fdrLoggingHandleCustomEvent(void *Event, // Here we need to prepare the log to handle: // - The metadata record we're going to write. (16 bytes) - // - The additional data we're going to write. Currently, that's the size of - // the event we're going to dump into the log as free-form bytes. + // - The additional data we're going to write. Currently, that's the size + // of the event we're going to dump into the log as free-form bytes. if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) { TLD.BQ = nullptr; return; @@ -280,90 +1000,207 @@ void fdrLoggingHandleCustomEvent(void *Event, CustomEvent.RecordKind = uint8_t(MetadataRecord::RecordKinds::CustomEventMarker); constexpr auto TSCSize = sizeof(TC.TSC); - std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); - std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); - std::memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent)); + internal_memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); + internal_memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); + internal_memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent)); TLD.RecordPtr += sizeof(CustomEvent); - std::memcpy(TLD.RecordPtr, Event, ReducedEventSize); + internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize); + incrementExtents(MetadataRecSize + EventSize); + endBufferIfFull(); +} + +void fdrLoggingHandleTypedEvent( + uint16_t EventType, const void *Event, + std::size_t EventSize) noexcept XRAY_NEVER_INSTRUMENT { + auto TC = getTimestamp(); + auto &TSC = TC.TSC; + auto &CPU = TC.CPU; + RecursionGuard Guard{Running}; + if (!Guard) + return; + if (EventSize > std::numeric_limits<int32_t>::max()) { + static pthread_once_t Once = PTHREAD_ONCE_INIT; + pthread_once(&Once, +[] { Report("Event size too large.\n"); }); + } + int32_t ReducedEventSize = static_cast<int32_t>(EventSize); + auto &TLD = getThreadLocalData(); + if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime)) + return; + + // Here we need to prepare the log to handle: + // - The metadata record we're going to write. (16 bytes) + // - The additional data we're going to write. Currently, that's the size + // of the event we're going to dump into the log as free-form bytes. + if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) { + TLD.BQ = nullptr; + return; + } + // Write the custom event metadata record, which consists of the following + // information: + // - 8 bytes (64-bits) for the full TSC when the event started. + // - 4 bytes (32-bits) for the length of the data. + // - 2 bytes (16-bits) for the event type. 3 bytes remain since one of the + // bytes has the record type (Metadata Record) and kind (TypedEvent). + // We'll log the error if the event type is greater than 2 bytes. + // Event types are generated sequentially, so 2^16 is enough. + MetadataRecord TypedEvent; + TypedEvent.Type = uint8_t(RecordType::Metadata); + TypedEvent.RecordKind = + uint8_t(MetadataRecord::RecordKinds::TypedEventMarker); + constexpr auto TSCSize = sizeof(TC.TSC); + internal_memcpy(&TypedEvent.Data, &ReducedEventSize, sizeof(int32_t)); + internal_memcpy(&TypedEvent.Data[sizeof(int32_t)], &TSC, TSCSize); + internal_memcpy(&TypedEvent.Data[sizeof(int32_t) + TSCSize], &EventType, + sizeof(EventType)); + internal_memcpy(TLD.RecordPtr, &TypedEvent, sizeof(TypedEvent)); + + TLD.RecordPtr += sizeof(TypedEvent); + internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize); incrementExtents(MetadataRecSize + EventSize); endBufferIfFull(); } -XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, +XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { - if (OptionsSize != sizeof(FDRLoggingOptions)) { - if (__sanitizer::Verbosity()) - Report("Cannot initialize FDR logging; wrong size for options: %d\n", - OptionsSize); - return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load( - &LoggingStatus, __sanitizer::memory_order_acquire)); - } + if (Options == nullptr) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; - if (!__sanitizer::atomic_compare_exchange_strong( - &LoggingStatus, &CurrentStatus, - XRayLogInitStatus::XRAY_LOG_INITIALIZING, - __sanitizer::memory_order_release)) { - if (__sanitizer::Verbosity()) + if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZING, + memory_order_release)) { + if (Verbosity()) Report("Cannot initialize already initialized implementation.\n"); return static_cast<XRayLogInitStatus>(CurrentStatus); } - { - __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); - memcpy(&FDROptions, Options, OptionsSize); + // Because of __xray_log_init_mode(...) which guarantees that this will be + // called with BufferSize == 0 and BufferMax == 0 we parse the configuration + // provided in the Options pointer as a string instead. + if (BufferSize == 0 && BufferMax == 0) { + if (Verbosity()) + Report("Initializing FDR mode with options: %s\n", + static_cast<const char *>(Options)); + + // TODO: Factor out the flags specific to the FDR mode implementation. For + // now, use the global/single definition of the flags, since the FDR mode + // flags are already defined there. + FlagParser FDRParser; + FDRFlags FDRFlags; + registerXRayFDRFlags(&FDRParser, &FDRFlags); + FDRFlags.setDefaults(); + + // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided + // options until we migrate everyone to use the XRAY_FDR_OPTIONS + // compiler-provided options. + FDRParser.ParseString(useCompilerDefinedFlags()); + FDRParser.ParseString(useCompilerDefinedFDRFlags()); + auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS"); + if (EnvOpts == nullptr) + EnvOpts = ""; + FDRParser.ParseString(EnvOpts); + + // FIXME: Remove this when we fully remove the deprecated flags. + if (internal_strlen(EnvOpts) == 0) { + FDRFlags.func_duration_threshold_us = + flags()->xray_fdr_log_func_duration_threshold_us; + FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms; + } + + // The provided options should always override the compiler-provided and + // environment-variable defined options. + FDRParser.ParseString(static_cast<const char *>(Options)); + *fdrFlags() = FDRFlags; + BufferSize = FDRFlags.buffer_size; + BufferMax = FDRFlags.buffer_max; + SpinMutexLock Guard(&FDROptionsMutex); + FDROptions.Fd = -1; + FDROptions.ReportErrors = true; + } else if (OptionsSize != sizeof(FDRLoggingOptions)) { + // FIXME: This is deprecated, and should really be removed. + // At this point we use the flag parser specific to the FDR mode + // implementation. + if (Verbosity()) + Report("Cannot initialize FDR logging; wrong size for options: %d\n", + OptionsSize); + return static_cast<XRayLogInitStatus>( + atomic_load(&LoggingStatus, memory_order_acquire)); + } else { + if (Verbosity()) + Report("XRay FDR: struct-based init is deprecated, please use " + "string-based configuration instead.\n"); + SpinMutexLock Guard(&FDROptionsMutex); + internal_memcpy(&FDROptions, Options, OptionsSize); } bool Success = false; if (BQ != nullptr) { - delete BQ; + BQ->~BufferQueue(); + InternalFree(BQ); BQ = nullptr; } - if (BQ == nullptr) - BQ = new BufferQueue(BufferSize, BufferMax, Success); + if (BQ == nullptr) { + BQ = reinterpret_cast<BufferQueue *>( + InternalAlloc(sizeof(BufferQueue), nullptr, 64)); + new (BQ) BufferQueue(BufferSize, BufferMax, Success); + } if (!Success) { Report("BufferQueue init failed.\n"); if (BQ != nullptr) { - delete BQ; + BQ->~BufferQueue(); + InternalFree(BQ); BQ = nullptr; } return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; } - static bool UNUSED Once = [] { - pthread_key_create(&__xray_fdr_internal::Key, +[](void *) { - auto &TLD = __xray_fdr_internal::getThreadLocalData(); + static pthread_once_t OnceInit = PTHREAD_ONCE_INIT; + pthread_once(&OnceInit, +[] { + atomic_store(&TicksPerSec, + probeRequiredCPUFeatures() ? getTSCFrequency() + : __xray::NanosecondsPerSecond, + memory_order_release); + pthread_key_create(&Key, +[](void *TLDPtr) { + if (TLDPtr == nullptr) + return; + auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr); if (TLD.BQ == nullptr) return; auto EC = TLD.BQ->releaseBuffer(TLD.Buffer); if (EC != BufferQueue::ErrorCode::Ok) Report("At thread exit, failed to release buffer at %p; error=%s\n", - TLD.Buffer.Buffer, BufferQueue::getErrorString(EC)); + TLD.Buffer.Data, BufferQueue::getErrorString(EC)); }); - return false; - }(); + }); + atomic_store(&ThresholdTicks, + atomic_load_relaxed(&TicksPerSec) * + fdrFlags()->func_duration_threshold_us / 1000000, + memory_order_release); // Arg1 handler should go in first to avoid concurrent code accidentally // falling back to arg0 when it should have ran arg1. __xray_set_handler_arg1(fdrLoggingHandleArg1); // Install the actual handleArg0 handler after initialising the buffers. __xray_set_handler(fdrLoggingHandleArg0); __xray_set_customevent_handler(fdrLoggingHandleCustomEvent); + __xray_set_typedevent_handler(fdrLoggingHandleTypedEvent); + + // Install the buffer iterator implementation. + __xray_log_set_buffer_iterator(fdrIterator); - __sanitizer::atomic_store(&LoggingStatus, - XRayLogInitStatus::XRAY_LOG_INITIALIZED, - __sanitizer::memory_order_release); + atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED, + memory_order_release); - if (__sanitizer::Verbosity()) + if (Verbosity()) Report("XRay FDR init successful.\n"); return XRayLogInitStatus::XRAY_LOG_INITIALIZED; } bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT { - using namespace __xray; XRayLogImpl Impl{ fdrLoggingInit, fdrLoggingFinalize, @@ -372,11 +1209,10 @@ bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT { }; auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl); if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && - __sanitizer::Verbosity()) + Verbosity()) Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n", RegistrationResult); - if (flags()->xray_fdr_log || - !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-fdr")) + if (flags()->xray_fdr_log || !internal_strcmp(flags()->xray_mode, "xray-fdr")) __xray_set_log_impl(Impl); return true; } diff --git a/lib/xray/xray_fdr_logging_impl.h b/lib/xray/xray_fdr_logging_impl.h deleted file mode 100644 index 59eab55b2573..000000000000 --- a/lib/xray/xray_fdr_logging_impl.h +++ /dev/null @@ -1,705 +0,0 @@ -//===-- xray_fdr_logging_impl.h ---------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file is a part of XRay, a dynamic runtime instrumentation system. -// -// Here we implement the thread local state management and record i/o for Flight -// Data Recorder mode for XRay, where we use compact structures to store records -// in memory as well as when writing out the data to files. -// -//===----------------------------------------------------------------------===// -#ifndef XRAY_XRAY_FDR_LOGGING_IMPL_H -#define XRAY_XRAY_FDR_LOGGING_IMPL_H - -#include <cassert> -#include <cstddef> -#include <cstring> -#include <limits> -#include <pthread.h> -#include <sys/syscall.h> -#include <time.h> -#include <type_traits> -#include <unistd.h> - -#include "sanitizer_common/sanitizer_common.h" -#include "xray/xray_log_interface.h" -#include "xray_buffer_queue.h" -#include "xray_defs.h" -#include "xray_fdr_log_records.h" -#include "xray_flags.h" -#include "xray_tsc.h" - -namespace __xray { - -__sanitizer::atomic_sint32_t LoggingStatus = { - XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; - -/// We expose some of the state transitions when FDR logging mode is operating -/// such that we can simulate a series of log events that may occur without -/// and test with determinism without worrying about the real CPU time. -/// -/// Because the code uses thread_local allocation extensively as part of its -/// design, callers that wish to test events occuring on different threads -/// will actually have to run them on different threads. -/// -/// This also means that it is possible to break invariants maintained by -/// cooperation with xray_fdr_logging class, so be careful and think twice. -namespace __xray_fdr_internal { - -/// Writes the new buffer record and wallclock time that begin a buffer for the -/// current thread. -static void writeNewBufferPreamble(pid_t Tid, timespec TS); - -/// Writes a Function Record to the buffer associated with the current thread. -static void writeFunctionRecord(int FuncId, uint32_t TSCDelta, - XRayEntryType EntryType); - -/// Sets up a new buffer in thread_local storage and writes a preamble. The -/// wall_clock_reader function is used to populate the WallTimeRecord entry. -static void setupNewBuffer(int (*wall_clock_reader)(clockid_t, - struct timespec *)); - -/// TSC Wrap records are written when a TSC delta encoding scheme overflows. -static void writeTSCWrapMetadata(uint64_t TSC); - -// Group together thread-local-data in a struct, then hide it behind a function -// call so that it can be initialized on first use instead of as a global. We -// force the alignment to 64-bytes for x86 cache line alignment, as this -// structure is used in the hot path of implementation. -struct alignas(64) ThreadLocalData { - BufferQueue::Buffer Buffer; - char *RecordPtr = nullptr; - // The number of FunctionEntry records immediately preceding RecordPtr. - uint8_t NumConsecutiveFnEnters = 0; - - // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit - // records preceding RecordPtr. - uint8_t NumTailCalls = 0; - - // We use a thread_local variable to keep track of which CPUs we've already - // run, and the TSC times for these CPUs. This allows us to stop repeating the - // CPU field in the function records. - // - // We assume that we'll support only 65536 CPUs for x86_64. - uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max(); - uint64_t LastTSC = 0; - uint64_t LastFunctionEntryTSC = 0; - - // Make sure a thread that's ever called handleArg0 has a thread-local - // live reference to the buffer queue for this particular instance of - // FDRLogging, and that we're going to clean it up when the thread exits. - BufferQueue *BQ = nullptr; -}; - -static_assert(std::is_trivially_destructible<ThreadLocalData>::value, - "ThreadLocalData must be trivially destructible"); - -static constexpr auto MetadataRecSize = sizeof(MetadataRecord); -static constexpr auto FunctionRecSize = sizeof(FunctionRecord); - -// Use a global pthread key to identify thread-local data for logging. -static pthread_key_t Key; - -// This function will initialize the thread-local data structure used by the FDR -// logging implementation and return a reference to it. The implementation -// details require a bit of care to maintain. -// -// First, some requirements on the implementation in general: -// -// - XRay handlers should not call any memory allocation routines that may -// delegate to an instrumented implementation. This means functions like -// malloc() and free() should not be called while instrumenting. -// -// - We would like to use some thread-local data initialized on first-use of -// the XRay instrumentation. These allow us to implement unsynchronized -// routines that access resources associated with the thread. -// -// The implementation here uses a few mechanisms that allow us to provide both -// the requirements listed above. We do this by: -// -// 1. Using a thread-local aligned storage buffer for representing the -// ThreadLocalData struct. This data will be uninitialized memory by -// design. -// -// 2. Not requiring a thread exit handler/implementation, keeping the -// thread-local as purely a collection of references/data that do not -// require cleanup. -// -// We're doing this to avoid using a `thread_local` object that has a -// non-trivial destructor, because the C++ runtime might call std::malloc(...) -// to register calls to destructors. Deadlocks may arise when, for example, an -// externally provided malloc implementation is XRay instrumented, and -// initializing the thread-locals involves calling into malloc. A malloc -// implementation that does global synchronization might be holding a lock for a -// critical section, calling a function that might be XRay instrumented (and -// thus in turn calling into malloc by virtue of registration of the -// thread_local's destructor). -static ThreadLocalData &getThreadLocalData() { - static_assert(alignof(ThreadLocalData) >= 64, - "ThreadLocalData must be cache line aligned."); - thread_local ThreadLocalData TLD; - thread_local bool UNUSED ThreadOnce = [] { - pthread_setspecific(Key, &TLD); - return false; - }(); - return TLD; -} - -//-----------------------------------------------------------------------------| -// The rest of the file is implementation. | -//-----------------------------------------------------------------------------| -// Functions are implemented in the header for inlining since we don't want | -// to grow the stack when we've hijacked the binary for logging. | -//-----------------------------------------------------------------------------| - -namespace { - -class RecursionGuard { - volatile bool &Running; - const bool Valid; - -public: - explicit RecursionGuard(volatile bool &R) : Running(R), Valid(!R) { - if (Valid) - Running = true; - } - - RecursionGuard(const RecursionGuard &) = delete; - RecursionGuard(RecursionGuard &&) = delete; - RecursionGuard &operator=(const RecursionGuard &) = delete; - RecursionGuard &operator=(RecursionGuard &&) = delete; - - explicit operator bool() const { return Valid; } - - ~RecursionGuard() noexcept { - if (Valid) - Running = false; - } -}; - -} // namespace - -static void writeNewBufferPreamble(pid_t Tid, - timespec TS) XRAY_NEVER_INSTRUMENT { - static constexpr int InitRecordsCount = 2; - auto &TLD = getThreadLocalData(); - MetadataRecord Metadata[InitRecordsCount]; - { - // Write out a MetadataRecord to signify that this is the start of a new - // buffer, associated with a particular thread, with a new CPU. For the - // data, we have 15 bytes to squeeze as much information as we can. At this - // point we only write down the following bytes: - // - Thread ID (pid_t, 4 bytes) - auto &NewBuffer = Metadata[0]; - NewBuffer.Type = uint8_t(RecordType::Metadata); - NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer); - std::memcpy(&NewBuffer.Data, &Tid, sizeof(pid_t)); - } - - // Also write the WalltimeMarker record. - { - static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes"); - auto &WalltimeMarker = Metadata[1]; - WalltimeMarker.Type = uint8_t(RecordType::Metadata); - WalltimeMarker.RecordKind = - uint8_t(MetadataRecord::RecordKinds::WalltimeMarker); - - // We only really need microsecond precision here, and enforce across - // platforms that we need 64-bit seconds and 32-bit microseconds encoded in - // the Metadata record. - int32_t Micros = TS.tv_nsec / 1000; - int64_t Seconds = TS.tv_sec; - std::memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds)); - std::memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros, sizeof(Micros)); - } - - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - if (TLD.BQ == nullptr || TLD.BQ->finalizing()) - return; - std::memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata)); - TLD.RecordPtr += sizeof(Metadata); - // Since we write out the extents as the first metadata record of the - // buffer, we need to write out the extents including the extents record. - __sanitizer::atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata), - __sanitizer::memory_order_release); -} - -inline void setupNewBuffer(int (*wall_clock_reader)( - clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - auto &B = TLD.Buffer; - TLD.RecordPtr = static_cast<char *>(B.Buffer); - pid_t Tid = syscall(SYS_gettid); - timespec TS{0, 0}; - // This is typically clock_gettime, but callers have injection ability. - wall_clock_reader(CLOCK_MONOTONIC, &TS); - writeNewBufferPreamble(Tid, TS); - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; -} - -static void incrementExtents(size_t Add) { - auto &TLD = getThreadLocalData(); - __sanitizer::atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, - __sanitizer::memory_order_acq_rel); -} - -static void decrementExtents(size_t Subtract) { - auto &TLD = getThreadLocalData(); - __sanitizer::atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, - __sanitizer::memory_order_acq_rel); -} - -inline void writeNewCPUIdMetadata(uint16_t CPU, - uint64_t TSC) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - MetadataRecord NewCPUId; - NewCPUId.Type = uint8_t(RecordType::Metadata); - NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId); - - // The data for the New CPU will contain the following bytes: - // - CPU ID (uint16_t, 2 bytes) - // - Full TSC (uint64_t, 8 bytes) - // Total = 10 bytes. - std::memcpy(&NewCPUId.Data, &CPU, sizeof(CPU)); - std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC)); - std::memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord)); - TLD.RecordPtr += sizeof(MetadataRecord); - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - incrementExtents(sizeof(MetadataRecord)); -} - -inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - MetadataRecord TSCWrap; - TSCWrap.Type = uint8_t(RecordType::Metadata); - TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap); - - // The data for the TSCWrap record contains the following bytes: - // - Full TSC (uint64_t, 8 bytes) - // Total = 8 bytes. - std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC)); - std::memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord)); - TLD.RecordPtr += sizeof(MetadataRecord); - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - incrementExtents(sizeof(MetadataRecord)); -} - -// Call Argument metadata records store the arguments to a function in the -// order of their appearance; holes are not supported by the buffer format. -static inline void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - MetadataRecord CallArg; - CallArg.Type = uint8_t(RecordType::Metadata); - CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument); - - std::memcpy(CallArg.Data, &A, sizeof(A)); - std::memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord)); - TLD.RecordPtr += sizeof(MetadataRecord); - incrementExtents(sizeof(MetadataRecord)); -} - -static inline void -writeFunctionRecord(int FuncId, uint32_t TSCDelta, - XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT { - FunctionRecord FuncRecord; - FuncRecord.Type = uint8_t(RecordType::Function); - // Only take 28 bits of the function id. - FuncRecord.FuncId = FuncId & ~(0x0F << 28); - FuncRecord.TSCDelta = TSCDelta; - - auto &TLD = getThreadLocalData(); - switch (EntryType) { - case XRayEntryType::ENTRY: - ++TLD.NumConsecutiveFnEnters; - FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); - break; - case XRayEntryType::LOG_ARGS_ENTRY: - // We should not rewind functions with logged args. - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); - break; - case XRayEntryType::EXIT: - // If we've decided to log the function exit, we will never erase the log - // before it. - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit); - break; - case XRayEntryType::TAIL: - // If we just entered the function we're tail exiting from or erased every - // invocation since then, this function entry tail pair is a candidate to - // be erased when the child function exits. - if (TLD.NumConsecutiveFnEnters > 0) { - ++TLD.NumTailCalls; - TLD.NumConsecutiveFnEnters = 0; - } else { - // We will never be able to erase this tail call since we have logged - // something in between the function entry and tail exit. - TLD.NumTailCalls = 0; - TLD.NumConsecutiveFnEnters = 0; - } - FuncRecord.RecordKind = - uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); - break; - case XRayEntryType::CUSTOM_EVENT: { - // This is a bug in patching, so we'll report it once and move on. - static bool Once = [&] { - Report("Internal error: patched an XRay custom event call as a function; " - "func id = %d\n", - FuncId); - return true; - }(); - (void)Once; - return; - } - } - - std::memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord)); - TLD.RecordPtr += sizeof(FunctionRecord); - incrementExtents(sizeof(FunctionRecord)); -} - -static uint64_t thresholdTicks() { - static uint64_t TicksPerSec = probeRequiredCPUFeatures() - ? getTSCFrequency() - : __xray::NanosecondsPerSecond; - static const uint64_t ThresholdTicks = - TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000; - return ThresholdTicks; -} - -// Re-point the thread local pointer into this thread's Buffer before the recent -// "Function Entry" record and any "Tail Call Exit" records after that. -static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, - uint64_t &LastFunctionEntryTSC, int32_t FuncId) { - auto &TLD = getThreadLocalData(); - TLD.RecordPtr -= FunctionRecSize; - decrementExtents(FunctionRecSize); - FunctionRecord FuncRecord; - std::memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize); - assert(FuncRecord.RecordKind == - uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && - "Expected to find function entry recording when rewinding."); - assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) && - "Expected matching function id when rewinding Exit"); - --TLD.NumConsecutiveFnEnters; - LastTSC -= FuncRecord.TSCDelta; - - // We unwound one call. Update the state and return without writing a log. - if (TLD.NumConsecutiveFnEnters != 0) { - LastFunctionEntryTSC -= FuncRecord.TSCDelta; - return; - } - - // Otherwise we've rewound the stack of all function entries, we might be - // able to rewind further by erasing tail call functions that are being - // exited from via this exit. - LastFunctionEntryTSC = 0; - auto RewindingTSC = LastTSC; - auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize; - while (TLD.NumTailCalls > 0) { - // Rewind the TSC back over the TAIL EXIT record. - FunctionRecord ExpectedTailExit; - std::memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize); - - assert(ExpectedTailExit.RecordKind == - uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) && - "Expected to find tail exit when rewinding."); - RewindingRecordPtr -= FunctionRecSize; - RewindingTSC -= ExpectedTailExit.TSCDelta; - FunctionRecord ExpectedFunctionEntry; - std::memcpy(&ExpectedFunctionEntry, RewindingRecordPtr, FunctionRecSize); - assert(ExpectedFunctionEntry.RecordKind == - uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && - "Expected to find function entry when rewinding tail call."); - assert(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId && - "Expected funcids to match when rewinding tail call."); - - // This tail call exceeded the threshold duration. It will not be erased. - if ((TSC - RewindingTSC) >= thresholdTicks()) { - TLD.NumTailCalls = 0; - return; - } - - // We can erase a tail exit pair that we're exiting through since - // its duration is under threshold. - --TLD.NumTailCalls; - RewindingRecordPtr -= FunctionRecSize; - RewindingTSC -= ExpectedFunctionEntry.TSCDelta; - TLD.RecordPtr -= 2 * FunctionRecSize; - LastTSC = RewindingTSC; - decrementExtents(2 * FunctionRecSize); - } -} - -inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) { - auto &TLD = getThreadLocalData(); - auto EC = BQArg.releaseBuffer(TLD.Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Buffer, - BufferQueue::getErrorString(EC)); - return false; - } - return true; -} - -inline bool prepareBuffer(uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, - struct timespec *), - size_t MaxSize) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - char *BufferStart = static_cast<char *>(TLD.Buffer.Buffer); - if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) { - if (!releaseThreadLocalBuffer(*TLD.BQ)) - return false; - auto EC = TLD.BQ->getBuffer(TLD.Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to acquire a buffer; error=%s\n", - BufferQueue::getErrorString(EC)); - return false; - } - setupNewBuffer(wall_clock_reader); - - // Always write the CPU metadata as the first record in the buffer. - writeNewCPUIdMetadata(CPU, TSC); - } - return true; -} - -inline bool -isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, struct timespec *)) - XRAY_NEVER_INSTRUMENT { - // Bail out right away if logging is not initialized yet. - // We should take the opportunity to release the buffer though. - auto Status = __sanitizer::atomic_load(&LoggingStatus, - __sanitizer::memory_order_acquire); - auto &TLD = getThreadLocalData(); - if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { - if (TLD.RecordPtr != nullptr && - (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || - Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { - if (!releaseThreadLocalBuffer(*LBQ)) - return false; - TLD.RecordPtr = nullptr; - return false; - } - return false; - } - - if (__sanitizer::atomic_load(&LoggingStatus, - __sanitizer::memory_order_acquire) != - XRayLogInitStatus::XRAY_LOG_INITIALIZED || - LBQ->finalizing()) { - if (!releaseThreadLocalBuffer(*LBQ)) - return false; - TLD.RecordPtr = nullptr; - } - - if (TLD.Buffer.Buffer == nullptr) { - auto EC = LBQ->getBuffer(TLD.Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - auto LS = __sanitizer::atomic_load(&LoggingStatus, - __sanitizer::memory_order_acquire); - if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING && - LS != XRayLogInitStatus::XRAY_LOG_FINALIZED) - Report("Failed to acquire a buffer; error=%s\n", - BufferQueue::getErrorString(EC)); - return false; - } - - setupNewBuffer(wall_clock_reader); - - // Always write the CPU metadata as the first record in the buffer. - writeNewCPUIdMetadata(CPU, TSC); - } - - if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) { - // This means this is the first CPU this thread has ever run on. We set - // the current CPU and record this as the first TSC we've seen. - TLD.CurrentCPU = CPU; - writeNewCPUIdMetadata(CPU, TSC); - } - - return true; -} // namespace __xray_fdr_internal - -// Compute the TSC difference between the time of measurement and the previous -// event. There are a few interesting situations we need to account for: -// -// - The thread has migrated to a different CPU. If this is the case, then -// we write down the following records: -// -// 1. A 'NewCPUId' Metadata record. -// 2. A FunctionRecord with a 0 for the TSCDelta field. -// -// - The TSC delta is greater than the 32 bits we can store in a -// FunctionRecord. In this case we write down the following records: -// -// 1. A 'TSCWrap' Metadata record. -// 2. A FunctionRecord with a 0 for the TSCDelta field. -// -// - The TSC delta is representable within the 32 bits we can store in a -// FunctionRecord. In this case we write down just a FunctionRecord with -// the correct TSC delta. -inline uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC, - uint8_t CPU) { - if (CPU != TLD.CurrentCPU) { - // We've moved to a new CPU. - writeNewCPUIdMetadata(CPU, TSC); - return 0; - } - // If the delta is greater than the range for a uint32_t, then we write out - // the TSC wrap metadata entry with the full TSC, and the TSC for the - // function record be 0. - uint64_t Delta = TSC - TLD.LastTSC; - if (Delta <= std::numeric_limits<uint32_t>::max()) - return Delta; - - writeTSCWrapMetadata(TSC); - return 0; -} - -inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - auto BufferStart = static_cast<char *>(TLD.Buffer.Buffer); - if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <= - ptrdiff_t{MetadataRecSize}) { - if (!releaseThreadLocalBuffer(*TLD.BQ)) - return; - TLD.RecordPtr = nullptr; - } -} - -thread_local volatile bool Running = false; - -/// Here's where the meat of the processing happens. The writer captures -/// function entry, exit and tail exit points with a time and will create -/// TSCWrap, NewCPUId and Function records as necessary. The writer might -/// walk backward through its buffer and erase trivial functions to avoid -/// polluting the log and may use the buffer queue to obtain or release a -/// buffer. -inline void processFunctionHook(int32_t FuncId, XRayEntryType Entry, - uint64_t TSC, unsigned char CPU, uint64_t Arg1, - int (*wall_clock_reader)(clockid_t, - struct timespec *), - BufferQueue *BQ) XRAY_NEVER_INSTRUMENT { - // Prevent signal handler recursion, so in case we're already in a log writing - // mode and the signal handler comes in (and is also instrumented) then we - // don't want to be clobbering potentially partial writes already happening in - // the thread. We use a simple thread_local latch to only allow one on-going - // handleArg0 to happen at any given time. - RecursionGuard Guard{Running}; - if (!Guard) { - assert(Running == true && "RecursionGuard is buggy!"); - return; - } - - auto &TLD = getThreadLocalData(); - - // In case the reference has been cleaned up before, we make sure we - // initialize it to the provided BufferQueue. - if (TLD.BQ == nullptr) - TLD.BQ = BQ; - - if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader)) - return; - - // Before we go setting up writing new function entries, we need to be really - // careful about the pointer math we're doing. This means we need to ensure - // that the record we are about to write is going to fit into the buffer, - // without overflowing the buffer. - // - // To do this properly, we use the following assumptions: - // - // - The least number of bytes we will ever write is 8 - // (sizeof(FunctionRecord)) only if the delta between the previous entry - // and this entry is within 32 bits. - // - The most number of bytes we will ever write is 8 + 16 + 16 = 40. - // This is computed by: - // - // MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord) - // - // These arise in the following cases: - // - // 1. When the delta between the TSC we get and the previous TSC for the - // same CPU is outside of the uint32_t range, we end up having to - // write a MetadataRecord to indicate a "tsc wrap" before the actual - // FunctionRecord. - // 2. When we learn that we've moved CPUs, we need to write a - // MetadataRecord to indicate a "cpu change", and thus write out the - // current TSC for that CPU before writing out the actual - // FunctionRecord. - // 3. When we learn about a new CPU ID, we need to write down a "new cpu - // id" MetadataRecord before writing out the actual FunctionRecord. - // 4. The second MetadataRecord is the optional function call argument. - // - // So the math we need to do is to determine whether writing 40 bytes past the - // current pointer exceeds the buffer's maximum size. If we don't have enough - // space to write 40 bytes in the buffer, we need get a new Buffer, set it up - // properly before doing any further writing. - size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize; - if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) { - TLD.BQ = nullptr; - return; - } - - // By this point, we are now ready to write up to 40 bytes (explained above). - assert((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Buffer) >= - static_cast<ptrdiff_t>(MetadataRecSize) && - "Misconfigured BufferQueue provided; Buffer size not large enough."); - - auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU); - TLD.LastTSC = TSC; - TLD.CurrentCPU = CPU; - switch (Entry) { - case XRayEntryType::ENTRY: - case XRayEntryType::LOG_ARGS_ENTRY: - // Update the thread local state for the next invocation. - TLD.LastFunctionEntryTSC = TSC; - break; - case XRayEntryType::TAIL: - case XRayEntryType::EXIT: - // Break out and write the exit record if we can't erase any functions. - if (TLD.NumConsecutiveFnEnters == 0 || - (TSC - TLD.LastFunctionEntryTSC) >= thresholdTicks()) - break; - rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId); - return; // without writing log. - case XRayEntryType::CUSTOM_EVENT: { - // This is a bug in patching, so we'll report it once and move on. - static bool Once = [&] { - Report("Internal error: patched an XRay custom event call as a function; " - "func id = %d", - FuncId); - return true; - }(); - (void)Once; - return; - } - } - - writeFunctionRecord(FuncId, RecordTSCDelta, Entry); - if (Entry == XRayEntryType::LOG_ARGS_ENTRY) - writeCallArgumentMetadata(Arg1); - - // If we've exhausted the buffer by this time, we then release the buffer to - // make sure that other threads may start using this buffer. - endBufferIfFull(); -} - -} // namespace __xray_fdr_internal -} // namespace __xray - -#endif // XRAY_XRAY_FDR_LOGGING_IMPL_H diff --git a/lib/xray/xray_flags.cc b/lib/xray/xray_flags.cc index 1ee4d10d753c..b50b68666d80 100644 --- a/lib/xray/xray_flags.cc +++ b/lib/xray/xray_flags.cc @@ -30,7 +30,7 @@ void Flags::setDefaults() XRAY_NEVER_INSTRUMENT { #undef XRAY_FLAG } -static void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT { +void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT { #define XRAY_FLAG(Type, Name, DefaultValue, Description) \ RegisterFlag(P, #Name, Description, &F->Name); #include "xray_flags.inc" @@ -42,15 +42,14 @@ static void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT { // options that control XRay. This means users/deployments can tweak the // defaults that override the hard-coded defaults in the xray_flags.inc at // compile-time using the XRAY_DEFAULT_OPTIONS macro. -static const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { +const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { #ifdef XRAY_DEFAULT_OPTIONS -// Do the double-layered string conversion to prevent badly crafted strings -// provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues (or -// changing the semantics of the implementation through the macro). This ensures -// that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a string literal. -#define XRAY_STRINGIZE(x) #x -#define XRAY_STRINGIZE_OPTIONS(options) XRAY_STRINGIZE(options) - return XRAY_STRINGIZE_OPTIONS(XRAY_DEFAULT_OPTIONS); + // Do the double-layered string conversion to prevent badly crafted strings + // provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues + // (or changing the semantics of the implementation through the macro). This + // ensures that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a + // string literal. + return SANITIZER_STRINGIFY(XRAY_DEFAULT_OPTIONS); #else return ""; #endif diff --git a/lib/xray/xray_flags.h b/lib/xray/xray_flags.h index 3ed5b8844cb4..7c1ba9458856 100644 --- a/lib/xray/xray_flags.h +++ b/lib/xray/xray_flags.h @@ -29,6 +29,8 @@ struct Flags { }; extern Flags xray_flags_dont_use_directly; +extern void registerXRayFlags(FlagParser *P, Flags *F); +const char *useCompilerDefinedFlags(); inline Flags *flags() { return &xray_flags_dont_use_directly; } void initializeFlags(); diff --git a/lib/xray/xray_flags.inc b/lib/xray/xray_flags.inc index 29f1fce7d7f4..c87903963a36 100644 --- a/lib/xray/xray_flags.inc +++ b/lib/xray/xray_flags.inc @@ -27,23 +27,24 @@ XRAY_FLAG(uptr, xray_page_size_override, 0, XRAY_FLAG(bool, xray_naive_log, false, "DEPRECATED: Use xray_mode=xray-basic instead.") XRAY_FLAG(int, xray_naive_log_func_duration_threshold_us, 5, - "Naive logging will try to skip functions that execute for fewer " - "microseconds than this threshold.") + "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set " + "func_duration_threshold_us instead.") XRAY_FLAG(int, xray_naive_log_max_stack_depth, 64, - "Naive logging will keep track of at most this deep a call stack, " - "any more and the recordings will be droppped.") + "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set " + "max_stack_depth instead.") XRAY_FLAG(int, xray_naive_log_thread_buffer_size, 1024, - "The number of entries to keep on a per-thread buffer.") + "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set " + "thread_buffer_size instead.") // FDR (Flight Data Recorder) Mode logging options. XRAY_FLAG(bool, xray_fdr_log, false, "DEPRECATED: Use xray_mode=xray-fdr instead.") XRAY_FLAG(int, xray_fdr_log_func_duration_threshold_us, 5, - "FDR logging will try to skip functions that execute for fewer " - "microseconds than this threshold.") + "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set " + "func_duration_threshold_us instead.") XRAY_FLAG(int, xray_fdr_log_grace_period_us, 0, - "DEPRECATED: use xray_fdr_log_grace_period_ms instead.") + "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set " + "grace_period_ms instead.") XRAY_FLAG(int, xray_fdr_log_grace_period_ms, 100, - "FDR logging will wait this much time in microseconds before " - "actually flushing the log; this gives a chance for threads to " - "notice that the log has been finalized and clean up.") + "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set " + "grace_period_ms instead.") diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h new file mode 100644 index 000000000000..2acf14aa5625 --- /dev/null +++ b/lib/xray/xray_function_call_trie.h @@ -0,0 +1,455 @@ +//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This file defines the interface for a function call trie. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FUNCTION_CALL_TRIE_H +#define XRAY_FUNCTION_CALL_TRIE_H + +#include "sanitizer_common/sanitizer_allocator_internal.h" +#include "xray_profiling_flags.h" +#include "xray_segmented_array.h" +#include <memory> // For placement new. +#include <utility> + +namespace __xray { + +/// A FunctionCallTrie represents the stack traces of XRay instrumented +/// functions that we've encountered, where a node corresponds to a function and +/// the path from the root to the node its stack trace. Each node in the trie +/// will contain some useful values, including: +/// +/// * The cumulative amount of time spent in this particular node/stack. +/// * The number of times this stack has appeared. +/// * A histogram of latencies for that particular node. +/// +/// Each node in the trie will also contain a list of callees, represented using +/// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function +/// ID of the callee, and a pointer to the node. +/// +/// If we visualise this data structure, we'll find the following potential +/// representation: +/// +/// [function id node] -> [callees] [cumulative time] +/// [call counter] [latency histogram] +/// +/// As an example, when we have a function in this pseudocode: +/// +/// func f(N) { +/// g() +/// h() +/// for i := 1..N { j() } +/// } +/// +/// We may end up with a trie of the following form: +/// +/// f -> [ g, h, j ] [...] [1] [...] +/// g -> [ ... ] [...] [1] [...] +/// h -> [ ... ] [...] [1] [...] +/// j -> [ ... ] [...] [N] [...] +/// +/// If for instance the function g() called j() like so: +/// +/// func g() { +/// for i := 1..10 { j() } +/// } +/// +/// We'll find the following updated trie: +/// +/// f -> [ g, h, j ] [...] [1] [...] +/// g -> [ j' ] [...] [1] [...] +/// h -> [ ... ] [...] [1] [...] +/// j -> [ ... ] [...] [N] [...] +/// j' -> [ ... ] [...] [10] [...] +/// +/// Note that we'll have a new node representing the path `f -> g -> j'` with +/// isolated data. This isolation gives us a means of representing the stack +/// traces as a path, as opposed to a key in a table. The alternative +/// implementation here would be to use a separate table for the path, and use +/// hashes of the path as an identifier to accumulate the information. We've +/// moved away from this approach as it takes a lot of time to compute the hash +/// every time we need to update a function's call information as we're handling +/// the entry and exit events. +/// +/// This approach allows us to maintain a shadow stack, which represents the +/// currently executing path, and on function exits quickly compute the amount +/// of time elapsed from the entry, then update the counters for the node +/// already represented in the trie. This necessitates an efficient +/// representation of the various data structures (the list of callees must be +/// cache-aware and efficient to look up, and the histogram must be compact and +/// quick to update) to enable us to keep the overheads of this implementation +/// to the minimum. +class FunctionCallTrie { +public: + struct Node; + + // We use a NodeIdPair type instead of a std::pair<...> to not rely on the + // standard library types in this header. + struct NodeIdPair { + Node *NodePtr; + int32_t FId; + + // Constructor for inplace-construction. + NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {} + }; + + using NodeIdPairArray = Array<NodeIdPair>; + using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType; + + // A Node in the FunctionCallTrie gives us a list of callees, the cumulative + // number of times this node actually appeared, the cumulative amount of time + // for this particular node including its children call times, and just the + // local time spent on this node. Each Node will have the ID of the XRay + // instrumented function that it is associated to. + struct Node { + Node *Parent; + NodeIdPairArray Callees; + int64_t CallCount; + int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. + int32_t FId; + + // We add a constructor here to allow us to inplace-construct through + // Array<...>'s AppendEmplace. + Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT, + int32_t F) + : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT), + FId(F) {} + + // TODO: Include the compact histogram. + }; + +private: + struct ShadowStackEntry { + uint64_t EntryTSC; + Node *NodePtr; + + // We add a constructor here to allow us to inplace-construct through + // Array<...>'s AppendEmplace. + ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {} + }; + + using NodeArray = Array<Node>; + using RootArray = Array<Node *>; + using ShadowStackArray = Array<ShadowStackEntry>; + +public: + // We collate the allocators we need into a single struct, as a convenience to + // allow us to initialize these as a group. + struct Allocators { + using NodeAllocatorType = NodeArray::AllocatorType; + using RootAllocatorType = RootArray::AllocatorType; + using ShadowStackAllocatorType = ShadowStackArray::AllocatorType; + + NodeAllocatorType *NodeAllocator = nullptr; + RootAllocatorType *RootAllocator = nullptr; + ShadowStackAllocatorType *ShadowStackAllocator = nullptr; + NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + + Allocators() {} + Allocators(const Allocators &) = delete; + Allocators &operator=(const Allocators &) = delete; + + Allocators(Allocators &&O) + : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator), + ShadowStackAllocator(O.ShadowStackAllocator), + NodeIdPairAllocator(O.NodeIdPairAllocator) { + O.NodeAllocator = nullptr; + O.RootAllocator = nullptr; + O.ShadowStackAllocator = nullptr; + O.NodeIdPairAllocator = nullptr; + } + + Allocators &operator=(Allocators &&O) { + { + auto Tmp = O.NodeAllocator; + O.NodeAllocator = this->NodeAllocator; + this->NodeAllocator = Tmp; + } + { + auto Tmp = O.RootAllocator; + O.RootAllocator = this->RootAllocator; + this->RootAllocator = Tmp; + } + { + auto Tmp = O.ShadowStackAllocator; + O.ShadowStackAllocator = this->ShadowStackAllocator; + this->ShadowStackAllocator = Tmp; + } + { + auto Tmp = O.NodeIdPairAllocator; + O.NodeIdPairAllocator = this->NodeIdPairAllocator; + this->NodeIdPairAllocator = Tmp; + } + return *this; + } + + ~Allocators() { + // Note that we cannot use delete on these pointers, as they need to be + // returned to the sanitizer_common library's internal memory tracking + // system. + if (NodeAllocator != nullptr) { + NodeAllocator->~NodeAllocatorType(); + InternalFree(NodeAllocator); + NodeAllocator = nullptr; + } + if (RootAllocator != nullptr) { + RootAllocator->~RootAllocatorType(); + InternalFree(RootAllocator); + RootAllocator = nullptr; + } + if (ShadowStackAllocator != nullptr) { + ShadowStackAllocator->~ShadowStackAllocatorType(); + InternalFree(ShadowStackAllocator); + ShadowStackAllocator = nullptr; + } + if (NodeIdPairAllocator != nullptr) { + NodeIdPairAllocator->~NodeIdPairAllocatorType(); + InternalFree(NodeIdPairAllocator); + NodeIdPairAllocator = nullptr; + } + } + }; + + // TODO: Support configuration of options through the arguments. + static Allocators InitAllocators() { + return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max); + } + + static Allocators InitAllocatorsCustom(uptr Max) { + Allocators A; + auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>( + InternalAlloc(sizeof(Allocators::NodeAllocatorType))); + new (NodeAllocator) Allocators::NodeAllocatorType(Max); + A.NodeAllocator = NodeAllocator; + + auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>( + InternalAlloc(sizeof(Allocators::RootAllocatorType))); + new (RootAllocator) Allocators::RootAllocatorType(Max); + A.RootAllocator = RootAllocator; + + auto ShadowStackAllocator = + reinterpret_cast<Allocators::ShadowStackAllocatorType *>( + InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType))); + new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max); + A.ShadowStackAllocator = ShadowStackAllocator; + + auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + InternalAlloc(sizeof(NodeIdPairAllocatorType))); + new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max); + A.NodeIdPairAllocator = NodeIdPairAllocator; + return A; + } + +private: + NodeArray Nodes; + RootArray Roots; + ShadowStackArray ShadowStack; + NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + +public: + explicit FunctionCallTrie(const Allocators &A) + : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), + ShadowStack(*A.ShadowStackAllocator), + NodeIdPairAllocator(A.NodeIdPairAllocator) {} + + void enterFunction(const int32_t FId, uint64_t TSC) { + DCHECK_NE(FId, 0); + // This function primarily deals with ensuring that the ShadowStack is + // consistent and ready for when an exit event is encountered. + if (UNLIKELY(ShadowStack.empty())) { + auto NewRoot = + Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId); + if (UNLIKELY(NewRoot == nullptr)) + return; + Roots.Append(NewRoot); + ShadowStack.AppendEmplace(TSC, NewRoot); + return; + } + + auto &Top = ShadowStack.back(); + auto TopNode = Top.NodePtr; + DCHECK_NE(TopNode, nullptr); + + // If we've seen this callee before, then we just access that node and place + // that on the top of the stack. + auto Callee = TopNode->Callees.find_element( + [FId](const NodeIdPair &NR) { return NR.FId == FId; }); + if (Callee != nullptr) { + CHECK_NE(Callee->NodePtr, nullptr); + ShadowStack.AppendEmplace(TSC, Callee->NodePtr); + return; + } + + // This means we've never seen this stack before, create a new node here. + auto NewNode = + Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId); + if (UNLIKELY(NewNode == nullptr)) + return; + DCHECK_NE(NewNode, nullptr); + TopNode->Callees.AppendEmplace(NewNode, FId); + ShadowStack.AppendEmplace(TSC, NewNode); + DCHECK_NE(ShadowStack.back().NodePtr, nullptr); + return; + } + + void exitFunction(int32_t FId, uint64_t TSC) { + // When we exit a function, we look up the ShadowStack to see whether we've + // entered this function before. We do as little processing here as we can, + // since most of the hard work would have already been done at function + // entry. + uint64_t CumulativeTreeTime = 0; + while (!ShadowStack.empty()) { + const auto &Top = ShadowStack.back(); + auto TopNode = Top.NodePtr; + DCHECK_NE(TopNode, nullptr); + auto LocalTime = TSC - Top.EntryTSC; + TopNode->CallCount++; + TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime; + CumulativeTreeTime += LocalTime; + ShadowStack.trim(1); + + // TODO: Update the histogram for the node. + if (TopNode->FId == FId) + break; + } + } + + const RootArray &getRoots() const { return Roots; } + + // The deepCopyInto operation will update the provided FunctionCallTrie by + // re-creating the contents of this particular FunctionCallTrie in the other + // FunctionCallTrie. It will do this using a Depth First Traversal from the + // roots, and while doing so recreating the traversal in the provided + // FunctionCallTrie. + // + // This operation will *not* destroy the state in `O`, and thus may cause some + // duplicate entries in `O` if it is not empty. + // + // This function is *not* thread-safe, and may require external + // synchronisation of both "this" and |O|. + // + // This function must *not* be called with a non-empty FunctionCallTrie |O|. + void deepCopyInto(FunctionCallTrie &O) const { + DCHECK(O.getRoots().empty()); + + // We then push the root into a stack, to use as the parent marker for new + // nodes we push in as we're traversing depth-first down the call tree. + struct NodeAndParent { + FunctionCallTrie::Node *Node; + FunctionCallTrie::Node *NewNode; + }; + using Stack = Array<NodeAndParent>; + + typename Stack::AllocatorType StackAllocator( + profilingFlags()->stack_allocator_max); + Stack DFSStack(StackAllocator); + + for (const auto Root : getRoots()) { + // Add a node in O for this root. + auto NewRoot = O.Nodes.AppendEmplace( + nullptr, *O.NodeIdPairAllocator, Root->CallCount, + Root->CumulativeLocalTime, Root->FId); + + // Because we cannot allocate more memory we should bail out right away. + if (UNLIKELY(NewRoot == nullptr)) + return; + + O.Roots.Append(NewRoot); + + // TODO: Figure out what to do if we fail to allocate any more stack + // space. Maybe warn or report once? + DFSStack.AppendEmplace(Root, NewRoot); + while (!DFSStack.empty()) { + NodeAndParent NP = DFSStack.back(); + DCHECK_NE(NP.Node, nullptr); + DCHECK_NE(NP.NewNode, nullptr); + DFSStack.trim(1); + for (const auto Callee : NP.Node->Callees) { + auto NewNode = O.Nodes.AppendEmplace( + NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, + Callee.NodePtr->CumulativeLocalTime, Callee.FId); + if (UNLIKELY(NewNode == nullptr)) + return; + NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); + DFSStack.AppendEmplace(Callee.NodePtr, NewNode); + } + } + } + } + + // The mergeInto operation will update the provided FunctionCallTrie by + // traversing the current trie's roots and updating (i.e. merging) the data in + // the nodes with the data in the target's nodes. If the node doesn't exist in + // the provided trie, we add a new one in the right position, and inherit the + // data from the original (current) trie, along with all its callees. + // + // This function is *not* thread-safe, and may require external + // synchronisation of both "this" and |O|. + void mergeInto(FunctionCallTrie &O) const { + struct NodeAndTarget { + FunctionCallTrie::Node *OrigNode; + FunctionCallTrie::Node *TargetNode; + }; + using Stack = Array<NodeAndTarget>; + typename Stack::AllocatorType StackAllocator( + profilingFlags()->stack_allocator_max); + Stack DFSStack(StackAllocator); + + for (const auto Root : getRoots()) { + Node *TargetRoot = nullptr; + auto R = O.Roots.find_element( + [&](const Node *Node) { return Node->FId == Root->FId; }); + if (R == nullptr) { + TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0, + 0, Root->FId); + if (UNLIKELY(TargetRoot == nullptr)) + return; + + O.Roots.Append(TargetRoot); + } else { + TargetRoot = *R; + } + + DFSStack.Append(NodeAndTarget{Root, TargetRoot}); + while (!DFSStack.empty()) { + NodeAndTarget NT = DFSStack.back(); + DCHECK_NE(NT.OrigNode, nullptr); + DCHECK_NE(NT.TargetNode, nullptr); + DFSStack.trim(1); + // TODO: Update the histogram as well when we have it ready. + NT.TargetNode->CallCount += NT.OrigNode->CallCount; + NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime; + for (const auto Callee : NT.OrigNode->Callees) { + auto TargetCallee = NT.TargetNode->Callees.find_element( + [&](const FunctionCallTrie::NodeIdPair &C) { + return C.FId == Callee.FId; + }); + if (TargetCallee == nullptr) { + auto NewTargetNode = O.Nodes.AppendEmplace( + NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId); + + if (UNLIKELY(NewTargetNode == nullptr)) + return; + + TargetCallee = + NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId); + } + DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr); + } + } + } + } +}; + +} // namespace __xray + +#endif // XRAY_FUNCTION_CALL_TRIE_H diff --git a/lib/xray/xray_init.cc b/lib/xray/xray_init.cc index 11892cb8b7a3..b4e069795195 100644 --- a/lib/xray/xray_init.cc +++ b/lib/xray/xray_init.cc @@ -38,32 +38,29 @@ using namespace __xray; // // FIXME: Support DSO instrumentation maps too. The current solution only works // for statically linked executables. -__sanitizer::atomic_uint8_t XRayInitialized{0}; +atomic_uint8_t XRayInitialized{0}; // This should always be updated before XRayInitialized is updated. -__sanitizer::SpinMutex XRayInstrMapMutex; +SpinMutex XRayInstrMapMutex; XRaySledMap XRayInstrMap; // Global flag to determine whether the flags have been initialized. -__sanitizer::atomic_uint8_t XRayFlagsInitialized{0}; +atomic_uint8_t XRayFlagsInitialized{0}; // A mutex to allow only one thread to initialize the XRay data structures. -__sanitizer::SpinMutex XRayInitMutex; +SpinMutex XRayInitMutex; // __xray_init() will do the actual loading of the current process' memory map // and then proceed to look for the .xray_instr_map section/segment. void __xray_init() XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayInitMutex); + SpinMutexLock Guard(&XRayInitMutex); // Short-circuit if we've already initialized XRay before. - if (__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) + if (atomic_load(&XRayInitialized, memory_order_acquire)) return; - if (!__sanitizer::atomic_load(&XRayFlagsInitialized, - __sanitizer::memory_order_acquire)) { + if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) { initializeFlags(); - __sanitizer::atomic_store(&XRayFlagsInitialized, true, - __sanitizer::memory_order_release); + atomic_store(&XRayFlagsInitialized, true, memory_order_release); } if (__start_xray_instr_map == nullptr) { @@ -73,14 +70,13 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { } { - __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + SpinMutexLock Guard(&XRayInstrMapMutex); XRayInstrMap.Sleds = __start_xray_instr_map; XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map; XRayInstrMap.SledsIndex = __start_xray_fn_idx; XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx; } - __sanitizer::atomic_store(&XRayInitialized, true, - __sanitizer::memory_order_release); + atomic_store(&XRayInitialized, true, memory_order_release); #ifndef XRAY_NO_PREINIT if (flags()->patch_premain) @@ -88,7 +84,13 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { #endif } -#if !defined(XRAY_NO_PREINIT) && SANITIZER_CAN_USE_PREINIT_ARRAY +// FIXME: Make check-xray tests work on FreeBSD without +// SANITIZER_CAN_USE_PREINIT_ARRAY. +// See sanitizer_internal_defs.h where the macro is defined. +// Calling unresolved PLT functions in .preinit_array can lead to deadlock on +// FreeBSD but here it seems benign. +#if !defined(XRAY_NO_PREINIT) && \ + (SANITIZER_CAN_USE_PREINIT_ARRAY || SANITIZER_FREEBSD) // Only add the preinit array initialization if the sanitizers can. __attribute__((section(".preinit_array"), used)) void (*__local_xray_preinit)(void) = __xray_init; diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc index 766313e85c58..01bf6ddc607e 100644 --- a/lib/xray/xray_interface.cc +++ b/lib/xray/xray_interface.cc @@ -19,9 +19,12 @@ #include <cstdio> #include <errno.h> #include <limits> +#include <string.h> #include <sys/mman.h> +#include "sanitizer_common/sanitizer_addrhashmap.h" #include "sanitizer_common/sanitizer_common.h" + #include "xray_defs.h" #include "xray_flags.h" @@ -48,26 +51,40 @@ static const int16_t cSledLength = 8; #endif /* CPU architecture */ // This is the function to call when we encounter the entry or exit sleds. -__sanitizer::atomic_uintptr_t XRayPatchedFunction{0}; +atomic_uintptr_t XRayPatchedFunction{0}; // This is the function to call from the arg1-enabled sleds/trampolines. -__sanitizer::atomic_uintptr_t XRayArgLogger{0}; +atomic_uintptr_t XRayArgLogger{0}; // This is the function to call when we encounter a custom event log call. -__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0}; +atomic_uintptr_t XRayPatchedCustomEvent{0}; + +// This is the function to call when we encounter a typed event log call. +atomic_uintptr_t XRayPatchedTypedEvent{0}; // This is the global status to determine whether we are currently // patching/unpatching. -__sanitizer::atomic_uint8_t XRayPatching{0}; +atomic_uint8_t XRayPatching{0}; + +struct TypeDescription { + uint32_t type_id; + std::size_t description_string_length; +}; + +using TypeDescriptorMapType = AddrHashMap<TypeDescription, 11>; +// An address map from immutable descriptors to type ids. +TypeDescriptorMapType TypeDescriptorAddressMap{}; -// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo -// any successful mprotect(...) changes. This is used to make a page writeable -// and executable, and upon destruction if it was successful in doing so returns -// the page into a read-only and executable page. +atomic_uint32_t TypeEventDescriptorCounter{0}; + +// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will +// undo any successful mprotect(...) changes. This is used to make a page +// writeable and executable, and upon destruction if it was successful in +// doing so returns the page into a read-only and executable page. // // This is only used specifically for runtime-patching of the XRay -// instrumentation points. This assumes that the executable pages are originally -// read-and-execute only. +// instrumentation points. This assumes that the executable pages are +// originally read-and-execute only. class MProtectHelper { void *PageAlignedAddr; std::size_t MProtectLen; @@ -116,6 +133,9 @@ bool patchSled(const XRaySledEntry &Sled, bool Enable, case XRayEntryType::CUSTOM_EVENT: Success = patchCustomEvent(Enable, FuncId, Sled); break; + case XRayEntryType::TYPED_EVENT: + Success = patchTypedEvent(Enable, FuncId, Sled); + break; default: Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind)); return false; @@ -125,19 +145,19 @@ bool patchSled(const XRaySledEntry &Sled, bool Enable, XRayPatchingStatus patchFunction(int32_t FuncId, bool Enable) XRAY_NEVER_INSTRUMENT { - if (!__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. uint8_t NotPatching = false; - if (!__sanitizer::atomic_compare_exchange_strong( - &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel)) + if (!atomic_compare_exchange_strong( + &XRayPatching, &NotPatching, true, memory_order_acq_rel)) return XRayPatchingStatus::ONGOING; // Already patching. // Next, we look for the function index. XRaySledMap InstrMap; { - __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + SpinMutexLock Guard(&XRayInstrMapMutex); InstrMap = XRayInstrMap; } @@ -161,8 +181,8 @@ XRayPatchingStatus patchFunction(int32_t FuncId, while (f != e) SucceedOnce |= patchSled(*f++, Enable, FuncId); - __sanitizer::atomic_store(&XRayPatching, false, - __sanitizer::memory_order_release); + atomic_store(&XRayPatching, false, + memory_order_release); if (!SucceedOnce) { Report("Failed patching any sled for function '%d'.", FuncId); @@ -176,26 +196,26 @@ XRayPatchingStatus patchFunction(int32_t FuncId, // implementation. |Enable| defines whether we're enabling or disabling the // runtime XRay instrumentation. XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { - if (!__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. uint8_t NotPatching = false; - if (!__sanitizer::atomic_compare_exchange_strong( - &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel)) + if (!atomic_compare_exchange_strong( + &XRayPatching, &NotPatching, true, memory_order_acq_rel)) return XRayPatchingStatus::ONGOING; // Already patching. uint8_t PatchingSuccess = false; auto XRayPatchingStatusResetter = - __sanitizer::at_scope_exit([&PatchingSuccess] { + at_scope_exit([&PatchingSuccess] { if (!PatchingSuccess) - __sanitizer::atomic_store(&XRayPatching, false, - __sanitizer::memory_order_release); + atomic_store(&XRayPatching, false, + memory_order_release); }); XRaySledMap InstrMap; { - __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + SpinMutexLock Guard(&XRayInstrMapMutex); InstrMap = XRayInstrMap; } if (InstrMap.Entries == 0) @@ -251,8 +271,8 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { } patchSled(Sled, Enable, FuncId); } - __sanitizer::atomic_store(&XRayPatching, false, - __sanitizer::memory_order_release); + atomic_store(&XRayPatching, false, + memory_order_release); PatchingSuccess = true; return XRayPatchingStatus::SUCCESS; } @@ -261,7 +281,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, bool Enable) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { - __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + SpinMutexLock Guard(&XRayInstrMapMutex); InstrMap = XRayInstrMap; } @@ -318,12 +338,12 @@ using namespace __xray; int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)) XRAY_NEVER_INSTRUMENT { - if (__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { - __sanitizer::atomic_store(&__xray::XRayPatchedFunction, + atomic_store(&__xray::XRayPatchedFunction, reinterpret_cast<uintptr_t>(entry), - __sanitizer::memory_order_release); + memory_order_release); return 1; } return 0; @@ -331,11 +351,23 @@ int __xray_set_handler(void (*entry)(int32_t, int __xray_set_customevent_handler(void (*entry)(void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) { - __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent, + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { + atomic_store(&__xray::XRayPatchedCustomEvent, + reinterpret_cast<uintptr_t>(entry), + memory_order_release); + return 1; + } + return 0; +} + +int __xray_set_typedevent_handler(void (*entry)( + uint16_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { + atomic_store(&__xray::XRayPatchedTypedEvent, reinterpret_cast<uintptr_t>(entry), - __sanitizer::memory_order_release); + memory_order_release); return 1; } return 0; @@ -349,6 +381,21 @@ int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT { return __xray_set_customevent_handler(nullptr); } +int __xray_remove_typedevent_handler() XRAY_NEVER_INSTRUMENT { + return __xray_set_typedevent_handler(nullptr); +} + +uint16_t __xray_register_event_type( + const char *const event_type) XRAY_NEVER_INSTRUMENT { + TypeDescriptorMapType::Handle h(&TypeDescriptorAddressMap, (uptr)event_type); + if (h.created()) { + h->type_id = atomic_fetch_add( + &TypeEventDescriptorCounter, 1, memory_order_acq_rel); + h->description_string_length = strnlen(event_type, 1024); + } + return h->type_id; +} + XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { return controlPatching(true); } @@ -367,22 +414,22 @@ __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { } int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) { - if (!__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) return 0; // A relaxed write might not be visible even if the current thread gets // scheduled on a different CPU/NUMA node. We need to wait for everyone to // have this handler installed for consistency of collected data across CPUs. - __sanitizer::atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry), - __sanitizer::memory_order_release); + atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry), + memory_order_release); return 1; } int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + SpinMutexLock Guard(&XRayInstrMapMutex); if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions) return 0; return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Function @@ -396,6 +443,6 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { } size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + SpinMutexLock Guard(&XRayInstrMapMutex); return XRayInstrMap.Functions; } diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h index 5811e2b7300a..8ca87457437e 100644 --- a/lib/xray/xray_interface_internal.h +++ b/lib/xray/xray_interface_internal.h @@ -43,8 +43,8 @@ struct XRaySledEntry { }; struct XRayFunctionSledIndex { - const XRaySledEntry* Begin; - const XRaySledEntry* End; + const XRaySledEntry *Begin; + const XRaySledEntry *End; }; } @@ -57,12 +57,13 @@ struct XRaySledMap { size_t Functions; }; -bool patchFunctionEntry(bool Enable, uint32_t FuncId, - const XRaySledEntry &Sled, void (*Trampoline)()); +bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, + void (*Trampoline)()); bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); } // namespace __xray @@ -74,6 +75,7 @@ extern void __xray_FunctionExit(); extern void __xray_FunctionTailExit(); extern void __xray_ArgLoggerEntry(); extern void __xray_CustomEvent(); +extern void __xray_TypedEvent(); } #endif diff --git a/lib/xray/xray_log_interface.cc b/lib/xray/xray_log_interface.cc index 783f004d292a..0886fd0d1210 100644 --- a/lib/xray/xray_log_interface.cc +++ b/lib/xray/xray_log_interface.cc @@ -18,9 +18,20 @@ #include "xray/xray_interface.h" #include "xray_defs.h" -__sanitizer::SpinMutex XRayImplMutex; -XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr}; -XRayLogImpl *GlobalXRayImpl = nullptr; +namespace __xray { +static SpinMutex XRayImplMutex; +static XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr}; +static XRayLogImpl *GlobalXRayImpl = nullptr; + +// This is the default implementation of a buffer iterator, which always yields +// a null buffer. +XRayBuffer NullBufferIterator(XRayBuffer) XRAY_NEVER_INSTRUMENT { + return {nullptr, 0}; +} + +// This is the global function responsible for iterating through given buffers. +atomic_uintptr_t XRayBufferIterator{ + reinterpret_cast<uintptr_t>(&NullBufferIterator)}; // We use a linked list of Mode to XRayLogImpl mappings. This is a linked list // when it should be a map because we're avoiding having to depend on C++ @@ -31,9 +42,24 @@ struct ModeImpl { XRayLogImpl Impl; }; -ModeImpl SentinelModeImpl{ +static ModeImpl SentinelModeImpl{ nullptr, nullptr, {nullptr, nullptr, nullptr, nullptr}}; -ModeImpl *ModeImpls = &SentinelModeImpl; +static ModeImpl *ModeImpls = &SentinelModeImpl; +static const ModeImpl *CurrentMode = nullptr; + +} // namespace __xray + +using namespace __xray; + +void __xray_log_set_buffer_iterator(XRayBuffer (*Iterator)(XRayBuffer)) + XRAY_NEVER_INSTRUMENT { + atomic_store(&__xray::XRayBufferIterator, + reinterpret_cast<uintptr_t>(Iterator), memory_order_release); +} + +void __xray_log_remove_buffer_iterator() XRAY_NEVER_INSTRUMENT { + __xray_log_set_buffer_iterator(&NullBufferIterator); +} XRayLogRegisterStatus __xray_log_register_mode(const char *Mode, @@ -42,16 +68,15 @@ __xray_log_register_mode(const char *Mode, Impl.log_finalize == nullptr || Impl.log_init == nullptr) return XRayLogRegisterStatus::XRAY_INCOMPLETE_IMPL; - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); // First, look for whether the mode already has a registered implementation. for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) { - if (!__sanitizer::internal_strcmp(Mode, it->Mode)) + if (!internal_strcmp(Mode, it->Mode)) return XRayLogRegisterStatus::XRAY_DUPLICATE_MODE; } - auto *NewModeImpl = - static_cast<ModeImpl *>(__sanitizer::InternalAlloc(sizeof(ModeImpl))); + auto *NewModeImpl = static_cast<ModeImpl *>(InternalAlloc(sizeof(ModeImpl))); NewModeImpl->Next = ModeImpls; - NewModeImpl->Mode = __sanitizer::internal_strdup(Mode); + NewModeImpl->Mode = internal_strdup(Mode); NewModeImpl->Impl = Impl; ModeImpls = NewModeImpl; return XRayLogRegisterStatus::XRAY_REGISTRATION_OK; @@ -59,9 +84,10 @@ __xray_log_register_mode(const char *Mode, XRayLogRegisterStatus __xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) { - if (!__sanitizer::internal_strcmp(Mode, it->Mode)) { + if (!internal_strcmp(Mode, it->Mode)) { + CurrentMode = it; CurrentXRayImpl = it->Impl; GlobalXRayImpl = &CurrentXRayImpl; __xray_set_handler(it->Impl.handle_arg0); @@ -71,24 +97,32 @@ __xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT { return XRayLogRegisterStatus::XRAY_MODE_NOT_FOUND; } +const char *__xray_log_get_current_mode() XRAY_NEVER_INSTRUMENT { + SpinMutexLock Guard(&XRayImplMutex); + if (CurrentMode != nullptr) + return CurrentMode->Mode; + return nullptr; +} + void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT { if (Impl.log_init == nullptr || Impl.log_finalize == nullptr || Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) { - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); GlobalXRayImpl = nullptr; + CurrentMode = nullptr; __xray_remove_handler(); __xray_remove_handler_arg1(); return; } - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); CurrentXRayImpl = Impl; GlobalXRayImpl = &CurrentXRayImpl; __xray_set_handler(Impl.handle_arg0); } void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); GlobalXRayImpl = nullptr; __xray_remove_handler(); __xray_remove_handler_arg1(); @@ -97,22 +131,80 @@ void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT { XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers, void *Args, size_t ArgsSize) XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); if (!GlobalXRayImpl) return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; return GlobalXRayImpl->log_init(BufferSize, MaxBuffers, Args, ArgsSize); } +XRayLogInitStatus __xray_log_init_mode(const char *Mode, const char *Config) + XRAY_NEVER_INSTRUMENT { + SpinMutexLock Guard(&XRayImplMutex); + if (!GlobalXRayImpl) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + + if (Config == nullptr) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + + // Check first whether the current mode is the same as what we expect. + if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + + // Here we do some work to coerce the pointer we're provided, so that + // the implementations that still take void* pointers can handle the + // data provided in the Config argument. + return GlobalXRayImpl->log_init( + 0, 0, const_cast<void *>(static_cast<const void *>(Config)), 0); +} + +XRayLogInitStatus +__xray_log_init_mode_bin(const char *Mode, const char *Config, + size_t ConfigSize) XRAY_NEVER_INSTRUMENT { + SpinMutexLock Guard(&XRayImplMutex); + if (!GlobalXRayImpl) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + + if (Config == nullptr) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + + // Check first whether the current mode is the same as what we expect. + if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + + // Here we do some work to coerce the pointer we're provided, so that + // the implementations that still take void* pointers can handle the + // data provided in the Config argument. + return GlobalXRayImpl->log_init( + 0, 0, const_cast<void *>(static_cast<const void *>(Config)), ConfigSize); +} + XRayLogInitStatus __xray_log_finalize() XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); if (!GlobalXRayImpl) return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; return GlobalXRayImpl->log_finalize(); } XRayLogFlushStatus __xray_log_flushLog() XRAY_NEVER_INSTRUMENT { - __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + SpinMutexLock Guard(&XRayImplMutex); if (!GlobalXRayImpl) return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; return GlobalXRayImpl->flush_log(); } + +XRayLogFlushStatus __xray_log_process_buffers( + void (*Processor)(const char *, XRayBuffer)) XRAY_NEVER_INSTRUMENT { + // We want to make sure that there will be no changes to the global state for + // the log by synchronising on the XRayBufferIteratorMutex. + if (!GlobalXRayImpl) + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + auto Iterator = reinterpret_cast<XRayBuffer (*)(XRayBuffer)>( + atomic_load(&XRayBufferIterator, memory_order_acquire)); + auto Buffer = (*Iterator)(XRayBuffer{nullptr, 0}); + auto Mode = CurrentMode ? CurrentMode->Mode : nullptr; + while (Buffer.Data != nullptr) { + (*Processor)(Mode, Buffer); + Buffer = (*Iterator)(Buffer); + } + return XRayLogFlushStatus::XRAY_LOG_FLUSHED; +} diff --git a/lib/xray/xray_mips.cc b/lib/xray/xray_mips.cc index cd863304db29..6f8243828668 100644 --- a/lib/xray/xray_mips.cc +++ b/lib/xray/xray_mips.cc @@ -158,6 +158,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, return false; } +bool patchTypedEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips? + return false; +} + } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { diff --git a/lib/xray/xray_mips64.cc b/lib/xray/xray_mips64.cc index fa8fdd5abccc..f1bdf1d7d22d 100644 --- a/lib/xray/xray_mips64.cc +++ b/lib/xray/xray_mips64.cc @@ -166,6 +166,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, // FIXME: Implement in mips64? return false; } + +bool patchTypedEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips64? + return false; +} } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { diff --git a/lib/xray/xray_powerpc64.cc b/lib/xray/xray_powerpc64.cc index ab03cb10042f..5e4938361c0c 100644 --- a/lib/xray/xray_powerpc64.cc +++ b/lib/xray/xray_powerpc64.cc @@ -99,6 +99,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, return false; } +bool patchTypedEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in powerpc64? + return false; +} + } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc new file mode 100644 index 000000000000..a43744d9a0cb --- /dev/null +++ b/lib/xray/xray_profile_collector.cc @@ -0,0 +1,318 @@ +//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the interface for the profileCollectorService. +// +//===----------------------------------------------------------------------===// +#include "xray_profile_collector.h" +#include "sanitizer_common/sanitizer_allocator_internal.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_vector.h" +#include "xray_profiling_flags.h" +#include <memory> +#include <pthread.h> +#include <utility> + +namespace __xray { +namespace profileCollectorService { + +namespace { + +SpinMutex GlobalMutex; +struct ThreadTrie { + tid_t TId; + FunctionCallTrie *Trie; +}; + +struct ProfileBuffer { + void *Data; + size_t Size; +}; + +struct BlockHeader { + u32 BlockSize; + u32 BlockNum; + u64 ThreadId; +}; + +// These need to be pointers that point to heap/internal-allocator-allocated +// objects because these are accessed even at program exit. +Vector<ThreadTrie> *ThreadTries = nullptr; +Vector<ProfileBuffer> *ProfileBuffers = nullptr; +FunctionCallTrie::Allocators *GlobalAllocators = nullptr; + +} // namespace + +void post(const FunctionCallTrie &T, tid_t TId) { + static pthread_once_t Once = PTHREAD_ONCE_INIT; + pthread_once(&Once, +[] { + SpinMutexLock Lock(&GlobalMutex); + GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( + profilingFlags()->global_allocator_max); + ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>( + InternalAlloc(sizeof(Vector<ThreadTrie>))); + new (ThreadTries) Vector<ThreadTrie>(); + ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>( + InternalAlloc(sizeof(Vector<ProfileBuffer>))); + new (ProfileBuffers) Vector<ProfileBuffer>(); + }); + DCHECK_NE(GlobalAllocators, nullptr); + DCHECK_NE(ThreadTries, nullptr); + DCHECK_NE(ProfileBuffers, nullptr); + + ThreadTrie *Item = nullptr; + { + SpinMutexLock Lock(&GlobalMutex); + if (GlobalAllocators == nullptr) + return; + + Item = ThreadTries->PushBack(); + Item->TId = TId; + + // Here we're using the internal allocator instead of the managed allocator + // because: + // + // 1) We're not using the segmented array data structure to host + // FunctionCallTrie objects. We're using a Vector (from sanitizer_common) + // which works like a std::vector<...> keeping elements contiguous in + // memory. The segmented array data structure assumes that elements are + // trivially destructible, where FunctionCallTrie isn't. + // + // 2) Using a managed allocator means we need to manage that separately, + // which complicates the nature of this code. To get around that, we're + // using the internal allocator instead, which has its own global state + // and is decoupled from the lifetime management required by the managed + // allocator we have in XRay. + // + Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc( + sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie))); + DCHECK_NE(Item->Trie, nullptr); + new (Item->Trie) FunctionCallTrie(*GlobalAllocators); + } + + T.deepCopyInto(*Item->Trie); +} + +// A PathArray represents the function id's representing a stack trace. In this +// context a path is almost always represented from the leaf function in a call +// stack to a root of the call trie. +using PathArray = Array<int32_t>; + +struct ProfileRecord { + using PathAllocator = typename PathArray::AllocatorType; + + // The Path in this record is the function id's from the leaf to the root of + // the function call stack as represented from a FunctionCallTrie. + PathArray *Path = nullptr; + const FunctionCallTrie::Node *Node = nullptr; + + // Constructor for in-place construction. + ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) + : Path([&] { + auto P = + reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray))); + new (P) PathArray(A); + return P; + }()), + Node(N) {} +}; + +namespace { + +using ProfileRecordArray = Array<ProfileRecord>; + +// Walk a depth-first traversal of each root of the FunctionCallTrie to generate +// the path(s) and the data associated with the path. +static void populateRecords(ProfileRecordArray &PRs, + ProfileRecord::PathAllocator &PA, + const FunctionCallTrie &Trie) { + using StackArray = Array<const FunctionCallTrie::Node *>; + using StackAllocator = typename StackArray::AllocatorType; + StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); + StackArray DFSStack(StackAlloc); + for (const auto R : Trie.getRoots()) { + DFSStack.Append(R); + while (!DFSStack.empty()) { + auto Node = DFSStack.back(); + DFSStack.trim(1); + auto Record = PRs.AppendEmplace(PA, Node); + if (Record == nullptr) + return; + DCHECK_NE(Record, nullptr); + + // Traverse the Node's parents and as we're doing so, get the FIds in + // the order they appear. + for (auto N = Node; N != nullptr; N = N->Parent) + Record->Path->Append(N->FId); + DCHECK(!Record->Path->empty()); + + for (const auto C : Node->Callees) + DFSStack.Append(C.NodePtr); + } + } +} + +static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, + const ProfileRecordArray &ProfileRecords) { + auto NextPtr = static_cast<char *>( + internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + + sizeof(Header); + for (const auto &Record : ProfileRecords) { + // List of IDs follow: + for (const auto FId : *Record.Path) + NextPtr = + static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + + sizeof(FId); + + // Add the sentinel here. + constexpr int32_t SentinelFId = 0; + NextPtr = static_cast<char *>( + internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + + sizeof(SentinelFId); + + // Add the node data here. + NextPtr = + static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount, + sizeof(Record.Node->CallCount))) + + sizeof(Record.Node->CallCount); + NextPtr = static_cast<char *>( + internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, + sizeof(Record.Node->CumulativeLocalTime))) + + sizeof(Record.Node->CumulativeLocalTime); + } + + DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size); +} + +} // namespace + +void serialize() { + SpinMutexLock Lock(&GlobalMutex); + + // Clear out the global ProfileBuffers. + for (uptr I = 0; I < ProfileBuffers->Size(); ++I) + InternalFree((*ProfileBuffers)[I].Data); + ProfileBuffers->Reset(); + + if (ThreadTries->Size() == 0) + return; + + // Then repopulate the global ProfileBuffers. + for (u32 I = 0; I < ThreadTries->Size(); ++I) { + using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; + ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); + ProfileRecord::PathAllocator PathAlloc( + profilingFlags()->global_allocator_max); + ProfileRecordArray ProfileRecords(PRAlloc); + + // First, we want to compute the amount of space we're going to need. We'll + // use a local allocator and an __xray::Array<...> to store the intermediary + // data, then compute the size as we're going along. Then we'll allocate the + // contiguous space to contain the thread buffer data. + const auto &Trie = *(*ThreadTries)[I].Trie; + if (Trie.getRoots().empty()) + continue; + populateRecords(ProfileRecords, PathAlloc, Trie); + DCHECK(!Trie.getRoots().empty()); + DCHECK(!ProfileRecords.empty()); + + // Go through each record, to compute the sizes. + // + // header size = block size (4 bytes) + // + block number (4 bytes) + // + thread id (8 bytes) + // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) + // + call count (8 bytes) + // + local time (8 bytes) + // + end of record (8 bytes) + u32 CumulativeSizes = 0; + for (const auto &Record : ProfileRecords) + CumulativeSizes += 20 + (4 * Record.Path->size()); + + BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId}; + auto Buffer = ProfileBuffers->PushBack(); + Buffer->Size = sizeof(Header) + CumulativeSizes; + Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); + DCHECK_NE(Buffer->Data, nullptr); + serializeRecords(Buffer, Header, ProfileRecords); + + // Now clean up the ProfileRecords array, one at a time. + for (auto &Record : ProfileRecords) { + Record.Path->~PathArray(); + InternalFree(Record.Path); + } + } +} + +void reset() { + SpinMutexLock Lock(&GlobalMutex); + if (ProfileBuffers != nullptr) { + // Clear out the profile buffers that have been serialized. + for (uptr I = 0; I < ProfileBuffers->Size(); ++I) + InternalFree((*ProfileBuffers)[I].Data); + ProfileBuffers->Reset(); + InternalFree(ProfileBuffers); + ProfileBuffers = nullptr; + } + + if (ThreadTries != nullptr) { + // Clear out the function call tries per thread. + for (uptr I = 0; I < ThreadTries->Size(); ++I) { + auto &T = (*ThreadTries)[I]; + T.Trie->~FunctionCallTrie(); + InternalFree(T.Trie); + } + ThreadTries->Reset(); + InternalFree(ThreadTries); + ThreadTries = nullptr; + } + + // Reset the global allocators. + if (GlobalAllocators != nullptr) { + GlobalAllocators->~Allocators(); + InternalFree(GlobalAllocators); + GlobalAllocators = nullptr; + } + GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); + ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>( + InternalAlloc(sizeof(Vector<ThreadTrie>))); + new (ThreadTries) Vector<ThreadTrie>(); + ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>( + InternalAlloc(sizeof(Vector<ProfileBuffer>))); + new (ProfileBuffers) Vector<ProfileBuffer>(); +} + +XRayBuffer nextBuffer(XRayBuffer B) { + SpinMutexLock Lock(&GlobalMutex); + + if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0) + return {nullptr, 0}; + + if (B.Data == nullptr) + return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; + + BlockHeader Header; + internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); + auto NextBlock = Header.BlockNum + 1; + if (NextBlock < ProfileBuffers->Size()) + return {(*ProfileBuffers)[NextBlock].Data, + (*ProfileBuffers)[NextBlock].Size}; + return {nullptr, 0}; +} + +} // namespace profileCollectorService +} // namespace __xray diff --git a/lib/xray/xray_profile_collector.h b/lib/xray/xray_profile_collector.h new file mode 100644 index 000000000000..335043db9526 --- /dev/null +++ b/lib/xray/xray_profile_collector.h @@ -0,0 +1,88 @@ +//===-- xray_profile_collector.h -------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This file defines the interface for a data collection service, for XRay +// profiling. What we implement here is an in-process service where +// FunctionCallTrie instances can be handed off by threads, to be +// consolidated/collected. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_PROFILE_COLLECTOR_H +#define XRAY_XRAY_PROFILE_COLLECTOR_H + +#include "xray_function_call_trie.h" + +#include "xray/xray_log_interface.h" + +namespace __xray { + +/// The ProfileCollectorService implements a centralised mechanism for +/// collecting FunctionCallTrie instances, indexed by thread ID. On demand, the +/// ProfileCollectorService can be queried for the most recent state of the +/// data, in a form that allows traversal. +namespace profileCollectorService { + +/// Posts the FunctionCallTrie associated with a specific Thread ID. This +/// will: +/// +/// - Make a copy of the FunctionCallTrie and store that against the Thread +/// ID. This will use the global allocator for the service-managed +/// FunctionCallTrie instances. +/// - Queue up a pointer to the FunctionCallTrie. +/// - If the queue is long enough (longer than some arbitrary threshold) we +/// then pre-calculate a single FunctionCallTrie for the whole process. +/// +/// +/// We are making a copy of the FunctionCallTrie because the intent is to have +/// this function be called at thread exit, or soon after the profiling +/// handler is finalized through the XRay APIs. By letting threads each +/// process their own thread-local FunctionCallTrie instances, we're removing +/// the need for synchronisation across threads while we're profiling. +/// However, once we're done profiling, we can then collect copies of these +/// FunctionCallTrie instances and pay the cost of the copy. +/// +/// NOTE: In the future, if this turns out to be more costly than "moving" the +/// FunctionCallTrie instances from the owning thread to the collector +/// service, then we can change the implementation to do it this way (moving) +/// instead. +void post(const FunctionCallTrie &T, tid_t TId); + +/// The serialize will process all FunctionCallTrie instances in memory, and +/// turn those into specifically formatted blocks, each describing the +/// function call trie's contents in a compact form. In memory, this looks +/// like the following layout: +/// +/// - block size (32 bits) +/// - block number (32 bits) +/// - thread id (64 bits) +/// - list of records: +/// - function ids in leaf to root order, terminated by +/// 0 (32 bits per function id) +/// - call count (64 bit) +/// - cumulative local time (64 bit) +/// - record delimiter (64 bit, 0x0) +/// +void serialize(); + +/// The reset function will clear out any internal memory held by the +/// service. The intent is to have the resetting be done in calls to the +/// initialization routine, or explicitly through the flush log API. +void reset(); + +/// This nextBuffer function is meant to implement the iterator functionality, +/// provided in the XRay API. +XRayBuffer nextBuffer(XRayBuffer B); + +} // namespace profileCollectorService + +} // namespace __xray + +#endif // XRAY_XRAY_PROFILE_COLLECTOR_H diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc new file mode 100644 index 000000000000..786084c77226 --- /dev/null +++ b/lib/xray/xray_profiling.cc @@ -0,0 +1,372 @@ +//===-- xray_profiling.cc ---------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This is the implementation of a profiling handler. +// +//===----------------------------------------------------------------------===// +#include <memory> +#include <time.h> + +#include "sanitizer_common/sanitizer_atomic.h" +#include "sanitizer_common/sanitizer_flags.h" +#include "xray/xray_interface.h" +#include "xray/xray_log_interface.h" + +#include "xray_flags.h" +#include "xray_profile_collector.h" +#include "xray_profiling_flags.h" +#include "xray_recursion_guard.h" +#include "xray_tsc.h" +#include "xray_utils.h" +#include <pthread.h> + +namespace __xray { + +namespace { + +constexpr uptr XRayProfilingVersion = 0x20180424; + +struct XRayProfilingFileHeader { + const u64 MagicBytes = 0x7872617970726f66; // Identifier for XRay profiling + // files 'xrayprof' in hex. + const uptr Version = XRayProfilingVersion; + uptr Timestamp = 0; // System time in nanoseconds. + uptr PID = 0; // Process ID. +}; + +atomic_sint32_t ProfilerLogFlushStatus = { + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; + +atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; + +SpinMutex ProfilerOptionsMutex; + +struct alignas(64) ProfilingData { + FunctionCallTrie::Allocators *Allocators = nullptr; + FunctionCallTrie *FCT = nullptr; +}; + +static pthread_key_t ProfilingKey; + +thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{}; +static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { + thread_local auto ThreadOnce = [] { + new (&ThreadStorage) ProfilingData{}; + pthread_setspecific(ProfilingKey, &ThreadStorage); + return false; + }(); + (void)ThreadOnce; + + auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage); + + // We need to check whether the global flag to finalizing/finalized has been + // switched. If it is, then we ought to not actually initialise the data. + auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire); + if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || + Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) + return TLD; + + // If we're live, then we re-initialize TLD if the pointers are not null. + if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) { + TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (TLD.Allocators) FunctionCallTrie::Allocators(); + *TLD.Allocators = FunctionCallTrie::InitAllocators(); + TLD.FCT = reinterpret_cast<FunctionCallTrie *>( + InternalAlloc(sizeof(FunctionCallTrie))); + new (TLD.FCT) FunctionCallTrie(*TLD.Allocators); + } + + return TLD; +} + +static void cleanupTLD() XRAY_NEVER_INSTRUMENT { + auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage); + if (TLD.Allocators != nullptr && TLD.FCT != nullptr) { + TLD.FCT->~FunctionCallTrie(); + TLD.Allocators->~Allocators(); + InternalFree(TLD.FCT); + InternalFree(TLD.Allocators); + TLD.FCT = nullptr; + TLD.Allocators = nullptr; + } +} + +} // namespace + +const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { +#ifdef XRAY_PROFILER_DEFAULT_OPTIONS + return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS); +#else + return ""; +#endif +} + +atomic_sint32_t ProfileFlushStatus = { + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; + +XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { + if (atomic_load(&ProfilerLogStatus, memory_order_acquire) != + XRayLogInitStatus::XRAY_LOG_FINALIZED) { + if (Verbosity()) + Report("Not flushing profiles, profiling not been finalized.\n"); + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + } + + s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result, + XRayLogFlushStatus::XRAY_LOG_FLUSHING, + memory_order_acq_rel)) { + if (Verbosity()) + Report("Not flushing profiles, implementation still finalizing.\n"); + } + + // At this point, we'll create the file that will contain the profile, but + // only if the options say so. + if (!profilingFlags()->no_flush) { + // First check whether we have data in the profile collector service + // before we try and write anything down. + XRayBuffer B = profileCollectorService::nextBuffer({nullptr, 0}); + if (B.Data == nullptr) { + if (Verbosity()) + Report("profiling: No data to flush.\n"); + } else { + int Fd = getLogFD(); + if (Fd == -1) { + if (Verbosity()) + Report("profiling: Failed to flush to file, dropping data.\n"); + } else { + XRayProfilingFileHeader Header; + Header.Timestamp = NanoTime(); + Header.PID = internal_getpid(); + retryingWriteAll(Fd, reinterpret_cast<const char *>(&Header), + reinterpret_cast<const char *>(&Header) + + sizeof(Header)); + + // Now for each of the threads, write out the profile data as we would + // see it in memory, verbatim. + while (B.Data != nullptr && B.Size != 0) { + retryingWriteAll(Fd, reinterpret_cast<const char *>(B.Data), + reinterpret_cast<const char *>(B.Data) + B.Size); + B = profileCollectorService::nextBuffer(B); + } + // Then we close out the file. + internal_close(Fd); + } + } + } + + profileCollectorService::reset(); + + // Flush the current thread's local data structures as well. + cleanupTLD(); + + atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + memory_order_release); + + return XRayLogFlushStatus::XRAY_LOG_FLUSHED; +} + +namespace { + +thread_local atomic_uint8_t ReentranceGuard{0}; + +static void postCurrentThreadFCT(ProfilingData &TLD) { + if (TLD.Allocators == nullptr || TLD.FCT == nullptr) + return; + + profileCollectorService::post(*TLD.FCT, GetTid()); + cleanupTLD(); +} + +} // namespace + +void profilingHandleArg0(int32_t FuncId, + XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { + unsigned char CPU; + auto TSC = readTSC(CPU); + RecursionGuard G(ReentranceGuard); + if (!G) + return; + + auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire); + auto &TLD = getThreadLocalData(); + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED || + Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) { + postCurrentThreadFCT(TLD); + return; + } + + switch (Entry) { + case XRayEntryType::ENTRY: + case XRayEntryType::LOG_ARGS_ENTRY: + TLD.FCT->enterFunction(FuncId, TSC); + break; + case XRayEntryType::EXIT: + case XRayEntryType::TAIL: + TLD.FCT->exitFunction(FuncId, TSC); + break; + default: + // FIXME: Handle bugs. + break; + } +} + +void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry, + uint64_t) XRAY_NEVER_INSTRUMENT { + return profilingHandleArg0(FuncId, Entry); +} + +XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED; + if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_FINALIZING, + memory_order_release)) { + if (Verbosity()) + Report("Cannot finalize profile, the profiling is not initialized.\n"); + return static_cast<XRayLogInitStatus>(CurrentStatus); + } + + // Wait a grace period to allow threads to see that we're finalizing. + SleepForMillis(profilingFlags()->grace_period_ms); + + // We also want to make sure that the current thread's data is cleaned up, + // if we have any. + auto &TLD = getThreadLocalData(); + postCurrentThreadFCT(TLD); + + // Then we force serialize the log data. + profileCollectorService::serialize(); + + atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED, + memory_order_release); + return XRayLogInitStatus::XRAY_LOG_FINALIZED; +} + +XRayLogInitStatus +profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + if (BufferSize != 0 || BufferMax != 0) { + if (Verbosity()) + Report("__xray_log_init() being used, and is unsupported. Use " + "__xray_log_init_mode(...) instead. Bailing out."); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZING, + memory_order_release)) { + if (Verbosity()) + Report("Cannot initialize already initialised profiling " + "implementation.\n"); + return static_cast<XRayLogInitStatus>(CurrentStatus); + } + + { + SpinMutexLock Lock(&ProfilerOptionsMutex); + FlagParser ConfigParser; + ProfilerFlags Flags; + Flags.setDefaults(); + registerProfilerFlags(&ConfigParser, &Flags); + ConfigParser.ParseString(profilingCompilerDefinedFlags()); + const char *Env = GetEnv("XRAY_PROFILING_OPTIONS"); + if (Env == nullptr) + Env = ""; + ConfigParser.ParseString(Env); + + // Then parse the configuration string provided. + ConfigParser.ParseString(static_cast<const char *>(Options)); + if (Verbosity()) + ReportUnrecognizedFlags(); + *profilingFlags() = Flags; + } + + // We need to reset the profile data collection implementation now. + profileCollectorService::reset(); + + // We need to set up the exit handlers. + static pthread_once_t Once = PTHREAD_ONCE_INIT; + pthread_once(&Once, +[] { + pthread_key_create(&ProfilingKey, +[](void *P) { + // This is the thread-exit handler. + auto &TLD = *reinterpret_cast<ProfilingData *>(P); + if (TLD.Allocators == nullptr && TLD.FCT == nullptr) + return; + + postCurrentThreadFCT(TLD); + }); + + // We also need to set up an exit handler, so that we can get the profile + // information at exit time. We use the C API to do this, to not rely on C++ + // ABI functions for registering exit handlers. + Atexit(+[] { + // Finalize and flush. + if (profilingFinalize() != XRAY_LOG_FINALIZED) { + cleanupTLD(); + return; + } + if (profilingFlush() != XRAY_LOG_FLUSHED) { + cleanupTLD(); + return; + } + if (Verbosity()) + Report("XRay Profile flushed at exit."); + }); + }); + + __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer); + __xray_set_handler(profilingHandleArg0); + __xray_set_handler_arg1(profilingHandleArg1); + + atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED, + memory_order_release); + if (Verbosity()) + Report("XRay Profiling init successful.\n"); + + return XRayLogInitStatus::XRAY_LOG_INITIALIZED; +} + +bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT { + // Set up the flag defaults from the static defaults and the + // compiler-provided defaults. + { + SpinMutexLock Lock(&ProfilerOptionsMutex); + auto *F = profilingFlags(); + F->setDefaults(); + FlagParser ProfilingParser; + registerProfilerFlags(&ProfilingParser, F); + ProfilingParser.ParseString(profilingCompilerDefinedFlags()); + } + + XRayLogImpl Impl{ + profilingLoggingInit, + profilingFinalize, + profilingHandleArg0, + profilingFlush, + }; + auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl); + if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) { + if (Verbosity()) + Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = " + "%d\n", + RegistrationResult); + return false; + } + + if (!internal_strcmp(flags()->xray_mode, "xray-profiling")) + __xray_log_select_mode("xray_profiling"); + return true; +} + +} // namespace __xray + +static auto UNUSED Unused = __xray::profilingDynamicInitializer(); diff --git a/lib/xray/xray_profiling_flags.cc b/lib/xray/xray_profiling_flags.cc new file mode 100644 index 000000000000..593e66a78ad2 --- /dev/null +++ b/lib/xray/xray_profiling_flags.cc @@ -0,0 +1,40 @@ +//===-- xray_flags.h -------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay runtime flags. +//===----------------------------------------------------------------------===// + +#include "xray_profiling_flags.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_libc.h" +#include "xray_defs.h" + +namespace __xray { + +// Storage for the profiling flags. +ProfilerFlags xray_profiling_flags_dont_use_directly; + +void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue; +#include "xray_profiling_flags.inc" +#undef XRAY_FLAG +} + +void registerProfilerFlags(FlagParser *P, + ProfilerFlags *F) XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) \ + RegisterFlag(P, #Name, Description, &F->Name); +#include "xray_profiling_flags.inc" +#undef XRAY_FLAG +} + +} // namespace __xray diff --git a/lib/xray/xray_profiling_flags.h b/lib/xray/xray_profiling_flags.h new file mode 100644 index 000000000000..2f9a7514799a --- /dev/null +++ b/lib/xray/xray_profiling_flags.h @@ -0,0 +1,39 @@ +//===-- xray_profiling_flags.h ----------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay profiling runtime flags. +//===----------------------------------------------------------------------===// + +#ifndef XRAY_PROFILER_FLAGS_H +#define XRAY_PROFILER_FLAGS_H + +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_internal_defs.h" + +namespace __xray { + +struct ProfilerFlags { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name; +#include "xray_profiling_flags.inc" +#undef XRAY_FLAG + + void setDefaults(); +}; + +extern ProfilerFlags xray_profiling_flags_dont_use_directly; +inline ProfilerFlags *profilingFlags() { + return &xray_profiling_flags_dont_use_directly; +} +void registerProfilerFlags(FlagParser *P, ProfilerFlags *F); + +} // namespace __xray + +#endif // XRAY_PROFILER_FLAGS_H diff --git a/lib/xray/xray_profiling_flags.inc b/lib/xray/xray_profiling_flags.inc new file mode 100644 index 000000000000..e9230ae64187 --- /dev/null +++ b/lib/xray/xray_profiling_flags.inc @@ -0,0 +1,29 @@ +//===-- xray_profiling_flags.inc --------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// XRay profiling runtime flags. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FLAG +#error "Define XRAY_FLAG prior to including this file!" +#endif + +XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20, + "Maximum size of any single per-thread allocator.") +XRAY_FLAG(uptr, global_allocator_max, 2 << 24, + "Maximum size of the global allocator for profile storage.") +XRAY_FLAG(uptr, stack_allocator_max, 2 << 20, + "Maximum size of the traversal stack allocator.") +XRAY_FLAG(int, grace_period_ms, 1, + "Profile collection will wait this much time in milliseconds before " + "resetting the global state. This gives a chance to threads to " + "notice that the profiler has been finalized and clean up.") +XRAY_FLAG(bool, no_flush, false, + "Set to true if we want the profiling implementation to not write " + "out files.") diff --git a/lib/xray/xray_recursion_guard.h b/lib/xray/xray_recursion_guard.h new file mode 100644 index 000000000000..6edadea563bc --- /dev/null +++ b/lib/xray/xray_recursion_guard.h @@ -0,0 +1,57 @@ +//===-- xray_recursion_guard.h ---------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_RECURSION_GUARD_H +#define XRAY_XRAY_RECURSION_GUARD_H + +#include "sanitizer_common/sanitizer_atomic.h" + +namespace __xray { + +/// The RecursionGuard is useful for guarding against signal handlers which are +/// also potentially calling XRay-instrumented functions. To use the +/// RecursionGuard, you'll typically need a thread_local atomic_uint8_t: +/// +/// thread_local atomic_uint8_t Guard{0}; +/// +/// // In a handler function: +/// void handleArg0(int32_t F, XRayEntryType T) { +/// RecursionGuard G(Guard); +/// if (!G) +/// return; // Failed to acquire the guard. +/// ... +/// } +/// +class RecursionGuard { + atomic_uint8_t &Running; + const bool Valid; + +public: + explicit inline RecursionGuard(atomic_uint8_t &R) + : Running(R), Valid(!atomic_exchange(&R, 1, memory_order_acq_rel)) {} + + inline RecursionGuard(const RecursionGuard &) = delete; + inline RecursionGuard(RecursionGuard &&) = delete; + inline RecursionGuard &operator=(const RecursionGuard &) = delete; + inline RecursionGuard &operator=(RecursionGuard &&) = delete; + + explicit inline operator bool() const { return Valid; } + + inline ~RecursionGuard() noexcept { + if (Valid) + atomic_store(&Running, 0, memory_order_release); + } +}; + +} // namespace __xray + +#endif // XRAY_XRAY_RECURSION_GUARD_H diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h new file mode 100644 index 000000000000..11dd794fa520 --- /dev/null +++ b/lib/xray/xray_segmented_array.h @@ -0,0 +1,375 @@ +//===-- xray_segmented_array.h ---------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Defines the implementation of a segmented array, with fixed-size segments +// backing the segments. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_SEGMENTED_ARRAY_H +#define XRAY_SEGMENTED_ARRAY_H + +#include "sanitizer_common/sanitizer_allocator.h" +#include "xray_allocator.h" +#include "xray_utils.h" +#include <cassert> +#include <type_traits> +#include <utility> + +namespace __xray { + +/// The Array type provides an interface similar to std::vector<...> but does +/// not shrink in size. Once constructed, elements can be appended but cannot be +/// removed. The implementation is heavily dependent on the contract provided by +/// the Allocator type, in that all memory will be released when the Allocator +/// is destroyed. When an Array is destroyed, it will destroy elements in the +/// backing store but will not free the memory. +template <class T> class Array { + struct SegmentBase { + SegmentBase *Prev; + SegmentBase *Next; + }; + + // We want each segment of the array to be cache-line aligned, and elements of + // the array be offset from the beginning of the segment. + struct Segment : SegmentBase { + char Data[1]; + }; + +public: + // Each segment of the array will be laid out with the following assumptions: + // + // - Each segment will be on a cache-line address boundary (kCacheLineSize + // aligned). + // + // - The elements will be accessed through an aligned pointer, dependent on + // the alignment of T. + // + // - Each element is at least two-pointers worth from the beginning of the + // Segment, aligned properly, and the rest of the elements are accessed + // through appropriate alignment. + // + // We then compute the size of the segment to follow this logic: + // + // - Compute the number of elements that can fit within + // kCacheLineSize-multiple segments, minus the size of two pointers. + // + // - Request cacheline-multiple sized elements from the allocator. + static constexpr size_t AlignedElementStorageSize = + sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type); + + static constexpr size_t SegmentSize = + nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize); + + using AllocatorType = Allocator<SegmentSize>; + + static constexpr size_t ElementsPerSegment = + (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T)); + + static_assert(ElementsPerSegment > 0, + "Must have at least 1 element per segment."); + + static SegmentBase SentinelSegment; + +private: + AllocatorType *Alloc; + SegmentBase *Head = &SentinelSegment; + SegmentBase *Tail = &SentinelSegment; + size_t Size = 0; + + // Here we keep track of segments in the freelist, to allow us to re-use + // segments when elements are trimmed off the end. + SegmentBase *Freelist = &SentinelSegment; + + Segment *NewSegment() { + // We need to handle the case in which enough elements have been trimmed to + // allow us to re-use segments we've allocated before. For this we look into + // the Freelist, to see whether we need to actually allocate new blocks or + // just re-use blocks we've already seen before. + if (Freelist != &SentinelSegment) { + auto *FreeSegment = Freelist; + Freelist = FreeSegment->Next; + FreeSegment->Next = &SentinelSegment; + Freelist->Prev = &SentinelSegment; + return static_cast<Segment *>(FreeSegment); + } + + auto SegmentBlock = Alloc->Allocate(); + if (SegmentBlock.Data == nullptr) + return nullptr; + + // Placement-new the Segment element at the beginning of the SegmentBlock. + auto S = reinterpret_cast<Segment *>(SegmentBlock.Data); + new (S) SegmentBase{&SentinelSegment, &SentinelSegment}; + return S; + } + + Segment *InitHeadAndTail() { + DCHECK_EQ(Head, &SentinelSegment); + DCHECK_EQ(Tail, &SentinelSegment); + auto Segment = NewSegment(); + if (Segment == nullptr) + return nullptr; + DCHECK_EQ(Segment->Next, &SentinelSegment); + DCHECK_EQ(Segment->Prev, &SentinelSegment); + Head = Tail = static_cast<SegmentBase *>(Segment); + return Segment; + } + + Segment *AppendNewSegment() { + auto S = NewSegment(); + if (S == nullptr) + return nullptr; + DCHECK_NE(Tail, &SentinelSegment); + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(S->Prev, &SentinelSegment); + DCHECK_EQ(S->Next, &SentinelSegment); + Tail->Next = S; + S->Prev = Tail; + Tail = S; + return static_cast<Segment *>(Tail); + } + + // This Iterator models a BidirectionalIterator. + template <class U> class Iterator { + SegmentBase *S = &SentinelSegment; + size_t Offset = 0; + size_t Size = 0; + + public: + Iterator(SegmentBase *IS, size_t Off, size_t S) + : S(IS), Offset(Off), Size(S) {} + Iterator(const Iterator &) noexcept = default; + Iterator() noexcept = default; + Iterator(Iterator &&) noexcept = default; + Iterator &operator=(const Iterator &) = default; + Iterator &operator=(Iterator &&) = default; + ~Iterator() = default; + + Iterator &operator++() { + if (++Offset % ElementsPerSegment || Offset == Size) + return *this; + + // At this point, we know that Offset % N == 0, so we must advance the + // segment pointer. + DCHECK_EQ(Offset % ElementsPerSegment, 0); + DCHECK_NE(Offset, Size); + DCHECK_NE(S, &SentinelSegment); + DCHECK_NE(S->Next, &SentinelSegment); + S = S->Next; + DCHECK_NE(S, &SentinelSegment); + return *this; + } + + Iterator &operator--() { + DCHECK_NE(S, &SentinelSegment); + DCHECK_GT(Offset, 0); + + auto PreviousOffset = Offset--; + if (PreviousOffset != Size && PreviousOffset % ElementsPerSegment == 0) { + DCHECK_NE(S->Prev, &SentinelSegment); + S = S->Prev; + } + + return *this; + } + + Iterator operator++(int) { + Iterator Copy(*this); + ++(*this); + return Copy; + } + + Iterator operator--(int) { + Iterator Copy(*this); + --(*this); + return Copy; + } + + template <class V, class W> + friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) { + return L.S == R.S && L.Offset == R.Offset; + } + + template <class V, class W> + friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) { + return !(L == R); + } + + U &operator*() const { + DCHECK_NE(S, &SentinelSegment); + auto RelOff = Offset % ElementsPerSegment; + + // We need to compute the character-aligned pointer, offset from the + // segment's Data location to get the element in the position of Offset. + auto Base = static_cast<Segment *>(S)->Data; + auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize); + return *reinterpret_cast<U *>(AlignedOffset); + } + + U *operator->() const { return &(**this); } + }; + +public: + explicit Array(AllocatorType &A) : Alloc(&A) {} + + Array(const Array &) = delete; + Array(Array &&O) NOEXCEPT : Alloc(O.Alloc), + Head(O.Head), + Tail(O.Tail), + Size(O.Size) { + O.Head = &SentinelSegment; + O.Tail = &SentinelSegment; + O.Size = 0; + } + + bool empty() const { return Size == 0; } + + AllocatorType &allocator() const { + DCHECK_NE(Alloc, nullptr); + return *Alloc; + } + + size_t size() const { return Size; } + + T *Append(const T &E) { + if (UNLIKELY(Head == &SentinelSegment)) + if (InitHeadAndTail() == nullptr) + return nullptr; + + auto Offset = Size % ElementsPerSegment; + if (UNLIKELY(Size != 0 && Offset == 0)) + if (AppendNewSegment() == nullptr) + return nullptr; + + auto Base = static_cast<Segment *>(Tail)->Data; + auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); + auto Position = reinterpret_cast<T *>(AlignedOffset); + *Position = E; + ++Size; + return Position; + } + + template <class... Args> T *AppendEmplace(Args &&... args) { + if (UNLIKELY(Head == &SentinelSegment)) + if (InitHeadAndTail() == nullptr) + return nullptr; + + auto Offset = Size % ElementsPerSegment; + auto *LatestSegment = Tail; + if (UNLIKELY(Size != 0 && Offset == 0)) { + LatestSegment = AppendNewSegment(); + if (LatestSegment == nullptr) + return nullptr; + } + + DCHECK_NE(Tail, &SentinelSegment); + auto Base = static_cast<Segment *>(LatestSegment)->Data; + auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); + auto Position = reinterpret_cast<T *>(AlignedOffset); + + // In-place construct at Position. + new (Position) T{std::forward<Args>(args)...}; + ++Size; + return reinterpret_cast<T *>(Position); + } + + T &operator[](size_t Offset) const { + DCHECK_LE(Offset, Size); + // We need to traverse the array enough times to find the element at Offset. + auto S = Head; + while (Offset >= ElementsPerSegment) { + S = S->Next; + Offset -= ElementsPerSegment; + DCHECK_NE(S, &SentinelSegment); + } + auto Base = static_cast<Segment *>(S)->Data; + auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); + auto Position = reinterpret_cast<T *>(AlignedOffset); + return *reinterpret_cast<T *>(Position); + } + + T &front() const { + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Size, 0u); + return *begin(); + } + + T &back() const { + DCHECK_NE(Tail, &SentinelSegment); + DCHECK_NE(Size, 0u); + auto It = end(); + --It; + return *It; + } + + template <class Predicate> T *find_element(Predicate P) const { + if (empty()) + return nullptr; + + auto E = end(); + for (auto I = begin(); I != E; ++I) + if (P(*I)) + return &(*I); + + return nullptr; + } + + /// Remove N Elements from the end. This leaves the blocks behind, and not + /// require allocation of new blocks for new elements added after trimming. + void trim(size_t Elements) { + DCHECK_LE(Elements, Size); + DCHECK_GT(Size, 0); + auto OldSize = Size; + Size -= Elements; + + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); + + for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) - + nearest_boundary(Size, ElementsPerSegment)) / + ElementsPerSegment; + SegmentsToTrim > 0; --SegmentsToTrim) { + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); + // Put the tail into the Freelist. + auto *FreeSegment = Tail; + Tail = Tail->Prev; + if (Tail == &SentinelSegment) + Head = Tail; + else + Tail->Next = &SentinelSegment; + + DCHECK_EQ(Tail->Next, &SentinelSegment); + FreeSegment->Next = Freelist; + FreeSegment->Prev = &SentinelSegment; + if (Freelist != &SentinelSegment) + Freelist->Prev = FreeSegment; + Freelist = FreeSegment; + } + } + + // Provide iterators. + Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); } + Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); } + Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); } + Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); } +}; + +// We need to have this storage definition out-of-line so that the compiler can +// ensure that storage for the SentinelSegment is defined and has a single +// address. +template <class T> +typename Array<T>::SegmentBase Array<T>::SentinelSegment{ + &Array<T>::SentinelSegment, &Array<T>::SentinelSegment}; + +} // namespace __xray + +#endif // XRAY_SEGMENTED_ARRAY_H diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S index 350afd9265fd..99ad3966ee3a 100644 --- a/lib/xray/xray_trampoline_x86_64.S +++ b/lib/xray/xray_trampoline_x86_64.S @@ -19,47 +19,56 @@ .macro SAVE_REGISTERS - subq $192, %rsp - CFI_DEF_CFA_OFFSET(200) - // At this point, the stack pointer should be aligned to an 8-byte boundary, - // because any call instructions that come after this will add another 8 - // bytes and therefore align it to 16-bytes. - movq %rbp, 184(%rsp) - movupd %xmm0, 168(%rsp) - movupd %xmm1, 152(%rsp) - movupd %xmm2, 136(%rsp) - movupd %xmm3, 120(%rsp) - movupd %xmm4, 104(%rsp) - movupd %xmm5, 88(%rsp) - movupd %xmm6, 72(%rsp) - movupd %xmm7, 56(%rsp) - movq %rdi, 48(%rsp) - movq %rax, 40(%rsp) - movq %rdx, 32(%rsp) - movq %rsi, 24(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 8(%rsp) - movq %r9, 0(%rsp) + subq $240, %rsp + CFI_DEF_CFA_OFFSET(248) + movq %rbp, 232(%rsp) + movupd %xmm0, 216(%rsp) + movupd %xmm1, 200(%rsp) + movupd %xmm2, 184(%rsp) + movupd %xmm3, 168(%rsp) + movupd %xmm4, 152(%rsp) + movupd %xmm5, 136(%rsp) + movupd %xmm6, 120(%rsp) + movupd %xmm7, 104(%rsp) + movq %rdi, 96(%rsp) + movq %rax, 88(%rsp) + movq %rdx, 80(%rsp) + movq %rsi, 72(%rsp) + movq %rcx, 64(%rsp) + movq %r8, 56(%rsp) + movq %r9, 48(%rsp) + movq %r10, 40(%rsp) + movq %r11, 32(%rsp) + movq %r12, 24(%rsp) + movq %r13, 16(%rsp) + movq %r14, 8(%rsp) + movq %r15, 0(%rsp) .endm .macro RESTORE_REGISTERS - movq 184(%rsp), %rbp - movupd 168(%rsp), %xmm0 - movupd 152(%rsp), %xmm1 - movupd 136(%rsp), %xmm2 - movupd 120(%rsp), %xmm3 - movupd 104(%rsp), %xmm4 - movupd 88(%rsp), %xmm5 - movupd 72(%rsp) , %xmm6 - movupd 56(%rsp) , %xmm7 - movq 48(%rsp), %rdi - movq 40(%rsp), %rax - movq 32(%rsp), %rdx - movq 24(%rsp), %rsi - movq 16(%rsp), %rcx - movq 8(%rsp), %r8 - movq 0(%rsp), %r9 - addq $192, %rsp + movq 232(%rsp), %rbp + movupd 216(%rsp), %xmm0 + movupd 200(%rsp), %xmm1 + movupd 184(%rsp), %xmm2 + movupd 168(%rsp), %xmm3 + movupd 152(%rsp), %xmm4 + movupd 136(%rsp), %xmm5 + movupd 120(%rsp) , %xmm6 + movupd 104(%rsp) , %xmm7 + movq 96(%rsp), %rdi + movq 88(%rsp), %rax + movq 80(%rsp), %rdx + movq 72(%rsp), %rsi + movq 64(%rsp), %rcx + movq 56(%rsp), %r8 + movq 48(%rsp), %r9 + movq 40(%rsp), %r10 + movq 32(%rsp), %r11 + movq 24(%rsp), %r12 + movq 16(%rsp), %r13 + movq 8(%rsp), %r14 + movq 0(%rsp), %r15 + addq $240, %rsp CFI_DEF_CFA_OFFSET(8) .endm @@ -90,6 +99,7 @@ .globl ASM_SYMBOL(__xray_FunctionEntry) .align 16, 0x90 ASM_TYPE_FUNCTION(__xray_FunctionEntry) +# LLVM-MCA-BEGIN __xray_FunctionEntry ASM_SYMBOL(__xray_FunctionEntry): CFI_STARTPROC SAVE_REGISTERS @@ -100,7 +110,7 @@ ASM_SYMBOL(__xray_FunctionEntry): testq %rax, %rax je .Ltmp0 - // The patched function prolog puts its xray_instr_map index into %r10d. + // The patched function prologue puts its xray_instr_map index into %r10d. movl %r10d, %edi xor %esi,%esi ALIGNED_CALL_RAX @@ -108,6 +118,7 @@ ASM_SYMBOL(__xray_FunctionEntry): .Ltmp0: RESTORE_REGISTERS retq +# LLVM-MCA-END ASM_SIZE(__xray_FunctionEntry) CFI_ENDPROC @@ -116,6 +127,7 @@ ASM_SYMBOL(__xray_FunctionEntry): .globl ASM_SYMBOL(__xray_FunctionExit) .align 16, 0x90 ASM_TYPE_FUNCTION(__xray_FunctionExit) +# LLVM-MCA-BEGIN __xray_FunctionExit ASM_SYMBOL(__xray_FunctionExit): CFI_STARTPROC // Save the important registers first. Since we're assuming that this @@ -146,6 +158,7 @@ ASM_SYMBOL(__xray_FunctionExit): addq $56, %rsp CFI_DEF_CFA_OFFSET(8) retq +# LLVM-MCA-END ASM_SIZE(__xray_FunctionExit) CFI_ENDPROC @@ -154,6 +167,7 @@ ASM_SYMBOL(__xray_FunctionExit): .globl ASM_SYMBOL(__xray_FunctionTailExit) .align 16, 0x90 ASM_TYPE_FUNCTION(__xray_FunctionTailExit) +# LLVM-MCA-BEGIN __xray_FunctionTailExit ASM_SYMBOL(__xray_FunctionTailExit): CFI_STARTPROC SAVE_REGISTERS @@ -170,6 +184,7 @@ ASM_SYMBOL(__xray_FunctionTailExit): .Ltmp4: RESTORE_REGISTERS retq +# LLVM-MCA-END ASM_SIZE(__xray_FunctionTailExit) CFI_ENDPROC @@ -178,6 +193,7 @@ ASM_SYMBOL(__xray_FunctionTailExit): .globl ASM_SYMBOL(__xray_ArgLoggerEntry) .align 16, 0x90 ASM_TYPE_FUNCTION(__xray_ArgLoggerEntry) +# LLVM-MCA-BEGIN __xray_ArgLoggerEntry ASM_SYMBOL(__xray_ArgLoggerEntry): CFI_STARTPROC SAVE_REGISTERS @@ -207,6 +223,7 @@ ASM_SYMBOL(__xray_ArgLoggerEntry): .Larg1entryFail: RESTORE_REGISTERS retq +# LLVM-MCA-END ASM_SIZE(__xray_ArgLoggerEntry) CFI_ENDPROC @@ -215,13 +232,13 @@ ASM_SYMBOL(__xray_ArgLoggerEntry): .global ASM_SYMBOL(__xray_CustomEvent) .align 16, 0x90 ASM_TYPE_FUNCTION(__xray_CustomEvent) +# LLVM-MCA-BEGIN __xray_CustomEvent ASM_SYMBOL(__xray_CustomEvent): CFI_STARTPROC SAVE_REGISTERS // We take two arguments to this trampoline, which should be in rdi and rsi - // already. We also make sure that we stash %rax because we use that register - // to call the logging handler. + // already. movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax testq %rax,%rax je .LcustomEventCleanup @@ -231,7 +248,35 @@ ASM_SYMBOL(__xray_CustomEvent): .LcustomEventCleanup: RESTORE_REGISTERS retq +# LLVM-MCA-END ASM_SIZE(__xray_CustomEvent) CFI_ENDPROC +//===----------------------------------------------------------------------===// + + .global ASM_SYMBOL(__xray_TypedEvent) + .align 16, 0x90 + ASM_TYPE_FUNCTION(__xray_TypedEvent) +# LLVM-MCA-BEGIN __xray_TypedEvent +ASM_SYMBOL(__xray_TypedEvent): + CFI_STARTPROC + SAVE_REGISTERS + + // We pass three arguments to this trampoline, which should be in rdi, rsi + // and rdx without our intervention. + movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax + testq %rax,%rax + je .LtypedEventCleanup + + ALIGNED_CALL_RAX + +.LtypedEventCleanup: + RESTORE_REGISTERS + retq +# LLVM-MCA-END + ASM_SIZE(__xray_TypedEvent) + CFI_ENDPROC + +//===----------------------------------------------------------------------===// + NO_EXEC_STACK_DIRECTIVE diff --git a/lib/xray/xray_utils.cc b/lib/xray/xray_utils.cc index cf800d3aeaf8..68f4e8c1094c 100644 --- a/lib/xray/xray_utils.cc +++ b/lib/xray/xray_utils.cc @@ -15,11 +15,11 @@ #include "sanitizer_common/sanitizer_common.h" #include "xray_defs.h" #include "xray_flags.h" -#include <stdlib.h> #include <cstdio> #include <errno.h> #include <fcntl.h> #include <iterator> +#include <stdlib.h> #include <sys/types.h> #include <tuple> #include <unistd.h> @@ -31,7 +31,7 @@ void printToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT { fprintf(stderr, "%s", Buffer); } -void retryingWriteAll(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT { +void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT { if (Begin == End) return; auto TotalBytes = std::distance(Begin, End); @@ -82,7 +82,7 @@ bool readValueFromFile(const char *Filename, if (!Success) return false; close(Fd); - char *End = nullptr; + const char *End = nullptr; long long Tmp = internal_simple_strtoll(Line, &End, 10); bool Result = false; if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) { @@ -94,10 +94,10 @@ bool readValueFromFile(const char *Filename, int getLogFD() XRAY_NEVER_INSTRUMENT { // Open a temporary file once for the log. - static char TmpFilename[256] = {}; - static char TmpWildcardPattern[] = "XXXXXX"; - auto Argv = GetArgv(); - const char *Progname = Argv[0] == nullptr ? "(unknown)" : Argv[0]; + char TmpFilename[256] = {}; + char TmpWildcardPattern[] = "XXXXXX"; + auto **Argv = GetArgv(); + const char *Progname = !Argv ? "(unknown)" : Argv[0]; const char *LastSlash = internal_strrchr(Progname, '/'); if (LastSlash != nullptr) @@ -117,7 +117,7 @@ int getLogFD() XRAY_NEVER_INSTRUMENT { TmpFilename); return -1; } - if (__sanitizer::Verbosity()) + if (Verbosity()) Report("XRay: Log file in '%s'\n", TmpFilename); return Fd; diff --git a/lib/xray/xray_utils.h b/lib/xray/xray_utils.h index 1ecc74a2dce8..eafa16e1a9d5 100644 --- a/lib/xray/xray_utils.h +++ b/lib/xray/xray_utils.h @@ -15,6 +15,8 @@ #ifndef XRAY_UTILS_H #define XRAY_UTILS_H +#include <cstddef> +#include <cstdint> #include <sys/types.h> #include <utility> @@ -24,7 +26,7 @@ namespace __xray { void printToStdErr(const char *Buffer); // EINTR-safe write routine, provided a file descriptor and a character range. -void retryingWriteAll(int Fd, char *Begin, char *End); +void retryingWriteAll(int Fd, const char *Begin, const char *End); // Reads a long long value from a provided file. bool readValueFromFile(const char *Filename, long long *Value); @@ -36,6 +38,32 @@ std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End); // file. int getLogFD(); +constexpr size_t gcd(size_t a, size_t b) { + return (b == 0) ? a : gcd(b, a % b); +} + +constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); } + +constexpr size_t nearest_boundary(size_t number, size_t multiple) { + return multiple * ((number / multiple) + (number % multiple ? 1 : 0)); +} + +constexpr size_t next_pow2_helper(size_t num, size_t acc) { + return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1); +} + +constexpr size_t next_pow2(size_t number) { + return next_pow2_helper(number, 1); +} + +template <class T> constexpr T &max(T &A, T &B) { return A > B ? A : B; } + +template <class T> constexpr T &min(T &A, T &B) { return A <= B ? A : B; } + +constexpr ptrdiff_t diff(uintptr_t A, uintptr_t B) { + return max(A, B) - min(A, B); +} + } // namespace __xray #endif // XRAY_UTILS_H diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc index e17f00ac3a62..51dc4ce43b1c 100644 --- a/lib/xray/xray_x86_64.cc +++ b/lib/xray/xray_x86_64.cc @@ -3,6 +3,15 @@ #include "xray_defs.h" #include "xray_interface_internal.h" +#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD +#include <sys/types.h> +#if SANITIZER_OPENBSD +#include <sys/time.h> +#include <machine/cpu.h> +#endif +#include <sys/sysctl.h> +#endif + #include <atomic> #include <cstdint> #include <errno.h> @@ -14,6 +23,7 @@ namespace __xray { +#if SANITIZER_LINUX static std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT { auto BytesToRead = std::distance(Begin, End); @@ -47,7 +57,7 @@ static bool readValueFromFile(const char *Filename, close(Fd); if (!Success) return false; - char *End = nullptr; + const char *End = nullptr; long long Tmp = internal_simple_strtoll(Line, &End, 10); bool Result = false; if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) { @@ -71,6 +81,31 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { } return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency); } +#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD +uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { + long long TSCFrequency = -1; + size_t tscfreqsz = sizeof(TSCFrequency); +#if SANITIZER_OPENBSD + int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ }; + if (sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) { + +#else + if (sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz, + NULL, 0) != -1) { +#endif + return static_cast<uint64_t>(TSCFrequency); + } else { + Report("Unable to determine CPU frequency for TSC accounting.\n"); + } + + return 0; +} +#else +uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { + /* Not supported */ + return 0; +} +#endif static constexpr uint8_t CallOpCode = 0xe8; static constexpr uint16_t MovR10Seq = 0xba41; @@ -184,8 +219,8 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, reinterpret_cast<int64_t>(__xray_FunctionTailExit) - (static_cast<int64_t>(Sled.Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { - Report("XRay Exit trampoline (%p) too far from sled (%p)\n", - __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address)); + Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n", + __xray_FunctionTailExit, reinterpret_cast<void *>(Sled.Address)); return false; } if (Enable) { @@ -251,6 +286,37 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, return false; } +bool patchTypedEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // Here we do the dance of replacing the following sled: + // + // xray_sled_n: + // jmp +20 // 2 byte instruction + // ... + // + // With the following: + // + // nopw // 2 bytes + // ... + // + // + // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'. + // The 20 byte sled stashes three argument registers, calls the trampoline, + // unstashes the registers and returns. If the arguments are already in + // the correct registers, the stashing and unstashing become equivalently + // sized nops. + if (Enable) { + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq, + std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq, + std::memory_order_release); + } + return false; +} + // We determine whether the CPU we're running on has the correct features we // need. In x86_64 this will be rdtscp support. bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { @@ -259,7 +325,8 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { // We check whether rdtscp support is enabled. According to the x86_64 manual, // level should be set at 0x80000001, and we should have a look at bit 27 in // EDX. That's 0x8000000 (or 1u << 27). - __get_cpuid(0x80000001, &EAX, &EBX, &ECX, &EDX); + __asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX) + : "0"(0x80000001)); if (!(EDX & (1u << 27))) { Report("Missing rdtscp support.\n"); return false; diff --git a/lib/xray/xray_x86_64.inc b/lib/xray/xray_x86_64.inc index 4ad3f9810946..b3c475f9110c 100644 --- a/lib/xray/xray_x86_64.inc +++ b/lib/xray/xray_x86_64.inc @@ -21,9 +21,10 @@ namespace __xray { ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT { unsigned LongCPU; - uint64_t TSC = __rdtscp(&LongCPU); + unsigned long Rax, Rdx; + __asm__ __volatile__("rdtscp\n" : "=a"(Rax), "=d"(Rdx), "=c"(LongCPU) ::); CPU = LongCPU; - return TSC; + return (Rdx << 32) + Rax; } uint64_t getTSCFrequency(); |