aboutsummaryrefslogtreecommitdiff
path: root/lib/xray
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xray')
-rw-r--r--lib/xray/CMakeLists.txt225
-rw-r--r--lib/xray/tests/CMakeLists.txt74
-rw-r--r--lib/xray/tests/unit/CMakeLists.txt8
-rw-r--r--lib/xray/tests/unit/allocator_test.cc42
-rw-r--r--lib/xray/tests/unit/buffer_queue_test.cc8
-rw-r--r--lib/xray/tests/unit/fdr_logging_test.cc17
-rw-r--r--lib/xray/tests/unit/function_call_trie_test.cc286
-rw-r--r--lib/xray/tests/unit/profile_collector_test.cc179
-rw-r--r--lib/xray/tests/unit/segmented_array_test.cc200
-rw-r--r--lib/xray/xray_AArch64.cc6
-rw-r--r--lib/xray/xray_allocator.h129
-rw-r--r--lib/xray/xray_arm.cc6
-rw-r--r--lib/xray/xray_basic_flags.cc50
-rw-r--r--lib/xray/xray_basic_flags.h38
-rw-r--r--lib/xray/xray_basic_flags.inc24
-rw-r--r--lib/xray/xray_basic_logging.cc (renamed from lib/xray/xray_inmemory_log.cc)292
-rw-r--r--lib/xray/xray_basic_logging.h (renamed from lib/xray/xray_inmemory_log.h)3
-rw-r--r--lib/xray/xray_buffer_queue.cc52
-rw-r--r--lib/xray/xray_buffer_queue.h127
-rw-r--r--lib/xray/xray_fdr_flags.cc48
-rw-r--r--lib/xray/xray_fdr_flags.h38
-rw-r--r--lib/xray/xray_fdr_flags.inc29
-rw-r--r--lib/xray/xray_fdr_log_records.h2
-rw-r--r--lib/xray/xray_fdr_logging.cc1108
-rw-r--r--lib/xray/xray_fdr_logging_impl.h705
-rw-r--r--lib/xray/xray_flags.cc17
-rw-r--r--lib/xray/xray_flags.h2
-rw-r--r--lib/xray/xray_flags.inc23
-rw-r--r--lib/xray/xray_function_call_trie.h455
-rw-r--r--lib/xray/xray_init.cc32
-rw-r--r--lib/xray/xray_interface.cc131
-rw-r--r--lib/xray/xray_interface_internal.h10
-rw-r--r--lib/xray/xray_log_interface.cc128
-rw-r--r--lib/xray/xray_mips.cc6
-rw-r--r--lib/xray/xray_mips64.cc6
-rw-r--r--lib/xray/xray_powerpc64.cc6
-rw-r--r--lib/xray/xray_profile_collector.cc318
-rw-r--r--lib/xray/xray_profile_collector.h88
-rw-r--r--lib/xray/xray_profiling.cc372
-rw-r--r--lib/xray/xray_profiling_flags.cc40
-rw-r--r--lib/xray/xray_profiling_flags.h39
-rw-r--r--lib/xray/xray_profiling_flags.inc29
-rw-r--r--lib/xray/xray_recursion_guard.h57
-rw-r--r--lib/xray/xray_segmented_array.h375
-rw-r--r--lib/xray/xray_trampoline_x86_64.S127
-rw-r--r--lib/xray/xray_utils.cc16
-rw-r--r--lib/xray/xray_utils.h30
-rw-r--r--lib/xray/xray_x86_64.cc75
-rw-r--r--lib/xray/xray_x86_64.inc5
49 files changed, 4877 insertions, 1206 deletions
diff --git a/lib/xray/CMakeLists.txt b/lib/xray/CMakeLists.txt
index 5547600b943a..8e18f55658f8 100644
--- a/lib/xray/CMakeLists.txt
+++ b/lib/xray/CMakeLists.txt
@@ -1,16 +1,29 @@
-# Build for the XRay runtime support library.
+# Build for all components of the XRay runtime support library.
# XRay runtime library implementation files.
set(XRAY_SOURCES
- xray_inmemory_log.cc
- xray_init.cc
- xray_flags.cc
- xray_interface.cc
- xray_buffer_queue.cc
- xray_log_interface.cc
- xray_fdr_logging.cc
- xray_utils.cc)
+ xray_init.cc
+ xray_flags.cc
+ xray_interface.cc
+ xray_log_interface.cc
+ xray_utils.cc)
+# Implementation files for all XRay modes.
+set(XRAY_FDR_MODE_SOURCES
+ xray_fdr_flags.cc
+ xray_buffer_queue.cc
+ xray_fdr_logging.cc)
+
+set(XRAY_BASIC_MODE_SOURCES
+ xray_basic_flags.cc
+ xray_basic_logging.cc)
+
+set(XRAY_PROFILING_MODE_SOURCES
+ xray_profile_collector.cc
+ xray_profiling.cc
+ xray_profiling_flags.cc)
+
+# Implementation files for all XRay architectures.
set(x86_64_SOURCES
xray_x86_64.cc
xray_trampoline_x86_64.S)
@@ -23,8 +36,8 @@ set(armhf_SOURCES
${arm_SOURCES})
set(aarch64_SOURCES
- xray_AArch64.cc
- xray_trampoline_AArch64.S)
+ xray_AArch64.cc
+ xray_trampoline_AArch64.S)
set(mips_SOURCES
xray_mips.cc
@@ -47,11 +60,68 @@ set(powerpc64le_SOURCES
xray_trampoline_powerpc64.cc
xray_trampoline_powerpc64_asm.S)
+set(XRAY_IMPL_HEADERS
+ xray_allocator.h
+ xray_basic_flags.h
+ xray_basic_flags.inc
+ xray_basic_logging.h
+ xray_buffer_queue.h
+ xray_defs.h
+ xray_fdr_flags.h
+ xray_fdr_flags.inc
+ xray_fdr_log_records.h
+ xray_fdr_logging.h
+ xray_flags.h
+ xray_flags.inc
+ xray_function_call_trie.h
+ xray_interface_internal.h
+ xray_powerpc64.inc
+ xray_profile_collector.h
+ xray_profiling_flags.h
+ xray_profiling_flags.inc
+ xray_recursion_guard.h
+ xray_segmented_array.h
+ xray_tsc.h
+ xray_utils.h
+ xray_x86_64.inc)
+
+# Create list of all source files for
+# consumption by tests.
+set(XRAY_ALL_SOURCE_FILES
+ ${XRAY_SOURCES}
+ ${XRAY_FDR_MODE_SOURCES}
+ ${XRAY_BASIC_MODE_SOURCES}
+ ${XRAY_PROFILING_MODE_SOURCES}
+ ${x86_64_SOURCES}
+ ${arm_SOURCES}
+ ${armhf_SOURCES}
+ ${mips_SOURCES}
+ ${mipsel_SOURCES}
+ ${mips64_SOURCES}
+ ${mips64el_SOURCES}
+ ${powerpc64le_SOURCES}
+ ${XRAY_IMPL_HEADERS}
+)
+list(REMOVE_DUPLICATES XRAY_ALL_SOURCE_FILES)
+# Make list that uses absolute paths
+set(XRAY_ALL_SOURCE_FILES_ABS_PATHS "")
+foreach (src_file ${XRAY_ALL_SOURCE_FILES})
+ list(APPEND
+ XRAY_ALL_SOURCE_FILES_ABS_PATHS
+ "${CMAKE_CURRENT_SOURCE_DIR}/${src_file}")
+endforeach()
+
+
+# Now put it all together...
include_directories(..)
include_directories(../../include)
set(XRAY_CFLAGS ${SANITIZER_COMMON_CFLAGS})
set(XRAY_COMMON_DEFINITIONS XRAY_HAS_EXCEPTIONS=1)
+
+# We don't need RTTI in XRay, so turn that off.
+append_rtti_flag(OFF XRAY_CFLAGS)
+
append_list_if(
COMPILER_RT_HAS_XRAY_COMPILER_FLAG XRAY_SUPPORTED=1 XRAY_COMMON_DEFINITIONS)
append_list_if(
@@ -60,10 +130,13 @@ append_list_if(
add_compiler_rt_component(xray)
set(XRAY_COMMON_RUNTIME_OBJECT_LIBS
- RTXray
RTSanitizerCommon
RTSanitizerCommonLibc)
+if (TARGET cxx-headers OR HAVE_LIBCXX)
+ set(XRAY_DEPS cxx-headers)
+endif()
+
if (APPLE)
set(XRAY_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
add_asm_sources(XRAY_ASM_SOURCES xray_trampoline_x86_64.S)
@@ -75,8 +148,34 @@ if (APPLE)
OS ${XRAY_SUPPORTED_OS}
ARCHS ${XRAY_SUPPORTED_ARCH}
SOURCES ${x86_64_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
+ add_compiler_rt_object_libraries(RTXrayFDR
+ OS ${XRAY_SUPPORTED_OS}
+ ARCHS ${XRAY_SUPPORTED_ARCH}
+ SOURCES ${XRAY_FDR_MODE_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
CFLAGS ${XRAY_CFLAGS}
- DEFS ${XRAY_COMMON_DEFINITIONS})
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
+ add_compiler_rt_object_libraries(RTXrayBASIC
+ OS ${XRAY_SUPPORTED_OS}
+ ARCHS ${XRAY_SUPPORTED_ARCH}
+ SOURCES ${XRAY_BASIC_MODE_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
+ add_compiler_rt_object_libraries(RTXrayPROFILING
+ OS ${XRAY_SUPPORTED_OS}
+ ARCHS ${XRAY_SUPPORTED_ARCH}
+ SOURCES ${XRAY_PROFILING_MODE_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
# We only support running on osx for now.
add_compiler_rt_runtime(clang_rt.xray
@@ -91,24 +190,104 @@ if (APPLE)
LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
LINK_LIBS ${XRAY_LINK_LIBS}
PARENT_TARGET xray)
-else()
-foreach(arch ${XRAY_SUPPORTED_ARCH})
- if(CAN_TARGET_${arch})
+ add_compiler_rt_runtime(clang_rt.xray-fdr
+ STATIC
+ OS ${XRAY_SUPPORTED_OS}
+ ARCHS ${XRAY_SUPPORTED_ARCH}
+ OBJECT_LIBS RTXrayFDR
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+ LINK_LIBS ${XRAY_LINK_LIBS}
+ PARENT_TARGET xray)
+ add_compiler_rt_runtime(clang_rt.xray-basic
+ STATIC
+ OS ${XRAY_SUPPORTED_OS}
+ ARCHS ${XRAY_SUPPORTED_ARCH}
+ OBJECT_LIBS RTXrayBASIC
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+ LINK_LIBS ${XRAY_LINK_LIBS}
+ PARENT_TARGET xray)
+ add_compiler_rt_runtime(clang_rt.xray-profiling
+ STATIC
+ OS ${XRAY_SUPPORTED_OS}
+ ARCHS ${XRAY_SUPPORTED_ARCH}
+ OBJECT_LIBS RTXrayPROFILING
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+ LINK_LIBS ${XRAY_LINK_LIBS}
+ PARENT_TARGET xray)
+else() # not Apple
+ foreach(arch ${XRAY_SUPPORTED_ARCH})
+ if(NOT CAN_TARGET_${arch})
+ continue()
+ endif()
add_compiler_rt_object_libraries(RTXray
ARCHS ${arch}
- SOURCES ${XRAY_SOURCES} CFLAGS ${XRAY_CFLAGS}
- DEFS ${XRAY_COMMON_DEFINITIONS})
+ SOURCES ${XRAY_SOURCES} ${${arch}_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
+ add_compiler_rt_object_libraries(RTXrayFDR
+ ARCHS ${arch}
+ SOURCES ${XRAY_FDR_MODE_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
+ add_compiler_rt_object_libraries(RTXrayBASIC
+ ARCHS ${arch}
+ SOURCES ${XRAY_BASIC_MODE_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
+ add_compiler_rt_object_libraries(RTXrayPROFILING
+ ARCHS ${arch}
+ SOURCES ${XRAY_PROFILING_MODE_SOURCES}
+ ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ DEPS ${XRAY_DEPS})
+
+ # Common XRay archive for instrumented binaries.
add_compiler_rt_runtime(clang_rt.xray
STATIC
ARCHS ${arch}
- SOURCES ${${arch}_SOURCES}
CFLAGS ${XRAY_CFLAGS}
DEFS ${XRAY_COMMON_DEFINITIONS}
- OBJECT_LIBS ${XRAY_COMMON_RUNTIME_OBJECT_LIBS}
+ OBJECT_LIBS ${XRAY_COMMON_RUNTIME_OBJECT_LIBS} RTXray
PARENT_TARGET xray)
- endif()
-endforeach()
-endif()
+ # FDR mode runtime archive (addon for clang_rt.xray)
+ add_compiler_rt_runtime(clang_rt.xray-fdr
+ STATIC
+ ARCHS ${arch}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ OBJECT_LIBS RTXrayFDR
+ PARENT_TARGET xray)
+ # Basic mode runtime archive (addon for clang_rt.xray)
+ add_compiler_rt_runtime(clang_rt.xray-basic
+ STATIC
+ ARCHS ${arch}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ OBJECT_LIBS RTXrayBASIC
+ PARENT_TARGET xray)
+ # Profiler Mode runtime
+ add_compiler_rt_runtime(clang_rt.xray-profiling
+ STATIC
+ ARCHS ${arch}
+ CFLAGS ${XRAY_CFLAGS}
+ DEFS ${XRAY_COMMON_DEFINITIONS}
+ OBJECT_LIBS RTXrayPROFILING
+ PARENT_TARGET xray)
+ endforeach()
+endif() # not Apple
if(COMPILER_RT_INCLUDE_TESTS)
add_subdirectory(tests)
diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt
index e54e63f27890..11f373167d24 100644
--- a/lib/xray/tests/CMakeLists.txt
+++ b/lib/xray/tests/CMakeLists.txt
@@ -3,6 +3,18 @@ include_directories(..)
add_custom_target(XRayUnitTests)
set_target_properties(XRayUnitTests PROPERTIES FOLDER "XRay unittests")
+# Sanity check XRAY_ALL_SOURCE_FILES_ABS_PATHS
+list(LENGTH XRAY_ALL_SOURCE_FILES_ABS_PATHS XASFAP_LENGTH)
+if (${XASFAP_LENGTH} EQUAL 0)
+ message(FATAL_ERROR "XRAY_ALL_SOURCE_FILES_ABS_PATHS cannot be empty")
+endif()
+unset(XASFAP_LENGTH)
+foreach (src_file ${XRAY_ALL_SOURCE_FILES_ABS_PATHS})
+ if (NOT EXISTS "${src_file}")
+ message(FATAL_ERROR "Source file \"${src_file}\" does not exist")
+ endif()
+endforeach()
+
set(XRAY_UNITTEST_CFLAGS
${XRAY_CFLAGS}
${COMPILER_RT_UNITTEST_CFLAGS}
@@ -11,27 +23,77 @@ set(XRAY_UNITTEST_CFLAGS
-I${COMPILER_RT_SOURCE_DIR}/lib/xray
-I${COMPILER_RT_SOURCE_DIR}/lib)
+function(add_xray_lib library)
+ add_library(${library} STATIC ${ARGN})
+ set_target_properties(${library} PROPERTIES
+ ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ FOLDER "Compiler-RT Runtime tests")
+endfunction()
+
+function(get_xray_lib_for_arch arch lib)
+ if(APPLE)
+ set(tgt_name "RTXRay.test.osx")
+ else()
+ set(tgt_name "RTXRay.test.${arch}")
+ endif()
+ set(${lib} "${tgt_name}" PARENT_SCOPE)
+endfunction()
+
set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
+set(XRAY_UNITTEST_LINK_FLAGS
+ ${CMAKE_THREAD_LIBS_INIT}
+ -l${SANITIZER_CXX_ABI_LIBRARY}
+ -fxray-instrument
+ )
+if (NOT APPLE)
+ append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS)
+ append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS)
+ append_list_if(COMPILER_RT_HAS_LIBDL -ldl XRAY_UNITTEST_LINK_FLAGS)
+ append_list_if(COMPILER_RT_HAS_LIBPTHREAD -pthread XRAY_UNITTEST_LINK_FLAGS)
+endif()
+
macro(add_xray_unittest testname)
cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN})
if(UNIX AND NOT APPLE)
+ set(CMAKE_DL_LIBS_INIT "")
foreach(arch ${XRAY_TEST_ARCH})
set(TEST_OBJECTS)
+ get_xray_lib_for_arch(${arch} XRAY_RUNTIME_LIBS)
generate_compiler_rt_tests(TEST_OBJECTS
XRayUnitTests "${testname}-${arch}-Test" "${arch}"
SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
+ # Note that any change in the implementations will cause all the unit
+ # tests to be re-built. This is by design, but may be cumbersome during
+ # the build/test cycle.
+ COMPILE_DEPS ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
+ ${XRAY_HEADERS} ${XRAY_ALL_SOURCE_FILES_ABS_PATHS}
+ RUNTIME "${XRAY_RUNTIME_LIBS}"
DEPS gtest xray llvm-xray
CFLAGS ${XRAY_UNITTEST_CFLAGS}
- LINK_FLAGS -fxray-instrument
- ${TARGET_LINK_FLAGS}
- -lstdc++ -lm ${CMAKE_THREAD_LIBS_INIT}
- -lpthread
- -ldl -lrt)
- set_target_properties(XRayUnitTests PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+ LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS})
+ set_target_properties(XRayUnitTests
+ PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endforeach()
endif()
endmacro()
if(COMPILER_RT_CAN_EXECUTE_TESTS)
+ if (APPLE)
+ add_xray_lib("RTXRay.test.osx"
+ $<TARGET_OBJECTS:RTXray.osx>
+ $<TARGET_OBJECTS:RTXrayFDR.osx>
+ $<TARGET_OBJECTS:RTXrayPROFILING.osx>
+ $<TARGET_OBJECTS:RTSanitizerCommon.osx>
+ $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>)
+ else()
+ foreach(arch ${XRAY_SUPPORTED_ARCH})
+ add_xray_lib("RTXRay.test.${arch}"
+ $<TARGET_OBJECTS:RTXray.${arch}>
+ $<TARGET_OBJECTS:RTXrayFDR.${arch}>
+ $<TARGET_OBJECTS:RTXrayPROFILING.${arch}>
+ $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+ $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>)
+ endforeach()
+ endif()
add_subdirectory(unit)
endif()
diff --git a/lib/xray/tests/unit/CMakeLists.txt b/lib/xray/tests/unit/CMakeLists.txt
index 62d01f239581..b42eb50d0790 100644
--- a/lib/xray/tests/unit/CMakeLists.txt
+++ b/lib/xray/tests/unit/CMakeLists.txt
@@ -2,3 +2,11 @@ add_xray_unittest(XRayBufferQueueTest SOURCES
buffer_queue_test.cc xray_unit_test_main.cc)
add_xray_unittest(XRayFDRLoggingTest SOURCES
fdr_logging_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayAllocatorTest SOURCES
+ allocator_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRaySegmentedArrayTest SOURCES
+ segmented_array_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayFunctionCallTrieTest SOURCES
+ function_call_trie_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayProfileCollectorTest SOURCES
+ profile_collector_test.cc xray_unit_test_main.cc)
diff --git a/lib/xray/tests/unit/allocator_test.cc b/lib/xray/tests/unit/allocator_test.cc
new file mode 100644
index 000000000000..be404160e417
--- /dev/null
+++ b/lib/xray/tests/unit/allocator_test.cc
@@ -0,0 +1,42 @@
+//===-- allocator_test.cc -------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+
+#include "xray_allocator.h"
+#include "gtest/gtest.h"
+
+namespace __xray {
+namespace {
+
+struct TestData {
+ s64 First;
+ s64 Second;
+};
+
+TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11); }
+
+TEST(AllocatorTest, Allocate) {
+ Allocator<sizeof(TestData)> A(2 << 11);
+ auto B = A.Allocate();
+ ASSERT_NE(B.Data, nullptr);
+}
+
+TEST(AllocatorTest, OverAllocate) {
+ Allocator<sizeof(TestData)> A(sizeof(TestData));
+ auto B1 = A.Allocate();
+ (void)B1;
+ auto B2 = A.Allocate();
+ ASSERT_EQ(B2.Data, nullptr);
+}
+
+} // namespace
+} // namespace __xray
diff --git a/lib/xray/tests/unit/buffer_queue_test.cc b/lib/xray/tests/unit/buffer_queue_test.cc
index 1ec7469ce187..c0d4ccb268d6 100644
--- a/lib/xray/tests/unit/buffer_queue_test.cc
+++ b/lib/xray/tests/unit/buffer_queue_test.cc
@@ -32,9 +32,9 @@ TEST(BufferQueueTest, GetAndRelease) {
ASSERT_TRUE(Success);
BufferQueue::Buffer Buf;
ASSERT_EQ(Buffers.getBuffer(Buf), BufferQueue::ErrorCode::Ok);
- ASSERT_NE(nullptr, Buf.Buffer);
+ ASSERT_NE(nullptr, Buf.Data);
ASSERT_EQ(Buffers.releaseBuffer(Buf), BufferQueue::ErrorCode::Ok);
- ASSERT_EQ(nullptr, Buf.Buffer);
+ ASSERT_EQ(nullptr, Buf.Data);
}
TEST(BufferQueueTest, GetUntilFailed) {
@@ -53,7 +53,7 @@ TEST(BufferQueueTest, ReleaseUnknown) {
BufferQueue Buffers(kSize, 1, Success);
ASSERT_TRUE(Success);
BufferQueue::Buffer Buf;
- Buf.Buffer = reinterpret_cast<void *>(0xdeadbeef);
+ Buf.Data = reinterpret_cast<void *>(0xdeadbeef);
Buf.Size = kSize;
EXPECT_EQ(BufferQueue::ErrorCode::UnrecognizedBuffer,
Buffers.releaseBuffer(Buf));
@@ -65,7 +65,7 @@ TEST(BufferQueueTest, ErrorsWhenFinalising) {
ASSERT_TRUE(Success);
BufferQueue::Buffer Buf;
ASSERT_EQ(Buffers.getBuffer(Buf), BufferQueue::ErrorCode::Ok);
- ASSERT_NE(nullptr, Buf.Buffer);
+ ASSERT_NE(nullptr, Buf.Data);
ASSERT_EQ(Buffers.finalize(), BufferQueue::ErrorCode::Ok);
BufferQueue::Buffer OtherBuf;
ASSERT_EQ(BufferQueue::ErrorCode::QueueFinalizing,
diff --git a/lib/xray/tests/unit/fdr_logging_test.cc b/lib/xray/tests/unit/fdr_logging_test.cc
index 76738ea4dff3..b6961efbc351 100644
--- a/lib/xray/tests/unit/fdr_logging_test.cc
+++ b/lib/xray/tests/unit/fdr_logging_test.cc
@@ -10,6 +10,7 @@
// This file is a part of XRay, a function call tracing system.
//
//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
#include "xray_fdr_logging.h"
#include "gtest/gtest.h"
@@ -86,7 +87,7 @@ TEST(FDRLoggingTest, Simple) {
XRayFileHeader H;
memcpy(&H, Contents, sizeof(XRayFileHeader));
- ASSERT_EQ(H.Version, 2);
+ ASSERT_EQ(H.Version, 3);
ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
// We require one buffer at least to have the "extents" metadata record,
@@ -131,7 +132,7 @@ TEST(FDRLoggingTest, Multiple) {
XRayFileHeader H;
memcpy(&H, Contents, sizeof(XRayFileHeader));
- ASSERT_EQ(H.Version, 2);
+ ASSERT_EQ(H.Version, 3);
ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
MetadataRecord MDR0, MDR1;
@@ -154,12 +155,12 @@ TEST(FDRLoggingTest, MultiThreadedCycling) {
// Now we want to create one thread, do some logging, then create another one,
// in succession and making sure that we're able to get thread records from
// the latest thread (effectively being able to recycle buffers).
- std::array<pid_t, 2> Threads;
+ std::array<tid_t, 2> Threads;
for (uint64_t I = 0; I < 2; ++I) {
std::thread t{[I, &Threads] {
fdrLoggingHandleArg0(I + 1, XRayEntryType::ENTRY);
fdrLoggingHandleArg0(I + 1, XRayEntryType::EXIT);
- Threads[I] = syscall(SYS_gettid);
+ Threads[I] = GetTid();
}};
t.join();
}
@@ -182,7 +183,7 @@ TEST(FDRLoggingTest, MultiThreadedCycling) {
XRayFileHeader H;
memcpy(&H, Contents, sizeof(XRayFileHeader));
- ASSERT_EQ(H.Version, 2);
+ ASSERT_EQ(H.Version, 3);
ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
MetadataRecord MDR0, MDR1;
@@ -192,9 +193,9 @@ TEST(FDRLoggingTest, MultiThreadedCycling) {
ASSERT_EQ(MDR0.RecordKind,
uint8_t(MetadataRecord::RecordKinds::BufferExtents));
ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer));
- pid_t Latest = 0;
- memcpy(&Latest, MDR1.Data, sizeof(pid_t));
- ASSERT_EQ(Latest, Threads[1]);
+ int32_t Latest = 0;
+ memcpy(&Latest, MDR1.Data, sizeof(int32_t));
+ ASSERT_EQ(Latest, static_cast<int32_t>(Threads[1]));
}
} // namespace
diff --git a/lib/xray/tests/unit/function_call_trie_test.cc b/lib/xray/tests/unit/function_call_trie_test.cc
new file mode 100644
index 000000000000..049ecfb07e01
--- /dev/null
+++ b/lib/xray/tests/unit/function_call_trie_test.cc
@@ -0,0 +1,286 @@
+//===-- function_call_trie_test.cc ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "gtest/gtest.h"
+
+#include "xray_function_call_trie.h"
+
+namespace __xray {
+
+namespace {
+
+TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
+ profilingFlags()->setDefaults();
+ FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(Allocators);
+}
+
+TEST(FunctionCallTrieTest, EnterAndExitFunction) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+
+ Trie.enterFunction(1, 1);
+ Trie.exitFunction(1, 2);
+
+ // We need a way to pull the data out. At this point, until we get a data
+ // collection service implemented, we're going to export the data as a list of
+ // roots, and manually walk through the structure ourselves.
+
+ const auto &R = Trie.getRoots();
+
+ ASSERT_EQ(R.size(), 1u);
+ ASSERT_EQ(R.front()->FId, 1);
+ ASSERT_EQ(R.front()->CallCount, 1);
+ ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, MissingFunctionEntry) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ Trie.exitFunction(1, 1);
+ const auto &R = Trie.getRoots();
+
+ ASSERT_TRUE(R.empty());
+}
+
+TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ Trie.enterFunction(2, 1);
+ Trie.enterFunction(3, 3);
+ Trie.exitFunction(1, 5);
+ const auto &R = Trie.getRoots();
+
+ ASSERT_FALSE(R.empty());
+ EXPECT_EQ(R.size(), size_t{1});
+}
+
+TEST(FunctionCallTrieTest, MissingFunctionExit) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ Trie.enterFunction(1, 1);
+ const auto &R = Trie.getRoots();
+
+ ASSERT_FALSE(R.empty());
+ EXPECT_EQ(R.size(), size_t{1});
+}
+
+TEST(FunctionCallTrieTest, MultipleRoots) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+
+ // Enter and exit FId = 1.
+ Trie.enterFunction(1, 1);
+ Trie.exitFunction(1, 2);
+
+ // Enter and exit FId = 2.
+ Trie.enterFunction(2, 3);
+ Trie.exitFunction(2, 4);
+
+ const auto &R = Trie.getRoots();
+ ASSERT_FALSE(R.empty());
+ ASSERT_EQ(R.size(), 2u);
+
+ // Make sure the roots have different IDs.
+ const auto R0 = R[0];
+ const auto R1 = R[1];
+ ASSERT_NE(R0->FId, R1->FId);
+
+ // Inspect the roots that they have the right data.
+ ASSERT_NE(R0, nullptr);
+ EXPECT_EQ(R0->CallCount, 1u);
+ EXPECT_EQ(R0->CumulativeLocalTime, 1u);
+
+ ASSERT_NE(R1, nullptr);
+ EXPECT_EQ(R1->CallCount, 1u);
+ EXPECT_EQ(R1->CumulativeLocalTime, 1u);
+}
+
+// While missing an intermediary entry may be rare in practice, we still enforce
+// that we can handle the case where we've missed the entry event somehow, in
+// between call entry/exits. To illustrate, imagine the following shadow call
+// stack:
+//
+// f0@t0 -> f1@t1 -> f2@t2
+//
+// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for
+// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of
+// accounting local time to `f2` from d = (t3 - t2), then local time to `f1`
+// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'.
+TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+
+ Trie.enterFunction(1, 0);
+ Trie.enterFunction(2, 100);
+ Trie.enterFunction(3, 200);
+ Trie.exitFunction(3, 300);
+ Trie.exitFunction(1, 400);
+
+ // What we should see at this point is all the functions in the trie in a
+ // specific order (1 -> 2 -> 3) with the appropriate count(s) and local
+ // latencies.
+ const auto &R = Trie.getRoots();
+ ASSERT_FALSE(R.empty());
+ ASSERT_EQ(R.size(), 1u);
+
+ const auto &F1 = *R[0];
+ ASSERT_EQ(F1.FId, 1);
+ ASSERT_FALSE(F1.Callees.empty());
+
+ const auto &F2 = *F1.Callees[0].NodePtr;
+ ASSERT_EQ(F2.FId, 2);
+ ASSERT_FALSE(F2.Callees.empty());
+
+ const auto &F3 = *F2.Callees[0].NodePtr;
+ ASSERT_EQ(F3.FId, 3);
+ ASSERT_TRUE(F3.Callees.empty());
+
+ // Now that we've established the preconditions, we check for specific aspects
+ // of the nodes.
+ EXPECT_EQ(F3.CallCount, 1);
+ EXPECT_EQ(F2.CallCount, 1);
+ EXPECT_EQ(F1.CallCount, 1);
+ EXPECT_EQ(F3.CumulativeLocalTime, 100);
+ EXPECT_EQ(F2.CumulativeLocalTime, 300);
+ EXPECT_EQ(F1.CumulativeLocalTime, 100);
+}
+
+TEST(FunctionCallTrieTest, DeepCallStack) {
+ // Simulate a relatively deep call stack (32 levels) and ensure that we can
+ // properly pop all the way up the stack.
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ for (int i = 0; i < 32; ++i)
+ Trie.enterFunction(i + 1, i);
+ Trie.exitFunction(1, 33);
+
+ // Here, validate that we have a 32-level deep function call path from the
+ // root (1) down to the leaf (33).
+ const auto &R = Trie.getRoots();
+ ASSERT_EQ(R.size(), 1u);
+ auto F = R[0];
+ for (int i = 0; i < 32; ++i) {
+ EXPECT_EQ(F->FId, i + 1);
+ EXPECT_EQ(F->CallCount, 1);
+ if (F->Callees.empty() && i != 31)
+ FAIL() << "Empty callees for FId " << F->FId;
+ if (i != 31)
+ F = F->Callees[0].NodePtr;
+ }
+}
+
+// TODO: Test that we can handle cross-CPU migrations, where TSCs are not
+// guaranteed to be synchronised.
+TEST(FunctionCallTrieTest, DeepCopy) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+
+ Trie.enterFunction(1, 0);
+ Trie.enterFunction(2, 1);
+ Trie.exitFunction(2, 2);
+ Trie.enterFunction(3, 3);
+ Trie.exitFunction(3, 4);
+ Trie.exitFunction(1, 5);
+
+ // We want to make a deep copy and compare notes.
+ auto B = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Copy(B);
+ Trie.deepCopyInto(Copy);
+
+ ASSERT_NE(Trie.getRoots().size(), 0u);
+ ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size());
+ const auto &R0Orig = *Trie.getRoots()[0];
+ const auto &R0Copy = *Copy.getRoots()[0];
+ EXPECT_EQ(R0Orig.FId, 1);
+ EXPECT_EQ(R0Orig.FId, R0Copy.FId);
+
+ ASSERT_EQ(R0Orig.Callees.size(), 2u);
+ ASSERT_EQ(R0Copy.Callees.size(), 2u);
+
+ const auto &F1Orig =
+ *R0Orig.Callees
+ .find_element(
+ [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
+ ->NodePtr;
+ const auto &F1Copy =
+ *R0Copy.Callees
+ .find_element(
+ [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
+ ->NodePtr;
+ EXPECT_EQ(&R0Orig, F1Orig.Parent);
+ EXPECT_EQ(&R0Copy, F1Copy.Parent);
+}
+
+TEST(FunctionCallTrieTest, MergeInto) {
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie T0(A);
+ FunctionCallTrie T1(A);
+
+ // 1 -> 2 -> 3
+ T0.enterFunction(1, 0);
+ T0.enterFunction(2, 1);
+ T0.enterFunction(3, 2);
+ T0.exitFunction(3, 3);
+ T0.exitFunction(2, 4);
+ T0.exitFunction(1, 5);
+
+ // 1 -> 2 -> 3
+ T1.enterFunction(1, 0);
+ T1.enterFunction(2, 1);
+ T1.enterFunction(3, 2);
+ T1.exitFunction(3, 3);
+ T1.exitFunction(2, 4);
+ T1.exitFunction(1, 5);
+
+ // We use a different allocator here to make sure that we're able to transfer
+ // data into a FunctionCallTrie which uses a different allocator. This
+ // reflects the inteded usage scenario for when we're collecting profiles that
+ // aggregate across threads.
+ auto B = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Merged(B);
+
+ T0.mergeInto(Merged);
+ T1.mergeInto(Merged);
+
+ ASSERT_EQ(Merged.getRoots().size(), 1u);
+ const auto &R0 = *Merged.getRoots()[0];
+ EXPECT_EQ(R0.FId, 1);
+ EXPECT_EQ(R0.CallCount, 2);
+ EXPECT_EQ(R0.CumulativeLocalTime, 10);
+ EXPECT_EQ(R0.Callees.size(), 1u);
+
+ const auto &F1 = *R0.Callees[0].NodePtr;
+ EXPECT_EQ(F1.FId, 2);
+ EXPECT_EQ(F1.CallCount, 2);
+ EXPECT_EQ(F1.CumulativeLocalTime, 6);
+ EXPECT_EQ(F1.Callees.size(), 1u);
+
+ const auto &F2 = *F1.Callees[0].NodePtr;
+ EXPECT_EQ(F2.FId, 3);
+ EXPECT_EQ(F2.CallCount, 2);
+ EXPECT_EQ(F2.CumulativeLocalTime, 2);
+ EXPECT_EQ(F2.Callees.size(), 0u);
+}
+
+} // namespace
+
+} // namespace __xray
diff --git a/lib/xray/tests/unit/profile_collector_test.cc b/lib/xray/tests/unit/profile_collector_test.cc
new file mode 100644
index 000000000000..b7dbe567312a
--- /dev/null
+++ b/lib/xray/tests/unit/profile_collector_test.cc
@@ -0,0 +1,179 @@
+//===-- profile_collector_test.cc -----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "gtest/gtest.h"
+
+#include "xray_profile_collector.h"
+#include "xray_profiling_flags.h"
+#include <cstdint>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace __xray {
+namespace {
+
+static constexpr auto kHeaderSize = 16u;
+
+void ValidateBlock(XRayBuffer B) {
+ profilingFlags()->setDefaults();
+ ASSERT_NE(static_cast<const void *>(B.Data), nullptr);
+ ASSERT_NE(B.Size, 0u);
+ ASSERT_GE(B.Size, kHeaderSize);
+ // We look at the block size, the block number, and the thread ID to ensure
+ // that none of them are zero (or that the header data is laid out as we
+ // expect).
+ char LocalBuffer[kHeaderSize] = {};
+ internal_memcpy(LocalBuffer, B.Data, kHeaderSize);
+ u32 BlockSize = 0;
+ u32 BlockNumber = 0;
+ u64 ThreadId = 0;
+ internal_memcpy(&BlockSize, LocalBuffer, sizeof(u32));
+ internal_memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32));
+ internal_memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64));
+ ASSERT_NE(BlockSize, 0u);
+ ASSERT_GE(BlockNumber, 0u);
+ ASSERT_NE(ThreadId, 0u);
+}
+
+std::tuple<u32, u32, u64> ParseBlockHeader(XRayBuffer B) {
+ char LocalBuffer[kHeaderSize] = {};
+ internal_memcpy(LocalBuffer, B.Data, kHeaderSize);
+ u32 BlockSize = 0;
+ u32 BlockNumber = 0;
+ u64 ThreadId = 0;
+ internal_memcpy(&BlockSize, LocalBuffer, sizeof(u32));
+ internal_memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32));
+ internal_memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64));
+ return std::make_tuple(BlockSize, BlockNumber, ThreadId);
+}
+
+struct Profile {
+ int64_t CallCount;
+ int64_t CumulativeLocalTime;
+ std::vector<int32_t> Path;
+};
+
+std::tuple<Profile, const char *> ParseProfile(const char *P) {
+ Profile Result;
+ // Read the path first, until we find a sentinel 0.
+ int32_t F;
+ do {
+ internal_memcpy(&F, P, sizeof(int32_t));
+ P += sizeof(int32_t);
+ Result.Path.push_back(F);
+ } while (F != 0);
+
+ // Then read the CallCount.
+ internal_memcpy(&Result.CallCount, P, sizeof(int64_t));
+ P += sizeof(int64_t);
+
+ // Then read the CumulativeLocalTime.
+ internal_memcpy(&Result.CumulativeLocalTime, P, sizeof(int64_t));
+ P += sizeof(int64_t);
+ return std::make_tuple(std::move(Result), P);
+}
+
+TEST(profileCollectorServiceTest, PostSerializeCollect) {
+ profilingFlags()->setDefaults();
+ // The most basic use-case (the one we actually only care about) is the one
+ // where we ensure that we can post FunctionCallTrie instances, which are then
+ // destroyed but serialized properly.
+ //
+ // First, we initialise a set of allocators in the local scope. This ensures
+ // that we're able to copy the contents of the FunctionCallTrie that uses
+ // the local allocators.
+ auto Allocators = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie T(Allocators);
+
+ // Then, we populate the trie with some data.
+ T.enterFunction(1, 1);
+ T.enterFunction(2, 2);
+ T.exitFunction(2, 3);
+ T.exitFunction(1, 4);
+
+ // Then we post the data to the global profile collector service.
+ profileCollectorService::post(T, 1);
+
+ // Then we serialize the data.
+ profileCollectorService::serialize();
+
+ // Then we go through a single buffer to see whether we're getting the data we
+ // expect.
+ auto B = profileCollectorService::nextBuffer({nullptr, 0});
+ ValidateBlock(B);
+ u32 BlockSize;
+ u32 BlockNum;
+ u64 ThreadId;
+ std::tie(BlockSize, BlockNum, ThreadId) = ParseBlockHeader(B);
+
+ // We look at the serialized buffer to see whether the Trie we're expecting
+ // to see is there.
+ auto DStart = static_cast<const char *>(B.Data) + kHeaderSize;
+ std::vector<char> D(DStart, DStart + BlockSize);
+ B = profileCollectorService::nextBuffer(B);
+ ASSERT_EQ(B.Data, nullptr);
+ ASSERT_EQ(B.Size, 0u);
+
+ Profile Profile1, Profile2;
+ auto P = static_cast<const char *>(D.data());
+ std::tie(Profile1, P) = ParseProfile(P);
+ std::tie(Profile2, P) = ParseProfile(P);
+
+ ASSERT_NE(Profile1.Path.size(), Profile2.Path.size());
+ auto &P1 = Profile1.Path.size() < Profile2.Path.size() ? Profile2 : Profile1;
+ auto &P2 = Profile1.Path.size() < Profile2.Path.size() ? Profile1 : Profile2;
+ std::vector<int32_t> P1Expected = {2, 1, 0};
+ std::vector<int32_t> P2Expected = {1, 0};
+ ASSERT_EQ(P1.Path.size(), P1Expected.size());
+ ASSERT_EQ(P2.Path.size(), P2Expected.size());
+ ASSERT_EQ(P1.Path, P1Expected);
+ ASSERT_EQ(P2.Path, P2Expected);
+}
+
+// We break out a function that will be run in multiple threads, one that will
+// use a thread local allocator, and will post the FunctionCallTrie to the
+// profileCollectorService. This simulates what the threads being profiled would
+// be doing anyway, but through the XRay logging implementation.
+void threadProcessing() {
+ thread_local auto Allocators = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie T(Allocators);
+
+ T.enterFunction(1, 1);
+ T.enterFunction(2, 2);
+ T.exitFunction(2, 3);
+ T.exitFunction(1, 4);
+
+ profileCollectorService::post(T, GetTid());
+}
+
+TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) {
+ profilingFlags()->setDefaults();
+ std::thread t1(threadProcessing);
+ std::thread t2(threadProcessing);
+
+ t1.join();
+ t2.join();
+
+ // At this point, t1 and t2 are already done with what they were doing.
+ profileCollectorService::serialize();
+
+ // Ensure that we see two buffers.
+ auto B = profileCollectorService::nextBuffer({nullptr, 0});
+ ValidateBlock(B);
+
+ B = profileCollectorService::nextBuffer(B);
+ ValidateBlock(B);
+}
+
+} // namespace
+} // namespace __xray
diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc
new file mode 100644
index 000000000000..035674ccfaf5
--- /dev/null
+++ b/lib/xray/tests/unit/segmented_array_test.cc
@@ -0,0 +1,200 @@
+#include "xray_segmented_array.h"
+#include "gtest/gtest.h"
+
+namespace __xray {
+namespace {
+
+struct TestData {
+ s64 First;
+ s64 Second;
+
+ // Need a constructor for emplace operations.
+ TestData(s64 F, s64 S) : First(F), Second(S) {}
+};
+
+TEST(SegmentedArrayTest, ConstructWithAllocators) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ Array<TestData> Data(A);
+ (void)Data;
+}
+
+TEST(SegmentedArrayTest, ConstructAndPopulate) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ Array<TestData> data(A);
+ ASSERT_NE(data.Append(TestData{0, 0}), nullptr);
+ ASSERT_NE(data.Append(TestData{1, 1}), nullptr);
+ ASSERT_EQ(data.size(), 2u);
+}
+
+TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ Array<TestData> data(A);
+ ASSERT_NE(data.Append(TestData{0, 1}), nullptr);
+ ASSERT_EQ(data.size(), 1u);
+ ASSERT_EQ(data[0].First, 0);
+ ASSERT_EQ(data[0].Second, 1);
+}
+
+TEST(SegmentedArrayTest, PopulateWithMoreElements) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 24);
+ Array<TestData> data(A);
+ static const auto kMaxElements = 100u;
+ for (auto I = 0u; I < kMaxElements; ++I) {
+ ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr);
+ }
+ ASSERT_EQ(data.size(), kMaxElements);
+ for (auto I = 0u; I < kMaxElements; ++I) {
+ ASSERT_EQ(data[I].First, I);
+ ASSERT_EQ(data[I].Second, I + 1);
+ }
+}
+
+TEST(SegmentedArrayTest, AppendEmplace) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ Array<TestData> data(A);
+ ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+ ASSERT_EQ(data[0].First, 1);
+ ASSERT_EQ(data[0].Second, 1);
+}
+
+TEST(SegmentedArrayTest, AppendAndTrim) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ Array<TestData> data(A);
+ ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+ ASSERT_EQ(data.size(), 1u);
+ data.trim(1);
+ ASSERT_EQ(data.size(), 0u);
+ ASSERT_TRUE(data.empty());
+}
+
+TEST(SegmentedArrayTest, IteratorAdvance) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ Array<TestData> data(A);
+ ASSERT_TRUE(data.empty());
+ ASSERT_EQ(data.begin(), data.end());
+ auto I0 = data.begin();
+ ASSERT_EQ(I0++, data.begin());
+ ASSERT_NE(I0, data.begin());
+ for (const auto &D : data) {
+ (void)D;
+ FAIL();
+ }
+ ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+ ASSERT_EQ(data.size(), 1u);
+ ASSERT_NE(data.begin(), data.end());
+ auto &D0 = *data.begin();
+ ASSERT_EQ(D0.First, 1);
+ ASSERT_EQ(D0.Second, 1);
+}
+
+TEST(SegmentedArrayTest, IteratorRetreat) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ Array<TestData> data(A);
+ ASSERT_TRUE(data.empty());
+ ASSERT_EQ(data.begin(), data.end());
+ ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+ ASSERT_EQ(data.size(), 1u);
+ ASSERT_NE(data.begin(), data.end());
+ auto &D0 = *data.begin();
+ ASSERT_EQ(D0.First, 1);
+ ASSERT_EQ(D0.Second, 1);
+
+ auto I0 = data.end();
+ ASSERT_EQ(I0--, data.end());
+ ASSERT_NE(I0, data.end());
+ ASSERT_EQ(I0, data.begin());
+ ASSERT_EQ(I0->First, 1);
+ ASSERT_EQ(I0->Second, 1);
+}
+
+TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 20);
+ Array<TestData> Data(A);
+ ASSERT_TRUE(Data.empty());
+ auto I0Begin = Data.begin(), I0End = Data.end();
+ // Add enough elements in Data to have more than one chunk.
+ constexpr auto Segment = Array<TestData>::SegmentSize;
+ constexpr auto SegmentX2 = Segment * 2;
+ for (auto i = SegmentX2; i > 0u; --i) {
+ Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
+ }
+ ASSERT_EQ(Data.size(), SegmentX2);
+ {
+ auto &Back = Data.back();
+ ASSERT_EQ(Back.First, 1);
+ ASSERT_EQ(Back.Second, 1);
+ }
+
+ // Trim one chunk's elements worth.
+ Data.trim(Segment);
+ ASSERT_EQ(Data.size(), Segment);
+
+ // Check that we are still able to access 'back' properly.
+ {
+ auto &Back = Data.back();
+ ASSERT_EQ(Back.First, static_cast<s64>(Segment + 1));
+ ASSERT_EQ(Back.Second, static_cast<s64>(Segment + 1));
+ }
+
+ // Then trim until it's empty.
+ Data.trim(Segment);
+ ASSERT_TRUE(Data.empty());
+
+ // Here our iterators should be the same.
+ auto I1Begin = Data.begin(), I1End = Data.end();
+ EXPECT_EQ(I0Begin, I1Begin);
+ EXPECT_EQ(I0End, I1End);
+
+ // Then we ensure that adding elements back works just fine.
+ for (auto i = SegmentX2; i > 0u; --i) {
+ Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
+ }
+ EXPECT_EQ(Data.size(), SegmentX2);
+}
+
+struct ShadowStackEntry {
+ uint64_t EntryTSC = 0;
+ uint64_t *NodePtr = nullptr;
+ ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {}
+};
+
+TEST(SegmentedArrayTest, SimulateStackBehaviour) {
+ using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
+ AllocatorType A(1 << 10);
+ Array<ShadowStackEntry> Data(A);
+ static uint64_t Dummy = 0;
+ constexpr uint64_t Max = 9;
+
+ for (uint64_t i = 0; i < Max; ++i) {
+ auto P = Data.Append({i, &Dummy});
+ ASSERT_NE(P, nullptr);
+ ASSERT_EQ(P->NodePtr, &Dummy);
+ auto &Back = Data.back();
+ ASSERT_EQ(Back.NodePtr, &Dummy);
+ ASSERT_EQ(Back.EntryTSC, i);
+ }
+
+ // Simulate a stack by checking the data from the end as we're trimming.
+ auto Counter = Max;
+ ASSERT_EQ(Data.size(), size_t(Max));
+ while (!Data.empty()) {
+ const auto &Top = Data.back();
+ uint64_t *TopNode = Top.NodePtr;
+ EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter;
+ Data.trim(1);
+ --Counter;
+ ASSERT_EQ(Data.size(), size_t(Counter));
+ }
+}
+
+} // namespace
+} // namespace __xray
diff --git a/lib/xray/xray_AArch64.cc b/lib/xray/xray_AArch64.cc
index f26e77dd7fc1..096de009e83c 100644
--- a/lib/xray/xray_AArch64.cc
+++ b/lib/xray/xray_AArch64.cc
@@ -112,6 +112,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
return false;
}
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+ // FIXME: Implement in aarch64?
+ return false;
+}
+
// FIXME: Maybe implement this better?
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h
new file mode 100644
index 000000000000..8244815284a8
--- /dev/null
+++ b/lib/xray/xray_allocator.h
@@ -0,0 +1,129 @@
+//===-- xray_allocator.h ---------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the allocator interface for an arena allocator, used primarily for
+// the profiling runtime.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_ALLOCATOR_H
+#define XRAY_ALLOCATOR_H
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_posix.h"
+#include "xray_utils.h"
+#include <sys/mman.h>
+#include <cstddef>
+#include <cstdint>
+
+#ifndef MAP_NORESERVE
+// no-op on NetBSD (at least), unsupported flag on FreeBSD basically because unneeded
+#define MAP_NORESERVE 0
+#endif
+
+namespace __xray {
+
+/// The Allocator type hands out fixed-sized chunks of memory that are
+/// cache-line aligned and sized. This is useful for placement of
+/// performance-sensitive data in memory that's frequently accessed. The
+/// allocator also self-limits the peak memory usage to a dynamically defined
+/// maximum.
+///
+/// N is the lower-bound size of the block of memory to return from the
+/// allocation function. N is used to compute the size of a block, which is
+/// cache-line-size multiples worth of memory. We compute the size of a block by
+/// determining how many cache lines worth of memory is required to subsume N.
+///
+/// The Allocator instance will manage its own memory acquired through mmap.
+/// This severely constrains the platforms on which this can be used to POSIX
+/// systems where mmap semantics are well-defined.
+///
+/// FIXME: Isolate the lower-level memory management to a different abstraction
+/// that can be platform-specific.
+template <size_t N> struct Allocator {
+ // The Allocator returns memory as Block instances.
+ struct Block {
+ /// Compute the minimum cache-line size multiple that is >= N.
+ static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
+ void *Data;
+ };
+
+private:
+ const size_t MaxMemory{0};
+ void *BackingStore = nullptr;
+ void *AlignedNextBlock = nullptr;
+ size_t AllocatedBlocks = 0;
+ SpinMutex Mutex{};
+
+ void *Alloc() {
+ SpinMutexLock Lock(&Mutex);
+ if (UNLIKELY(BackingStore == nullptr)) {
+ BackingStore = reinterpret_cast<void *>(
+ internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0));
+ if (BackingStore == MAP_FAILED) {
+ BackingStore = nullptr;
+ if (Verbosity())
+ Report("XRay Profiling: Failed to allocate memory for allocator.\n");
+ return nullptr;
+ }
+
+ AlignedNextBlock = BackingStore;
+
+ // Ensure that NextBlock is aligned appropriately.
+ auto BackingStoreNum = reinterpret_cast<uintptr_t>(BackingStore);
+ auto AlignedNextBlockNum = nearest_boundary(
+ reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
+ if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
+ munmap(BackingStore, MaxMemory);
+ AlignedNextBlock = BackingStore = nullptr;
+ if (Verbosity())
+ Report("XRay Profiling: Cannot obtain enough memory from "
+ "preallocated region.\n");
+ return nullptr;
+ }
+
+ AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum);
+
+ // Assert that AlignedNextBlock is cache-line aligned.
+ DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
+ 0);
+ }
+
+ if ((AllocatedBlocks * Block::Size) >= MaxMemory)
+ return nullptr;
+
+ // Align the pointer we'd like to return to an appropriate alignment, then
+ // advance the pointer from where to start allocations.
+ void *Result = AlignedNextBlock;
+ AlignedNextBlock = reinterpret_cast<void *>(
+ reinterpret_cast<char *>(AlignedNextBlock) + N);
+ ++AllocatedBlocks;
+ return Result;
+ }
+
+public:
+ explicit Allocator(size_t M)
+ : MaxMemory(nearest_boundary(M, kCacheLineSize)) {}
+
+ Block Allocate() { return {Alloc()}; }
+
+ ~Allocator() NOEXCEPT {
+ if (BackingStore != nullptr) {
+ internal_munmap(BackingStore, MaxMemory);
+ }
+ }
+};
+
+} // namespace __xray
+
+#endif // XRAY_ALLOCATOR_H
diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc
index da4efcdd2b17..5b828287e3f6 100644
--- a/lib/xray/xray_arm.cc
+++ b/lib/xray/xray_arm.cc
@@ -149,6 +149,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
return false;
}
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+ // FIXME: Implement in arm?
+ return false;
+}
+
// FIXME: Maybe implement this better?
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
diff --git a/lib/xray/xray_basic_flags.cc b/lib/xray/xray_basic_flags.cc
new file mode 100644
index 000000000000..14d805c71a88
--- /dev/null
+++ b/lib/xray/xray_basic_flags.cc
@@ -0,0 +1,50 @@
+//===-- xray_basic_flags.cc -------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay Basic flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_basic_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+/// Use via basicFlags().
+BasicFlags xray_basic_flags_dont_use_directly;
+
+void BasicFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayBasicFlags(FlagParser *P,
+ BasicFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) \
+ RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedBasicFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_BASIC_OPTIONS
+ return SANITIZER_STRINGIFY(XRAY_BASIC_OPTIONS);
+#else
+ return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_basic_flags.h b/lib/xray/xray_basic_flags.h
new file mode 100644
index 000000000000..041578f0663c
--- /dev/null
+++ b/lib/xray/xray_basic_flags.h
@@ -0,0 +1,38 @@
+//===-- xray_basic_flags.h -------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instruementation system.
+//
+// XRay Basic Mode runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_BASIC_FLAGS_H
+#define XRAY_BASIC_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct BasicFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+
+ void setDefaults();
+};
+
+extern BasicFlags xray_basic_flags_dont_use_directly;
+extern void registerXRayBasicFlags(FlagParser *P, BasicFlags *F);
+const char *useCompilerDefinedBasicFlags();
+inline BasicFlags *basicFlags() { return &xray_basic_flags_dont_use_directly; }
+
+} // namespace __xray
+
+#endif // XRAY_BASIC_FLAGS_H
diff --git a/lib/xray/xray_basic_flags.inc b/lib/xray/xray_basic_flags.inc
new file mode 100644
index 000000000000..327735b51055
--- /dev/null
+++ b/lib/xray/xray_basic_flags.inc
@@ -0,0 +1,24 @@
+//===-- xray_basic_flags.inc ------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(int, func_duration_threshold_us, 5,
+ "Basic logging will try to skip functions that execute for fewer "
+ "microseconds than this threshold.")
+XRAY_FLAG(int, max_stack_depth, 64,
+ "Basic logging will keep track of at most this deep a call stack, "
+ "any more and the recordings will be dropped.")
+XRAY_FLAG(int, thread_buffer_size, 1024,
+ "The number of entries to keep on a per-thread buffer.")
diff --git a/lib/xray/xray_inmemory_log.cc b/lib/xray/xray_basic_logging.cc
index a27ffbcbd12e..585ca641cd0c 100644
--- a/lib/xray/xray_inmemory_log.cc
+++ b/lib/xray/xray_basic_logging.cc
@@ -1,4 +1,4 @@
-//===-- xray_inmemory_log.cc ------------------------------------*- C++ -*-===//
+//===-- xray_basic_logging.cc -----------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,8 +15,6 @@
//
//===----------------------------------------------------------------------===//
-#include <cassert>
-#include <cstring>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
@@ -29,16 +27,18 @@
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_libc.h"
#include "xray/xray_records.h"
+#include "xray_recursion_guard.h"
+#include "xray_basic_flags.h"
+#include "xray_basic_logging.h"
#include "xray_defs.h"
#include "xray_flags.h"
-#include "xray_inmemory_log.h"
#include "xray_interface_internal.h"
#include "xray_tsc.h"
#include "xray_utils.h"
namespace __xray {
-__sanitizer::SpinMutex LogMutex;
+SpinMutex LogMutex;
// We use elements of this type to record the entry TSC of every function ID we
// see as we're tracing a particular thread's execution.
@@ -60,43 +60,41 @@ struct alignas(64) ThreadLocalData {
size_t StackSize = 0;
size_t StackEntries = 0;
int Fd = -1;
- pid_t TID = 0;
};
static pthread_key_t PThreadKey;
-static __sanitizer::atomic_uint8_t BasicInitialized{0};
+static atomic_uint8_t BasicInitialized{0};
BasicLoggingOptions GlobalOptions;
-thread_local volatile bool RecursionGuard = false;
+thread_local atomic_uint8_t Guard{0};
-static uint64_t thresholdTicks() XRAY_NEVER_INSTRUMENT {
- static uint64_t TicksPerSec = probeRequiredCPUFeatures()
- ? getTSCFrequency()
- : __xray::NanosecondsPerSecond;
- static const uint64_t ThresholdTicks =
- TicksPerSec * GlobalOptions.DurationFilterMicros / 1000000;
- return ThresholdTicks;
-}
+static atomic_uint8_t UseRealTSC{0};
+static atomic_uint64_t ThresholdTicks{0};
+static atomic_uint64_t TicksPerSec{0};
+static atomic_uint64_t CycleFrequency{NanosecondsPerSecond};
static int openLogFile() XRAY_NEVER_INSTRUMENT {
int F = getLogFD();
if (F == -1)
return -1;
- // Test for required CPU features and cache the cycle frequency
- static bool TSCSupported = probeRequiredCPUFeatures();
- static uint64_t CycleFrequency =
- TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;
+ static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT;
+ pthread_once(&DetectOnce, +[] {
+ if (atomic_load(&UseRealTSC, memory_order_acquire))
+ atomic_store(&CycleFrequency, getTSCFrequency(), memory_order_release);
+ });
// Since we're here, we get to write the header. We set it up so that the
// header will only be written once, at the start, and let the threads
// logging do writes which just append.
XRayFileHeader Header;
- Header.Version = 2; // Version 2 includes tail exit records.
+ // Version 2 includes tail exit records.
+ // Version 3 includes pid inside records.
+ Header.Version = 3;
Header.Type = FileTypes::NAIVE_LOG;
- Header.CycleFrequency = CycleFrequency;
+ Header.CycleFrequency = atomic_load(&CycleFrequency, memory_order_acquire);
// FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
// before setting the values in the header.
@@ -107,20 +105,21 @@ static int openLogFile() XRAY_NEVER_INSTRUMENT {
return F;
}
-int getGlobalFd() XRAY_NEVER_INSTRUMENT {
- static int Fd = openLogFile();
+static int getGlobalFd() XRAY_NEVER_INSTRUMENT {
+ static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+ static int Fd = 0;
+ pthread_once(&OnceInit, +[] { Fd = openLogFile(); });
return Fd;
}
-ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
thread_local ThreadLocalData TLD;
thread_local bool UNUSED TOnce = [] {
if (GlobalOptions.ThreadBufferSize == 0) {
- if (__sanitizer::Verbosity())
+ if (Verbosity())
Report("Not initializing TLD since ThreadBufferSize == 0.\n");
return false;
}
- TLD.TID = __sanitizer::GetTid();
pthread_setspecific(PThreadKey, &TLD);
TLD.Fd = getGlobalFd();
TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>(
@@ -129,7 +128,7 @@ ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
TLD.BufferSize = GlobalOptions.ThreadBufferSize;
TLD.BufferOffset = 0;
if (GlobalOptions.MaxStackDepth == 0) {
- if (__sanitizer::Verbosity())
+ if (Verbosity())
Report("Not initializing the ShadowStack since MaxStackDepth == 0.\n");
TLD.StackSize = 0;
TLD.StackEntries = 0;
@@ -141,13 +140,6 @@ ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
alignof(StackEntry)));
TLD.StackSize = GlobalOptions.MaxStackDepth;
TLD.StackEntries = 0;
- if (__sanitizer::Verbosity() >= 2) {
- static auto UNUSED Once = [] {
- auto ticks = thresholdTicks();
- Report("Ticks threshold: %d\n", ticks);
- return false;
- }();
- }
return false;
}();
return TLD;
@@ -157,7 +149,6 @@ template <class RDTSC>
void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
- auto &InMemoryBuffer = TLD.InMemoryBuffer;
int Fd = getGlobalFd();
if (Fd == -1)
return;
@@ -165,10 +156,9 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
// Use a simple recursion guard, to handle cases where we're already logging
// and for one reason or another, this function gets called again in the same
// thread.
- if (RecursionGuard)
+ RecursionGuard G(Guard);
+ if (!G)
return;
- RecursionGuard = true;
- auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; });
uint8_t CPU = 0;
uint64_t TSC = ReadTSC(CPU);
@@ -189,7 +179,7 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
E.TSC = TSC;
auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
(sizeof(StackEntry) * (TLD.StackEntries - 1));
- __sanitizer::internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry));
+ internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry));
break;
}
case XRayEntryType::EXIT:
@@ -213,12 +203,12 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
StackEntry StackTop;
auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
(sizeof(StackEntry) * TLD.StackEntries);
- __sanitizer::internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry));
+ internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry));
if (StackTop.FuncId == FuncId && StackTop.CPU == CPU &&
StackTop.TSC < TSC) {
auto Delta = TSC - StackTop.TSC;
- if (Delta < thresholdTicks()) {
- assert(TLD.BufferOffset > 0);
+ if (Delta < atomic_load(&ThresholdTicks, memory_order_relaxed)) {
+ DCHECK(TLD.BufferOffset > 0);
TLD.BufferOffset -= StackTop.Type == XRayEntryType::ENTRY ? 1 : 2;
return;
}
@@ -227,27 +217,26 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
}
default:
// Should be unreachable.
- assert(false && "Unsupported XRayEntryType encountered.");
+ DCHECK(false && "Unsupported XRayEntryType encountered.");
break;
}
// First determine whether the delta between the function's enter record and
// the exit record is higher than the threshold.
- __xray::XRayRecord R;
+ XRayRecord R;
R.RecordType = RecordTypes::NORMAL;
R.CPU = CPU;
R.TSC = TSC;
- R.TId = TLD.TID;
+ R.TId = GetTid();
+ R.PId = internal_getpid();
R.Type = Type;
R.FuncId = FuncId;
- auto EntryPtr = static_cast<char *>(InMemoryBuffer) +
- (sizeof(__xray::XRayRecord) * TLD.BufferOffset);
- __sanitizer::internal_memcpy(EntryPtr, &R, sizeof(R));
+ auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer);
+ internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
if (++TLD.BufferOffset == TLD.BufferSize) {
- __sanitizer::SpinMutexLock L(&LogMutex);
- auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer);
- retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer),
- reinterpret_cast<char *>(RecordBuffer + TLD.BufferOffset));
+ SpinMutexLock L(&LogMutex);
+ retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
+ reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
TLD.BufferOffset = 0;
TLD.StackEntries = 0;
}
@@ -257,8 +246,8 @@ template <class RDTSC>
void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
- auto &InMemoryBuffer = TLD.InMemoryBuffer;
- auto &Offset = TLD.BufferOffset;
+ auto FirstEntry =
+ reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer);
const auto &BuffLen = TLD.BufferSize;
int Fd = getGlobalFd();
if (Fd == -1)
@@ -267,45 +256,41 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
// First we check whether there's enough space to write the data consecutively
// in the thread-local buffer. If not, we first flush the buffer before
// attempting to write the two records that must be consecutive.
- if (Offset + 2 > BuffLen) {
- __sanitizer::SpinMutexLock L(&LogMutex);
- auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer);
- retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer),
- reinterpret_cast<char *>(RecordBuffer + Offset));
- Offset = 0;
+ if (TLD.BufferOffset + 2 > BuffLen) {
+ SpinMutexLock L(&LogMutex);
+ retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
+ reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+ TLD.BufferOffset = 0;
TLD.StackEntries = 0;
}
// Then we write the "we have an argument" record.
InMemoryRawLog(FuncId, Type, ReadTSC);
- if (RecursionGuard)
+ RecursionGuard G(Guard);
+ if (!G)
return;
- RecursionGuard = true;
- auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; });
- // And from here on write the arg payload.
- __xray::XRayArgPayload R;
+ // And, from here on write the arg payload.
+ XRayArgPayload R;
R.RecordType = RecordTypes::ARG_PAYLOAD;
R.FuncId = FuncId;
- R.TId = TLD.TID;
+ R.TId = GetTid();
+ R.PId = internal_getpid();
R.Arg = Arg1;
- auto EntryPtr =
- &reinterpret_cast<__xray::XRayArgPayload *>(&InMemoryBuffer)[Offset];
- std::memcpy(EntryPtr, &R, sizeof(R));
- if (++Offset == BuffLen) {
- __sanitizer::SpinMutexLock L(&LogMutex);
- auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer);
- retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer),
- reinterpret_cast<char *>(RecordBuffer + Offset));
- Offset = 0;
+ internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
+ if (++TLD.BufferOffset == BuffLen) {
+ SpinMutexLock L(&LogMutex);
+ retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
+ reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+ TLD.BufferOffset = 0;
TLD.StackEntries = 0;
}
}
void basicLoggingHandleArg0RealTSC(int32_t FuncId,
XRayEntryType Type) XRAY_NEVER_INSTRUMENT {
- InMemoryRawLog(FuncId, Type, __xray::readTSC);
+ InMemoryRawLog(FuncId, Type, readTSC);
}
void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type)
@@ -318,13 +303,13 @@ void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type)
TS = {0, 0};
}
CPU = 0;
- return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
+ return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
});
}
void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Type,
uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
- InMemoryRawLogWithArg(FuncId, Type, Arg1, __xray::readTSC);
+ InMemoryRawLogWithArg(FuncId, Type, Arg1, readTSC);
}
void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type,
@@ -338,34 +323,34 @@ void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type,
TS = {0, 0};
}
CPU = 0;
- return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
+ return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
});
}
static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
ThreadLocalData &TLD = *reinterpret_cast<ThreadLocalData *>(P);
- auto ExitGuard = __sanitizer::at_scope_exit([&TLD] {
+ auto ExitGuard = at_scope_exit([&TLD] {
// Clean up dynamic resources.
if (TLD.InMemoryBuffer)
InternalFree(TLD.InMemoryBuffer);
if (TLD.ShadowStack)
InternalFree(TLD.ShadowStack);
- if (__sanitizer::Verbosity())
- Report("Cleaned up log for TID: %d\n", TLD.TID);
+ if (Verbosity())
+ Report("Cleaned up log for TID: %d\n", GetTid());
});
if (TLD.Fd == -1 || TLD.BufferOffset == 0) {
- if (__sanitizer::Verbosity())
- Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", TLD.TID,
+ if (Verbosity())
+ Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", GetTid(),
TLD.Fd, TLD.BufferOffset);
return;
}
{
- __sanitizer::SpinMutexLock L(&LogMutex);
+ SpinMutexLock L(&LogMutex);
retryingWriteAll(TLD.Fd, reinterpret_cast<char *>(TLD.InMemoryBuffer),
reinterpret_cast<char *>(TLD.InMemoryBuffer) +
- (sizeof(__xray::XRayRecord) * TLD.BufferOffset));
+ (sizeof(XRayRecord) * TLD.BufferOffset));
}
// Because this thread's exit could be the last one trying to write to
@@ -378,45 +363,89 @@ static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
void *Options,
size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
- static bool UNUSED Once = [] {
- pthread_key_create(&PThreadKey, TLDDestructor);
- return false;
- }();
-
uint8_t Expected = 0;
- if (!__sanitizer::atomic_compare_exchange_strong(
- &BasicInitialized, &Expected, 1, __sanitizer::memory_order_acq_rel)) {
- if (__sanitizer::Verbosity())
+ if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
+ memory_order_acq_rel)) {
+ if (Verbosity())
Report("Basic logging already initialized.\n");
return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
}
- if (OptionsSize != sizeof(BasicLoggingOptions)) {
+ static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+ pthread_once(&OnceInit, +[] {
+ pthread_key_create(&PThreadKey, TLDDestructor);
+ atomic_store(&UseRealTSC, probeRequiredCPUFeatures(), memory_order_release);
+ // Initialize the global TicksPerSec value.
+ atomic_store(&TicksPerSec,
+ probeRequiredCPUFeatures() ? getTSCFrequency()
+ : NanosecondsPerSecond,
+ memory_order_release);
+ if (!atomic_load(&UseRealTSC, memory_order_relaxed) && Verbosity())
+ Report("WARNING: Required CPU features missing for XRay instrumentation, "
+ "using emulation instead.\n");
+ });
+
+ if (BufferSize == 0 && BufferMax == 0 && Options != nullptr) {
+ FlagParser P;
+ BasicFlags F;
+ F.setDefaults();
+ registerXRayBasicFlags(&P, &F);
+ P.ParseString(useCompilerDefinedBasicFlags());
+ auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
+ if (EnvOpts == nullptr)
+ EnvOpts = "";
+
+ P.ParseString(EnvOpts);
+
+ // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
+ // set through XRAY_OPTIONS instead.
+ if (internal_strlen(EnvOpts) == 0) {
+ F.func_duration_threshold_us =
+ flags()->xray_naive_log_func_duration_threshold_us;
+ F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
+ F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
+ }
+
+ P.ParseString(static_cast<const char *>(Options));
+ GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
+ GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
+ GlobalOptions.MaxStackDepth = F.max_stack_depth;
+ *basicFlags() = F;
+ } else if (OptionsSize != sizeof(BasicLoggingOptions)) {
Report("Invalid options size, potential ABI mismatch; expected %d got %d",
sizeof(BasicLoggingOptions), OptionsSize);
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ } else {
+ if (Verbosity())
+ Report("XRay Basic: struct-based init is deprecated, please use "
+ "string-based configuration instead.\n");
+ GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options);
}
- static auto UseRealTSC = probeRequiredCPUFeatures();
- if (!UseRealTSC && __sanitizer::Verbosity())
- Report("WARNING: Required CPU features missing for XRay instrumentation, "
- "using emulation instead.\n");
-
- GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options);
- __xray_set_handler_arg1(UseRealTSC ? basicLoggingHandleArg1RealTSC
- : basicLoggingHandleArg1EmulateTSC);
- __xray_set_handler(UseRealTSC ? basicLoggingHandleArg0RealTSC
- : basicLoggingHandleArg0EmulateTSC);
+ atomic_store(&ThresholdTicks,
+ atomic_load(&TicksPerSec, memory_order_acquire) *
+ GlobalOptions.DurationFilterMicros / 1000000,
+ memory_order_release);
+ __xray_set_handler_arg1(atomic_load(&UseRealTSC, memory_order_acquire)
+ ? basicLoggingHandleArg1RealTSC
+ : basicLoggingHandleArg1EmulateTSC);
+ __xray_set_handler(atomic_load(&UseRealTSC, memory_order_acquire)
+ ? basicLoggingHandleArg0RealTSC
+ : basicLoggingHandleArg0EmulateTSC);
+
+ // TODO: Implement custom event and typed event handling support in Basic
+ // Mode.
__xray_remove_customevent_handler();
+ __xray_remove_typedevent_handler();
return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
}
XRayLogInitStatus basicLoggingFinalize() XRAY_NEVER_INSTRUMENT {
uint8_t Expected = 0;
- if (!__sanitizer::atomic_compare_exchange_strong(
- &BasicInitialized, &Expected, 0, __sanitizer::memory_order_acq_rel) &&
- __sanitizer::Verbosity())
+ if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 0,
+ memory_order_acq_rel) &&
+ Verbosity())
Report("Basic logging already finalized.\n");
// Nothing really to do aside from marking state of the global to be
@@ -444,24 +473,41 @@ bool basicLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
};
auto RegistrationResult = __xray_log_register_mode("xray-basic", Impl);
if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
- __sanitizer::Verbosity())
+ Verbosity())
Report("Cannot register XRay Basic Mode to 'xray-basic'; error = %d\n",
RegistrationResult);
if (flags()->xray_naive_log ||
- !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-basic")) {
- __xray_set_log_impl(Impl);
- BasicLoggingOptions Options;
- Options.DurationFilterMicros =
- flags()->xray_naive_log_func_duration_threshold_us;
- Options.MaxStackDepth = flags()->xray_naive_log_max_stack_depth;
- Options.ThreadBufferSize = flags()->xray_naive_log_thread_buffer_size;
- __xray_log_init(flags()->xray_naive_log_thread_buffer_size, 0, &Options,
- sizeof(BasicLoggingOptions));
- static auto UNUSED Once = [] {
- static auto UNUSED &TLD = getThreadLocalData();
- __sanitizer::Atexit(+[] { TLDDestructor(&TLD); });
+ !internal_strcmp(flags()->xray_mode, "xray-basic")) {
+ auto SelectResult = __xray_log_select_mode("xray-basic");
+ if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+ if (Verbosity())
+ Report("Failed selecting XRay Basic Mode; error = %d\n", SelectResult);
return false;
- }();
+ }
+
+ // We initialize the implementation using the data we get from the
+ // XRAY_BASIC_OPTIONS environment variable, at this point of the
+ // implementation.
+ auto *Env = GetEnv("XRAY_BASIC_OPTIONS");
+ auto InitResult =
+ __xray_log_init_mode("xray-basic", Env == nullptr ? "" : Env);
+ if (InitResult != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
+ if (Verbosity())
+ Report("Failed initializing XRay Basic Mode; error = %d\n", InitResult);
+ return false;
+ }
+
+ // At this point we know that we've successfully initialized Basic mode
+ // tracing, and the only chance we're going to get for the current thread to
+ // clean-up may be at thread/program exit. To ensure that we're going to get
+ // the cleanup even without calling the finalization routines, we're
+ // registering a program exit function that will do the cleanup.
+ static pthread_once_t DynamicOnce = PTHREAD_ONCE_INIT;
+ pthread_once(&DynamicOnce, +[] {
+ static void *FakeTLD = nullptr;
+ FakeTLD = &getThreadLocalData();
+ Atexit(+[] { TLDDestructor(FakeTLD); });
+ });
}
return true;
}
diff --git a/lib/xray/xray_inmemory_log.h b/lib/xray/xray_basic_logging.h
index e4fcb8ca5ffd..1639b96d91a1 100644
--- a/lib/xray/xray_inmemory_log.h
+++ b/lib/xray/xray_basic_logging.h
@@ -1,5 +1,4 @@
-//===-- xray_inmemory_log.h
-//------------------------------------------------===//
+//===-- xray_basic_logging.h ----------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/xray/xray_buffer_queue.cc b/lib/xray/xray_buffer_queue.cc
index a0018f6b0cba..8dfcc23540b1 100644
--- a/lib/xray/xray_buffer_queue.cc
+++ b/lib/xray/xray_buffer_queue.cc
@@ -16,14 +16,37 @@
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_libc.h"
+#include <memory>
using namespace __xray;
using namespace __sanitizer;
+template <class T> static T *initArray(size_t N) {
+ auto A = reinterpret_cast<T *>(
+ InternalAlloc(N * sizeof(T), nullptr, kCacheLineSize));
+ if (A != nullptr)
+ while (N > 0)
+ new (A + (--N)) T();
+ return A;
+}
+
BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
- : BufferSize(B), Buffers(new BufferRep[N]()), BufferCount(N), Finalizing{0},
- OwnedBuffers(new void *[N]()), Next(Buffers), First(Buffers),
- LiveBuffers(0) {
+ : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)),
+ BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)),
+ Next(Buffers), First(Buffers), LiveBuffers(0) {
+ if (Buffers == nullptr) {
+ Success = false;
+ return;
+ }
+ if (OwnedBuffers == nullptr) {
+ // Clean up the buffers we've already allocated.
+ for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
+ B->~BufferRep();
+ InternalFree(Buffers);
+ Success = false;
+ return;
+ };
+
for (size_t i = 0; i < N; ++i) {
auto &T = Buffers[i];
void *Tmp = InternalAlloc(BufferSize, nullptr, 64);
@@ -37,7 +60,7 @@ BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
return;
}
auto &Buf = T.Buff;
- Buf.Buffer = Tmp;
+ Buf.Data = Tmp;
Buf.Size = B;
Buf.Extents = reinterpret_cast<BufferExtents *>(Extents);
OwnedBuffers[i] = Tmp;
@@ -46,9 +69,9 @@ BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
}
BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
- if (__sanitizer::atomic_load(&Finalizing, __sanitizer::memory_order_acquire))
+ if (atomic_load(&Finalizing, memory_order_acquire))
return ErrorCode::QueueFinalizing;
- __sanitizer::SpinMutexLock Guard(&Mutex);
+ SpinMutexLock Guard(&Mutex);
if (LiveBuffers == BufferCount)
return ErrorCode::NotEnoughMemory;
@@ -68,7 +91,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
// Blitz through the buffers array to find the buffer.
bool Found = false;
for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) {
- if (*I == Buf.Buffer) {
+ if (*I == Buf.Data) {
Found = true;
break;
}
@@ -76,7 +99,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
if (!Found)
return ErrorCode::UnrecognizedBuffer;
- __sanitizer::SpinMutexLock Guard(&Mutex);
+ SpinMutexLock Guard(&Mutex);
// This points to a semantic bug, we really ought to not be releasing more
// buffers than we actually get.
@@ -86,7 +109,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
// Now that the buffer has been released, we mark it as "used".
First->Buff = Buf;
First->Used = true;
- Buf.Buffer = nullptr;
+ Buf.Data = nullptr;
Buf.Size = 0;
--LiveBuffers;
if (++First == (Buffers + BufferCount))
@@ -96,8 +119,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
}
BufferQueue::ErrorCode BufferQueue::finalize() {
- if (__sanitizer::atomic_exchange(&Finalizing, 1,
- __sanitizer::memory_order_acq_rel))
+ if (atomic_exchange(&Finalizing, 1, memory_order_acq_rel))
return ErrorCode::QueueFinalizing;
return ErrorCode::Ok;
}
@@ -106,9 +128,11 @@ BufferQueue::~BufferQueue() {
for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
auto &T = *I;
auto &Buf = T.Buff;
- InternalFree(Buf.Buffer);
+ InternalFree(Buf.Data);
InternalFree(Buf.Extents);
}
- delete[] Buffers;
- delete[] OwnedBuffers;
+ for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
+ B->~BufferRep();
+ InternalFree(Buffers);
+ InternalFree(OwnedBuffers);
}
diff --git a/lib/xray/xray_buffer_queue.h b/lib/xray/xray_buffer_queue.h
index 1ceb58274616..e76fa7983c90 100644
--- a/lib/xray/xray_buffer_queue.h
+++ b/lib/xray/xray_buffer_queue.h
@@ -15,9 +15,10 @@
#ifndef XRAY_BUFFER_QUEUE_H
#define XRAY_BUFFER_QUEUE_H
-#include <cstddef>
#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_mutex.h"
+#include <cstddef>
namespace __xray {
@@ -27,18 +28,17 @@ namespace __xray {
/// the "flight data recorder" (FDR) mode to support ongoing XRay function call
/// trace collection.
class BufferQueue {
- public:
+public:
struct alignas(64) BufferExtents {
- __sanitizer::atomic_uint64_t Size;
+ atomic_uint64_t Size;
};
struct Buffer {
- void *Buffer = nullptr;
+ void *Data = nullptr;
size_t Size = 0;
- BufferExtents* Extents;
+ BufferExtents *Extents;
};
- private:
struct BufferRep {
// The managed buffer.
Buffer Buff;
@@ -48,14 +48,72 @@ class BufferQueue {
bool Used = false;
};
+private:
+ // This models a ForwardIterator. |T| Must be either a `Buffer` or `const
+ // Buffer`. Note that we only advance to the "used" buffers, when
+ // incrementing, so that at dereference we're always at a valid point.
+ template <class T> class Iterator {
+ public:
+ BufferRep *Buffers = nullptr;
+ size_t Offset = 0;
+ size_t Max = 0;
+
+ Iterator &operator++() {
+ DCHECK_NE(Offset, Max);
+ do {
+ ++Offset;
+ } while (!Buffers[Offset].Used && Offset != Max);
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ Iterator C = *this;
+ ++(*this);
+ return C;
+ }
+
+ T &operator*() const { return Buffers[Offset].Buff; }
+
+ T *operator->() const { return &(Buffers[Offset].Buff); }
+
+ Iterator(BufferRep *Root, size_t O, size_t M)
+ : Buffers(Root), Offset(O), Max(M) {
+ // We want to advance to the first Offset where the 'Used' property is
+ // true, or to the end of the list/queue.
+ while (!Buffers[Offset].Used && Offset != Max) {
+ ++Offset;
+ }
+ }
+
+ Iterator() = default;
+ Iterator(const Iterator &) = default;
+ Iterator(Iterator &&) = default;
+ Iterator &operator=(const Iterator &) = default;
+ Iterator &operator=(Iterator &&) = default;
+ ~Iterator() = default;
+
+ template <class V>
+ friend bool operator==(const Iterator &L, const Iterator<V> &R) {
+ DCHECK_EQ(L.Max, R.Max);
+ return L.Buffers == R.Buffers && L.Offset == R.Offset;
+ }
+
+ template <class V>
+ friend bool operator!=(const Iterator &L, const Iterator<V> &R) {
+ return !(L == R);
+ }
+ };
+
// Size of each individual Buffer.
size_t BufferSize;
BufferRep *Buffers;
+
+ // Amount of pre-allocated buffers.
size_t BufferCount;
- __sanitizer::SpinMutex Mutex;
- __sanitizer::atomic_uint8_t Finalizing;
+ SpinMutex Mutex;
+ atomic_uint8_t Finalizing;
// Pointers to buffers managed/owned by the BufferQueue.
void **OwnedBuffers;
@@ -70,7 +128,7 @@ class BufferQueue {
// Count of buffers that have been handed out through 'getBuffer'.
size_t LiveBuffers;
- public:
+public:
enum class ErrorCode : unsigned {
Ok,
NotEnoughMemory,
@@ -81,16 +139,16 @@ class BufferQueue {
static const char *getErrorString(ErrorCode E) {
switch (E) {
- case ErrorCode::Ok:
- return "(none)";
- case ErrorCode::NotEnoughMemory:
- return "no available buffers in the queue";
- case ErrorCode::QueueFinalizing:
- return "queue already finalizing";
- case ErrorCode::UnrecognizedBuffer:
- return "buffer being returned not owned by buffer queue";
- case ErrorCode::AlreadyFinalized:
- return "queue already finalized";
+ case ErrorCode::Ok:
+ return "(none)";
+ case ErrorCode::NotEnoughMemory:
+ return "no available buffers in the queue";
+ case ErrorCode::QueueFinalizing:
+ return "queue already finalizing";
+ case ErrorCode::UnrecognizedBuffer:
+ return "buffer being returned not owned by buffer queue";
+ case ErrorCode::AlreadyFinalized:
+ return "queue already finalized";
}
return "unknown error";
}
@@ -122,8 +180,7 @@ class BufferQueue {
ErrorCode releaseBuffer(Buffer &Buf);
bool finalizing() const {
- return __sanitizer::atomic_load(&Finalizing,
- __sanitizer::memory_order_acquire);
+ return atomic_load(&Finalizing, memory_order_acquire);
}
/// Returns the configured size of the buffers in the buffer queue.
@@ -141,19 +198,29 @@ class BufferQueue {
/// Applies the provided function F to each Buffer in the queue, only if the
/// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a
/// releaseBuffer(...) operation).
- template <class F>
- void apply(F Fn) {
- __sanitizer::SpinMutexLock G(&Mutex);
- for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
- const auto &T = *I;
- if (T.Used) Fn(T.Buff);
- }
+ template <class F> void apply(F Fn) {
+ SpinMutexLock G(&Mutex);
+ for (auto I = begin(), E = end(); I != E; ++I)
+ Fn(*I);
+ }
+
+ using const_iterator = Iterator<const Buffer>;
+ using iterator = Iterator<Buffer>;
+
+ /// Provides iterator access to the raw Buffer instances.
+ iterator begin() const { return iterator(Buffers, 0, BufferCount); }
+ const_iterator cbegin() const {
+ return const_iterator(Buffers, 0, BufferCount);
+ }
+ iterator end() const { return iterator(Buffers, BufferCount, BufferCount); }
+ const_iterator cend() const {
+ return const_iterator(Buffers, BufferCount, BufferCount);
}
// Cleans up allocated buffers.
~BufferQueue();
};
-} // namespace __xray
+} // namespace __xray
-#endif // XRAY_BUFFER_QUEUE_H
+#endif // XRAY_BUFFER_QUEUE_H
diff --git a/lib/xray/xray_fdr_flags.cc b/lib/xray/xray_fdr_flags.cc
new file mode 100644
index 000000000000..a14851b1b616
--- /dev/null
+++ b/lib/xray/xray_fdr_flags.cc
@@ -0,0 +1,48 @@
+//===-- xray_fdr_flags.cc ---------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay FDR flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_fdr_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+FDRFlags xray_fdr_flags_dont_use_directly; // use via fdrFlags().
+
+void FDRFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayFDRFlags(FlagParser *P, FDRFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) \
+ RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedFDRFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_FDR_OPTIONS
+ return SANITIZER_STRINGIFY(XRAY_FDR_OPTIONS);
+#else
+ return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_fdr_flags.h b/lib/xray/xray_fdr_flags.h
new file mode 100644
index 000000000000..9c953f1cabcf
--- /dev/null
+++ b/lib/xray/xray_fdr_flags.h
@@ -0,0 +1,38 @@
+//===-- xray_fdr_flags.h ---------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the flags for the flight-data-recorder mode implementation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FDR_FLAGS_H
+#define XRAY_FDR_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct FDRFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+
+ void setDefaults();
+};
+
+extern FDRFlags xray_fdr_flags_dont_use_directly;
+extern void registerXRayFDRFlags(FlagParser *P, FDRFlags *F);
+const char *useCompilerDefinedFDRFlags();
+inline FDRFlags *fdrFlags() { return &xray_fdr_flags_dont_use_directly; }
+
+} // namespace __xray
+
+#endif // XRAY_FDR_FLAGS_H
diff --git a/lib/xray/xray_fdr_flags.inc b/lib/xray/xray_fdr_flags.inc
new file mode 100644
index 000000000000..d8721ad12cbe
--- /dev/null
+++ b/lib/xray/xray_fdr_flags.inc
@@ -0,0 +1,29 @@
+//===-- xray_fdr_flags.inc --------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay FDR Mode runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+// FDR (Flight Data Recorder) Mode logging options.
+XRAY_FLAG(int, func_duration_threshold_us, 5,
+ "FDR logging will try to skip functions that execute for fewer "
+ "microseconds than this threshold.")
+XRAY_FLAG(int, grace_period_ms, 100,
+ "FDR logging will wait this much time in milliseconds before "
+ "actually flushing the log; this gives a chance for threads to "
+ "notice that the log has been finalized and clean up.")
+XRAY_FLAG(int, buffer_size, 16384,
+ "Size of buffers in the circular buffer queue.")
+XRAY_FLAG(int, buffer_max, 100, "Maximum number of buffers in the queue.")
+XRAY_FLAG(bool, no_file_flush, false,
+ "Set to true to not write log files by default.")
diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h
index 324208db82ca..87096d4fc29e 100644
--- a/lib/xray/xray_fdr_log_records.h
+++ b/lib/xray/xray_fdr_log_records.h
@@ -32,6 +32,8 @@ struct alignas(16) MetadataRecord {
CustomEventMarker,
CallArgument,
BufferExtents,
+ TypedEventMarker,
+ Pid,
};
// Use 7 bits to identify this record type.
diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
index 1bfa10c21f5c..6cb2dfa0c658 100644
--- a/lib/xray/xray_fdr_logging.cc
+++ b/lib/xray/xray_fdr_logging.cc
@@ -15,64 +15,836 @@
//
//===----------------------------------------------------------------------===//
#include "xray_fdr_logging.h"
+#include <cassert>
#include <errno.h>
+#include <limits>
+#include <memory>
+#include <pthread.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
+#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_atomic.h"
#include "sanitizer_common/sanitizer_common.h"
#include "xray/xray_interface.h"
#include "xray/xray_records.h"
#include "xray_buffer_queue.h"
#include "xray_defs.h"
-#include "xray_fdr_logging_impl.h"
+#include "xray_fdr_flags.h"
#include "xray_flags.h"
+#include "xray_recursion_guard.h"
#include "xray_tsc.h"
#include "xray_utils.h"
namespace __xray {
+atomic_sint32_t LoggingStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+// Group together thread-local-data in a struct, then hide it behind a function
+// call so that it can be initialized on first use instead of as a global. We
+// force the alignment to 64-bytes for x86 cache line alignment, as this
+// structure is used in the hot path of implementation.
+struct alignas(64) ThreadLocalData {
+ BufferQueue::Buffer Buffer;
+ char *RecordPtr = nullptr;
+ // The number of FunctionEntry records immediately preceding RecordPtr.
+ uint8_t NumConsecutiveFnEnters = 0;
+
+ // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit
+ // records preceding RecordPtr.
+ uint8_t NumTailCalls = 0;
+
+ // We use a thread_local variable to keep track of which CPUs we've already
+ // run, and the TSC times for these CPUs. This allows us to stop repeating the
+ // CPU field in the function records.
+ //
+ // We assume that we'll support only 65536 CPUs for x86_64.
+ uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
+ uint64_t LastTSC = 0;
+ uint64_t LastFunctionEntryTSC = 0;
+
+ // Make sure a thread that's ever called handleArg0 has a thread-local
+ // live reference to the buffer queue for this particular instance of
+ // FDRLogging, and that we're going to clean it up when the thread exits.
+ BufferQueue *BQ = nullptr;
+};
+
+static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
+ "ThreadLocalData must be trivially destructible");
+
+static constexpr auto MetadataRecSize = sizeof(MetadataRecord);
+static constexpr auto FunctionRecSize = sizeof(FunctionRecord);
+
+// Use a global pthread key to identify thread-local data for logging.
+static pthread_key_t Key;
+
// Global BufferQueue.
-BufferQueue *BQ = nullptr;
+static BufferQueue *BQ = nullptr;
-__sanitizer::atomic_sint32_t LogFlushStatus = {
+static atomic_sint32_t LogFlushStatus = {
XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
-FDRLoggingOptions FDROptions;
+static FDRLoggingOptions FDROptions;
+
+static SpinMutex FDROptionsMutex;
+
+// This function will initialize the thread-local data structure used by the FDR
+// logging implementation and return a reference to it. The implementation
+// details require a bit of care to maintain.
+//
+// First, some requirements on the implementation in general:
+//
+// - XRay handlers should not call any memory allocation routines that may
+// delegate to an instrumented implementation. This means functions like
+// malloc() and free() should not be called while instrumenting.
+//
+// - We would like to use some thread-local data initialized on first-use of
+// the XRay instrumentation. These allow us to implement unsynchronized
+// routines that access resources associated with the thread.
+//
+// The implementation here uses a few mechanisms that allow us to provide both
+// the requirements listed above. We do this by:
+//
+// 1. Using a thread-local aligned storage buffer for representing the
+// ThreadLocalData struct. This data will be uninitialized memory by
+// design.
+//
+// 2. Not requiring a thread exit handler/implementation, keeping the
+// thread-local as purely a collection of references/data that do not
+// require cleanup.
+//
+// We're doing this to avoid using a `thread_local` object that has a
+// non-trivial destructor, because the C++ runtime might call std::malloc(...)
+// to register calls to destructors. Deadlocks may arise when, for example, an
+// externally provided malloc implementation is XRay instrumented, and
+// initializing the thread-locals involves calling into malloc. A malloc
+// implementation that does global synchronization might be holding a lock for a
+// critical section, calling a function that might be XRay instrumented (and
+// thus in turn calling into malloc by virtue of registration of the
+// thread_local's destructor).
+static_assert(alignof(ThreadLocalData) >= 64,
+ "ThreadLocalData must be cache line aligned.");
+static ThreadLocalData &getThreadLocalData() {
+ thread_local typename std::aligned_storage<
+ sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{};
+
+ if (pthread_getspecific(Key) == NULL) {
+ new (reinterpret_cast<ThreadLocalData *>(&TLDStorage)) ThreadLocalData{};
+ pthread_setspecific(Key, &TLDStorage);
+ }
+
+ return *reinterpret_cast<ThreadLocalData *>(&TLDStorage);
+}
+
+static void writeNewBufferPreamble(tid_t Tid, timespec TS,
+ pid_t Pid) XRAY_NEVER_INSTRUMENT {
+ static constexpr int InitRecordsCount = 3;
+ auto &TLD = getThreadLocalData();
+ MetadataRecord Metadata[InitRecordsCount];
+ {
+ // Write out a MetadataRecord to signify that this is the start of a new
+ // buffer, associated with a particular thread, with a new CPU. For the
+ // data, we have 15 bytes to squeeze as much information as we can. At this
+ // point we only write down the following bytes:
+ // - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes)
+ auto &NewBuffer = Metadata[0];
+ NewBuffer.Type = uint8_t(RecordType::Metadata);
+ NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer);
+ int32_t tid = static_cast<int32_t>(Tid);
+ internal_memcpy(&NewBuffer.Data, &tid, sizeof(tid));
+ }
+
+ // Also write the WalltimeMarker record.
+ {
+ static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
+ auto &WalltimeMarker = Metadata[1];
+ WalltimeMarker.Type = uint8_t(RecordType::Metadata);
+ WalltimeMarker.RecordKind =
+ uint8_t(MetadataRecord::RecordKinds::WalltimeMarker);
+
+ // We only really need microsecond precision here, and enforce across
+ // platforms that we need 64-bit seconds and 32-bit microseconds encoded in
+ // the Metadata record.
+ int32_t Micros = TS.tv_nsec / 1000;
+ int64_t Seconds = TS.tv_sec;
+ internal_memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds));
+ internal_memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros,
+ sizeof(Micros));
+ }
+
+ // Also write the Pid record.
+ {
+ // Write out a MetadataRecord that contains the current pid
+ auto &PidMetadata = Metadata[2];
+ PidMetadata.Type = uint8_t(RecordType::Metadata);
+ PidMetadata.RecordKind = uint8_t(MetadataRecord::RecordKinds::Pid);
+ int32_t pid = static_cast<int32_t>(Pid);
+ internal_memcpy(&PidMetadata.Data, &pid, sizeof(pid));
+ }
+
+ TLD.NumConsecutiveFnEnters = 0;
+ TLD.NumTailCalls = 0;
+ if (TLD.BQ == nullptr || TLD.BQ->finalizing())
+ return;
+ internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
+ TLD.RecordPtr += sizeof(Metadata);
+ // Since we write out the extents as the first metadata record of the
+ // buffer, we need to write out the extents including the extents record.
+ atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
+ memory_order_release);
+}
+
+static void setupNewBuffer(int (*wall_clock_reader)(
+ clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
+ auto &TLD = getThreadLocalData();
+ auto &B = TLD.Buffer;
+ TLD.RecordPtr = static_cast<char *>(B.Data);
+ tid_t Tid = GetTid();
+ timespec TS{0, 0};
+ pid_t Pid = internal_getpid();
+ // This is typically clock_gettime, but callers have injection ability.
+ wall_clock_reader(CLOCK_MONOTONIC, &TS);
+ writeNewBufferPreamble(Tid, TS, Pid);
+ TLD.NumConsecutiveFnEnters = 0;
+ TLD.NumTailCalls = 0;
+}
+
+static void incrementExtents(size_t Add) {
+ auto &TLD = getThreadLocalData();
+ atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel);
+}
+
+static void decrementExtents(size_t Subtract) {
+ auto &TLD = getThreadLocalData();
+ atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel);
+}
+
+static void writeNewCPUIdMetadata(uint16_t CPU,
+ uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+ auto &TLD = getThreadLocalData();
+ MetadataRecord NewCPUId;
+ NewCPUId.Type = uint8_t(RecordType::Metadata);
+ NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
+
+ // The data for the New CPU will contain the following bytes:
+ // - CPU ID (uint16_t, 2 bytes)
+ // - Full TSC (uint64_t, 8 bytes)
+ // Total = 10 bytes.
+ internal_memcpy(&NewCPUId.Data, &CPU, sizeof(CPU));
+ internal_memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
+ internal_memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord));
+ TLD.RecordPtr += sizeof(MetadataRecord);
+ TLD.NumConsecutiveFnEnters = 0;
+ TLD.NumTailCalls = 0;
+ incrementExtents(sizeof(MetadataRecord));
+}
+
+static void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+ auto &TLD = getThreadLocalData();
+ MetadataRecord TSCWrap;
+ TSCWrap.Type = uint8_t(RecordType::Metadata);
+ TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
+
+ // The data for the TSCWrap record contains the following bytes:
+ // - Full TSC (uint64_t, 8 bytes)
+ // Total = 8 bytes.
+ internal_memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
+ internal_memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord));
+ TLD.RecordPtr += sizeof(MetadataRecord);
+ TLD.NumConsecutiveFnEnters = 0;
+ TLD.NumTailCalls = 0;
+ incrementExtents(sizeof(MetadataRecord));
+}
+
+// Call Argument metadata records store the arguments to a function in the
+// order of their appearance; holes are not supported by the buffer format.
+static void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT {
+ auto &TLD = getThreadLocalData();
+ MetadataRecord CallArg;
+ CallArg.Type = uint8_t(RecordType::Metadata);
+ CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument);
+
+ internal_memcpy(CallArg.Data, &A, sizeof(A));
+ internal_memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord));
+ TLD.RecordPtr += sizeof(MetadataRecord);
+ incrementExtents(sizeof(MetadataRecord));
+}
+
+static void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
+ XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT {
+ FunctionRecord FuncRecord;
+ FuncRecord.Type = uint8_t(RecordType::Function);
+ // Only take 28 bits of the function id.
+ FuncRecord.FuncId = FuncId & ~(0x0F << 28);
+ FuncRecord.TSCDelta = TSCDelta;
+
+ auto &TLD = getThreadLocalData();
+ switch (EntryType) {
+ case XRayEntryType::ENTRY:
+ ++TLD.NumConsecutiveFnEnters;
+ FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
+ break;
+ case XRayEntryType::LOG_ARGS_ENTRY:
+ // We should not rewind functions with logged args.
+ TLD.NumConsecutiveFnEnters = 0;
+ TLD.NumTailCalls = 0;
+ FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
+ break;
+ case XRayEntryType::EXIT:
+ // If we've decided to log the function exit, we will never erase the log
+ // before it.
+ TLD.NumConsecutiveFnEnters = 0;
+ TLD.NumTailCalls = 0;
+ FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
+ break;
+ case XRayEntryType::TAIL:
+ // If we just entered the function we're tail exiting from or erased every
+ // invocation since then, this function entry tail pair is a candidate to
+ // be erased when the child function exits.
+ if (TLD.NumConsecutiveFnEnters > 0) {
+ ++TLD.NumTailCalls;
+ TLD.NumConsecutiveFnEnters = 0;
+ } else {
+ // We will never be able to erase this tail call since we have logged
+ // something in between the function entry and tail exit.
+ TLD.NumTailCalls = 0;
+ TLD.NumConsecutiveFnEnters = 0;
+ }
+ FuncRecord.RecordKind =
+ uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
+ break;
+ case XRayEntryType::CUSTOM_EVENT: {
+ // This is a bug in patching, so we'll report it once and move on.
+ static atomic_uint8_t ErrorLatch{0};
+ if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+ Report("Internal error: patched an XRay custom event call as a function; "
+ "func id = %d\n",
+ FuncId);
+ return;
+ }
+ case XRayEntryType::TYPED_EVENT: {
+ static atomic_uint8_t ErrorLatch{0};
+ if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+ Report("Internal error: patched an XRay typed event call as a function; "
+ "func id = %d\n",
+ FuncId);
+ return;
+ }
+ }
+
+ internal_memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord));
+ TLD.RecordPtr += sizeof(FunctionRecord);
+ incrementExtents(sizeof(FunctionRecord));
+}
+
+static atomic_uint64_t TicksPerSec{0};
+static atomic_uint64_t ThresholdTicks{0};
+
+// Re-point the thread local pointer into this thread's Buffer before the recent
+// "Function Entry" record and any "Tail Call Exit" records after that.
+static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
+ uint64_t &LastFunctionEntryTSC, int32_t FuncId) {
+ auto &TLD = getThreadLocalData();
+ TLD.RecordPtr -= FunctionRecSize;
+ decrementExtents(FunctionRecSize);
+ FunctionRecord FuncRecord;
+ internal_memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize);
+ DCHECK(FuncRecord.RecordKind ==
+ uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
+ "Expected to find function entry recording when rewinding.");
+ DCHECK(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) &&
+ "Expected matching function id when rewinding Exit");
+ --TLD.NumConsecutiveFnEnters;
+ LastTSC -= FuncRecord.TSCDelta;
+
+ // We unwound one call. Update the state and return without writing a log.
+ if (TLD.NumConsecutiveFnEnters != 0) {
+ LastFunctionEntryTSC -= FuncRecord.TSCDelta;
+ return;
+ }
+
+ // Otherwise we've rewound the stack of all function entries, we might be
+ // able to rewind further by erasing tail call functions that are being
+ // exited from via this exit.
+ LastFunctionEntryTSC = 0;
+ auto RewindingTSC = LastTSC;
+ auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize;
+ while (TLD.NumTailCalls > 0) {
+ // Rewind the TSC back over the TAIL EXIT record.
+ FunctionRecord ExpectedTailExit;
+ internal_memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize);
+
+ DCHECK(ExpectedTailExit.RecordKind ==
+ uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) &&
+ "Expected to find tail exit when rewinding.");
+ RewindingRecordPtr -= FunctionRecSize;
+ RewindingTSC -= ExpectedTailExit.TSCDelta;
+ FunctionRecord ExpectedFunctionEntry;
+ internal_memcpy(&ExpectedFunctionEntry, RewindingRecordPtr,
+ FunctionRecSize);
+ DCHECK(ExpectedFunctionEntry.RecordKind ==
+ uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
+ "Expected to find function entry when rewinding tail call.");
+ DCHECK(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId &&
+ "Expected funcids to match when rewinding tail call.");
+
+ // This tail call exceeded the threshold duration. It will not be erased.
+ if ((TSC - RewindingTSC) >= atomic_load_relaxed(&ThresholdTicks)) {
+ TLD.NumTailCalls = 0;
+ return;
+ }
+
+ // We can erase a tail exit pair that we're exiting through since
+ // its duration is under threshold.
+ --TLD.NumTailCalls;
+ RewindingRecordPtr -= FunctionRecSize;
+ RewindingTSC -= ExpectedFunctionEntry.TSCDelta;
+ TLD.RecordPtr -= 2 * FunctionRecSize;
+ LastTSC = RewindingTSC;
+ decrementExtents(2 * FunctionRecSize);
+ }
+}
+
+static bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
+ auto &TLD = getThreadLocalData();
+ auto EC = BQArg.releaseBuffer(TLD.Buffer);
+ if (EC != BufferQueue::ErrorCode::Ok) {
+ Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Data,
+ BufferQueue::getErrorString(EC));
+ return false;
+ }
+ return true;
+}
+
+static bool prepareBuffer(uint64_t TSC, unsigned char CPU,
+ int (*wall_clock_reader)(clockid_t,
+ struct timespec *),
+ size_t MaxSize) XRAY_NEVER_INSTRUMENT {
+ auto &TLD = getThreadLocalData();
+ char *BufferStart = static_cast<char *>(TLD.Buffer.Data);
+ if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) {
+ if (!releaseThreadLocalBuffer(*TLD.BQ))
+ return false;
+ auto EC = TLD.BQ->getBuffer(TLD.Buffer);
+ if (EC != BufferQueue::ErrorCode::Ok) {
+ Report("Failed to prepare a buffer; error = '%s'\n",
+ BufferQueue::getErrorString(EC));
+ return false;
+ }
+ setupNewBuffer(wall_clock_reader);
-__sanitizer::SpinMutex FDROptionsMutex;
+ // Always write the CPU metadata as the first record in the buffer.
+ writeNewCPUIdMetadata(CPU, TSC);
+ }
+ return true;
+}
+
+static bool
+isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU,
+ int (*wall_clock_reader)(clockid_t, struct timespec *))
+ XRAY_NEVER_INSTRUMENT {
+ // Bail out right away if logging is not initialized yet.
+ // We should take the opportunity to release the buffer though.
+ auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
+ auto &TLD = getThreadLocalData();
+ if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
+ if (TLD.RecordPtr != nullptr &&
+ (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+ Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
+ if (!releaseThreadLocalBuffer(*LBQ))
+ return false;
+ TLD.RecordPtr = nullptr;
+ return false;
+ }
+ return false;
+ }
+
+ if (atomic_load(&LoggingStatus, memory_order_acquire) !=
+ XRayLogInitStatus::XRAY_LOG_INITIALIZED ||
+ LBQ->finalizing()) {
+ if (!releaseThreadLocalBuffer(*LBQ))
+ return false;
+ TLD.RecordPtr = nullptr;
+ }
+
+ if (TLD.Buffer.Data == nullptr) {
+ auto EC = LBQ->getBuffer(TLD.Buffer);
+ if (EC != BufferQueue::ErrorCode::Ok) {
+ auto LS = atomic_load(&LoggingStatus, memory_order_acquire);
+ if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING &&
+ LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
+ Report("Failed to acquire a buffer; error = '%s'\n",
+ BufferQueue::getErrorString(EC));
+ return false;
+ }
+
+ setupNewBuffer(wall_clock_reader);
+
+ // Always write the CPU metadata as the first record in the buffer.
+ writeNewCPUIdMetadata(CPU, TSC);
+ }
+
+ if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) {
+ // This means this is the first CPU this thread has ever run on. We set
+ // the current CPU and record this as the first TSC we've seen.
+ TLD.CurrentCPU = CPU;
+ writeNewCPUIdMetadata(CPU, TSC);
+ }
+
+ return true;
+}
+
+// Compute the TSC difference between the time of measurement and the previous
+// event. There are a few interesting situations we need to account for:
+//
+// - The thread has migrated to a different CPU. If this is the case, then
+// we write down the following records:
+//
+// 1. A 'NewCPUId' Metadata record.
+// 2. A FunctionRecord with a 0 for the TSCDelta field.
+//
+// - The TSC delta is greater than the 32 bits we can store in a
+// FunctionRecord. In this case we write down the following records:
+//
+// 1. A 'TSCWrap' Metadata record.
+// 2. A FunctionRecord with a 0 for the TSCDelta field.
+//
+// - The TSC delta is representable within the 32 bits we can store in a
+// FunctionRecord. In this case we write down just a FunctionRecord with
+// the correct TSC delta.
+static uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC,
+ uint8_t CPU) {
+ if (CPU != TLD.CurrentCPU) {
+ // We've moved to a new CPU.
+ writeNewCPUIdMetadata(CPU, TSC);
+ return 0;
+ }
+ // If the delta is greater than the range for a uint32_t, then we write out
+ // the TSC wrap metadata entry with the full TSC, and the TSC for the
+ // function record be 0.
+ uint64_t Delta = TSC - TLD.LastTSC;
+ if (Delta <= std::numeric_limits<uint32_t>::max())
+ return Delta;
+
+ writeTSCWrapMetadata(TSC);
+ return 0;
+}
+
+static void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
+ auto &TLD = getThreadLocalData();
+ auto BufferStart = static_cast<char *>(TLD.Buffer.Data);
+ if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <=
+ ptrdiff_t{MetadataRecSize}) {
+ if (!releaseThreadLocalBuffer(*TLD.BQ))
+ return;
+ TLD.RecordPtr = nullptr;
+ }
+}
+
+thread_local atomic_uint8_t Running{0};
+
+/// Here's where the meat of the processing happens. The writer captures
+/// function entry, exit and tail exit points with a time and will create
+/// TSCWrap, NewCPUId and Function records as necessary. The writer might
+/// walk backward through its buffer and erase trivial functions to avoid
+/// polluting the log and may use the buffer queue to obtain or release a
+/// buffer.
+static void processFunctionHook(int32_t FuncId, XRayEntryType Entry,
+ uint64_t TSC, unsigned char CPU, uint64_t Arg1,
+ int (*wall_clock_reader)(clockid_t,
+ struct timespec *))
+ XRAY_NEVER_INSTRUMENT {
+ __asm volatile("# LLVM-MCA-BEGIN processFunctionHook");
+ // Prevent signal handler recursion, so in case we're already in a log writing
+ // mode and the signal handler comes in (and is also instrumented) then we
+ // don't want to be clobbering potentially partial writes already happening in
+ // the thread. We use a simple thread_local latch to only allow one on-going
+ // handleArg0 to happen at any given time.
+ RecursionGuard Guard{Running};
+ if (!Guard) {
+ DCHECK(atomic_load_relaxed(&Running) && "RecursionGuard is buggy!");
+ return;
+ }
+
+ auto &TLD = getThreadLocalData();
+
+ if (TLD.BQ == nullptr)
+ TLD.BQ = BQ;
+
+ if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
+ return;
+
+ // Before we go setting up writing new function entries, we need to be really
+ // careful about the pointer math we're doing. This means we need to ensure
+ // that the record we are about to write is going to fit into the buffer,
+ // without overflowing the buffer.
+ //
+ // To do this properly, we use the following assumptions:
+ //
+ // - The least number of bytes we will ever write is 8
+ // (sizeof(FunctionRecord)) only if the delta between the previous entry
+ // and this entry is within 32 bits.
+ // - The most number of bytes we will ever write is 8 + 16 + 16 = 40.
+ // This is computed by:
+ //
+ // MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord)
+ //
+ // These arise in the following cases:
+ //
+ // 1. When the delta between the TSC we get and the previous TSC for the
+ // same CPU is outside of the uint32_t range, we end up having to
+ // write a MetadataRecord to indicate a "tsc wrap" before the actual
+ // FunctionRecord.
+ // 2. When we learn that we've moved CPUs, we need to write a
+ // MetadataRecord to indicate a "cpu change", and thus write out the
+ // current TSC for that CPU before writing out the actual
+ // FunctionRecord.
+ // 3. When we learn about a new CPU ID, we need to write down a "new cpu
+ // id" MetadataRecord before writing out the actual FunctionRecord.
+ // 4. The second MetadataRecord is the optional function call argument.
+ //
+ // So the math we need to do is to determine whether writing 40 bytes past the
+ // current pointer exceeds the buffer's maximum size. If we don't have enough
+ // space to write 40 bytes in the buffer, we need get a new Buffer, set it up
+ // properly before doing any further writing.
+ size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize;
+ if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) {
+ TLD.BQ = nullptr;
+ return;
+ }
+
+ // By this point, we are now ready to write up to 40 bytes (explained above).
+ DCHECK((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Data) >=
+ static_cast<ptrdiff_t>(MetadataRecSize) &&
+ "Misconfigured BufferQueue provided; Buffer size not large enough.");
+
+ auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU);
+ TLD.LastTSC = TSC;
+ TLD.CurrentCPU = CPU;
+ switch (Entry) {
+ case XRayEntryType::ENTRY:
+ case XRayEntryType::LOG_ARGS_ENTRY:
+ // Update the thread local state for the next invocation.
+ TLD.LastFunctionEntryTSC = TSC;
+ break;
+ case XRayEntryType::TAIL:
+ case XRayEntryType::EXIT:
+ // Break out and write the exit record if we can't erase any functions.
+ if (TLD.NumConsecutiveFnEnters == 0 ||
+ (TSC - TLD.LastFunctionEntryTSC) >=
+ atomic_load_relaxed(&ThresholdTicks))
+ break;
+ rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId);
+ return; // without writing log.
+ case XRayEntryType::CUSTOM_EVENT: {
+ // This is a bug in patching, so we'll report it once and move on.
+ static atomic_uint8_t ErrorLatch{0};
+ if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+ Report("Internal error: patched an XRay custom event call as a function; "
+ "func id = %d\n",
+ FuncId);
+ return;
+ }
+ case XRayEntryType::TYPED_EVENT: {
+ static atomic_uint8_t ErrorLatch{0};
+ if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+ Report("Internal error: patched an XRay typed event call as a function; "
+ "func id = %d\n",
+ FuncId);
+ return;
+ }
+ }
+
+ writeFunctionRecord(FuncId, RecordTSCDelta, Entry);
+ if (Entry == XRayEntryType::LOG_ARGS_ENTRY)
+ writeCallArgumentMetadata(Arg1);
+
+ // If we've exhausted the buffer by this time, we then release the buffer to
+ // make sure that other threads may start using this buffer.
+ endBufferIfFull();
+ __asm volatile("# LLVM-MCA-END");
+}
+
+static XRayFileHeader &fdrCommonHeaderInfo() {
+ static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage;
+ static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+ static bool TSCSupported = true;
+ static uint64_t CycleFrequency = NanosecondsPerSecond;
+ pthread_once(&OnceInit, +[] {
+ XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
+ // Version 2 of the log writes the extents of the buffer, instead of
+ // relying on an end-of-buffer record.
+ // Version 3 includes PID metadata record
+ H.Version = 3;
+ H.Type = FileTypes::FDR_LOG;
+
+ // Test for required CPU features and cache the cycle frequency
+ TSCSupported = probeRequiredCPUFeatures();
+ if (TSCSupported)
+ CycleFrequency = getTSCFrequency();
+ H.CycleFrequency = CycleFrequency;
+
+ // FIXME: Actually check whether we have 'constant_tsc' and
+ // 'nonstop_tsc' before setting the values in the header.
+ H.ConstantTSC = 1;
+ H.NonstopTSC = 1;
+ });
+ return reinterpret_cast<XRayFileHeader &>(HStorage);
+}
+
+// This is the iterator implementation, which knows how to handle FDR-mode
+// specific buffers. This is used as an implementation of the iterator function
+// needed by __xray_set_buffer_iterator(...). It maintains a global state of the
+// buffer iteration for the currently installed FDR mode buffers. In particular:
+//
+// - If the argument represents the initial state of XRayBuffer ({nullptr, 0})
+// then the iterator returns the header information.
+// - If the argument represents the header information ({address of header
+// info, size of the header info}) then it returns the first FDR buffer's
+// address and extents.
+// - It will keep returning the next buffer and extents as there are more
+// buffers to process. When the input represents the last buffer, it will
+// return the initial state to signal completion ({nullptr, 0}).
+//
+// See xray/xray_log_interface.h for more details on the requirements for the
+// implementations of __xray_set_buffer_iterator(...) and
+// __xray_log_process_buffers(...).
+XRayBuffer fdrIterator(const XRayBuffer B) {
+ DCHECK(internal_strcmp(__xray_log_get_current_mode(), "xray-fdr") == 0);
+ DCHECK(BQ->finalizing());
+
+ if (BQ == nullptr || !BQ->finalizing()) {
+ if (Verbosity())
+ Report(
+ "XRay FDR: Failed global buffer queue is null or not finalizing!\n");
+ return {nullptr, 0};
+ }
+
+ // We use a global scratch-pad for the header information, which only gets
+ // initialized the first time this function is called. We'll update one part
+ // of this information with some relevant data (in particular the number of
+ // buffers to expect).
+ static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage;
+ static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT;
+ pthread_once(&HeaderOnce, +[] {
+ reinterpret_cast<XRayFileHeader &>(HeaderStorage) = fdrCommonHeaderInfo();
+ });
+
+ // We use a convenience alias for code referring to Header from here on out.
+ auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage);
+ if (B.Data == nullptr && B.Size == 0) {
+ Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
+ return XRayBuffer{static_cast<void *>(&Header), sizeof(Header)};
+ }
+
+ static BufferQueue::const_iterator It{};
+ static BufferQueue::const_iterator End{};
+ static void *CurrentBuffer{nullptr};
+ if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
+ // From this point on, we provide raw access to the raw buffer we're getting
+ // from the BufferQueue. We're relying on the iterators from the current
+ // Buffer queue.
+ It = BQ->cbegin();
+ End = BQ->cend();
+ }
+
+ if (CurrentBuffer != nullptr) {
+ InternalFree(CurrentBuffer);
+ CurrentBuffer = nullptr;
+ }
+
+ if (It == End)
+ return {nullptr, 0};
+
+ // Set up the current buffer to contain the extents like we would when writing
+ // out to disk. The difference here would be that we still write "empty"
+ // buffers, or at least go through the iterators faithfully to let the
+ // handlers see the empty buffers in the queue.
+ auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire);
+ auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+ CurrentBuffer = InternalAlloc(SerializedBufferSize);
+ if (CurrentBuffer == nullptr)
+ return {nullptr, 0};
+
+ // Write out the extents as a Metadata Record into the CurrentBuffer.
+ MetadataRecord ExtentsRecord;
+ ExtentsRecord.Type = uint8_t(RecordType::Metadata);
+ ExtentsRecord.RecordKind =
+ uint8_t(MetadataRecord::RecordKinds::BufferExtents);
+ internal_memcpy(ExtentsRecord.Data, &BufferSize, sizeof(BufferSize));
+ auto AfterExtents =
+ static_cast<char *>(internal_memcpy(CurrentBuffer, &ExtentsRecord,
+ sizeof(MetadataRecord))) +
+ sizeof(MetadataRecord);
+ internal_memcpy(AfterExtents, It->Data, BufferSize);
+
+ XRayBuffer Result;
+ Result.Data = CurrentBuffer;
+ Result.Size = SerializedBufferSize;
+ ++It;
+ return Result;
+}
// Must finalize before flushing.
XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
- if (__sanitizer::atomic_load(&LoggingStatus,
- __sanitizer::memory_order_acquire) !=
+ if (atomic_load(&LoggingStatus, memory_order_acquire) !=
XRayLogInitStatus::XRAY_LOG_FINALIZED) {
- if (__sanitizer::Verbosity())
+ if (Verbosity())
Report("Not flushing log, implementation is not finalized.\n");
return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
}
s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
- if (!__sanitizer::atomic_compare_exchange_strong(
- &LogFlushStatus, &Result, XRayLogFlushStatus::XRAY_LOG_FLUSHING,
- __sanitizer::memory_order_release)) {
-
- if (__sanitizer::Verbosity())
+ if (!atomic_compare_exchange_strong(&LogFlushStatus, &Result,
+ XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+ memory_order_release)) {
+ if (Verbosity())
Report("Not flushing log, implementation is still finalizing.\n");
return static_cast<XRayLogFlushStatus>(Result);
}
if (BQ == nullptr) {
- if (__sanitizer::Verbosity())
+ if (Verbosity())
Report("Cannot flush when global buffer queue is null.\n");
return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
}
// We wait a number of milliseconds to allow threads to see that we've
// finalised before attempting to flush the log.
- __sanitizer::SleepForMillis(flags()->xray_fdr_log_grace_period_ms);
+ SleepForMillis(fdrFlags()->grace_period_ms);
+
+ // At this point, we're going to uninstall the iterator implementation, before
+ // we decide to do anything further with the global buffer queue.
+ __xray_log_remove_buffer_iterator();
+
+ // Once flushed, we should set the global status of the logging implementation
+ // to "uninitialized" to allow for FDR-logging multiple runs.
+ auto ResetToUnitialized = at_scope_exit([] {
+ atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+ memory_order_release);
+ });
+
+ auto CleanupBuffers = at_scope_exit([] {
+ if (BQ != nullptr) {
+ auto &TLD = getThreadLocalData();
+ if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
+ releaseThreadLocalBuffer(*TLD.BQ);
+ BQ->~BufferQueue();
+ InternalFree(BQ);
+ BQ = nullptr;
+ }
+ });
+
+ if (fdrFlags()->no_file_flush) {
+ if (Verbosity())
+ Report("XRay FDR: Not flushing to file, 'no_file_flush=true'.\n");
+
+ atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+ memory_order_release);
+ return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+ }
// We write out the file in the following format:
//
@@ -85,35 +857,20 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
//
int Fd = -1;
{
- __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+ // FIXME: Remove this section of the code, when we remove the struct-based
+ // configuration API.
+ SpinMutexLock Guard(&FDROptionsMutex);
Fd = FDROptions.Fd;
}
if (Fd == -1)
Fd = getLogFD();
if (Fd == -1) {
auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
- __sanitizer::atomic_store(&LogFlushStatus, Result,
- __sanitizer::memory_order_release);
+ atomic_store(&LogFlushStatus, Result, memory_order_release);
return Result;
}
- // Test for required CPU features and cache the cycle frequency
- static bool TSCSupported = probeRequiredCPUFeatures();
- static uint64_t CycleFrequency =
- TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;
-
- XRayFileHeader Header;
-
- // Version 2 of the log writes the extents of the buffer, instead of relying
- // on an end-of-buffer record.
- Header.Version = 2;
- Header.Type = FileTypes::FDR_LOG;
- Header.CycleFrequency = CycleFrequency;
-
- // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
- // before setting the values in the header.
- Header.ConstantTSC = 1;
- Header.NonstopTSC = 1;
+ XRayFileHeader Header = fdrCommonHeaderInfo();
Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
retryingWriteAll(Fd, reinterpret_cast<char *>(&Header),
reinterpret_cast<char *>(&Header) + sizeof(Header));
@@ -121,39 +878,36 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
BQ->apply([&](const BufferQueue::Buffer &B) {
// Starting at version 2 of the FDR logging implementation, we only write
// the records identified by the extents of the buffer. We use the Extents
- // from the Buffer and write that out as the first record in the buffer.
- // We still use a Metadata record, but fill in the extents instead for the
+ // from the Buffer and write that out as the first record in the buffer. We
+ // still use a Metadata record, but fill in the extents instead for the
// data.
MetadataRecord ExtentsRecord;
- auto BufferExtents = __sanitizer::atomic_load(
- &B.Extents->Size, __sanitizer::memory_order_acquire);
- assert(BufferExtents <= B.Size);
+ auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire);
+ DCHECK(BufferExtents <= B.Size);
ExtentsRecord.Type = uint8_t(RecordType::Metadata);
ExtentsRecord.RecordKind =
uint8_t(MetadataRecord::RecordKinds::BufferExtents);
- std::memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
+ internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
if (BufferExtents > 0) {
retryingWriteAll(Fd, reinterpret_cast<char *>(&ExtentsRecord),
reinterpret_cast<char *>(&ExtentsRecord) +
sizeof(MetadataRecord));
- retryingWriteAll(Fd, reinterpret_cast<char *>(B.Buffer),
- reinterpret_cast<char *>(B.Buffer) + BufferExtents);
+ retryingWriteAll(Fd, reinterpret_cast<char *>(B.Data),
+ reinterpret_cast<char *>(B.Data) + BufferExtents);
}
});
- __sanitizer::atomic_store(&LogFlushStatus,
- XRayLogFlushStatus::XRAY_LOG_FLUSHED,
- __sanitizer::memory_order_release);
+ atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+ memory_order_release);
return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
}
XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
- if (!__sanitizer::atomic_compare_exchange_strong(
- &LoggingStatus, &CurrentStatus,
- XRayLogInitStatus::XRAY_LOG_FINALIZING,
- __sanitizer::memory_order_release)) {
- if (__sanitizer::Verbosity())
+ if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+ XRayLogInitStatus::XRAY_LOG_FINALIZING,
+ memory_order_release)) {
+ if (Verbosity())
Report("Cannot finalize log, implementation not initialized.\n");
return static_cast<XRayLogInitStatus>(CurrentStatus);
}
@@ -162,39 +916,11 @@ XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
// operations to be performed until re-initialized.
BQ->finalize();
- __sanitizer::atomic_store(&LoggingStatus,
- XRayLogInitStatus::XRAY_LOG_FINALIZED,
- __sanitizer::memory_order_release);
+ atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+ memory_order_release);
return XRayLogInitStatus::XRAY_LOG_FINALIZED;
}
-XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT {
- s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_FINALIZED;
- if (__sanitizer::atomic_compare_exchange_strong(
- &LoggingStatus, &CurrentStatus,
- XRayLogInitStatus::XRAY_LOG_INITIALIZED,
- __sanitizer::memory_order_release))
- return static_cast<XRayLogInitStatus>(CurrentStatus);
-
- // Release the in-memory buffer queue.
- delete BQ;
- BQ = nullptr;
-
- // Spin until the flushing status is flushed.
- s32 CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED;
- while (__sanitizer::atomic_compare_exchange_weak(
- &LogFlushStatus, &CurrentFlushingStatus,
- XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING,
- __sanitizer::memory_order_release)) {
- if (CurrentFlushingStatus == XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING)
- break;
- CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED;
- }
-
- // At this point, we know that the status is flushed, and that we can assume
- return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-}
-
struct TSCAndCPU {
uint64_t TSC = 0;
unsigned char CPU = 0;
@@ -202,12 +928,14 @@ struct TSCAndCPU {
static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
// We want to get the TSC as early as possible, so that we can check whether
- // we've seen this CPU before. We also do it before we load anything else, to
- // allow for forward progress with the scheduling.
+ // we've seen this CPU before. We also do it before we load anything else,
+ // to allow for forward progress with the scheduling.
TSCAndCPU Result;
// Test once for required CPU features
- static bool TSCSupported = probeRequiredCPUFeatures();
+ static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT;
+ static bool TSCSupported = true;
+ pthread_once(&OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
if (TSCSupported) {
Result.TSC = __xray::readTSC(Result.CPU);
@@ -228,20 +956,17 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
void fdrLoggingHandleArg0(int32_t FuncId,
XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
auto TC = getTimestamp();
- __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0,
- clock_gettime, BQ);
+ processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, clock_gettime);
}
void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry,
uint64_t Arg) XRAY_NEVER_INSTRUMENT {
auto TC = getTimestamp();
- __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg,
- clock_gettime, BQ);
+ processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, clock_gettime);
}
void fdrLoggingHandleCustomEvent(void *Event,
std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
- using namespace __xray_fdr_internal;
auto TC = getTimestamp();
auto &TSC = TC.TSC;
auto &CPU = TC.CPU;
@@ -249,13 +974,8 @@ void fdrLoggingHandleCustomEvent(void *Event,
if (!Guard)
return;
if (EventSize > std::numeric_limits<int32_t>::max()) {
- using Empty = struct {};
- static Empty Once = [&] {
- Report("Event size too large = %zu ; > max = %d\n", EventSize,
- std::numeric_limits<int32_t>::max());
- return Empty();
- }();
- (void)Once;
+ static pthread_once_t Once = PTHREAD_ONCE_INIT;
+ pthread_once(&Once, +[] { Report("Event size too large.\n"); });
}
int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
auto &TLD = getThreadLocalData();
@@ -264,8 +984,8 @@ void fdrLoggingHandleCustomEvent(void *Event,
// Here we need to prepare the log to handle:
// - The metadata record we're going to write. (16 bytes)
- // - The additional data we're going to write. Currently, that's the size of
- // the event we're going to dump into the log as free-form bytes.
+ // - The additional data we're going to write. Currently, that's the size
+ // of the event we're going to dump into the log as free-form bytes.
if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
TLD.BQ = nullptr;
return;
@@ -280,90 +1000,207 @@ void fdrLoggingHandleCustomEvent(void *Event,
CustomEvent.RecordKind =
uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
constexpr auto TSCSize = sizeof(TC.TSC);
- std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
- std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
- std::memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
+ internal_memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
+ internal_memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+ internal_memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
TLD.RecordPtr += sizeof(CustomEvent);
- std::memcpy(TLD.RecordPtr, Event, ReducedEventSize);
+ internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
+ incrementExtents(MetadataRecSize + EventSize);
+ endBufferIfFull();
+}
+
+void fdrLoggingHandleTypedEvent(
+ uint16_t EventType, const void *Event,
+ std::size_t EventSize) noexcept XRAY_NEVER_INSTRUMENT {
+ auto TC = getTimestamp();
+ auto &TSC = TC.TSC;
+ auto &CPU = TC.CPU;
+ RecursionGuard Guard{Running};
+ if (!Guard)
+ return;
+ if (EventSize > std::numeric_limits<int32_t>::max()) {
+ static pthread_once_t Once = PTHREAD_ONCE_INIT;
+ pthread_once(&Once, +[] { Report("Event size too large.\n"); });
+ }
+ int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+ auto &TLD = getThreadLocalData();
+ if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime))
+ return;
+
+ // Here we need to prepare the log to handle:
+ // - The metadata record we're going to write. (16 bytes)
+ // - The additional data we're going to write. Currently, that's the size
+ // of the event we're going to dump into the log as free-form bytes.
+ if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
+ TLD.BQ = nullptr;
+ return;
+ }
+ // Write the custom event metadata record, which consists of the following
+ // information:
+ // - 8 bytes (64-bits) for the full TSC when the event started.
+ // - 4 bytes (32-bits) for the length of the data.
+ // - 2 bytes (16-bits) for the event type. 3 bytes remain since one of the
+ // bytes has the record type (Metadata Record) and kind (TypedEvent).
+ // We'll log the error if the event type is greater than 2 bytes.
+ // Event types are generated sequentially, so 2^16 is enough.
+ MetadataRecord TypedEvent;
+ TypedEvent.Type = uint8_t(RecordType::Metadata);
+ TypedEvent.RecordKind =
+ uint8_t(MetadataRecord::RecordKinds::TypedEventMarker);
+ constexpr auto TSCSize = sizeof(TC.TSC);
+ internal_memcpy(&TypedEvent.Data, &ReducedEventSize, sizeof(int32_t));
+ internal_memcpy(&TypedEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+ internal_memcpy(&TypedEvent.Data[sizeof(int32_t) + TSCSize], &EventType,
+ sizeof(EventType));
+ internal_memcpy(TLD.RecordPtr, &TypedEvent, sizeof(TypedEvent));
+
+ TLD.RecordPtr += sizeof(TypedEvent);
+ internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
incrementExtents(MetadataRecSize + EventSize);
endBufferIfFull();
}
-XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
+XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
void *Options,
size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
- if (OptionsSize != sizeof(FDRLoggingOptions)) {
- if (__sanitizer::Verbosity())
- Report("Cannot initialize FDR logging; wrong size for options: %d\n",
- OptionsSize);
- return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
- &LoggingStatus, __sanitizer::memory_order_acquire));
- }
+ if (Options == nullptr)
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
- if (!__sanitizer::atomic_compare_exchange_strong(
- &LoggingStatus, &CurrentStatus,
- XRayLogInitStatus::XRAY_LOG_INITIALIZING,
- __sanitizer::memory_order_release)) {
- if (__sanitizer::Verbosity())
+ if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+ XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+ memory_order_release)) {
+ if (Verbosity())
Report("Cannot initialize already initialized implementation.\n");
return static_cast<XRayLogInitStatus>(CurrentStatus);
}
- {
- __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
- memcpy(&FDROptions, Options, OptionsSize);
+ // Because of __xray_log_init_mode(...) which guarantees that this will be
+ // called with BufferSize == 0 and BufferMax == 0 we parse the configuration
+ // provided in the Options pointer as a string instead.
+ if (BufferSize == 0 && BufferMax == 0) {
+ if (Verbosity())
+ Report("Initializing FDR mode with options: %s\n",
+ static_cast<const char *>(Options));
+
+ // TODO: Factor out the flags specific to the FDR mode implementation. For
+ // now, use the global/single definition of the flags, since the FDR mode
+ // flags are already defined there.
+ FlagParser FDRParser;
+ FDRFlags FDRFlags;
+ registerXRayFDRFlags(&FDRParser, &FDRFlags);
+ FDRFlags.setDefaults();
+
+ // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
+ // options until we migrate everyone to use the XRAY_FDR_OPTIONS
+ // compiler-provided options.
+ FDRParser.ParseString(useCompilerDefinedFlags());
+ FDRParser.ParseString(useCompilerDefinedFDRFlags());
+ auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
+ if (EnvOpts == nullptr)
+ EnvOpts = "";
+ FDRParser.ParseString(EnvOpts);
+
+ // FIXME: Remove this when we fully remove the deprecated flags.
+ if (internal_strlen(EnvOpts) == 0) {
+ FDRFlags.func_duration_threshold_us =
+ flags()->xray_fdr_log_func_duration_threshold_us;
+ FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
+ }
+
+ // The provided options should always override the compiler-provided and
+ // environment-variable defined options.
+ FDRParser.ParseString(static_cast<const char *>(Options));
+ *fdrFlags() = FDRFlags;
+ BufferSize = FDRFlags.buffer_size;
+ BufferMax = FDRFlags.buffer_max;
+ SpinMutexLock Guard(&FDROptionsMutex);
+ FDROptions.Fd = -1;
+ FDROptions.ReportErrors = true;
+ } else if (OptionsSize != sizeof(FDRLoggingOptions)) {
+ // FIXME: This is deprecated, and should really be removed.
+ // At this point we use the flag parser specific to the FDR mode
+ // implementation.
+ if (Verbosity())
+ Report("Cannot initialize FDR logging; wrong size for options: %d\n",
+ OptionsSize);
+ return static_cast<XRayLogInitStatus>(
+ atomic_load(&LoggingStatus, memory_order_acquire));
+ } else {
+ if (Verbosity())
+ Report("XRay FDR: struct-based init is deprecated, please use "
+ "string-based configuration instead.\n");
+ SpinMutexLock Guard(&FDROptionsMutex);
+ internal_memcpy(&FDROptions, Options, OptionsSize);
}
bool Success = false;
if (BQ != nullptr) {
- delete BQ;
+ BQ->~BufferQueue();
+ InternalFree(BQ);
BQ = nullptr;
}
- if (BQ == nullptr)
- BQ = new BufferQueue(BufferSize, BufferMax, Success);
+ if (BQ == nullptr) {
+ BQ = reinterpret_cast<BufferQueue *>(
+ InternalAlloc(sizeof(BufferQueue), nullptr, 64));
+ new (BQ) BufferQueue(BufferSize, BufferMax, Success);
+ }
if (!Success) {
Report("BufferQueue init failed.\n");
if (BQ != nullptr) {
- delete BQ;
+ BQ->~BufferQueue();
+ InternalFree(BQ);
BQ = nullptr;
}
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
}
- static bool UNUSED Once = [] {
- pthread_key_create(&__xray_fdr_internal::Key, +[](void *) {
- auto &TLD = __xray_fdr_internal::getThreadLocalData();
+ static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+ pthread_once(&OnceInit, +[] {
+ atomic_store(&TicksPerSec,
+ probeRequiredCPUFeatures() ? getTSCFrequency()
+ : __xray::NanosecondsPerSecond,
+ memory_order_release);
+ pthread_key_create(&Key, +[](void *TLDPtr) {
+ if (TLDPtr == nullptr)
+ return;
+ auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
if (TLD.BQ == nullptr)
return;
auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
if (EC != BufferQueue::ErrorCode::Ok)
Report("At thread exit, failed to release buffer at %p; error=%s\n",
- TLD.Buffer.Buffer, BufferQueue::getErrorString(EC));
+ TLD.Buffer.Data, BufferQueue::getErrorString(EC));
});
- return false;
- }();
+ });
+ atomic_store(&ThresholdTicks,
+ atomic_load_relaxed(&TicksPerSec) *
+ fdrFlags()->func_duration_threshold_us / 1000000,
+ memory_order_release);
// Arg1 handler should go in first to avoid concurrent code accidentally
// falling back to arg0 when it should have ran arg1.
__xray_set_handler_arg1(fdrLoggingHandleArg1);
// Install the actual handleArg0 handler after initialising the buffers.
__xray_set_handler(fdrLoggingHandleArg0);
__xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
+ __xray_set_typedevent_handler(fdrLoggingHandleTypedEvent);
+
+ // Install the buffer iterator implementation.
+ __xray_log_set_buffer_iterator(fdrIterator);
- __sanitizer::atomic_store(&LoggingStatus,
- XRayLogInitStatus::XRAY_LOG_INITIALIZED,
- __sanitizer::memory_order_release);
+ atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+ memory_order_release);
- if (__sanitizer::Verbosity())
+ if (Verbosity())
Report("XRay FDR init successful.\n");
return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
}
bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
- using namespace __xray;
XRayLogImpl Impl{
fdrLoggingInit,
fdrLoggingFinalize,
@@ -372,11 +1209,10 @@ bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
};
auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
- __sanitizer::Verbosity())
+ Verbosity())
Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
RegistrationResult);
- if (flags()->xray_fdr_log ||
- !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-fdr"))
+ if (flags()->xray_fdr_log || !internal_strcmp(flags()->xray_mode, "xray-fdr"))
__xray_set_log_impl(Impl);
return true;
}
diff --git a/lib/xray/xray_fdr_logging_impl.h b/lib/xray/xray_fdr_logging_impl.h
deleted file mode 100644
index 59eab55b2573..000000000000
--- a/lib/xray/xray_fdr_logging_impl.h
+++ /dev/null
@@ -1,705 +0,0 @@
-//===-- xray_fdr_logging_impl.h ---------------------------------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Here we implement the thread local state management and record i/o for Flight
-// Data Recorder mode for XRay, where we use compact structures to store records
-// in memory as well as when writing out the data to files.
-//
-//===----------------------------------------------------------------------===//
-#ifndef XRAY_XRAY_FDR_LOGGING_IMPL_H
-#define XRAY_XRAY_FDR_LOGGING_IMPL_H
-
-#include <cassert>
-#include <cstddef>
-#include <cstring>
-#include <limits>
-#include <pthread.h>
-#include <sys/syscall.h>
-#include <time.h>
-#include <type_traits>
-#include <unistd.h>
-
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray/xray_log_interface.h"
-#include "xray_buffer_queue.h"
-#include "xray_defs.h"
-#include "xray_fdr_log_records.h"
-#include "xray_flags.h"
-#include "xray_tsc.h"
-
-namespace __xray {
-
-__sanitizer::atomic_sint32_t LoggingStatus = {
- XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-
-/// We expose some of the state transitions when FDR logging mode is operating
-/// such that we can simulate a series of log events that may occur without
-/// and test with determinism without worrying about the real CPU time.
-///
-/// Because the code uses thread_local allocation extensively as part of its
-/// design, callers that wish to test events occuring on different threads
-/// will actually have to run them on different threads.
-///
-/// This also means that it is possible to break invariants maintained by
-/// cooperation with xray_fdr_logging class, so be careful and think twice.
-namespace __xray_fdr_internal {
-
-/// Writes the new buffer record and wallclock time that begin a buffer for the
-/// current thread.
-static void writeNewBufferPreamble(pid_t Tid, timespec TS);
-
-/// Writes a Function Record to the buffer associated with the current thread.
-static void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
- XRayEntryType EntryType);
-
-/// Sets up a new buffer in thread_local storage and writes a preamble. The
-/// wall_clock_reader function is used to populate the WallTimeRecord entry.
-static void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
- struct timespec *));
-
-/// TSC Wrap records are written when a TSC delta encoding scheme overflows.
-static void writeTSCWrapMetadata(uint64_t TSC);
-
-// Group together thread-local-data in a struct, then hide it behind a function
-// call so that it can be initialized on first use instead of as a global. We
-// force the alignment to 64-bytes for x86 cache line alignment, as this
-// structure is used in the hot path of implementation.
-struct alignas(64) ThreadLocalData {
- BufferQueue::Buffer Buffer;
- char *RecordPtr = nullptr;
- // The number of FunctionEntry records immediately preceding RecordPtr.
- uint8_t NumConsecutiveFnEnters = 0;
-
- // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit
- // records preceding RecordPtr.
- uint8_t NumTailCalls = 0;
-
- // We use a thread_local variable to keep track of which CPUs we've already
- // run, and the TSC times for these CPUs. This allows us to stop repeating the
- // CPU field in the function records.
- //
- // We assume that we'll support only 65536 CPUs for x86_64.
- uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
- uint64_t LastTSC = 0;
- uint64_t LastFunctionEntryTSC = 0;
-
- // Make sure a thread that's ever called handleArg0 has a thread-local
- // live reference to the buffer queue for this particular instance of
- // FDRLogging, and that we're going to clean it up when the thread exits.
- BufferQueue *BQ = nullptr;
-};
-
-static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
- "ThreadLocalData must be trivially destructible");
-
-static constexpr auto MetadataRecSize = sizeof(MetadataRecord);
-static constexpr auto FunctionRecSize = sizeof(FunctionRecord);
-
-// Use a global pthread key to identify thread-local data for logging.
-static pthread_key_t Key;
-
-// This function will initialize the thread-local data structure used by the FDR
-// logging implementation and return a reference to it. The implementation
-// details require a bit of care to maintain.
-//
-// First, some requirements on the implementation in general:
-//
-// - XRay handlers should not call any memory allocation routines that may
-// delegate to an instrumented implementation. This means functions like
-// malloc() and free() should not be called while instrumenting.
-//
-// - We would like to use some thread-local data initialized on first-use of
-// the XRay instrumentation. These allow us to implement unsynchronized
-// routines that access resources associated with the thread.
-//
-// The implementation here uses a few mechanisms that allow us to provide both
-// the requirements listed above. We do this by:
-//
-// 1. Using a thread-local aligned storage buffer for representing the
-// ThreadLocalData struct. This data will be uninitialized memory by
-// design.
-//
-// 2. Not requiring a thread exit handler/implementation, keeping the
-// thread-local as purely a collection of references/data that do not
-// require cleanup.
-//
-// We're doing this to avoid using a `thread_local` object that has a
-// non-trivial destructor, because the C++ runtime might call std::malloc(...)
-// to register calls to destructors. Deadlocks may arise when, for example, an
-// externally provided malloc implementation is XRay instrumented, and
-// initializing the thread-locals involves calling into malloc. A malloc
-// implementation that does global synchronization might be holding a lock for a
-// critical section, calling a function that might be XRay instrumented (and
-// thus in turn calling into malloc by virtue of registration of the
-// thread_local's destructor).
-static ThreadLocalData &getThreadLocalData() {
- static_assert(alignof(ThreadLocalData) >= 64,
- "ThreadLocalData must be cache line aligned.");
- thread_local ThreadLocalData TLD;
- thread_local bool UNUSED ThreadOnce = [] {
- pthread_setspecific(Key, &TLD);
- return false;
- }();
- return TLD;
-}
-
-//-----------------------------------------------------------------------------|
-// The rest of the file is implementation. |
-//-----------------------------------------------------------------------------|
-// Functions are implemented in the header for inlining since we don't want |
-// to grow the stack when we've hijacked the binary for logging. |
-//-----------------------------------------------------------------------------|
-
-namespace {
-
-class RecursionGuard {
- volatile bool &Running;
- const bool Valid;
-
-public:
- explicit RecursionGuard(volatile bool &R) : Running(R), Valid(!R) {
- if (Valid)
- Running = true;
- }
-
- RecursionGuard(const RecursionGuard &) = delete;
- RecursionGuard(RecursionGuard &&) = delete;
- RecursionGuard &operator=(const RecursionGuard &) = delete;
- RecursionGuard &operator=(RecursionGuard &&) = delete;
-
- explicit operator bool() const { return Valid; }
-
- ~RecursionGuard() noexcept {
- if (Valid)
- Running = false;
- }
-};
-
-} // namespace
-
-static void writeNewBufferPreamble(pid_t Tid,
- timespec TS) XRAY_NEVER_INSTRUMENT {
- static constexpr int InitRecordsCount = 2;
- auto &TLD = getThreadLocalData();
- MetadataRecord Metadata[InitRecordsCount];
- {
- // Write out a MetadataRecord to signify that this is the start of a new
- // buffer, associated with a particular thread, with a new CPU. For the
- // data, we have 15 bytes to squeeze as much information as we can. At this
- // point we only write down the following bytes:
- // - Thread ID (pid_t, 4 bytes)
- auto &NewBuffer = Metadata[0];
- NewBuffer.Type = uint8_t(RecordType::Metadata);
- NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer);
- std::memcpy(&NewBuffer.Data, &Tid, sizeof(pid_t));
- }
-
- // Also write the WalltimeMarker record.
- {
- static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
- auto &WalltimeMarker = Metadata[1];
- WalltimeMarker.Type = uint8_t(RecordType::Metadata);
- WalltimeMarker.RecordKind =
- uint8_t(MetadataRecord::RecordKinds::WalltimeMarker);
-
- // We only really need microsecond precision here, and enforce across
- // platforms that we need 64-bit seconds and 32-bit microseconds encoded in
- // the Metadata record.
- int32_t Micros = TS.tv_nsec / 1000;
- int64_t Seconds = TS.tv_sec;
- std::memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds));
- std::memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros, sizeof(Micros));
- }
-
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- if (TLD.BQ == nullptr || TLD.BQ->finalizing())
- return;
- std::memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
- TLD.RecordPtr += sizeof(Metadata);
- // Since we write out the extents as the first metadata record of the
- // buffer, we need to write out the extents including the extents record.
- __sanitizer::atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
- __sanitizer::memory_order_release);
-}
-
-inline void setupNewBuffer(int (*wall_clock_reader)(
- clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- auto &B = TLD.Buffer;
- TLD.RecordPtr = static_cast<char *>(B.Buffer);
- pid_t Tid = syscall(SYS_gettid);
- timespec TS{0, 0};
- // This is typically clock_gettime, but callers have injection ability.
- wall_clock_reader(CLOCK_MONOTONIC, &TS);
- writeNewBufferPreamble(Tid, TS);
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
-}
-
-static void incrementExtents(size_t Add) {
- auto &TLD = getThreadLocalData();
- __sanitizer::atomic_fetch_add(&TLD.Buffer.Extents->Size, Add,
- __sanitizer::memory_order_acq_rel);
-}
-
-static void decrementExtents(size_t Subtract) {
- auto &TLD = getThreadLocalData();
- __sanitizer::atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract,
- __sanitizer::memory_order_acq_rel);
-}
-
-inline void writeNewCPUIdMetadata(uint16_t CPU,
- uint64_t TSC) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- MetadataRecord NewCPUId;
- NewCPUId.Type = uint8_t(RecordType::Metadata);
- NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
-
- // The data for the New CPU will contain the following bytes:
- // - CPU ID (uint16_t, 2 bytes)
- // - Full TSC (uint64_t, 8 bytes)
- // Total = 10 bytes.
- std::memcpy(&NewCPUId.Data, &CPU, sizeof(CPU));
- std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
- std::memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord));
- TLD.RecordPtr += sizeof(MetadataRecord);
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- incrementExtents(sizeof(MetadataRecord));
-}
-
-inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- MetadataRecord TSCWrap;
- TSCWrap.Type = uint8_t(RecordType::Metadata);
- TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
-
- // The data for the TSCWrap record contains the following bytes:
- // - Full TSC (uint64_t, 8 bytes)
- // Total = 8 bytes.
- std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
- std::memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord));
- TLD.RecordPtr += sizeof(MetadataRecord);
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- incrementExtents(sizeof(MetadataRecord));
-}
-
-// Call Argument metadata records store the arguments to a function in the
-// order of their appearance; holes are not supported by the buffer format.
-static inline void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- MetadataRecord CallArg;
- CallArg.Type = uint8_t(RecordType::Metadata);
- CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument);
-
- std::memcpy(CallArg.Data, &A, sizeof(A));
- std::memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord));
- TLD.RecordPtr += sizeof(MetadataRecord);
- incrementExtents(sizeof(MetadataRecord));
-}
-
-static inline void
-writeFunctionRecord(int FuncId, uint32_t TSCDelta,
- XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT {
- FunctionRecord FuncRecord;
- FuncRecord.Type = uint8_t(RecordType::Function);
- // Only take 28 bits of the function id.
- FuncRecord.FuncId = FuncId & ~(0x0F << 28);
- FuncRecord.TSCDelta = TSCDelta;
-
- auto &TLD = getThreadLocalData();
- switch (EntryType) {
- case XRayEntryType::ENTRY:
- ++TLD.NumConsecutiveFnEnters;
- FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
- break;
- case XRayEntryType::LOG_ARGS_ENTRY:
- // We should not rewind functions with logged args.
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
- break;
- case XRayEntryType::EXIT:
- // If we've decided to log the function exit, we will never erase the log
- // before it.
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
- break;
- case XRayEntryType::TAIL:
- // If we just entered the function we're tail exiting from or erased every
- // invocation since then, this function entry tail pair is a candidate to
- // be erased when the child function exits.
- if (TLD.NumConsecutiveFnEnters > 0) {
- ++TLD.NumTailCalls;
- TLD.NumConsecutiveFnEnters = 0;
- } else {
- // We will never be able to erase this tail call since we have logged
- // something in between the function entry and tail exit.
- TLD.NumTailCalls = 0;
- TLD.NumConsecutiveFnEnters = 0;
- }
- FuncRecord.RecordKind =
- uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
- break;
- case XRayEntryType::CUSTOM_EVENT: {
- // This is a bug in patching, so we'll report it once and move on.
- static bool Once = [&] {
- Report("Internal error: patched an XRay custom event call as a function; "
- "func id = %d\n",
- FuncId);
- return true;
- }();
- (void)Once;
- return;
- }
- }
-
- std::memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord));
- TLD.RecordPtr += sizeof(FunctionRecord);
- incrementExtents(sizeof(FunctionRecord));
-}
-
-static uint64_t thresholdTicks() {
- static uint64_t TicksPerSec = probeRequiredCPUFeatures()
- ? getTSCFrequency()
- : __xray::NanosecondsPerSecond;
- static const uint64_t ThresholdTicks =
- TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000;
- return ThresholdTicks;
-}
-
-// Re-point the thread local pointer into this thread's Buffer before the recent
-// "Function Entry" record and any "Tail Call Exit" records after that.
-static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
- uint64_t &LastFunctionEntryTSC, int32_t FuncId) {
- auto &TLD = getThreadLocalData();
- TLD.RecordPtr -= FunctionRecSize;
- decrementExtents(FunctionRecSize);
- FunctionRecord FuncRecord;
- std::memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize);
- assert(FuncRecord.RecordKind ==
- uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
- "Expected to find function entry recording when rewinding.");
- assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) &&
- "Expected matching function id when rewinding Exit");
- --TLD.NumConsecutiveFnEnters;
- LastTSC -= FuncRecord.TSCDelta;
-
- // We unwound one call. Update the state and return without writing a log.
- if (TLD.NumConsecutiveFnEnters != 0) {
- LastFunctionEntryTSC -= FuncRecord.TSCDelta;
- return;
- }
-
- // Otherwise we've rewound the stack of all function entries, we might be
- // able to rewind further by erasing tail call functions that are being
- // exited from via this exit.
- LastFunctionEntryTSC = 0;
- auto RewindingTSC = LastTSC;
- auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize;
- while (TLD.NumTailCalls > 0) {
- // Rewind the TSC back over the TAIL EXIT record.
- FunctionRecord ExpectedTailExit;
- std::memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize);
-
- assert(ExpectedTailExit.RecordKind ==
- uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) &&
- "Expected to find tail exit when rewinding.");
- RewindingRecordPtr -= FunctionRecSize;
- RewindingTSC -= ExpectedTailExit.TSCDelta;
- FunctionRecord ExpectedFunctionEntry;
- std::memcpy(&ExpectedFunctionEntry, RewindingRecordPtr, FunctionRecSize);
- assert(ExpectedFunctionEntry.RecordKind ==
- uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
- "Expected to find function entry when rewinding tail call.");
- assert(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId &&
- "Expected funcids to match when rewinding tail call.");
-
- // This tail call exceeded the threshold duration. It will not be erased.
- if ((TSC - RewindingTSC) >= thresholdTicks()) {
- TLD.NumTailCalls = 0;
- return;
- }
-
- // We can erase a tail exit pair that we're exiting through since
- // its duration is under threshold.
- --TLD.NumTailCalls;
- RewindingRecordPtr -= FunctionRecSize;
- RewindingTSC -= ExpectedFunctionEntry.TSCDelta;
- TLD.RecordPtr -= 2 * FunctionRecSize;
- LastTSC = RewindingTSC;
- decrementExtents(2 * FunctionRecSize);
- }
-}
-
-inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
- auto &TLD = getThreadLocalData();
- auto EC = BQArg.releaseBuffer(TLD.Buffer);
- if (EC != BufferQueue::ErrorCode::Ok) {
- Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Buffer,
- BufferQueue::getErrorString(EC));
- return false;
- }
- return true;
-}
-
-inline bool prepareBuffer(uint64_t TSC, unsigned char CPU,
- int (*wall_clock_reader)(clockid_t,
- struct timespec *),
- size_t MaxSize) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- char *BufferStart = static_cast<char *>(TLD.Buffer.Buffer);
- if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) {
- if (!releaseThreadLocalBuffer(*TLD.BQ))
- return false;
- auto EC = TLD.BQ->getBuffer(TLD.Buffer);
- if (EC != BufferQueue::ErrorCode::Ok) {
- Report("Failed to acquire a buffer; error=%s\n",
- BufferQueue::getErrorString(EC));
- return false;
- }
- setupNewBuffer(wall_clock_reader);
-
- // Always write the CPU metadata as the first record in the buffer.
- writeNewCPUIdMetadata(CPU, TSC);
- }
- return true;
-}
-
-inline bool
-isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU,
- int (*wall_clock_reader)(clockid_t, struct timespec *))
- XRAY_NEVER_INSTRUMENT {
- // Bail out right away if logging is not initialized yet.
- // We should take the opportunity to release the buffer though.
- auto Status = __sanitizer::atomic_load(&LoggingStatus,
- __sanitizer::memory_order_acquire);
- auto &TLD = getThreadLocalData();
- if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
- if (TLD.RecordPtr != nullptr &&
- (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
- Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
- if (!releaseThreadLocalBuffer(*LBQ))
- return false;
- TLD.RecordPtr = nullptr;
- return false;
- }
- return false;
- }
-
- if (__sanitizer::atomic_load(&LoggingStatus,
- __sanitizer::memory_order_acquire) !=
- XRayLogInitStatus::XRAY_LOG_INITIALIZED ||
- LBQ->finalizing()) {
- if (!releaseThreadLocalBuffer(*LBQ))
- return false;
- TLD.RecordPtr = nullptr;
- }
-
- if (TLD.Buffer.Buffer == nullptr) {
- auto EC = LBQ->getBuffer(TLD.Buffer);
- if (EC != BufferQueue::ErrorCode::Ok) {
- auto LS = __sanitizer::atomic_load(&LoggingStatus,
- __sanitizer::memory_order_acquire);
- if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING &&
- LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
- Report("Failed to acquire a buffer; error=%s\n",
- BufferQueue::getErrorString(EC));
- return false;
- }
-
- setupNewBuffer(wall_clock_reader);
-
- // Always write the CPU metadata as the first record in the buffer.
- writeNewCPUIdMetadata(CPU, TSC);
- }
-
- if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) {
- // This means this is the first CPU this thread has ever run on. We set
- // the current CPU and record this as the first TSC we've seen.
- TLD.CurrentCPU = CPU;
- writeNewCPUIdMetadata(CPU, TSC);
- }
-
- return true;
-} // namespace __xray_fdr_internal
-
-// Compute the TSC difference between the time of measurement and the previous
-// event. There are a few interesting situations we need to account for:
-//
-// - The thread has migrated to a different CPU. If this is the case, then
-// we write down the following records:
-//
-// 1. A 'NewCPUId' Metadata record.
-// 2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-// - The TSC delta is greater than the 32 bits we can store in a
-// FunctionRecord. In this case we write down the following records:
-//
-// 1. A 'TSCWrap' Metadata record.
-// 2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-// - The TSC delta is representable within the 32 bits we can store in a
-// FunctionRecord. In this case we write down just a FunctionRecord with
-// the correct TSC delta.
-inline uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC,
- uint8_t CPU) {
- if (CPU != TLD.CurrentCPU) {
- // We've moved to a new CPU.
- writeNewCPUIdMetadata(CPU, TSC);
- return 0;
- }
- // If the delta is greater than the range for a uint32_t, then we write out
- // the TSC wrap metadata entry with the full TSC, and the TSC for the
- // function record be 0.
- uint64_t Delta = TSC - TLD.LastTSC;
- if (Delta <= std::numeric_limits<uint32_t>::max())
- return Delta;
-
- writeTSCWrapMetadata(TSC);
- return 0;
-}
-
-inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- auto BufferStart = static_cast<char *>(TLD.Buffer.Buffer);
- if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <=
- ptrdiff_t{MetadataRecSize}) {
- if (!releaseThreadLocalBuffer(*TLD.BQ))
- return;
- TLD.RecordPtr = nullptr;
- }
-}
-
-thread_local volatile bool Running = false;
-
-/// Here's where the meat of the processing happens. The writer captures
-/// function entry, exit and tail exit points with a time and will create
-/// TSCWrap, NewCPUId and Function records as necessary. The writer might
-/// walk backward through its buffer and erase trivial functions to avoid
-/// polluting the log and may use the buffer queue to obtain or release a
-/// buffer.
-inline void processFunctionHook(int32_t FuncId, XRayEntryType Entry,
- uint64_t TSC, unsigned char CPU, uint64_t Arg1,
- int (*wall_clock_reader)(clockid_t,
- struct timespec *),
- BufferQueue *BQ) XRAY_NEVER_INSTRUMENT {
- // Prevent signal handler recursion, so in case we're already in a log writing
- // mode and the signal handler comes in (and is also instrumented) then we
- // don't want to be clobbering potentially partial writes already happening in
- // the thread. We use a simple thread_local latch to only allow one on-going
- // handleArg0 to happen at any given time.
- RecursionGuard Guard{Running};
- if (!Guard) {
- assert(Running == true && "RecursionGuard is buggy!");
- return;
- }
-
- auto &TLD = getThreadLocalData();
-
- // In case the reference has been cleaned up before, we make sure we
- // initialize it to the provided BufferQueue.
- if (TLD.BQ == nullptr)
- TLD.BQ = BQ;
-
- if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
- return;
-
- // Before we go setting up writing new function entries, we need to be really
- // careful about the pointer math we're doing. This means we need to ensure
- // that the record we are about to write is going to fit into the buffer,
- // without overflowing the buffer.
- //
- // To do this properly, we use the following assumptions:
- //
- // - The least number of bytes we will ever write is 8
- // (sizeof(FunctionRecord)) only if the delta between the previous entry
- // and this entry is within 32 bits.
- // - The most number of bytes we will ever write is 8 + 16 + 16 = 40.
- // This is computed by:
- //
- // MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord)
- //
- // These arise in the following cases:
- //
- // 1. When the delta between the TSC we get and the previous TSC for the
- // same CPU is outside of the uint32_t range, we end up having to
- // write a MetadataRecord to indicate a "tsc wrap" before the actual
- // FunctionRecord.
- // 2. When we learn that we've moved CPUs, we need to write a
- // MetadataRecord to indicate a "cpu change", and thus write out the
- // current TSC for that CPU before writing out the actual
- // FunctionRecord.
- // 3. When we learn about a new CPU ID, we need to write down a "new cpu
- // id" MetadataRecord before writing out the actual FunctionRecord.
- // 4. The second MetadataRecord is the optional function call argument.
- //
- // So the math we need to do is to determine whether writing 40 bytes past the
- // current pointer exceeds the buffer's maximum size. If we don't have enough
- // space to write 40 bytes in the buffer, we need get a new Buffer, set it up
- // properly before doing any further writing.
- size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize;
- if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) {
- TLD.BQ = nullptr;
- return;
- }
-
- // By this point, we are now ready to write up to 40 bytes (explained above).
- assert((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Buffer) >=
- static_cast<ptrdiff_t>(MetadataRecSize) &&
- "Misconfigured BufferQueue provided; Buffer size not large enough.");
-
- auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU);
- TLD.LastTSC = TSC;
- TLD.CurrentCPU = CPU;
- switch (Entry) {
- case XRayEntryType::ENTRY:
- case XRayEntryType::LOG_ARGS_ENTRY:
- // Update the thread local state for the next invocation.
- TLD.LastFunctionEntryTSC = TSC;
- break;
- case XRayEntryType::TAIL:
- case XRayEntryType::EXIT:
- // Break out and write the exit record if we can't erase any functions.
- if (TLD.NumConsecutiveFnEnters == 0 ||
- (TSC - TLD.LastFunctionEntryTSC) >= thresholdTicks())
- break;
- rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId);
- return; // without writing log.
- case XRayEntryType::CUSTOM_EVENT: {
- // This is a bug in patching, so we'll report it once and move on.
- static bool Once = [&] {
- Report("Internal error: patched an XRay custom event call as a function; "
- "func id = %d",
- FuncId);
- return true;
- }();
- (void)Once;
- return;
- }
- }
-
- writeFunctionRecord(FuncId, RecordTSCDelta, Entry);
- if (Entry == XRayEntryType::LOG_ARGS_ENTRY)
- writeCallArgumentMetadata(Arg1);
-
- // If we've exhausted the buffer by this time, we then release the buffer to
- // make sure that other threads may start using this buffer.
- endBufferIfFull();
-}
-
-} // namespace __xray_fdr_internal
-} // namespace __xray
-
-#endif // XRAY_XRAY_FDR_LOGGING_IMPL_H
diff --git a/lib/xray/xray_flags.cc b/lib/xray/xray_flags.cc
index 1ee4d10d753c..b50b68666d80 100644
--- a/lib/xray/xray_flags.cc
+++ b/lib/xray/xray_flags.cc
@@ -30,7 +30,7 @@ void Flags::setDefaults() XRAY_NEVER_INSTRUMENT {
#undef XRAY_FLAG
}
-static void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
+void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
#define XRAY_FLAG(Type, Name, DefaultValue, Description) \
RegisterFlag(P, #Name, Description, &F->Name);
#include "xray_flags.inc"
@@ -42,15 +42,14 @@ static void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
// options that control XRay. This means users/deployments can tweak the
// defaults that override the hard-coded defaults in the xray_flags.inc at
// compile-time using the XRAY_DEFAULT_OPTIONS macro.
-static const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
#ifdef XRAY_DEFAULT_OPTIONS
-// Do the double-layered string conversion to prevent badly crafted strings
-// provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues (or
-// changing the semantics of the implementation through the macro). This ensures
-// that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a string literal.
-#define XRAY_STRINGIZE(x) #x
-#define XRAY_STRINGIZE_OPTIONS(options) XRAY_STRINGIZE(options)
- return XRAY_STRINGIZE_OPTIONS(XRAY_DEFAULT_OPTIONS);
+ // Do the double-layered string conversion to prevent badly crafted strings
+ // provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues
+ // (or changing the semantics of the implementation through the macro). This
+ // ensures that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a
+ // string literal.
+ return SANITIZER_STRINGIFY(XRAY_DEFAULT_OPTIONS);
#else
return "";
#endif
diff --git a/lib/xray/xray_flags.h b/lib/xray/xray_flags.h
index 3ed5b8844cb4..7c1ba9458856 100644
--- a/lib/xray/xray_flags.h
+++ b/lib/xray/xray_flags.h
@@ -29,6 +29,8 @@ struct Flags {
};
extern Flags xray_flags_dont_use_directly;
+extern void registerXRayFlags(FlagParser *P, Flags *F);
+const char *useCompilerDefinedFlags();
inline Flags *flags() { return &xray_flags_dont_use_directly; }
void initializeFlags();
diff --git a/lib/xray/xray_flags.inc b/lib/xray/xray_flags.inc
index 29f1fce7d7f4..c87903963a36 100644
--- a/lib/xray/xray_flags.inc
+++ b/lib/xray/xray_flags.inc
@@ -27,23 +27,24 @@ XRAY_FLAG(uptr, xray_page_size_override, 0,
XRAY_FLAG(bool, xray_naive_log, false,
"DEPRECATED: Use xray_mode=xray-basic instead.")
XRAY_FLAG(int, xray_naive_log_func_duration_threshold_us, 5,
- "Naive logging will try to skip functions that execute for fewer "
- "microseconds than this threshold.")
+ "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+ "func_duration_threshold_us instead.")
XRAY_FLAG(int, xray_naive_log_max_stack_depth, 64,
- "Naive logging will keep track of at most this deep a call stack, "
- "any more and the recordings will be droppped.")
+ "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+ "max_stack_depth instead.")
XRAY_FLAG(int, xray_naive_log_thread_buffer_size, 1024,
- "The number of entries to keep on a per-thread buffer.")
+ "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+ "thread_buffer_size instead.")
// FDR (Flight Data Recorder) Mode logging options.
XRAY_FLAG(bool, xray_fdr_log, false,
"DEPRECATED: Use xray_mode=xray-fdr instead.")
XRAY_FLAG(int, xray_fdr_log_func_duration_threshold_us, 5,
- "FDR logging will try to skip functions that execute for fewer "
- "microseconds than this threshold.")
+ "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+ "func_duration_threshold_us instead.")
XRAY_FLAG(int, xray_fdr_log_grace_period_us, 0,
- "DEPRECATED: use xray_fdr_log_grace_period_ms instead.")
+ "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+ "grace_period_ms instead.")
XRAY_FLAG(int, xray_fdr_log_grace_period_ms, 100,
- "FDR logging will wait this much time in microseconds before "
- "actually flushing the log; this gives a chance for threads to "
- "notice that the log has been finalized and clean up.")
+ "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+ "grace_period_ms instead.")
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
new file mode 100644
index 000000000000..2acf14aa5625
--- /dev/null
+++ b/lib/xray/xray_function_call_trie.h
@@ -0,0 +1,455 @@
+//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a function call trie.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FUNCTION_CALL_TRIE_H
+#define XRAY_FUNCTION_CALL_TRIE_H
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
+#include <memory> // For placement new.
+#include <utility>
+
+namespace __xray {
+
+/// A FunctionCallTrie represents the stack traces of XRay instrumented
+/// functions that we've encountered, where a node corresponds to a function and
+/// the path from the root to the node its stack trace. Each node in the trie
+/// will contain some useful values, including:
+///
+/// * The cumulative amount of time spent in this particular node/stack.
+/// * The number of times this stack has appeared.
+/// * A histogram of latencies for that particular node.
+///
+/// Each node in the trie will also contain a list of callees, represented using
+/// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function
+/// ID of the callee, and a pointer to the node.
+///
+/// If we visualise this data structure, we'll find the following potential
+/// representation:
+///
+/// [function id node] -> [callees] [cumulative time]
+/// [call counter] [latency histogram]
+///
+/// As an example, when we have a function in this pseudocode:
+///
+/// func f(N) {
+/// g()
+/// h()
+/// for i := 1..N { j() }
+/// }
+///
+/// We may end up with a trie of the following form:
+///
+/// f -> [ g, h, j ] [...] [1] [...]
+/// g -> [ ... ] [...] [1] [...]
+/// h -> [ ... ] [...] [1] [...]
+/// j -> [ ... ] [...] [N] [...]
+///
+/// If for instance the function g() called j() like so:
+///
+/// func g() {
+/// for i := 1..10 { j() }
+/// }
+///
+/// We'll find the following updated trie:
+///
+/// f -> [ g, h, j ] [...] [1] [...]
+/// g -> [ j' ] [...] [1] [...]
+/// h -> [ ... ] [...] [1] [...]
+/// j -> [ ... ] [...] [N] [...]
+/// j' -> [ ... ] [...] [10] [...]
+///
+/// Note that we'll have a new node representing the path `f -> g -> j'` with
+/// isolated data. This isolation gives us a means of representing the stack
+/// traces as a path, as opposed to a key in a table. The alternative
+/// implementation here would be to use a separate table for the path, and use
+/// hashes of the path as an identifier to accumulate the information. We've
+/// moved away from this approach as it takes a lot of time to compute the hash
+/// every time we need to update a function's call information as we're handling
+/// the entry and exit events.
+///
+/// This approach allows us to maintain a shadow stack, which represents the
+/// currently executing path, and on function exits quickly compute the amount
+/// of time elapsed from the entry, then update the counters for the node
+/// already represented in the trie. This necessitates an efficient
+/// representation of the various data structures (the list of callees must be
+/// cache-aware and efficient to look up, and the histogram must be compact and
+/// quick to update) to enable us to keep the overheads of this implementation
+/// to the minimum.
+class FunctionCallTrie {
+public:
+ struct Node;
+
+ // We use a NodeIdPair type instead of a std::pair<...> to not rely on the
+ // standard library types in this header.
+ struct NodeIdPair {
+ Node *NodePtr;
+ int32_t FId;
+
+ // Constructor for inplace-construction.
+ NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
+ };
+
+ using NodeIdPairArray = Array<NodeIdPair>;
+ using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
+
+ // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
+ // number of times this node actually appeared, the cumulative amount of time
+ // for this particular node including its children call times, and just the
+ // local time spent on this node. Each Node will have the ID of the XRay
+ // instrumented function that it is associated to.
+ struct Node {
+ Node *Parent;
+ NodeIdPairArray Callees;
+ int64_t CallCount;
+ int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+ int32_t FId;
+
+ // We add a constructor here to allow us to inplace-construct through
+ // Array<...>'s AppendEmplace.
+ Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+ int32_t F)
+ : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
+ FId(F) {}
+
+ // TODO: Include the compact histogram.
+ };
+
+private:
+ struct ShadowStackEntry {
+ uint64_t EntryTSC;
+ Node *NodePtr;
+
+ // We add a constructor here to allow us to inplace-construct through
+ // Array<...>'s AppendEmplace.
+ ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
+ };
+
+ using NodeArray = Array<Node>;
+ using RootArray = Array<Node *>;
+ using ShadowStackArray = Array<ShadowStackEntry>;
+
+public:
+ // We collate the allocators we need into a single struct, as a convenience to
+ // allow us to initialize these as a group.
+ struct Allocators {
+ using NodeAllocatorType = NodeArray::AllocatorType;
+ using RootAllocatorType = RootArray::AllocatorType;
+ using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
+
+ NodeAllocatorType *NodeAllocator = nullptr;
+ RootAllocatorType *RootAllocator = nullptr;
+ ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
+ NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+
+ Allocators() {}
+ Allocators(const Allocators &) = delete;
+ Allocators &operator=(const Allocators &) = delete;
+
+ Allocators(Allocators &&O)
+ : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
+ ShadowStackAllocator(O.ShadowStackAllocator),
+ NodeIdPairAllocator(O.NodeIdPairAllocator) {
+ O.NodeAllocator = nullptr;
+ O.RootAllocator = nullptr;
+ O.ShadowStackAllocator = nullptr;
+ O.NodeIdPairAllocator = nullptr;
+ }
+
+ Allocators &operator=(Allocators &&O) {
+ {
+ auto Tmp = O.NodeAllocator;
+ O.NodeAllocator = this->NodeAllocator;
+ this->NodeAllocator = Tmp;
+ }
+ {
+ auto Tmp = O.RootAllocator;
+ O.RootAllocator = this->RootAllocator;
+ this->RootAllocator = Tmp;
+ }
+ {
+ auto Tmp = O.ShadowStackAllocator;
+ O.ShadowStackAllocator = this->ShadowStackAllocator;
+ this->ShadowStackAllocator = Tmp;
+ }
+ {
+ auto Tmp = O.NodeIdPairAllocator;
+ O.NodeIdPairAllocator = this->NodeIdPairAllocator;
+ this->NodeIdPairAllocator = Tmp;
+ }
+ return *this;
+ }
+
+ ~Allocators() {
+ // Note that we cannot use delete on these pointers, as they need to be
+ // returned to the sanitizer_common library's internal memory tracking
+ // system.
+ if (NodeAllocator != nullptr) {
+ NodeAllocator->~NodeAllocatorType();
+ InternalFree(NodeAllocator);
+ NodeAllocator = nullptr;
+ }
+ if (RootAllocator != nullptr) {
+ RootAllocator->~RootAllocatorType();
+ InternalFree(RootAllocator);
+ RootAllocator = nullptr;
+ }
+ if (ShadowStackAllocator != nullptr) {
+ ShadowStackAllocator->~ShadowStackAllocatorType();
+ InternalFree(ShadowStackAllocator);
+ ShadowStackAllocator = nullptr;
+ }
+ if (NodeIdPairAllocator != nullptr) {
+ NodeIdPairAllocator->~NodeIdPairAllocatorType();
+ InternalFree(NodeIdPairAllocator);
+ NodeIdPairAllocator = nullptr;
+ }
+ }
+ };
+
+ // TODO: Support configuration of options through the arguments.
+ static Allocators InitAllocators() {
+ return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
+ }
+
+ static Allocators InitAllocatorsCustom(uptr Max) {
+ Allocators A;
+ auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
+ InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
+ new (NodeAllocator) Allocators::NodeAllocatorType(Max);
+ A.NodeAllocator = NodeAllocator;
+
+ auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
+ InternalAlloc(sizeof(Allocators::RootAllocatorType)));
+ new (RootAllocator) Allocators::RootAllocatorType(Max);
+ A.RootAllocator = RootAllocator;
+
+ auto ShadowStackAllocator =
+ reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
+ InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
+ new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max);
+ A.ShadowStackAllocator = ShadowStackAllocator;
+
+ auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+ InternalAlloc(sizeof(NodeIdPairAllocatorType)));
+ new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
+ A.NodeIdPairAllocator = NodeIdPairAllocator;
+ return A;
+ }
+
+private:
+ NodeArray Nodes;
+ RootArray Roots;
+ ShadowStackArray ShadowStack;
+ NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+
+public:
+ explicit FunctionCallTrie(const Allocators &A)
+ : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+ ShadowStack(*A.ShadowStackAllocator),
+ NodeIdPairAllocator(A.NodeIdPairAllocator) {}
+
+ void enterFunction(const int32_t FId, uint64_t TSC) {
+ DCHECK_NE(FId, 0);
+ // This function primarily deals with ensuring that the ShadowStack is
+ // consistent and ready for when an exit event is encountered.
+ if (UNLIKELY(ShadowStack.empty())) {
+ auto NewRoot =
+ Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+ if (UNLIKELY(NewRoot == nullptr))
+ return;
+ Roots.Append(NewRoot);
+ ShadowStack.AppendEmplace(TSC, NewRoot);
+ return;
+ }
+
+ auto &Top = ShadowStack.back();
+ auto TopNode = Top.NodePtr;
+ DCHECK_NE(TopNode, nullptr);
+
+ // If we've seen this callee before, then we just access that node and place
+ // that on the top of the stack.
+ auto Callee = TopNode->Callees.find_element(
+ [FId](const NodeIdPair &NR) { return NR.FId == FId; });
+ if (Callee != nullptr) {
+ CHECK_NE(Callee->NodePtr, nullptr);
+ ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+ return;
+ }
+
+ // This means we've never seen this stack before, create a new node here.
+ auto NewNode =
+ Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+ if (UNLIKELY(NewNode == nullptr))
+ return;
+ DCHECK_NE(NewNode, nullptr);
+ TopNode->Callees.AppendEmplace(NewNode, FId);
+ ShadowStack.AppendEmplace(TSC, NewNode);
+ DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
+ return;
+ }
+
+ void exitFunction(int32_t FId, uint64_t TSC) {
+ // When we exit a function, we look up the ShadowStack to see whether we've
+ // entered this function before. We do as little processing here as we can,
+ // since most of the hard work would have already been done at function
+ // entry.
+ uint64_t CumulativeTreeTime = 0;
+ while (!ShadowStack.empty()) {
+ const auto &Top = ShadowStack.back();
+ auto TopNode = Top.NodePtr;
+ DCHECK_NE(TopNode, nullptr);
+ auto LocalTime = TSC - Top.EntryTSC;
+ TopNode->CallCount++;
+ TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
+ CumulativeTreeTime += LocalTime;
+ ShadowStack.trim(1);
+
+ // TODO: Update the histogram for the node.
+ if (TopNode->FId == FId)
+ break;
+ }
+ }
+
+ const RootArray &getRoots() const { return Roots; }
+
+ // The deepCopyInto operation will update the provided FunctionCallTrie by
+ // re-creating the contents of this particular FunctionCallTrie in the other
+ // FunctionCallTrie. It will do this using a Depth First Traversal from the
+ // roots, and while doing so recreating the traversal in the provided
+ // FunctionCallTrie.
+ //
+ // This operation will *not* destroy the state in `O`, and thus may cause some
+ // duplicate entries in `O` if it is not empty.
+ //
+ // This function is *not* thread-safe, and may require external
+ // synchronisation of both "this" and |O|.
+ //
+ // This function must *not* be called with a non-empty FunctionCallTrie |O|.
+ void deepCopyInto(FunctionCallTrie &O) const {
+ DCHECK(O.getRoots().empty());
+
+ // We then push the root into a stack, to use as the parent marker for new
+ // nodes we push in as we're traversing depth-first down the call tree.
+ struct NodeAndParent {
+ FunctionCallTrie::Node *Node;
+ FunctionCallTrie::Node *NewNode;
+ };
+ using Stack = Array<NodeAndParent>;
+
+ typename Stack::AllocatorType StackAllocator(
+ profilingFlags()->stack_allocator_max);
+ Stack DFSStack(StackAllocator);
+
+ for (const auto Root : getRoots()) {
+ // Add a node in O for this root.
+ auto NewRoot = O.Nodes.AppendEmplace(
+ nullptr, *O.NodeIdPairAllocator, Root->CallCount,
+ Root->CumulativeLocalTime, Root->FId);
+
+ // Because we cannot allocate more memory we should bail out right away.
+ if (UNLIKELY(NewRoot == nullptr))
+ return;
+
+ O.Roots.Append(NewRoot);
+
+ // TODO: Figure out what to do if we fail to allocate any more stack
+ // space. Maybe warn or report once?
+ DFSStack.AppendEmplace(Root, NewRoot);
+ while (!DFSStack.empty()) {
+ NodeAndParent NP = DFSStack.back();
+ DCHECK_NE(NP.Node, nullptr);
+ DCHECK_NE(NP.NewNode, nullptr);
+ DFSStack.trim(1);
+ for (const auto Callee : NP.Node->Callees) {
+ auto NewNode = O.Nodes.AppendEmplace(
+ NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
+ Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+ if (UNLIKELY(NewNode == nullptr))
+ return;
+ NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
+ DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
+ }
+ }
+ }
+ }
+
+ // The mergeInto operation will update the provided FunctionCallTrie by
+ // traversing the current trie's roots and updating (i.e. merging) the data in
+ // the nodes with the data in the target's nodes. If the node doesn't exist in
+ // the provided trie, we add a new one in the right position, and inherit the
+ // data from the original (current) trie, along with all its callees.
+ //
+ // This function is *not* thread-safe, and may require external
+ // synchronisation of both "this" and |O|.
+ void mergeInto(FunctionCallTrie &O) const {
+ struct NodeAndTarget {
+ FunctionCallTrie::Node *OrigNode;
+ FunctionCallTrie::Node *TargetNode;
+ };
+ using Stack = Array<NodeAndTarget>;
+ typename Stack::AllocatorType StackAllocator(
+ profilingFlags()->stack_allocator_max);
+ Stack DFSStack(StackAllocator);
+
+ for (const auto Root : getRoots()) {
+ Node *TargetRoot = nullptr;
+ auto R = O.Roots.find_element(
+ [&](const Node *Node) { return Node->FId == Root->FId; });
+ if (R == nullptr) {
+ TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
+ 0, Root->FId);
+ if (UNLIKELY(TargetRoot == nullptr))
+ return;
+
+ O.Roots.Append(TargetRoot);
+ } else {
+ TargetRoot = *R;
+ }
+
+ DFSStack.Append(NodeAndTarget{Root, TargetRoot});
+ while (!DFSStack.empty()) {
+ NodeAndTarget NT = DFSStack.back();
+ DCHECK_NE(NT.OrigNode, nullptr);
+ DCHECK_NE(NT.TargetNode, nullptr);
+ DFSStack.trim(1);
+ // TODO: Update the histogram as well when we have it ready.
+ NT.TargetNode->CallCount += NT.OrigNode->CallCount;
+ NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;
+ for (const auto Callee : NT.OrigNode->Callees) {
+ auto TargetCallee = NT.TargetNode->Callees.find_element(
+ [&](const FunctionCallTrie::NodeIdPair &C) {
+ return C.FId == Callee.FId;
+ });
+ if (TargetCallee == nullptr) {
+ auto NewTargetNode = O.Nodes.AppendEmplace(
+ NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+
+ if (UNLIKELY(NewTargetNode == nullptr))
+ return;
+
+ TargetCallee =
+ NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
+ }
+ DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
+ }
+ }
+ }
+ }
+};
+
+} // namespace __xray
+
+#endif // XRAY_FUNCTION_CALL_TRIE_H
diff --git a/lib/xray/xray_init.cc b/lib/xray/xray_init.cc
index 11892cb8b7a3..b4e069795195 100644
--- a/lib/xray/xray_init.cc
+++ b/lib/xray/xray_init.cc
@@ -38,32 +38,29 @@ using namespace __xray;
//
// FIXME: Support DSO instrumentation maps too. The current solution only works
// for statically linked executables.
-__sanitizer::atomic_uint8_t XRayInitialized{0};
+atomic_uint8_t XRayInitialized{0};
// This should always be updated before XRayInitialized is updated.
-__sanitizer::SpinMutex XRayInstrMapMutex;
+SpinMutex XRayInstrMapMutex;
XRaySledMap XRayInstrMap;
// Global flag to determine whether the flags have been initialized.
-__sanitizer::atomic_uint8_t XRayFlagsInitialized{0};
+atomic_uint8_t XRayFlagsInitialized{0};
// A mutex to allow only one thread to initialize the XRay data structures.
-__sanitizer::SpinMutex XRayInitMutex;
+SpinMutex XRayInitMutex;
// __xray_init() will do the actual loading of the current process' memory map
// and then proceed to look for the .xray_instr_map section/segment.
void __xray_init() XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayInitMutex);
+ SpinMutexLock Guard(&XRayInitMutex);
// Short-circuit if we've already initialized XRay before.
- if (__sanitizer::atomic_load(&XRayInitialized,
- __sanitizer::memory_order_acquire))
+ if (atomic_load(&XRayInitialized, memory_order_acquire))
return;
- if (!__sanitizer::atomic_load(&XRayFlagsInitialized,
- __sanitizer::memory_order_acquire)) {
+ if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) {
initializeFlags();
- __sanitizer::atomic_store(&XRayFlagsInitialized, true,
- __sanitizer::memory_order_release);
+ atomic_store(&XRayFlagsInitialized, true, memory_order_release);
}
if (__start_xray_instr_map == nullptr) {
@@ -73,14 +70,13 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
}
{
- __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+ SpinMutexLock Guard(&XRayInstrMapMutex);
XRayInstrMap.Sleds = __start_xray_instr_map;
XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map;
XRayInstrMap.SledsIndex = __start_xray_fn_idx;
XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx;
}
- __sanitizer::atomic_store(&XRayInitialized, true,
- __sanitizer::memory_order_release);
+ atomic_store(&XRayInitialized, true, memory_order_release);
#ifndef XRAY_NO_PREINIT
if (flags()->patch_premain)
@@ -88,7 +84,13 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
#endif
}
-#if !defined(XRAY_NO_PREINIT) && SANITIZER_CAN_USE_PREINIT_ARRAY
+// FIXME: Make check-xray tests work on FreeBSD without
+// SANITIZER_CAN_USE_PREINIT_ARRAY.
+// See sanitizer_internal_defs.h where the macro is defined.
+// Calling unresolved PLT functions in .preinit_array can lead to deadlock on
+// FreeBSD but here it seems benign.
+#if !defined(XRAY_NO_PREINIT) && \
+ (SANITIZER_CAN_USE_PREINIT_ARRAY || SANITIZER_FREEBSD)
// Only add the preinit array initialization if the sanitizers can.
__attribute__((section(".preinit_array"),
used)) void (*__local_xray_preinit)(void) = __xray_init;
diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc
index 766313e85c58..01bf6ddc607e 100644
--- a/lib/xray/xray_interface.cc
+++ b/lib/xray/xray_interface.cc
@@ -19,9 +19,12 @@
#include <cstdio>
#include <errno.h>
#include <limits>
+#include <string.h>
#include <sys/mman.h>
+#include "sanitizer_common/sanitizer_addrhashmap.h"
#include "sanitizer_common/sanitizer_common.h"
+
#include "xray_defs.h"
#include "xray_flags.h"
@@ -48,26 +51,40 @@ static const int16_t cSledLength = 8;
#endif /* CPU architecture */
// This is the function to call when we encounter the entry or exit sleds.
-__sanitizer::atomic_uintptr_t XRayPatchedFunction{0};
+atomic_uintptr_t XRayPatchedFunction{0};
// This is the function to call from the arg1-enabled sleds/trampolines.
-__sanitizer::atomic_uintptr_t XRayArgLogger{0};
+atomic_uintptr_t XRayArgLogger{0};
// This is the function to call when we encounter a custom event log call.
-__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0};
+atomic_uintptr_t XRayPatchedCustomEvent{0};
+
+// This is the function to call when we encounter a typed event log call.
+atomic_uintptr_t XRayPatchedTypedEvent{0};
// This is the global status to determine whether we are currently
// patching/unpatching.
-__sanitizer::atomic_uint8_t XRayPatching{0};
+atomic_uint8_t XRayPatching{0};
+
+struct TypeDescription {
+ uint32_t type_id;
+ std::size_t description_string_length;
+};
+
+using TypeDescriptorMapType = AddrHashMap<TypeDescription, 11>;
+// An address map from immutable descriptors to type ids.
+TypeDescriptorMapType TypeDescriptorAddressMap{};
-// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo
-// any successful mprotect(...) changes. This is used to make a page writeable
-// and executable, and upon destruction if it was successful in doing so returns
-// the page into a read-only and executable page.
+atomic_uint32_t TypeEventDescriptorCounter{0};
+
+// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will
+// undo any successful mprotect(...) changes. This is used to make a page
+// writeable and executable, and upon destruction if it was successful in
+// doing so returns the page into a read-only and executable page.
//
// This is only used specifically for runtime-patching of the XRay
-// instrumentation points. This assumes that the executable pages are originally
-// read-and-execute only.
+// instrumentation points. This assumes that the executable pages are
+// originally read-and-execute only.
class MProtectHelper {
void *PageAlignedAddr;
std::size_t MProtectLen;
@@ -116,6 +133,9 @@ bool patchSled(const XRaySledEntry &Sled, bool Enable,
case XRayEntryType::CUSTOM_EVENT:
Success = patchCustomEvent(Enable, FuncId, Sled);
break;
+ case XRayEntryType::TYPED_EVENT:
+ Success = patchTypedEvent(Enable, FuncId, Sled);
+ break;
default:
Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
return false;
@@ -125,19 +145,19 @@ bool patchSled(const XRaySledEntry &Sled, bool Enable,
XRayPatchingStatus patchFunction(int32_t FuncId,
bool Enable) XRAY_NEVER_INSTRUMENT {
- if (!__sanitizer::atomic_load(&XRayInitialized,
- __sanitizer::memory_order_acquire))
+ if (!atomic_load(&XRayInitialized,
+ memory_order_acquire))
return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
uint8_t NotPatching = false;
- if (!__sanitizer::atomic_compare_exchange_strong(
- &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel))
+ if (!atomic_compare_exchange_strong(
+ &XRayPatching, &NotPatching, true, memory_order_acq_rel))
return XRayPatchingStatus::ONGOING; // Already patching.
// Next, we look for the function index.
XRaySledMap InstrMap;
{
- __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+ SpinMutexLock Guard(&XRayInstrMapMutex);
InstrMap = XRayInstrMap;
}
@@ -161,8 +181,8 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
while (f != e)
SucceedOnce |= patchSled(*f++, Enable, FuncId);
- __sanitizer::atomic_store(&XRayPatching, false,
- __sanitizer::memory_order_release);
+ atomic_store(&XRayPatching, false,
+ memory_order_release);
if (!SucceedOnce) {
Report("Failed patching any sled for function '%d'.", FuncId);
@@ -176,26 +196,26 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
// implementation. |Enable| defines whether we're enabling or disabling the
// runtime XRay instrumentation.
XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
- if (!__sanitizer::atomic_load(&XRayInitialized,
- __sanitizer::memory_order_acquire))
+ if (!atomic_load(&XRayInitialized,
+ memory_order_acquire))
return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
uint8_t NotPatching = false;
- if (!__sanitizer::atomic_compare_exchange_strong(
- &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel))
+ if (!atomic_compare_exchange_strong(
+ &XRayPatching, &NotPatching, true, memory_order_acq_rel))
return XRayPatchingStatus::ONGOING; // Already patching.
uint8_t PatchingSuccess = false;
auto XRayPatchingStatusResetter =
- __sanitizer::at_scope_exit([&PatchingSuccess] {
+ at_scope_exit([&PatchingSuccess] {
if (!PatchingSuccess)
- __sanitizer::atomic_store(&XRayPatching, false,
- __sanitizer::memory_order_release);
+ atomic_store(&XRayPatching, false,
+ memory_order_release);
});
XRaySledMap InstrMap;
{
- __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+ SpinMutexLock Guard(&XRayInstrMapMutex);
InstrMap = XRayInstrMap;
}
if (InstrMap.Entries == 0)
@@ -251,8 +271,8 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
}
patchSled(Sled, Enable, FuncId);
}
- __sanitizer::atomic_store(&XRayPatching, false,
- __sanitizer::memory_order_release);
+ atomic_store(&XRayPatching, false,
+ memory_order_release);
PatchingSuccess = true;
return XRayPatchingStatus::SUCCESS;
}
@@ -261,7 +281,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
bool Enable) XRAY_NEVER_INSTRUMENT {
XRaySledMap InstrMap;
{
- __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+ SpinMutexLock Guard(&XRayInstrMapMutex);
InstrMap = XRayInstrMap;
}
@@ -318,12 +338,12 @@ using namespace __xray;
int __xray_set_handler(void (*entry)(int32_t,
XRayEntryType)) XRAY_NEVER_INSTRUMENT {
- if (__sanitizer::atomic_load(&XRayInitialized,
- __sanitizer::memory_order_acquire)) {
+ if (atomic_load(&XRayInitialized,
+ memory_order_acquire)) {
- __sanitizer::atomic_store(&__xray::XRayPatchedFunction,
+ atomic_store(&__xray::XRayPatchedFunction,
reinterpret_cast<uintptr_t>(entry),
- __sanitizer::memory_order_release);
+ memory_order_release);
return 1;
}
return 0;
@@ -331,11 +351,23 @@ int __xray_set_handler(void (*entry)(int32_t,
int __xray_set_customevent_handler(void (*entry)(void *, size_t))
XRAY_NEVER_INSTRUMENT {
- if (__sanitizer::atomic_load(&XRayInitialized,
- __sanitizer::memory_order_acquire)) {
- __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent,
+ if (atomic_load(&XRayInitialized,
+ memory_order_acquire)) {
+ atomic_store(&__xray::XRayPatchedCustomEvent,
+ reinterpret_cast<uintptr_t>(entry),
+ memory_order_release);
+ return 1;
+ }
+ return 0;
+}
+
+int __xray_set_typedevent_handler(void (*entry)(
+ uint16_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT {
+ if (atomic_load(&XRayInitialized,
+ memory_order_acquire)) {
+ atomic_store(&__xray::XRayPatchedTypedEvent,
reinterpret_cast<uintptr_t>(entry),
- __sanitizer::memory_order_release);
+ memory_order_release);
return 1;
}
return 0;
@@ -349,6 +381,21 @@ int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT {
return __xray_set_customevent_handler(nullptr);
}
+int __xray_remove_typedevent_handler() XRAY_NEVER_INSTRUMENT {
+ return __xray_set_typedevent_handler(nullptr);
+}
+
+uint16_t __xray_register_event_type(
+ const char *const event_type) XRAY_NEVER_INSTRUMENT {
+ TypeDescriptorMapType::Handle h(&TypeDescriptorAddressMap, (uptr)event_type);
+ if (h.created()) {
+ h->type_id = atomic_fetch_add(
+ &TypeEventDescriptorCounter, 1, memory_order_acq_rel);
+ h->description_string_length = strnlen(event_type, 1024);
+ }
+ return h->type_id;
+}
+
XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT {
return controlPatching(true);
}
@@ -367,22 +414,22 @@ __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
}
int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) {
- if (!__sanitizer::atomic_load(&XRayInitialized,
- __sanitizer::memory_order_acquire))
+ if (!atomic_load(&XRayInitialized,
+ memory_order_acquire))
return 0;
// A relaxed write might not be visible even if the current thread gets
// scheduled on a different CPU/NUMA node. We need to wait for everyone to
// have this handler installed for consistency of collected data across CPUs.
- __sanitizer::atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
- __sanitizer::memory_order_release);
+ atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
+ memory_order_release);
return 1;
}
int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+ SpinMutexLock Guard(&XRayInstrMapMutex);
if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions)
return 0;
return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Function
@@ -396,6 +443,6 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
}
size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+ SpinMutexLock Guard(&XRayInstrMapMutex);
return XRayInstrMap.Functions;
}
diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h
index 5811e2b7300a..8ca87457437e 100644
--- a/lib/xray/xray_interface_internal.h
+++ b/lib/xray/xray_interface_internal.h
@@ -43,8 +43,8 @@ struct XRaySledEntry {
};
struct XRayFunctionSledIndex {
- const XRaySledEntry* Begin;
- const XRaySledEntry* End;
+ const XRaySledEntry *Begin;
+ const XRaySledEntry *End;
};
}
@@ -57,12 +57,13 @@ struct XRaySledMap {
size_t Functions;
};
-bool patchFunctionEntry(bool Enable, uint32_t FuncId,
- const XRaySledEntry &Sled, void (*Trampoline)());
+bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
+ void (*Trampoline)());
bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
const XRaySledEntry &Sled);
bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
} // namespace __xray
@@ -74,6 +75,7 @@ extern void __xray_FunctionExit();
extern void __xray_FunctionTailExit();
extern void __xray_ArgLoggerEntry();
extern void __xray_CustomEvent();
+extern void __xray_TypedEvent();
}
#endif
diff --git a/lib/xray/xray_log_interface.cc b/lib/xray/xray_log_interface.cc
index 783f004d292a..0886fd0d1210 100644
--- a/lib/xray/xray_log_interface.cc
+++ b/lib/xray/xray_log_interface.cc
@@ -18,9 +18,20 @@
#include "xray/xray_interface.h"
#include "xray_defs.h"
-__sanitizer::SpinMutex XRayImplMutex;
-XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr};
-XRayLogImpl *GlobalXRayImpl = nullptr;
+namespace __xray {
+static SpinMutex XRayImplMutex;
+static XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr};
+static XRayLogImpl *GlobalXRayImpl = nullptr;
+
+// This is the default implementation of a buffer iterator, which always yields
+// a null buffer.
+XRayBuffer NullBufferIterator(XRayBuffer) XRAY_NEVER_INSTRUMENT {
+ return {nullptr, 0};
+}
+
+// This is the global function responsible for iterating through given buffers.
+atomic_uintptr_t XRayBufferIterator{
+ reinterpret_cast<uintptr_t>(&NullBufferIterator)};
// We use a linked list of Mode to XRayLogImpl mappings. This is a linked list
// when it should be a map because we're avoiding having to depend on C++
@@ -31,9 +42,24 @@ struct ModeImpl {
XRayLogImpl Impl;
};
-ModeImpl SentinelModeImpl{
+static ModeImpl SentinelModeImpl{
nullptr, nullptr, {nullptr, nullptr, nullptr, nullptr}};
-ModeImpl *ModeImpls = &SentinelModeImpl;
+static ModeImpl *ModeImpls = &SentinelModeImpl;
+static const ModeImpl *CurrentMode = nullptr;
+
+} // namespace __xray
+
+using namespace __xray;
+
+void __xray_log_set_buffer_iterator(XRayBuffer (*Iterator)(XRayBuffer))
+ XRAY_NEVER_INSTRUMENT {
+ atomic_store(&__xray::XRayBufferIterator,
+ reinterpret_cast<uintptr_t>(Iterator), memory_order_release);
+}
+
+void __xray_log_remove_buffer_iterator() XRAY_NEVER_INSTRUMENT {
+ __xray_log_set_buffer_iterator(&NullBufferIterator);
+}
XRayLogRegisterStatus
__xray_log_register_mode(const char *Mode,
@@ -42,16 +68,15 @@ __xray_log_register_mode(const char *Mode,
Impl.log_finalize == nullptr || Impl.log_init == nullptr)
return XRayLogRegisterStatus::XRAY_INCOMPLETE_IMPL;
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
// First, look for whether the mode already has a registered implementation.
for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
- if (!__sanitizer::internal_strcmp(Mode, it->Mode))
+ if (!internal_strcmp(Mode, it->Mode))
return XRayLogRegisterStatus::XRAY_DUPLICATE_MODE;
}
- auto *NewModeImpl =
- static_cast<ModeImpl *>(__sanitizer::InternalAlloc(sizeof(ModeImpl)));
+ auto *NewModeImpl = static_cast<ModeImpl *>(InternalAlloc(sizeof(ModeImpl)));
NewModeImpl->Next = ModeImpls;
- NewModeImpl->Mode = __sanitizer::internal_strdup(Mode);
+ NewModeImpl->Mode = internal_strdup(Mode);
NewModeImpl->Impl = Impl;
ModeImpls = NewModeImpl;
return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
@@ -59,9 +84,10 @@ __xray_log_register_mode(const char *Mode,
XRayLogRegisterStatus
__xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
- if (!__sanitizer::internal_strcmp(Mode, it->Mode)) {
+ if (!internal_strcmp(Mode, it->Mode)) {
+ CurrentMode = it;
CurrentXRayImpl = it->Impl;
GlobalXRayImpl = &CurrentXRayImpl;
__xray_set_handler(it->Impl.handle_arg0);
@@ -71,24 +97,32 @@ __xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT {
return XRayLogRegisterStatus::XRAY_MODE_NOT_FOUND;
}
+const char *__xray_log_get_current_mode() XRAY_NEVER_INSTRUMENT {
+ SpinMutexLock Guard(&XRayImplMutex);
+ if (CurrentMode != nullptr)
+ return CurrentMode->Mode;
+ return nullptr;
+}
+
void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
if (Impl.log_init == nullptr || Impl.log_finalize == nullptr ||
Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) {
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
GlobalXRayImpl = nullptr;
+ CurrentMode = nullptr;
__xray_remove_handler();
__xray_remove_handler_arg1();
return;
}
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
CurrentXRayImpl = Impl;
GlobalXRayImpl = &CurrentXRayImpl;
__xray_set_handler(Impl.handle_arg0);
}
void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
GlobalXRayImpl = nullptr;
__xray_remove_handler();
__xray_remove_handler_arg1();
@@ -97,22 +131,80 @@ void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT {
XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers,
void *Args,
size_t ArgsSize) XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
if (!GlobalXRayImpl)
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
return GlobalXRayImpl->log_init(BufferSize, MaxBuffers, Args, ArgsSize);
}
+XRayLogInitStatus __xray_log_init_mode(const char *Mode, const char *Config)
+ XRAY_NEVER_INSTRUMENT {
+ SpinMutexLock Guard(&XRayImplMutex);
+ if (!GlobalXRayImpl)
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+ if (Config == nullptr)
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+ // Check first whether the current mode is the same as what we expect.
+ if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+ // Here we do some work to coerce the pointer we're provided, so that
+ // the implementations that still take void* pointers can handle the
+ // data provided in the Config argument.
+ return GlobalXRayImpl->log_init(
+ 0, 0, const_cast<void *>(static_cast<const void *>(Config)), 0);
+}
+
+XRayLogInitStatus
+__xray_log_init_mode_bin(const char *Mode, const char *Config,
+ size_t ConfigSize) XRAY_NEVER_INSTRUMENT {
+ SpinMutexLock Guard(&XRayImplMutex);
+ if (!GlobalXRayImpl)
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+ if (Config == nullptr)
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+ // Check first whether the current mode is the same as what we expect.
+ if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+ // Here we do some work to coerce the pointer we're provided, so that
+ // the implementations that still take void* pointers can handle the
+ // data provided in the Config argument.
+ return GlobalXRayImpl->log_init(
+ 0, 0, const_cast<void *>(static_cast<const void *>(Config)), ConfigSize);
+}
+
XRayLogInitStatus __xray_log_finalize() XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
if (!GlobalXRayImpl)
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
return GlobalXRayImpl->log_finalize();
}
XRayLogFlushStatus __xray_log_flushLog() XRAY_NEVER_INSTRUMENT {
- __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+ SpinMutexLock Guard(&XRayImplMutex);
if (!GlobalXRayImpl)
return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
return GlobalXRayImpl->flush_log();
}
+
+XRayLogFlushStatus __xray_log_process_buffers(
+ void (*Processor)(const char *, XRayBuffer)) XRAY_NEVER_INSTRUMENT {
+ // We want to make sure that there will be no changes to the global state for
+ // the log by synchronising on the XRayBufferIteratorMutex.
+ if (!GlobalXRayImpl)
+ return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+ auto Iterator = reinterpret_cast<XRayBuffer (*)(XRayBuffer)>(
+ atomic_load(&XRayBufferIterator, memory_order_acquire));
+ auto Buffer = (*Iterator)(XRayBuffer{nullptr, 0});
+ auto Mode = CurrentMode ? CurrentMode->Mode : nullptr;
+ while (Buffer.Data != nullptr) {
+ (*Processor)(Mode, Buffer);
+ Buffer = (*Iterator)(Buffer);
+ }
+ return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
diff --git a/lib/xray/xray_mips.cc b/lib/xray/xray_mips.cc
index cd863304db29..6f8243828668 100644
--- a/lib/xray/xray_mips.cc
+++ b/lib/xray/xray_mips.cc
@@ -158,6 +158,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
return false;
}
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+ // FIXME: Implement in mips?
+ return false;
+}
+
} // namespace __xray
extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_mips64.cc b/lib/xray/xray_mips64.cc
index fa8fdd5abccc..f1bdf1d7d22d 100644
--- a/lib/xray/xray_mips64.cc
+++ b/lib/xray/xray_mips64.cc
@@ -166,6 +166,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
// FIXME: Implement in mips64?
return false;
}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+ // FIXME: Implement in mips64?
+ return false;
+}
} // namespace __xray
extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_powerpc64.cc b/lib/xray/xray_powerpc64.cc
index ab03cb10042f..5e4938361c0c 100644
--- a/lib/xray/xray_powerpc64.cc
+++ b/lib/xray/xray_powerpc64.cc
@@ -99,6 +99,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
return false;
}
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+ // FIXME: Implement in powerpc64?
+ return false;
+}
+
} // namespace __xray
extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
new file mode 100644
index 000000000000..a43744d9a0cb
--- /dev/null
+++ b/lib/xray/xray_profile_collector.cc
@@ -0,0 +1,318 @@
+//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the interface for the profileCollectorService.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_profiling_flags.h"
+#include <memory>
+#include <pthread.h>
+#include <utility>
+
+namespace __xray {
+namespace profileCollectorService {
+
+namespace {
+
+SpinMutex GlobalMutex;
+struct ThreadTrie {
+ tid_t TId;
+ FunctionCallTrie *Trie;
+};
+
+struct ProfileBuffer {
+ void *Data;
+ size_t Size;
+};
+
+struct BlockHeader {
+ u32 BlockSize;
+ u32 BlockNum;
+ u64 ThreadId;
+};
+
+// These need to be pointers that point to heap/internal-allocator-allocated
+// objects because these are accessed even at program exit.
+Vector<ThreadTrie> *ThreadTries = nullptr;
+Vector<ProfileBuffer> *ProfileBuffers = nullptr;
+FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+
+} // namespace
+
+void post(const FunctionCallTrie &T, tid_t TId) {
+ static pthread_once_t Once = PTHREAD_ONCE_INIT;
+ pthread_once(&Once, +[] {
+ SpinMutexLock Lock(&GlobalMutex);
+ GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+ InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+ new (GlobalAllocators) FunctionCallTrie::Allocators();
+ *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
+ profilingFlags()->global_allocator_max);
+ ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
+ InternalAlloc(sizeof(Vector<ThreadTrie>)));
+ new (ThreadTries) Vector<ThreadTrie>();
+ ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
+ InternalAlloc(sizeof(Vector<ProfileBuffer>)));
+ new (ProfileBuffers) Vector<ProfileBuffer>();
+ });
+ DCHECK_NE(GlobalAllocators, nullptr);
+ DCHECK_NE(ThreadTries, nullptr);
+ DCHECK_NE(ProfileBuffers, nullptr);
+
+ ThreadTrie *Item = nullptr;
+ {
+ SpinMutexLock Lock(&GlobalMutex);
+ if (GlobalAllocators == nullptr)
+ return;
+
+ Item = ThreadTries->PushBack();
+ Item->TId = TId;
+
+ // Here we're using the internal allocator instead of the managed allocator
+ // because:
+ //
+ // 1) We're not using the segmented array data structure to host
+ // FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
+ // which works like a std::vector<...> keeping elements contiguous in
+ // memory. The segmented array data structure assumes that elements are
+ // trivially destructible, where FunctionCallTrie isn't.
+ //
+ // 2) Using a managed allocator means we need to manage that separately,
+ // which complicates the nature of this code. To get around that, we're
+ // using the internal allocator instead, which has its own global state
+ // and is decoupled from the lifetime management required by the managed
+ // allocator we have in XRay.
+ //
+ Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
+ sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
+ DCHECK_NE(Item->Trie, nullptr);
+ new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+ }
+
+ T.deepCopyInto(*Item->Trie);
+}
+
+// A PathArray represents the function id's representing a stack trace. In this
+// context a path is almost always represented from the leaf function in a call
+// stack to a root of the call trie.
+using PathArray = Array<int32_t>;
+
+struct ProfileRecord {
+ using PathAllocator = typename PathArray::AllocatorType;
+
+ // The Path in this record is the function id's from the leaf to the root of
+ // the function call stack as represented from a FunctionCallTrie.
+ PathArray *Path = nullptr;
+ const FunctionCallTrie::Node *Node = nullptr;
+
+ // Constructor for in-place construction.
+ ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
+ : Path([&] {
+ auto P =
+ reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
+ new (P) PathArray(A);
+ return P;
+ }()),
+ Node(N) {}
+};
+
+namespace {
+
+using ProfileRecordArray = Array<ProfileRecord>;
+
+// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
+// the path(s) and the data associated with the path.
+static void populateRecords(ProfileRecordArray &PRs,
+ ProfileRecord::PathAllocator &PA,
+ const FunctionCallTrie &Trie) {
+ using StackArray = Array<const FunctionCallTrie::Node *>;
+ using StackAllocator = typename StackArray::AllocatorType;
+ StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+ StackArray DFSStack(StackAlloc);
+ for (const auto R : Trie.getRoots()) {
+ DFSStack.Append(R);
+ while (!DFSStack.empty()) {
+ auto Node = DFSStack.back();
+ DFSStack.trim(1);
+ auto Record = PRs.AppendEmplace(PA, Node);
+ if (Record == nullptr)
+ return;
+ DCHECK_NE(Record, nullptr);
+
+ // Traverse the Node's parents and as we're doing so, get the FIds in
+ // the order they appear.
+ for (auto N = Node; N != nullptr; N = N->Parent)
+ Record->Path->Append(N->FId);
+ DCHECK(!Record->Path->empty());
+
+ for (const auto C : Node->Callees)
+ DFSStack.Append(C.NodePtr);
+ }
+ }
+}
+
+static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
+ const ProfileRecordArray &ProfileRecords) {
+ auto NextPtr = static_cast<char *>(
+ internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
+ sizeof(Header);
+ for (const auto &Record : ProfileRecords) {
+ // List of IDs follow:
+ for (const auto FId : *Record.Path)
+ NextPtr =
+ static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+ sizeof(FId);
+
+ // Add the sentinel here.
+ constexpr int32_t SentinelFId = 0;
+ NextPtr = static_cast<char *>(
+ internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
+ sizeof(SentinelFId);
+
+ // Add the node data here.
+ NextPtr =
+ static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
+ sizeof(Record.Node->CallCount))) +
+ sizeof(Record.Node->CallCount);
+ NextPtr = static_cast<char *>(
+ internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
+ sizeof(Record.Node->CumulativeLocalTime))) +
+ sizeof(Record.Node->CumulativeLocalTime);
+ }
+
+ DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
+}
+
+} // namespace
+
+void serialize() {
+ SpinMutexLock Lock(&GlobalMutex);
+
+ // Clear out the global ProfileBuffers.
+ for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
+ InternalFree((*ProfileBuffers)[I].Data);
+ ProfileBuffers->Reset();
+
+ if (ThreadTries->Size() == 0)
+ return;
+
+ // Then repopulate the global ProfileBuffers.
+ for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+ using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
+ ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+ ProfileRecord::PathAllocator PathAlloc(
+ profilingFlags()->global_allocator_max);
+ ProfileRecordArray ProfileRecords(PRAlloc);
+
+ // First, we want to compute the amount of space we're going to need. We'll
+ // use a local allocator and an __xray::Array<...> to store the intermediary
+ // data, then compute the size as we're going along. Then we'll allocate the
+ // contiguous space to contain the thread buffer data.
+ const auto &Trie = *(*ThreadTries)[I].Trie;
+ if (Trie.getRoots().empty())
+ continue;
+ populateRecords(ProfileRecords, PathAlloc, Trie);
+ DCHECK(!Trie.getRoots().empty());
+ DCHECK(!ProfileRecords.empty());
+
+ // Go through each record, to compute the sizes.
+ //
+ // header size = block size (4 bytes)
+ // + block number (4 bytes)
+ // + thread id (8 bytes)
+ // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
+ // + call count (8 bytes)
+ // + local time (8 bytes)
+ // + end of record (8 bytes)
+ u32 CumulativeSizes = 0;
+ for (const auto &Record : ProfileRecords)
+ CumulativeSizes += 20 + (4 * Record.Path->size());
+
+ BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
+ auto Buffer = ProfileBuffers->PushBack();
+ Buffer->Size = sizeof(Header) + CumulativeSizes;
+ Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+ DCHECK_NE(Buffer->Data, nullptr);
+ serializeRecords(Buffer, Header, ProfileRecords);
+
+ // Now clean up the ProfileRecords array, one at a time.
+ for (auto &Record : ProfileRecords) {
+ Record.Path->~PathArray();
+ InternalFree(Record.Path);
+ }
+ }
+}
+
+void reset() {
+ SpinMutexLock Lock(&GlobalMutex);
+ if (ProfileBuffers != nullptr) {
+ // Clear out the profile buffers that have been serialized.
+ for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
+ InternalFree((*ProfileBuffers)[I].Data);
+ ProfileBuffers->Reset();
+ InternalFree(ProfileBuffers);
+ ProfileBuffers = nullptr;
+ }
+
+ if (ThreadTries != nullptr) {
+ // Clear out the function call tries per thread.
+ for (uptr I = 0; I < ThreadTries->Size(); ++I) {
+ auto &T = (*ThreadTries)[I];
+ T.Trie->~FunctionCallTrie();
+ InternalFree(T.Trie);
+ }
+ ThreadTries->Reset();
+ InternalFree(ThreadTries);
+ ThreadTries = nullptr;
+ }
+
+ // Reset the global allocators.
+ if (GlobalAllocators != nullptr) {
+ GlobalAllocators->~Allocators();
+ InternalFree(GlobalAllocators);
+ GlobalAllocators = nullptr;
+ }
+ GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+ InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+ new (GlobalAllocators) FunctionCallTrie::Allocators();
+ *GlobalAllocators = FunctionCallTrie::InitAllocators();
+ ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
+ InternalAlloc(sizeof(Vector<ThreadTrie>)));
+ new (ThreadTries) Vector<ThreadTrie>();
+ ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
+ InternalAlloc(sizeof(Vector<ProfileBuffer>)));
+ new (ProfileBuffers) Vector<ProfileBuffer>();
+}
+
+XRayBuffer nextBuffer(XRayBuffer B) {
+ SpinMutexLock Lock(&GlobalMutex);
+
+ if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+ return {nullptr, 0};
+
+ if (B.Data == nullptr)
+ return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
+
+ BlockHeader Header;
+ internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
+ auto NextBlock = Header.BlockNum + 1;
+ if (NextBlock < ProfileBuffers->Size())
+ return {(*ProfileBuffers)[NextBlock].Data,
+ (*ProfileBuffers)[NextBlock].Size};
+ return {nullptr, 0};
+}
+
+} // namespace profileCollectorService
+} // namespace __xray
diff --git a/lib/xray/xray_profile_collector.h b/lib/xray/xray_profile_collector.h
new file mode 100644
index 000000000000..335043db9526
--- /dev/null
+++ b/lib/xray/xray_profile_collector.h
@@ -0,0 +1,88 @@
+//===-- xray_profile_collector.h -------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a data collection service, for XRay
+// profiling. What we implement here is an in-process service where
+// FunctionCallTrie instances can be handed off by threads, to be
+// consolidated/collected.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_PROFILE_COLLECTOR_H
+#define XRAY_XRAY_PROFILE_COLLECTOR_H
+
+#include "xray_function_call_trie.h"
+
+#include "xray/xray_log_interface.h"
+
+namespace __xray {
+
+/// The ProfileCollectorService implements a centralised mechanism for
+/// collecting FunctionCallTrie instances, indexed by thread ID. On demand, the
+/// ProfileCollectorService can be queried for the most recent state of the
+/// data, in a form that allows traversal.
+namespace profileCollectorService {
+
+/// Posts the FunctionCallTrie associated with a specific Thread ID. This
+/// will:
+///
+/// - Make a copy of the FunctionCallTrie and store that against the Thread
+/// ID. This will use the global allocator for the service-managed
+/// FunctionCallTrie instances.
+/// - Queue up a pointer to the FunctionCallTrie.
+/// - If the queue is long enough (longer than some arbitrary threshold) we
+/// then pre-calculate a single FunctionCallTrie for the whole process.
+///
+///
+/// We are making a copy of the FunctionCallTrie because the intent is to have
+/// this function be called at thread exit, or soon after the profiling
+/// handler is finalized through the XRay APIs. By letting threads each
+/// process their own thread-local FunctionCallTrie instances, we're removing
+/// the need for synchronisation across threads while we're profiling.
+/// However, once we're done profiling, we can then collect copies of these
+/// FunctionCallTrie instances and pay the cost of the copy.
+///
+/// NOTE: In the future, if this turns out to be more costly than "moving" the
+/// FunctionCallTrie instances from the owning thread to the collector
+/// service, then we can change the implementation to do it this way (moving)
+/// instead.
+void post(const FunctionCallTrie &T, tid_t TId);
+
+/// The serialize will process all FunctionCallTrie instances in memory, and
+/// turn those into specifically formatted blocks, each describing the
+/// function call trie's contents in a compact form. In memory, this looks
+/// like the following layout:
+///
+/// - block size (32 bits)
+/// - block number (32 bits)
+/// - thread id (64 bits)
+/// - list of records:
+/// - function ids in leaf to root order, terminated by
+/// 0 (32 bits per function id)
+/// - call count (64 bit)
+/// - cumulative local time (64 bit)
+/// - record delimiter (64 bit, 0x0)
+///
+void serialize();
+
+/// The reset function will clear out any internal memory held by the
+/// service. The intent is to have the resetting be done in calls to the
+/// initialization routine, or explicitly through the flush log API.
+void reset();
+
+/// This nextBuffer function is meant to implement the iterator functionality,
+/// provided in the XRay API.
+XRayBuffer nextBuffer(XRayBuffer B);
+
+} // namespace profileCollectorService
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_PROFILE_COLLECTOR_H
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
new file mode 100644
index 000000000000..786084c77226
--- /dev/null
+++ b/lib/xray/xray_profiling.cc
@@ -0,0 +1,372 @@
+//===-- xray_profiling.cc ---------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This is the implementation of a profiling handler.
+//
+//===----------------------------------------------------------------------===//
+#include <memory>
+#include <time.h>
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "xray/xray_interface.h"
+#include "xray/xray_log_interface.h"
+
+#include "xray_flags.h"
+#include "xray_profile_collector.h"
+#include "xray_profiling_flags.h"
+#include "xray_recursion_guard.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+#include <pthread.h>
+
+namespace __xray {
+
+namespace {
+
+constexpr uptr XRayProfilingVersion = 0x20180424;
+
+struct XRayProfilingFileHeader {
+ const u64 MagicBytes = 0x7872617970726f66; // Identifier for XRay profiling
+ // files 'xrayprof' in hex.
+ const uptr Version = XRayProfilingVersion;
+ uptr Timestamp = 0; // System time in nanoseconds.
+ uptr PID = 0; // Process ID.
+};
+
+atomic_sint32_t ProfilerLogFlushStatus = {
+ XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+SpinMutex ProfilerOptionsMutex;
+
+struct alignas(64) ProfilingData {
+ FunctionCallTrie::Allocators *Allocators = nullptr;
+ FunctionCallTrie *FCT = nullptr;
+};
+
+static pthread_key_t ProfilingKey;
+
+thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
+static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+ thread_local auto ThreadOnce = [] {
+ new (&ThreadStorage) ProfilingData{};
+ pthread_setspecific(ProfilingKey, &ThreadStorage);
+ return false;
+ }();
+ (void)ThreadOnce;
+
+ auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+
+ // We need to check whether the global flag to finalizing/finalized has been
+ // switched. If it is, then we ought to not actually initialise the data.
+ auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
+ if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+ Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)
+ return TLD;
+
+ // If we're live, then we re-initialize TLD if the pointers are not null.
+ if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) {
+ TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+ InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+ new (TLD.Allocators) FunctionCallTrie::Allocators();
+ *TLD.Allocators = FunctionCallTrie::InitAllocators();
+ TLD.FCT = reinterpret_cast<FunctionCallTrie *>(
+ InternalAlloc(sizeof(FunctionCallTrie)));
+ new (TLD.FCT) FunctionCallTrie(*TLD.Allocators);
+ }
+
+ return TLD;
+}
+
+static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
+ auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+ if (TLD.Allocators != nullptr && TLD.FCT != nullptr) {
+ TLD.FCT->~FunctionCallTrie();
+ TLD.Allocators->~Allocators();
+ InternalFree(TLD.FCT);
+ InternalFree(TLD.Allocators);
+ TLD.FCT = nullptr;
+ TLD.Allocators = nullptr;
+ }
+}
+
+} // namespace
+
+const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_PROFILER_DEFAULT_OPTIONS
+ return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS);
+#else
+ return "";
+#endif
+}
+
+atomic_sint32_t ProfileFlushStatus = {
+ XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
+ if (atomic_load(&ProfilerLogStatus, memory_order_acquire) !=
+ XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+ if (Verbosity())
+ Report("Not flushing profiles, profiling not been finalized.\n");
+ return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+ }
+
+ s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+ if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result,
+ XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+ memory_order_acq_rel)) {
+ if (Verbosity())
+ Report("Not flushing profiles, implementation still finalizing.\n");
+ }
+
+ // At this point, we'll create the file that will contain the profile, but
+ // only if the options say so.
+ if (!profilingFlags()->no_flush) {
+ // First check whether we have data in the profile collector service
+ // before we try and write anything down.
+ XRayBuffer B = profileCollectorService::nextBuffer({nullptr, 0});
+ if (B.Data == nullptr) {
+ if (Verbosity())
+ Report("profiling: No data to flush.\n");
+ } else {
+ int Fd = getLogFD();
+ if (Fd == -1) {
+ if (Verbosity())
+ Report("profiling: Failed to flush to file, dropping data.\n");
+ } else {
+ XRayProfilingFileHeader Header;
+ Header.Timestamp = NanoTime();
+ Header.PID = internal_getpid();
+ retryingWriteAll(Fd, reinterpret_cast<const char *>(&Header),
+ reinterpret_cast<const char *>(&Header) +
+ sizeof(Header));
+
+ // Now for each of the threads, write out the profile data as we would
+ // see it in memory, verbatim.
+ while (B.Data != nullptr && B.Size != 0) {
+ retryingWriteAll(Fd, reinterpret_cast<const char *>(B.Data),
+ reinterpret_cast<const char *>(B.Data) + B.Size);
+ B = profileCollectorService::nextBuffer(B);
+ }
+ // Then we close out the file.
+ internal_close(Fd);
+ }
+ }
+ }
+
+ profileCollectorService::reset();
+
+ // Flush the current thread's local data structures as well.
+ cleanupTLD();
+
+ atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+ memory_order_release);
+
+ return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+namespace {
+
+thread_local atomic_uint8_t ReentranceGuard{0};
+
+static void postCurrentThreadFCT(ProfilingData &TLD) {
+ if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
+ return;
+
+ profileCollectorService::post(*TLD.FCT, GetTid());
+ cleanupTLD();
+}
+
+} // namespace
+
+void profilingHandleArg0(int32_t FuncId,
+ XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+ unsigned char CPU;
+ auto TSC = readTSC(CPU);
+ RecursionGuard G(ReentranceGuard);
+ if (!G)
+ return;
+
+ auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
+ auto &TLD = getThreadLocalData();
+ if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
+ Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
+ postCurrentThreadFCT(TLD);
+ return;
+ }
+
+ switch (Entry) {
+ case XRayEntryType::ENTRY:
+ case XRayEntryType::LOG_ARGS_ENTRY:
+ TLD.FCT->enterFunction(FuncId, TSC);
+ break;
+ case XRayEntryType::EXIT:
+ case XRayEntryType::TAIL:
+ TLD.FCT->exitFunction(FuncId, TSC);
+ break;
+ default:
+ // FIXME: Handle bugs.
+ break;
+ }
+}
+
+void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry,
+ uint64_t) XRAY_NEVER_INSTRUMENT {
+ return profilingHandleArg0(FuncId, Entry);
+}
+
+XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
+ s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+ if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+ XRayLogInitStatus::XRAY_LOG_FINALIZING,
+ memory_order_release)) {
+ if (Verbosity())
+ Report("Cannot finalize profile, the profiling is not initialized.\n");
+ return static_cast<XRayLogInitStatus>(CurrentStatus);
+ }
+
+ // Wait a grace period to allow threads to see that we're finalizing.
+ SleepForMillis(profilingFlags()->grace_period_ms);
+
+ // We also want to make sure that the current thread's data is cleaned up,
+ // if we have any.
+ auto &TLD = getThreadLocalData();
+ postCurrentThreadFCT(TLD);
+
+ // Then we force serialize the log data.
+ profileCollectorService::serialize();
+
+ atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+ memory_order_release);
+ return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+XRayLogInitStatus
+profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
+ size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+ if (BufferSize != 0 || BufferMax != 0) {
+ if (Verbosity())
+ Report("__xray_log_init() being used, and is unsupported. Use "
+ "__xray_log_init_mode(...) instead. Bailing out.");
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ }
+
+ s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+ XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+ memory_order_release)) {
+ if (Verbosity())
+ Report("Cannot initialize already initialised profiling "
+ "implementation.\n");
+ return static_cast<XRayLogInitStatus>(CurrentStatus);
+ }
+
+ {
+ SpinMutexLock Lock(&ProfilerOptionsMutex);
+ FlagParser ConfigParser;
+ ProfilerFlags Flags;
+ Flags.setDefaults();
+ registerProfilerFlags(&ConfigParser, &Flags);
+ ConfigParser.ParseString(profilingCompilerDefinedFlags());
+ const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
+ if (Env == nullptr)
+ Env = "";
+ ConfigParser.ParseString(Env);
+
+ // Then parse the configuration string provided.
+ ConfigParser.ParseString(static_cast<const char *>(Options));
+ if (Verbosity())
+ ReportUnrecognizedFlags();
+ *profilingFlags() = Flags;
+ }
+
+ // We need to reset the profile data collection implementation now.
+ profileCollectorService::reset();
+
+ // We need to set up the exit handlers.
+ static pthread_once_t Once = PTHREAD_ONCE_INIT;
+ pthread_once(&Once, +[] {
+ pthread_key_create(&ProfilingKey, +[](void *P) {
+ // This is the thread-exit handler.
+ auto &TLD = *reinterpret_cast<ProfilingData *>(P);
+ if (TLD.Allocators == nullptr && TLD.FCT == nullptr)
+ return;
+
+ postCurrentThreadFCT(TLD);
+ });
+
+ // We also need to set up an exit handler, so that we can get the profile
+ // information at exit time. We use the C API to do this, to not rely on C++
+ // ABI functions for registering exit handlers.
+ Atexit(+[] {
+ // Finalize and flush.
+ if (profilingFinalize() != XRAY_LOG_FINALIZED) {
+ cleanupTLD();
+ return;
+ }
+ if (profilingFlush() != XRAY_LOG_FLUSHED) {
+ cleanupTLD();
+ return;
+ }
+ if (Verbosity())
+ Report("XRay Profile flushed at exit.");
+ });
+ });
+
+ __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
+ __xray_set_handler(profilingHandleArg0);
+ __xray_set_handler_arg1(profilingHandleArg1);
+
+ atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+ memory_order_release);
+ if (Verbosity())
+ Report("XRay Profiling init successful.\n");
+
+ return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+ // Set up the flag defaults from the static defaults and the
+ // compiler-provided defaults.
+ {
+ SpinMutexLock Lock(&ProfilerOptionsMutex);
+ auto *F = profilingFlags();
+ F->setDefaults();
+ FlagParser ProfilingParser;
+ registerProfilerFlags(&ProfilingParser, F);
+ ProfilingParser.ParseString(profilingCompilerDefinedFlags());
+ }
+
+ XRayLogImpl Impl{
+ profilingLoggingInit,
+ profilingFinalize,
+ profilingHandleArg0,
+ profilingFlush,
+ };
+ auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl);
+ if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+ if (Verbosity())
+ Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = "
+ "%d\n",
+ RegistrationResult);
+ return false;
+ }
+
+ if (!internal_strcmp(flags()->xray_mode, "xray-profiling"))
+ __xray_log_select_mode("xray_profiling");
+ return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::profilingDynamicInitializer();
diff --git a/lib/xray/xray_profiling_flags.cc b/lib/xray/xray_profiling_flags.cc
new file mode 100644
index 000000000000..593e66a78ad2
--- /dev/null
+++ b/lib/xray/xray_profiling_flags.cc
@@ -0,0 +1,40 @@
+//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay runtime flags.
+//===----------------------------------------------------------------------===//
+
+#include "xray_profiling_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+namespace __xray {
+
+// Storage for the profiling flags.
+ProfilerFlags xray_profiling_flags_dont_use_directly;
+
+void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerProfilerFlags(FlagParser *P,
+ ProfilerFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) \
+ RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_profiling_flags.h b/lib/xray/xray_profiling_flags.h
new file mode 100644
index 000000000000..2f9a7514799a
--- /dev/null
+++ b/lib/xray/xray_profiling_flags.h
@@ -0,0 +1,39 @@
+//===-- xray_profiling_flags.h ----------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay profiling runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_PROFILER_FLAGS_H
+#define XRAY_PROFILER_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct ProfilerFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+
+ void setDefaults();
+};
+
+extern ProfilerFlags xray_profiling_flags_dont_use_directly;
+inline ProfilerFlags *profilingFlags() {
+ return &xray_profiling_flags_dont_use_directly;
+}
+void registerProfilerFlags(FlagParser *P, ProfilerFlags *F);
+
+} // namespace __xray
+
+#endif // XRAY_PROFILER_FLAGS_H
diff --git a/lib/xray/xray_profiling_flags.inc b/lib/xray/xray_profiling_flags.inc
new file mode 100644
index 000000000000..e9230ae64187
--- /dev/null
+++ b/lib/xray/xray_profiling_flags.inc
@@ -0,0 +1,29 @@
+//===-- xray_profiling_flags.inc --------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay profiling runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20,
+ "Maximum size of any single per-thread allocator.")
+XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
+ "Maximum size of the global allocator for profile storage.")
+XRAY_FLAG(uptr, stack_allocator_max, 2 << 20,
+ "Maximum size of the traversal stack allocator.")
+XRAY_FLAG(int, grace_period_ms, 1,
+ "Profile collection will wait this much time in milliseconds before "
+ "resetting the global state. This gives a chance to threads to "
+ "notice that the profiler has been finalized and clean up.")
+XRAY_FLAG(bool, no_flush, false,
+ "Set to true if we want the profiling implementation to not write "
+ "out files.")
diff --git a/lib/xray/xray_recursion_guard.h b/lib/xray/xray_recursion_guard.h
new file mode 100644
index 000000000000..6edadea563bc
--- /dev/null
+++ b/lib/xray/xray_recursion_guard.h
@@ -0,0 +1,57 @@
+//===-- xray_recursion_guard.h ---------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_RECURSION_GUARD_H
+#define XRAY_XRAY_RECURSION_GUARD_H
+
+#include "sanitizer_common/sanitizer_atomic.h"
+
+namespace __xray {
+
+/// The RecursionGuard is useful for guarding against signal handlers which are
+/// also potentially calling XRay-instrumented functions. To use the
+/// RecursionGuard, you'll typically need a thread_local atomic_uint8_t:
+///
+/// thread_local atomic_uint8_t Guard{0};
+///
+/// // In a handler function:
+/// void handleArg0(int32_t F, XRayEntryType T) {
+/// RecursionGuard G(Guard);
+/// if (!G)
+/// return; // Failed to acquire the guard.
+/// ...
+/// }
+///
+class RecursionGuard {
+ atomic_uint8_t &Running;
+ const bool Valid;
+
+public:
+ explicit inline RecursionGuard(atomic_uint8_t &R)
+ : Running(R), Valid(!atomic_exchange(&R, 1, memory_order_acq_rel)) {}
+
+ inline RecursionGuard(const RecursionGuard &) = delete;
+ inline RecursionGuard(RecursionGuard &&) = delete;
+ inline RecursionGuard &operator=(const RecursionGuard &) = delete;
+ inline RecursionGuard &operator=(RecursionGuard &&) = delete;
+
+ explicit inline operator bool() const { return Valid; }
+
+ inline ~RecursionGuard() noexcept {
+ if (Valid)
+ atomic_store(&Running, 0, memory_order_release);
+ }
+};
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_RECURSION_GUARD_H
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
new file mode 100644
index 000000000000..11dd794fa520
--- /dev/null
+++ b/lib/xray/xray_segmented_array.h
@@ -0,0 +1,375 @@
+//===-- xray_segmented_array.h ---------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the implementation of a segmented array, with fixed-size segments
+// backing the segments.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_SEGMENTED_ARRAY_H
+#define XRAY_SEGMENTED_ARRAY_H
+
+#include "sanitizer_common/sanitizer_allocator.h"
+#include "xray_allocator.h"
+#include "xray_utils.h"
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+/// The Array type provides an interface similar to std::vector<...> but does
+/// not shrink in size. Once constructed, elements can be appended but cannot be
+/// removed. The implementation is heavily dependent on the contract provided by
+/// the Allocator type, in that all memory will be released when the Allocator
+/// is destroyed. When an Array is destroyed, it will destroy elements in the
+/// backing store but will not free the memory.
+template <class T> class Array {
+ struct SegmentBase {
+ SegmentBase *Prev;
+ SegmentBase *Next;
+ };
+
+ // We want each segment of the array to be cache-line aligned, and elements of
+ // the array be offset from the beginning of the segment.
+ struct Segment : SegmentBase {
+ char Data[1];
+ };
+
+public:
+ // Each segment of the array will be laid out with the following assumptions:
+ //
+ // - Each segment will be on a cache-line address boundary (kCacheLineSize
+ // aligned).
+ //
+ // - The elements will be accessed through an aligned pointer, dependent on
+ // the alignment of T.
+ //
+ // - Each element is at least two-pointers worth from the beginning of the
+ // Segment, aligned properly, and the rest of the elements are accessed
+ // through appropriate alignment.
+ //
+ // We then compute the size of the segment to follow this logic:
+ //
+ // - Compute the number of elements that can fit within
+ // kCacheLineSize-multiple segments, minus the size of two pointers.
+ //
+ // - Request cacheline-multiple sized elements from the allocator.
+ static constexpr size_t AlignedElementStorageSize =
+ sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type);
+
+ static constexpr size_t SegmentSize =
+ nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize);
+
+ using AllocatorType = Allocator<SegmentSize>;
+
+ static constexpr size_t ElementsPerSegment =
+ (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T));
+
+ static_assert(ElementsPerSegment > 0,
+ "Must have at least 1 element per segment.");
+
+ static SegmentBase SentinelSegment;
+
+private:
+ AllocatorType *Alloc;
+ SegmentBase *Head = &SentinelSegment;
+ SegmentBase *Tail = &SentinelSegment;
+ size_t Size = 0;
+
+ // Here we keep track of segments in the freelist, to allow us to re-use
+ // segments when elements are trimmed off the end.
+ SegmentBase *Freelist = &SentinelSegment;
+
+ Segment *NewSegment() {
+ // We need to handle the case in which enough elements have been trimmed to
+ // allow us to re-use segments we've allocated before. For this we look into
+ // the Freelist, to see whether we need to actually allocate new blocks or
+ // just re-use blocks we've already seen before.
+ if (Freelist != &SentinelSegment) {
+ auto *FreeSegment = Freelist;
+ Freelist = FreeSegment->Next;
+ FreeSegment->Next = &SentinelSegment;
+ Freelist->Prev = &SentinelSegment;
+ return static_cast<Segment *>(FreeSegment);
+ }
+
+ auto SegmentBlock = Alloc->Allocate();
+ if (SegmentBlock.Data == nullptr)
+ return nullptr;
+
+ // Placement-new the Segment element at the beginning of the SegmentBlock.
+ auto S = reinterpret_cast<Segment *>(SegmentBlock.Data);
+ new (S) SegmentBase{&SentinelSegment, &SentinelSegment};
+ return S;
+ }
+
+ Segment *InitHeadAndTail() {
+ DCHECK_EQ(Head, &SentinelSegment);
+ DCHECK_EQ(Tail, &SentinelSegment);
+ auto Segment = NewSegment();
+ if (Segment == nullptr)
+ return nullptr;
+ DCHECK_EQ(Segment->Next, &SentinelSegment);
+ DCHECK_EQ(Segment->Prev, &SentinelSegment);
+ Head = Tail = static_cast<SegmentBase *>(Segment);
+ return Segment;
+ }
+
+ Segment *AppendNewSegment() {
+ auto S = NewSegment();
+ if (S == nullptr)
+ return nullptr;
+ DCHECK_NE(Tail, &SentinelSegment);
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ DCHECK_EQ(S->Prev, &SentinelSegment);
+ DCHECK_EQ(S->Next, &SentinelSegment);
+ Tail->Next = S;
+ S->Prev = Tail;
+ Tail = S;
+ return static_cast<Segment *>(Tail);
+ }
+
+ // This Iterator models a BidirectionalIterator.
+ template <class U> class Iterator {
+ SegmentBase *S = &SentinelSegment;
+ size_t Offset = 0;
+ size_t Size = 0;
+
+ public:
+ Iterator(SegmentBase *IS, size_t Off, size_t S)
+ : S(IS), Offset(Off), Size(S) {}
+ Iterator(const Iterator &) noexcept = default;
+ Iterator() noexcept = default;
+ Iterator(Iterator &&) noexcept = default;
+ Iterator &operator=(const Iterator &) = default;
+ Iterator &operator=(Iterator &&) = default;
+ ~Iterator() = default;
+
+ Iterator &operator++() {
+ if (++Offset % ElementsPerSegment || Offset == Size)
+ return *this;
+
+ // At this point, we know that Offset % N == 0, so we must advance the
+ // segment pointer.
+ DCHECK_EQ(Offset % ElementsPerSegment, 0);
+ DCHECK_NE(Offset, Size);
+ DCHECK_NE(S, &SentinelSegment);
+ DCHECK_NE(S->Next, &SentinelSegment);
+ S = S->Next;
+ DCHECK_NE(S, &SentinelSegment);
+ return *this;
+ }
+
+ Iterator &operator--() {
+ DCHECK_NE(S, &SentinelSegment);
+ DCHECK_GT(Offset, 0);
+
+ auto PreviousOffset = Offset--;
+ if (PreviousOffset != Size && PreviousOffset % ElementsPerSegment == 0) {
+ DCHECK_NE(S->Prev, &SentinelSegment);
+ S = S->Prev;
+ }
+
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ Iterator Copy(*this);
+ ++(*this);
+ return Copy;
+ }
+
+ Iterator operator--(int) {
+ Iterator Copy(*this);
+ --(*this);
+ return Copy;
+ }
+
+ template <class V, class W>
+ friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
+ return L.S == R.S && L.Offset == R.Offset;
+ }
+
+ template <class V, class W>
+ friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) {
+ return !(L == R);
+ }
+
+ U &operator*() const {
+ DCHECK_NE(S, &SentinelSegment);
+ auto RelOff = Offset % ElementsPerSegment;
+
+ // We need to compute the character-aligned pointer, offset from the
+ // segment's Data location to get the element in the position of Offset.
+ auto Base = static_cast<Segment *>(S)->Data;
+ auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize);
+ return *reinterpret_cast<U *>(AlignedOffset);
+ }
+
+ U *operator->() const { return &(**this); }
+ };
+
+public:
+ explicit Array(AllocatorType &A) : Alloc(&A) {}
+
+ Array(const Array &) = delete;
+ Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
+ Head(O.Head),
+ Tail(O.Tail),
+ Size(O.Size) {
+ O.Head = &SentinelSegment;
+ O.Tail = &SentinelSegment;
+ O.Size = 0;
+ }
+
+ bool empty() const { return Size == 0; }
+
+ AllocatorType &allocator() const {
+ DCHECK_NE(Alloc, nullptr);
+ return *Alloc;
+ }
+
+ size_t size() const { return Size; }
+
+ T *Append(const T &E) {
+ if (UNLIKELY(Head == &SentinelSegment))
+ if (InitHeadAndTail() == nullptr)
+ return nullptr;
+
+ auto Offset = Size % ElementsPerSegment;
+ if (UNLIKELY(Size != 0 && Offset == 0))
+ if (AppendNewSegment() == nullptr)
+ return nullptr;
+
+ auto Base = static_cast<Segment *>(Tail)->Data;
+ auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+ auto Position = reinterpret_cast<T *>(AlignedOffset);
+ *Position = E;
+ ++Size;
+ return Position;
+ }
+
+ template <class... Args> T *AppendEmplace(Args &&... args) {
+ if (UNLIKELY(Head == &SentinelSegment))
+ if (InitHeadAndTail() == nullptr)
+ return nullptr;
+
+ auto Offset = Size % ElementsPerSegment;
+ auto *LatestSegment = Tail;
+ if (UNLIKELY(Size != 0 && Offset == 0)) {
+ LatestSegment = AppendNewSegment();
+ if (LatestSegment == nullptr)
+ return nullptr;
+ }
+
+ DCHECK_NE(Tail, &SentinelSegment);
+ auto Base = static_cast<Segment *>(LatestSegment)->Data;
+ auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+ auto Position = reinterpret_cast<T *>(AlignedOffset);
+
+ // In-place construct at Position.
+ new (Position) T{std::forward<Args>(args)...};
+ ++Size;
+ return reinterpret_cast<T *>(Position);
+ }
+
+ T &operator[](size_t Offset) const {
+ DCHECK_LE(Offset, Size);
+ // We need to traverse the array enough times to find the element at Offset.
+ auto S = Head;
+ while (Offset >= ElementsPerSegment) {
+ S = S->Next;
+ Offset -= ElementsPerSegment;
+ DCHECK_NE(S, &SentinelSegment);
+ }
+ auto Base = static_cast<Segment *>(S)->Data;
+ auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+ auto Position = reinterpret_cast<T *>(AlignedOffset);
+ return *reinterpret_cast<T *>(Position);
+ }
+
+ T &front() const {
+ DCHECK_NE(Head, &SentinelSegment);
+ DCHECK_NE(Size, 0u);
+ return *begin();
+ }
+
+ T &back() const {
+ DCHECK_NE(Tail, &SentinelSegment);
+ DCHECK_NE(Size, 0u);
+ auto It = end();
+ --It;
+ return *It;
+ }
+
+ template <class Predicate> T *find_element(Predicate P) const {
+ if (empty())
+ return nullptr;
+
+ auto E = end();
+ for (auto I = begin(); I != E; ++I)
+ if (P(*I))
+ return &(*I);
+
+ return nullptr;
+ }
+
+ /// Remove N Elements from the end. This leaves the blocks behind, and not
+ /// require allocation of new blocks for new elements added after trimming.
+ void trim(size_t Elements) {
+ DCHECK_LE(Elements, Size);
+ DCHECK_GT(Size, 0);
+ auto OldSize = Size;
+ Size -= Elements;
+
+ DCHECK_NE(Head, &SentinelSegment);
+ DCHECK_NE(Tail, &SentinelSegment);
+
+ for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) -
+ nearest_boundary(Size, ElementsPerSegment)) /
+ ElementsPerSegment;
+ SegmentsToTrim > 0; --SegmentsToTrim) {
+ DCHECK_NE(Head, &SentinelSegment);
+ DCHECK_NE(Tail, &SentinelSegment);
+ // Put the tail into the Freelist.
+ auto *FreeSegment = Tail;
+ Tail = Tail->Prev;
+ if (Tail == &SentinelSegment)
+ Head = Tail;
+ else
+ Tail->Next = &SentinelSegment;
+
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ FreeSegment->Next = Freelist;
+ FreeSegment->Prev = &SentinelSegment;
+ if (Freelist != &SentinelSegment)
+ Freelist->Prev = FreeSegment;
+ Freelist = FreeSegment;
+ }
+ }
+
+ // Provide iterators.
+ Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
+ Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
+ Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
+ Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
+};
+
+// We need to have this storage definition out-of-line so that the compiler can
+// ensure that storage for the SentinelSegment is defined and has a single
+// address.
+template <class T>
+typename Array<T>::SegmentBase Array<T>::SentinelSegment{
+ &Array<T>::SentinelSegment, &Array<T>::SentinelSegment};
+
+} // namespace __xray
+
+#endif // XRAY_SEGMENTED_ARRAY_H
diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S
index 350afd9265fd..99ad3966ee3a 100644
--- a/lib/xray/xray_trampoline_x86_64.S
+++ b/lib/xray/xray_trampoline_x86_64.S
@@ -19,47 +19,56 @@
.macro SAVE_REGISTERS
- subq $192, %rsp
- CFI_DEF_CFA_OFFSET(200)
- // At this point, the stack pointer should be aligned to an 8-byte boundary,
- // because any call instructions that come after this will add another 8
- // bytes and therefore align it to 16-bytes.
- movq %rbp, 184(%rsp)
- movupd %xmm0, 168(%rsp)
- movupd %xmm1, 152(%rsp)
- movupd %xmm2, 136(%rsp)
- movupd %xmm3, 120(%rsp)
- movupd %xmm4, 104(%rsp)
- movupd %xmm5, 88(%rsp)
- movupd %xmm6, 72(%rsp)
- movupd %xmm7, 56(%rsp)
- movq %rdi, 48(%rsp)
- movq %rax, 40(%rsp)
- movq %rdx, 32(%rsp)
- movq %rsi, 24(%rsp)
- movq %rcx, 16(%rsp)
- movq %r8, 8(%rsp)
- movq %r9, 0(%rsp)
+ subq $240, %rsp
+ CFI_DEF_CFA_OFFSET(248)
+ movq %rbp, 232(%rsp)
+ movupd %xmm0, 216(%rsp)
+ movupd %xmm1, 200(%rsp)
+ movupd %xmm2, 184(%rsp)
+ movupd %xmm3, 168(%rsp)
+ movupd %xmm4, 152(%rsp)
+ movupd %xmm5, 136(%rsp)
+ movupd %xmm6, 120(%rsp)
+ movupd %xmm7, 104(%rsp)
+ movq %rdi, 96(%rsp)
+ movq %rax, 88(%rsp)
+ movq %rdx, 80(%rsp)
+ movq %rsi, 72(%rsp)
+ movq %rcx, 64(%rsp)
+ movq %r8, 56(%rsp)
+ movq %r9, 48(%rsp)
+ movq %r10, 40(%rsp)
+ movq %r11, 32(%rsp)
+ movq %r12, 24(%rsp)
+ movq %r13, 16(%rsp)
+ movq %r14, 8(%rsp)
+ movq %r15, 0(%rsp)
.endm
.macro RESTORE_REGISTERS
- movq 184(%rsp), %rbp
- movupd 168(%rsp), %xmm0
- movupd 152(%rsp), %xmm1
- movupd 136(%rsp), %xmm2
- movupd 120(%rsp), %xmm3
- movupd 104(%rsp), %xmm4
- movupd 88(%rsp), %xmm5
- movupd 72(%rsp) , %xmm6
- movupd 56(%rsp) , %xmm7
- movq 48(%rsp), %rdi
- movq 40(%rsp), %rax
- movq 32(%rsp), %rdx
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rcx
- movq 8(%rsp), %r8
- movq 0(%rsp), %r9
- addq $192, %rsp
+ movq 232(%rsp), %rbp
+ movupd 216(%rsp), %xmm0
+ movupd 200(%rsp), %xmm1
+ movupd 184(%rsp), %xmm2
+ movupd 168(%rsp), %xmm3
+ movupd 152(%rsp), %xmm4
+ movupd 136(%rsp), %xmm5
+ movupd 120(%rsp) , %xmm6
+ movupd 104(%rsp) , %xmm7
+ movq 96(%rsp), %rdi
+ movq 88(%rsp), %rax
+ movq 80(%rsp), %rdx
+ movq 72(%rsp), %rsi
+ movq 64(%rsp), %rcx
+ movq 56(%rsp), %r8
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r10
+ movq 32(%rsp), %r11
+ movq 24(%rsp), %r12
+ movq 16(%rsp), %r13
+ movq 8(%rsp), %r14
+ movq 0(%rsp), %r15
+ addq $240, %rsp
CFI_DEF_CFA_OFFSET(8)
.endm
@@ -90,6 +99,7 @@
.globl ASM_SYMBOL(__xray_FunctionEntry)
.align 16, 0x90
ASM_TYPE_FUNCTION(__xray_FunctionEntry)
+# LLVM-MCA-BEGIN __xray_FunctionEntry
ASM_SYMBOL(__xray_FunctionEntry):
CFI_STARTPROC
SAVE_REGISTERS
@@ -100,7 +110,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
testq %rax, %rax
je .Ltmp0
- // The patched function prolog puts its xray_instr_map index into %r10d.
+ // The patched function prologue puts its xray_instr_map index into %r10d.
movl %r10d, %edi
xor %esi,%esi
ALIGNED_CALL_RAX
@@ -108,6 +118,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
.Ltmp0:
RESTORE_REGISTERS
retq
+# LLVM-MCA-END
ASM_SIZE(__xray_FunctionEntry)
CFI_ENDPROC
@@ -116,6 +127,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
.globl ASM_SYMBOL(__xray_FunctionExit)
.align 16, 0x90
ASM_TYPE_FUNCTION(__xray_FunctionExit)
+# LLVM-MCA-BEGIN __xray_FunctionExit
ASM_SYMBOL(__xray_FunctionExit):
CFI_STARTPROC
// Save the important registers first. Since we're assuming that this
@@ -146,6 +158,7 @@ ASM_SYMBOL(__xray_FunctionExit):
addq $56, %rsp
CFI_DEF_CFA_OFFSET(8)
retq
+# LLVM-MCA-END
ASM_SIZE(__xray_FunctionExit)
CFI_ENDPROC
@@ -154,6 +167,7 @@ ASM_SYMBOL(__xray_FunctionExit):
.globl ASM_SYMBOL(__xray_FunctionTailExit)
.align 16, 0x90
ASM_TYPE_FUNCTION(__xray_FunctionTailExit)
+# LLVM-MCA-BEGIN __xray_FunctionTailExit
ASM_SYMBOL(__xray_FunctionTailExit):
CFI_STARTPROC
SAVE_REGISTERS
@@ -170,6 +184,7 @@ ASM_SYMBOL(__xray_FunctionTailExit):
.Ltmp4:
RESTORE_REGISTERS
retq
+# LLVM-MCA-END
ASM_SIZE(__xray_FunctionTailExit)
CFI_ENDPROC
@@ -178,6 +193,7 @@ ASM_SYMBOL(__xray_FunctionTailExit):
.globl ASM_SYMBOL(__xray_ArgLoggerEntry)
.align 16, 0x90
ASM_TYPE_FUNCTION(__xray_ArgLoggerEntry)
+# LLVM-MCA-BEGIN __xray_ArgLoggerEntry
ASM_SYMBOL(__xray_ArgLoggerEntry):
CFI_STARTPROC
SAVE_REGISTERS
@@ -207,6 +223,7 @@ ASM_SYMBOL(__xray_ArgLoggerEntry):
.Larg1entryFail:
RESTORE_REGISTERS
retq
+# LLVM-MCA-END
ASM_SIZE(__xray_ArgLoggerEntry)
CFI_ENDPROC
@@ -215,13 +232,13 @@ ASM_SYMBOL(__xray_ArgLoggerEntry):
.global ASM_SYMBOL(__xray_CustomEvent)
.align 16, 0x90
ASM_TYPE_FUNCTION(__xray_CustomEvent)
+# LLVM-MCA-BEGIN __xray_CustomEvent
ASM_SYMBOL(__xray_CustomEvent):
CFI_STARTPROC
SAVE_REGISTERS
// We take two arguments to this trampoline, which should be in rdi and rsi
- // already. We also make sure that we stash %rax because we use that register
- // to call the logging handler.
+ // already.
movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax
testq %rax,%rax
je .LcustomEventCleanup
@@ -231,7 +248,35 @@ ASM_SYMBOL(__xray_CustomEvent):
.LcustomEventCleanup:
RESTORE_REGISTERS
retq
+# LLVM-MCA-END
ASM_SIZE(__xray_CustomEvent)
CFI_ENDPROC
+//===----------------------------------------------------------------------===//
+
+ .global ASM_SYMBOL(__xray_TypedEvent)
+ .align 16, 0x90
+ ASM_TYPE_FUNCTION(__xray_TypedEvent)
+# LLVM-MCA-BEGIN __xray_TypedEvent
+ASM_SYMBOL(__xray_TypedEvent):
+ CFI_STARTPROC
+ SAVE_REGISTERS
+
+ // We pass three arguments to this trampoline, which should be in rdi, rsi
+ // and rdx without our intervention.
+ movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax
+ testq %rax,%rax
+ je .LtypedEventCleanup
+
+ ALIGNED_CALL_RAX
+
+.LtypedEventCleanup:
+ RESTORE_REGISTERS
+ retq
+# LLVM-MCA-END
+ ASM_SIZE(__xray_TypedEvent)
+ CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
NO_EXEC_STACK_DIRECTIVE
diff --git a/lib/xray/xray_utils.cc b/lib/xray/xray_utils.cc
index cf800d3aeaf8..68f4e8c1094c 100644
--- a/lib/xray/xray_utils.cc
+++ b/lib/xray/xray_utils.cc
@@ -15,11 +15,11 @@
#include "sanitizer_common/sanitizer_common.h"
#include "xray_defs.h"
#include "xray_flags.h"
-#include <stdlib.h>
#include <cstdio>
#include <errno.h>
#include <fcntl.h>
#include <iterator>
+#include <stdlib.h>
#include <sys/types.h>
#include <tuple>
#include <unistd.h>
@@ -31,7 +31,7 @@ void printToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT {
fprintf(stderr, "%s", Buffer);
}
-void retryingWriteAll(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
+void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
if (Begin == End)
return;
auto TotalBytes = std::distance(Begin, End);
@@ -82,7 +82,7 @@ bool readValueFromFile(const char *Filename,
if (!Success)
return false;
close(Fd);
- char *End = nullptr;
+ const char *End = nullptr;
long long Tmp = internal_simple_strtoll(Line, &End, 10);
bool Result = false;
if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
@@ -94,10 +94,10 @@ bool readValueFromFile(const char *Filename,
int getLogFD() XRAY_NEVER_INSTRUMENT {
// Open a temporary file once for the log.
- static char TmpFilename[256] = {};
- static char TmpWildcardPattern[] = "XXXXXX";
- auto Argv = GetArgv();
- const char *Progname = Argv[0] == nullptr ? "(unknown)" : Argv[0];
+ char TmpFilename[256] = {};
+ char TmpWildcardPattern[] = "XXXXXX";
+ auto **Argv = GetArgv();
+ const char *Progname = !Argv ? "(unknown)" : Argv[0];
const char *LastSlash = internal_strrchr(Progname, '/');
if (LastSlash != nullptr)
@@ -117,7 +117,7 @@ int getLogFD() XRAY_NEVER_INSTRUMENT {
TmpFilename);
return -1;
}
- if (__sanitizer::Verbosity())
+ if (Verbosity())
Report("XRay: Log file in '%s'\n", TmpFilename);
return Fd;
diff --git a/lib/xray/xray_utils.h b/lib/xray/xray_utils.h
index 1ecc74a2dce8..eafa16e1a9d5 100644
--- a/lib/xray/xray_utils.h
+++ b/lib/xray/xray_utils.h
@@ -15,6 +15,8 @@
#ifndef XRAY_UTILS_H
#define XRAY_UTILS_H
+#include <cstddef>
+#include <cstdint>
#include <sys/types.h>
#include <utility>
@@ -24,7 +26,7 @@ namespace __xray {
void printToStdErr(const char *Buffer);
// EINTR-safe write routine, provided a file descriptor and a character range.
-void retryingWriteAll(int Fd, char *Begin, char *End);
+void retryingWriteAll(int Fd, const char *Begin, const char *End);
// Reads a long long value from a provided file.
bool readValueFromFile(const char *Filename, long long *Value);
@@ -36,6 +38,32 @@ std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End);
// file.
int getLogFD();
+constexpr size_t gcd(size_t a, size_t b) {
+ return (b == 0) ? a : gcd(b, a % b);
+}
+
+constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
+
+constexpr size_t nearest_boundary(size_t number, size_t multiple) {
+ return multiple * ((number / multiple) + (number % multiple ? 1 : 0));
+}
+
+constexpr size_t next_pow2_helper(size_t num, size_t acc) {
+ return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1);
+}
+
+constexpr size_t next_pow2(size_t number) {
+ return next_pow2_helper(number, 1);
+}
+
+template <class T> constexpr T &max(T &A, T &B) { return A > B ? A : B; }
+
+template <class T> constexpr T &min(T &A, T &B) { return A <= B ? A : B; }
+
+constexpr ptrdiff_t diff(uintptr_t A, uintptr_t B) {
+ return max(A, B) - min(A, B);
+}
+
} // namespace __xray
#endif // XRAY_UTILS_H
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
index e17f00ac3a62..51dc4ce43b1c 100644
--- a/lib/xray/xray_x86_64.cc
+++ b/lib/xray/xray_x86_64.cc
@@ -3,6 +3,15 @@
#include "xray_defs.h"
#include "xray_interface_internal.h"
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#include <sys/types.h>
+#if SANITIZER_OPENBSD
+#include <sys/time.h>
+#include <machine/cpu.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+
#include <atomic>
#include <cstdint>
#include <errno.h>
@@ -14,6 +23,7 @@
namespace __xray {
+#if SANITIZER_LINUX
static std::pair<ssize_t, bool>
retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
auto BytesToRead = std::distance(Begin, End);
@@ -47,7 +57,7 @@ static bool readValueFromFile(const char *Filename,
close(Fd);
if (!Success)
return false;
- char *End = nullptr;
+ const char *End = nullptr;
long long Tmp = internal_simple_strtoll(Line, &End, 10);
bool Result = false;
if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
@@ -71,6 +81,31 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
}
return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
}
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+ long long TSCFrequency = -1;
+ size_t tscfreqsz = sizeof(TSCFrequency);
+#if SANITIZER_OPENBSD
+ int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ };
+ if (sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+
+#else
+ if (sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
+ NULL, 0) != -1) {
+#endif
+ return static_cast<uint64_t>(TSCFrequency);
+ } else {
+ Report("Unable to determine CPU frequency for TSC accounting.\n");
+ }
+
+ return 0;
+}
+#else
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+ /* Not supported */
+ return 0;
+}
+#endif
static constexpr uint8_t CallOpCode = 0xe8;
static constexpr uint16_t MovR10Seq = 0xba41;
@@ -184,8 +219,8 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
reinterpret_cast<int64_t>(__xray_FunctionTailExit) -
(static_cast<int64_t>(Sled.Address) + 11);
if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
- Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
- __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address));
+ Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
+ __xray_FunctionTailExit, reinterpret_cast<void *>(Sled.Address));
return false;
}
if (Enable) {
@@ -251,6 +286,37 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
return false;
}
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+ // Here we do the dance of replacing the following sled:
+ //
+ // xray_sled_n:
+ // jmp +20 // 2 byte instruction
+ // ...
+ //
+ // With the following:
+ //
+ // nopw // 2 bytes
+ // ...
+ //
+ //
+ // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
+ // The 20 byte sled stashes three argument registers, calls the trampoline,
+ // unstashes the registers and returns. If the arguments are already in
+ // the correct registers, the stashing and unstashing become equivalently
+ // sized nops.
+ if (Enable) {
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
+ std::memory_order_release);
+ } else {
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
+ std::memory_order_release);
+ }
+ return false;
+}
+
// We determine whether the CPU we're running on has the correct features we
// need. In x86_64 this will be rdtscp support.
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
@@ -259,7 +325,8 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
// We check whether rdtscp support is enabled. According to the x86_64 manual,
// level should be set at 0x80000001, and we should have a look at bit 27 in
// EDX. That's 0x8000000 (or 1u << 27).
- __get_cpuid(0x80000001, &EAX, &EBX, &ECX, &EDX);
+ __asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX)
+ : "0"(0x80000001));
if (!(EDX & (1u << 27))) {
Report("Missing rdtscp support.\n");
return false;
diff --git a/lib/xray/xray_x86_64.inc b/lib/xray/xray_x86_64.inc
index 4ad3f9810946..b3c475f9110c 100644
--- a/lib/xray/xray_x86_64.inc
+++ b/lib/xray/xray_x86_64.inc
@@ -21,9 +21,10 @@ namespace __xray {
ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
unsigned LongCPU;
- uint64_t TSC = __rdtscp(&LongCPU);
+ unsigned long Rax, Rdx;
+ __asm__ __volatile__("rdtscp\n" : "=a"(Rax), "=d"(Rdx), "=c"(LongCPU) ::);
CPU = LongCPU;
- return TSC;
+ return (Rdx << 32) + Rax;
}
uint64_t getTSCFrequency();