49 files changed, 4877 insertions, 1206 deletions
diff --git a/lib/xray/CMakeLists.txt b/lib/xray/CMakeLists.txt
index 5547600b943a..8e18f55658f8 100644
--- a/lib/xray/CMakeLists.txt
+++ b/lib/xray/CMakeLists.txt
@@ -1,16 +1,29 @@
-# Build for the XRay runtime support library.
+# Build for all components of the XRay runtime support library.
 
 # XRay runtime library implementation files.
 set(XRAY_SOURCES
-  xray_inmemory_log.cc
-  xray_init.cc
-  xray_flags.cc
-  xray_interface.cc
-  xray_buffer_queue.cc
-  xray_log_interface.cc
-  xray_fdr_logging.cc
-  xray_utils.cc)
+    xray_init.cc
+    xray_flags.cc
+    xray_interface.cc
+    xray_log_interface.cc
+    xray_utils.cc)
 
+# Implementation files for all XRay modes.
+set(XRAY_FDR_MODE_SOURCES
+    xray_fdr_flags.cc
+    xray_buffer_queue.cc
+    xray_fdr_logging.cc)
+
+set(XRAY_BASIC_MODE_SOURCES
+    xray_basic_flags.cc
+    xray_basic_logging.cc)
+
+set(XRAY_PROFILING_MODE_SOURCES
+    xray_profile_collector.cc
+    xray_profiling.cc
+    xray_profiling_flags.cc)
+
+# Implementation files for all XRay architectures.
 set(x86_64_SOURCES
     xray_x86_64.cc
     xray_trampoline_x86_64.S)
@@ -23,8 +36,8 @@ set(armhf_SOURCES
     ${arm_SOURCES})
 
 set(aarch64_SOURCES
-  xray_AArch64.cc
-  xray_trampoline_AArch64.S)
+    xray_AArch64.cc
+    xray_trampoline_AArch64.S)
 
 set(mips_SOURCES
     xray_mips.cc
@@ -47,11 +60,68 @@ set(powerpc64le_SOURCES
     xray_trampoline_powerpc64.cc
     xray_trampoline_powerpc64_asm.S)
 
+set(XRAY_IMPL_HEADERS
+  xray_allocator.h
+  xray_basic_flags.h
+  xray_basic_flags.inc
+  xray_basic_logging.h
+  xray_buffer_queue.h
+  xray_defs.h
+  xray_fdr_flags.h
+  xray_fdr_flags.inc
+  xray_fdr_log_records.h
+  xray_fdr_logging.h
+  xray_flags.h
+  xray_flags.inc
+  xray_function_call_trie.h
+  xray_interface_internal.h
+  xray_powerpc64.inc
+  xray_profile_collector.h
+  xray_profiling_flags.h
+  xray_profiling_flags.inc
+  xray_recursion_guard.h
+  xray_segmented_array.h
+  xray_tsc.h
+  xray_utils.h
+  xray_x86_64.inc)
+
+# Create list of all source files for
+# consumption by tests.
+set(XRAY_ALL_SOURCE_FILES
+  ${XRAY_SOURCES}
+  ${XRAY_FDR_MODE_SOURCES}
+  ${XRAY_BASIC_MODE_SOURCES}
+  ${XRAY_PROFILING_MODE_SOURCES}
+  ${x86_64_SOURCES}
+  ${arm_SOURCES}
+  ${armhf_SOURCES}
+  ${mips_SOURCES}
+  ${mipsel_SOURCES}
+  ${mips64_SOURCES}
+  ${mips64el_SOURCES}
+  ${powerpc64le_SOURCES}
+  ${XRAY_IMPL_HEADERS}
+)
+list(REMOVE_DUPLICATES XRAY_ALL_SOURCE_FILES)
+# Make list that uses absolute paths
+set(XRAY_ALL_SOURCE_FILES_ABS_PATHS "")
+foreach (src_file ${XRAY_ALL_SOURCE_FILES})
+  list(APPEND
+    XRAY_ALL_SOURCE_FILES_ABS_PATHS
+    "${CMAKE_CURRENT_SOURCE_DIR}/${src_file}")
+endforeach()
+
+
+# Now put it all together...
 include_directories(..)
 include_directories(../../include)
 
 set(XRAY_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 set(XRAY_COMMON_DEFINITIONS XRAY_HAS_EXCEPTIONS=1)
+
+# We don't need RTTI in XRay, so turn that off.
+append_rtti_flag(OFF XRAY_CFLAGS)
+
 append_list_if(
   COMPILER_RT_HAS_XRAY_COMPILER_FLAG XRAY_SUPPORTED=1 XRAY_COMMON_DEFINITIONS)
 append_list_if(
@@ -60,10 +130,13 @@ append_list_if(
 add_compiler_rt_component(xray)
 
 set(XRAY_COMMON_RUNTIME_OBJECT_LIBS
-    RTXray
     RTSanitizerCommon
     RTSanitizerCommonLibc)
 
+if (TARGET cxx-headers OR HAVE_LIBCXX)
+  set(XRAY_DEPS cxx-headers)
+endif()
+
 if (APPLE)
   set(XRAY_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
   add_asm_sources(XRAY_ASM_SOURCES xray_trampoline_x86_64.S)
@@ -75,8 +148,34 @@ if (APPLE)
     OS ${XRAY_SUPPORTED_OS}
     ARCHS ${XRAY_SUPPORTED_ARCH}
     SOURCES ${x86_64_SOURCES}
+    ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    DEPS ${XRAY_DEPS})
+  add_compiler_rt_object_libraries(RTXrayFDR
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    SOURCES ${XRAY_FDR_MODE_SOURCES}
+    ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
     CFLAGS ${XRAY_CFLAGS}
-    DEFS ${XRAY_COMMON_DEFINITIONS})
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    DEPS ${XRAY_DEPS})
+  add_compiler_rt_object_libraries(RTXrayBASIC
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    SOURCES ${XRAY_BASIC_MODE_SOURCES}
+    ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    DEPS ${XRAY_DEPS})
+  add_compiler_rt_object_libraries(RTXrayPROFILING
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    SOURCES ${XRAY_PROFILING_MODE_SOURCES}
+    ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    DEPS ${XRAY_DEPS})
 
   # We only support running on osx for now.
   add_compiler_rt_runtime(clang_rt.xray
@@ -91,24 +190,104 @@ if (APPLE)
     LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
     LINK_LIBS ${XRAY_LINK_LIBS}
     PARENT_TARGET xray)
-else()
-foreach(arch ${XRAY_SUPPORTED_ARCH})
-  if(CAN_TARGET_${arch})
+  add_compiler_rt_runtime(clang_rt.xray-fdr
+    STATIC
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    OBJECT_LIBS RTXrayFDR
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_LIBS ${XRAY_LINK_LIBS}
+    PARENT_TARGET xray)
+  add_compiler_rt_runtime(clang_rt.xray-basic
+    STATIC
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    OBJECT_LIBS RTXrayBASIC
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_LIBS ${XRAY_LINK_LIBS}
+    PARENT_TARGET xray)
+  add_compiler_rt_runtime(clang_rt.xray-profiling
+    STATIC
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    OBJECT_LIBS RTXrayPROFILING
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_LIBS ${XRAY_LINK_LIBS}
+    PARENT_TARGET xray)
+else() # not Apple
+  foreach(arch ${XRAY_SUPPORTED_ARCH})
+    if(NOT CAN_TARGET_${arch})
+      continue()
+    endif()
     add_compiler_rt_object_libraries(RTXray
       ARCHS ${arch}
-      SOURCES ${XRAY_SOURCES} CFLAGS ${XRAY_CFLAGS}
-      DEFS ${XRAY_COMMON_DEFINITIONS})
+      SOURCES ${XRAY_SOURCES} ${${arch}_SOURCES}
+      ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+      CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      DEPS ${XRAY_DEPS})
+    add_compiler_rt_object_libraries(RTXrayFDR
+      ARCHS ${arch}
+      SOURCES ${XRAY_FDR_MODE_SOURCES}
+      ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+      CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      DEPS ${XRAY_DEPS})
+    add_compiler_rt_object_libraries(RTXrayBASIC
+      ARCHS ${arch}
+      SOURCES ${XRAY_BASIC_MODE_SOURCES}
+      ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+      CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      DEPS ${XRAY_DEPS})
+    add_compiler_rt_object_libraries(RTXrayPROFILING
+      ARCHS ${arch}
+      SOURCES ${XRAY_PROFILING_MODE_SOURCES}
+      ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+      CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      DEPS ${XRAY_DEPS})
+
+    # Common XRay archive for instrumented binaries.
     add_compiler_rt_runtime(clang_rt.xray
      STATIC
      ARCHS ${arch}
-     SOURCES ${${arch}_SOURCES}
      CFLAGS ${XRAY_CFLAGS}
      DEFS ${XRAY_COMMON_DEFINITIONS}
-     OBJECT_LIBS ${XRAY_COMMON_RUNTIME_OBJECT_LIBS}
+     OBJECT_LIBS ${XRAY_COMMON_RUNTIME_OBJECT_LIBS} RTXray
      PARENT_TARGET xray)
-  endif()
-endforeach()
-endif()
+    # FDR mode runtime archive (addon for clang_rt.xray)
+    add_compiler_rt_runtime(clang_rt.xray-fdr
+      STATIC
+      ARCHS ${arch}
+      CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      OBJECT_LIBS RTXrayFDR
+      PARENT_TARGET xray)
+    # Basic mode runtime archive (addon for clang_rt.xray)
+    add_compiler_rt_runtime(clang_rt.xray-basic
+      STATIC
+      ARCHS ${arch}
+      CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      OBJECT_LIBS RTXrayBASIC
+      PARENT_TARGET xray)
+   # Profiler Mode runtime
+   add_compiler_rt_runtime(clang_rt.xray-profiling
+     STATIC
+     ARCHS ${arch}
+     CFLAGS ${XRAY_CFLAGS}
+     DEFS ${XRAY_COMMON_DEFINITIONS}
+     OBJECT_LIBS RTXrayPROFILING
+     PARENT_TARGET xray)
+  endforeach()
+endif() # not Apple
 
 if(COMPILER_RT_INCLUDE_TESTS)
   add_subdirectory(tests)
diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt
index e54e63f27890..11f373167d24 100644
--- a/lib/xray/tests/CMakeLists.txt
+++ b/lib/xray/tests/CMakeLists.txt
@@ -3,6 +3,18 @@ include_directories(..)
 add_custom_target(XRayUnitTests)
 set_target_properties(XRayUnitTests PROPERTIES FOLDER "XRay unittests")
 
+# Sanity check XRAY_ALL_SOURCE_FILES_ABS_PATHS
+list(LENGTH XRAY_ALL_SOURCE_FILES_ABS_PATHS XASFAP_LENGTH)
+if (${XASFAP_LENGTH} EQUAL 0)
+  message(FATAL_ERROR "XRAY_ALL_SOURCE_FILES_ABS_PATHS cannot be empty")
+endif()
+unset(XASFAP_LENGTH)
+foreach (src_file ${XRAY_ALL_SOURCE_FILES_ABS_PATHS})
+  if (NOT EXISTS "${src_file}")
+    message(FATAL_ERROR "Source file \"${src_file}\" does not exist")
+  endif()
+endforeach()
+
 set(XRAY_UNITTEST_CFLAGS
   ${XRAY_CFLAGS}
   ${COMPILER_RT_UNITTEST_CFLAGS}
@@ -11,27 +23,77 @@ set(XRAY_UNITTEST_CFLAGS
   -I${COMPILER_RT_SOURCE_DIR}/lib/xray
   -I${COMPILER_RT_SOURCE_DIR}/lib)
 
+function(add_xray_lib library)
+  add_library(${library} STATIC ${ARGN})
+  set_target_properties(${library} PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    FOLDER "Compiler-RT Runtime tests")
+endfunction()
+
+function(get_xray_lib_for_arch arch lib)
+  if(APPLE)
+    set(tgt_name "RTXRay.test.osx")
+  else()
+    set(tgt_name "RTXRay.test.${arch}")
+  endif()
+  set(${lib} "${tgt_name}" PARENT_SCOPE)
+endfunction()
+
 set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
+set(XRAY_UNITTEST_LINK_FLAGS
+  ${CMAKE_THREAD_LIBS_INIT}
+	-l${SANITIZER_CXX_ABI_LIBRARY}
+  -fxray-instrument
+  )
+if (NOT APPLE)
+  append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS)
+  append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS)
+  append_list_if(COMPILER_RT_HAS_LIBDL -ldl XRAY_UNITTEST_LINK_FLAGS)
+  append_list_if(COMPILER_RT_HAS_LIBPTHREAD -pthread XRAY_UNITTEST_LINK_FLAGS)
+endif()
+
 macro(add_xray_unittest testname)
   cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN})
   if(UNIX AND NOT APPLE)
+    set(CMAKE_DL_LIBS_INIT "")
     foreach(arch ${XRAY_TEST_ARCH})
       set(TEST_OBJECTS)
+      get_xray_lib_for_arch(${arch} XRAY_RUNTIME_LIBS)
       generate_compiler_rt_tests(TEST_OBJECTS
         XRayUnitTests "${testname}-${arch}-Test" "${arch}"
         SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
+        # Note that any change in the implementations will cause all the unit
+        # tests to be re-built. This is by design, but may be cumbersome during
+        # the build/test cycle.
+        COMPILE_DEPS ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
+        ${XRAY_HEADERS} ${XRAY_ALL_SOURCE_FILES_ABS_PATHS}
+        RUNTIME "${XRAY_RUNTIME_LIBS}"
         DEPS gtest xray llvm-xray
         CFLAGS ${XRAY_UNITTEST_CFLAGS}
-        LINK_FLAGS -fxray-instrument
-          ${TARGET_LINK_FLAGS}
-          -lstdc++ -lm ${CMAKE_THREAD_LIBS_INIT}
-          -lpthread
-          -ldl -lrt)
-      set_target_properties(XRayUnitTests PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+        LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS})
+      set_target_properties(XRayUnitTests
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endforeach()
   endif()
 endmacro()
 
 if(COMPILER_RT_CAN_EXECUTE_TESTS)
+  if (APPLE)
+    add_xray_lib("RTXRay.test.osx"
+      $<TARGET_OBJECTS:RTXray.osx>
+      $<TARGET_OBJECTS:RTXrayFDR.osx>
+      $<TARGET_OBJECTS:RTXrayPROFILING.osx>
+      $<TARGET_OBJECTS:RTSanitizerCommon.osx>
+      $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>)
+  else()
+  foreach(arch ${XRAY_SUPPORTED_ARCH})
+    add_xray_lib("RTXRay.test.${arch}"
+      $<TARGET_OBJECTS:RTXray.${arch}>
+      $<TARGET_OBJECTS:RTXrayFDR.${arch}>
+      $<TARGET_OBJECTS:RTXrayPROFILING.${arch}>
+      $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+      $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>)
+  endforeach()
+  endif()
   add_subdirectory(unit)
 endif()
diff --git a/lib/xray/tests/unit/CMakeLists.txt b/lib/xray/tests/unit/CMakeLists.txt
index 62d01f239581..b42eb50d0790 100644
--- a/lib/xray/tests/unit/CMakeLists.txt
+++ b/lib/xray/tests/unit/CMakeLists.txt
@@ -2,3 +2,11 @@ add_xray_unittest(XRayBufferQueueTest SOURCES
   buffer_queue_test.cc xray_unit_test_main.cc)
 add_xray_unittest(XRayFDRLoggingTest SOURCES
   fdr_logging_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayAllocatorTest SOURCES
+  allocator_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRaySegmentedArrayTest SOURCES
+  segmented_array_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayFunctionCallTrieTest SOURCES
+  function_call_trie_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayProfileCollectorTest SOURCES
+  profile_collector_test.cc xray_unit_test_main.cc)
diff --git a/lib/xray/tests/unit/allocator_test.cc b/lib/xray/tests/unit/allocator_test.cc
new file mode 100644
index 000000000000..be404160e417
--- /dev/null
+++ b/lib/xray/tests/unit/allocator_test.cc
@@ -0,0 +1,42 @@
+//===-- allocator_test.cc -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+
+#include "xray_allocator.h"
+#include "gtest/gtest.h"
+
+namespace __xray {
+namespace {
+
+struct TestData {
+  s64 First;
+  s64 Second;
+};
+
+TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11); }
+
+TEST(AllocatorTest, Allocate) {
+  Allocator<sizeof(TestData)> A(2 << 11);
+  auto B = A.Allocate();
+  ASSERT_NE(B.Data, nullptr);
+}
+
+TEST(AllocatorTest, OverAllocate) {
+  Allocator<sizeof(TestData)> A(sizeof(TestData));
+  auto B1 = A.Allocate();
+  (void)B1;
+  auto B2 = A.Allocate();
+  ASSERT_EQ(B2.Data, nullptr);
+}
+
+} // namespace
+} // namespace __xray
diff --git a/lib/xray/tests/unit/buffer_queue_test.cc b/lib/xray/tests/unit/buffer_queue_test.cc
index 1ec7469ce187..c0d4ccb268d6 100644
--- a/lib/xray/tests/unit/buffer_queue_test.cc
+++ b/lib/xray/tests/unit/buffer_queue_test.cc
@@ -32,9 +32,9 @@ TEST(BufferQueueTest, GetAndRelease) {
   ASSERT_TRUE(Success);
   BufferQueue::Buffer Buf;
   ASSERT_EQ(Buffers.getBuffer(Buf), BufferQueue::ErrorCode::Ok);
-  ASSERT_NE(nullptr, Buf.Buffer);
+  ASSERT_NE(nullptr, Buf.Data);
   ASSERT_EQ(Buffers.releaseBuffer(Buf), BufferQueue::ErrorCode::Ok);
-  ASSERT_EQ(nullptr, Buf.Buffer);
+  ASSERT_EQ(nullptr, Buf.Data);
 }
 
 TEST(BufferQueueTest, GetUntilFailed) {
@@ -53,7 +53,7 @@ TEST(BufferQueueTest, ReleaseUnknown) {
   BufferQueue Buffers(kSize, 1, Success);
   ASSERT_TRUE(Success);
   BufferQueue::Buffer Buf;
-  Buf.Buffer = reinterpret_cast<void *>(0xdeadbeef);
+  Buf.Data = reinterpret_cast<void *>(0xdeadbeef);
   Buf.Size = kSize;
   EXPECT_EQ(BufferQueue::ErrorCode::UnrecognizedBuffer,
             Buffers.releaseBuffer(Buf));
@@ -65,7 +65,7 @@ TEST(BufferQueueTest, ErrorsWhenFinalising) {
   ASSERT_TRUE(Success);
   BufferQueue::Buffer Buf;
   ASSERT_EQ(Buffers.getBuffer(Buf), BufferQueue::ErrorCode::Ok);
-  ASSERT_NE(nullptr, Buf.Buffer);
+  ASSERT_NE(nullptr, Buf.Data);
   ASSERT_EQ(Buffers.finalize(), BufferQueue::ErrorCode::Ok);
   BufferQueue::Buffer OtherBuf;
   ASSERT_EQ(BufferQueue::ErrorCode::QueueFinalizing,
diff --git a/lib/xray/tests/unit/fdr_logging_test.cc b/lib/xray/tests/unit/fdr_logging_test.cc
index 76738ea4dff3..b6961efbc351 100644
--- a/lib/xray/tests/unit/fdr_logging_test.cc
+++ b/lib/xray/tests/unit/fdr_logging_test.cc
@@ -10,6 +10,7 @@
 // This file is a part of XRay, a function call tracing system.
 //
 //===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
 #include "xray_fdr_logging.h"
 #include "gtest/gtest.h"
 
@@ -86,7 +87,7 @@ TEST(FDRLoggingTest, Simple) {
 
   XRayFileHeader H;
   memcpy(&H, Contents, sizeof(XRayFileHeader));
-  ASSERT_EQ(H.Version, 2);
+  ASSERT_EQ(H.Version, 3);
   ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
 
   // We require one buffer at least to have the "extents" metadata record,
@@ -131,7 +132,7 @@ TEST(FDRLoggingTest, Multiple) {
 
   XRayFileHeader H;
   memcpy(&H, Contents, sizeof(XRayFileHeader));
-  ASSERT_EQ(H.Version, 2);
+  ASSERT_EQ(H.Version, 3);
   ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
 
   MetadataRecord MDR0, MDR1;
@@ -154,12 +155,12 @@ TEST(FDRLoggingTest, MultiThreadedCycling) {
   // Now we want to create one thread, do some logging, then create another one,
   // in succession and making sure that we're able to get thread records from
   // the latest thread (effectively being able to recycle buffers).
-  std::array<pid_t, 2> Threads;
+  std::array<tid_t, 2> Threads;
   for (uint64_t I = 0; I < 2; ++I) {
     std::thread t{[I, &Threads] {
       fdrLoggingHandleArg0(I + 1, XRayEntryType::ENTRY);
       fdrLoggingHandleArg0(I + 1, XRayEntryType::EXIT);
-      Threads[I] = syscall(SYS_gettid);
+      Threads[I] = GetTid();
     }};
     t.join();
   }
@@ -182,7 +183,7 @@ TEST(FDRLoggingTest, MultiThreadedCycling) {
 
   XRayFileHeader H;
   memcpy(&H, Contents, sizeof(XRayFileHeader));
-  ASSERT_EQ(H.Version, 2);
+  ASSERT_EQ(H.Version, 3);
   ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
 
   MetadataRecord MDR0, MDR1;
@@ -192,9 +193,9 @@ TEST(FDRLoggingTest, MultiThreadedCycling) {
   ASSERT_EQ(MDR0.RecordKind,
             uint8_t(MetadataRecord::RecordKinds::BufferExtents));
   ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer));
-  pid_t Latest = 0;
-  memcpy(&Latest, MDR1.Data, sizeof(pid_t));
-  ASSERT_EQ(Latest, Threads[1]);
+  int32_t Latest = 0;
+  memcpy(&Latest, MDR1.Data, sizeof(int32_t));
+  ASSERT_EQ(Latest, static_cast<int32_t>(Threads[1]));
 }
 
 } // namespace
diff --git a/lib/xray/tests/unit/function_call_trie_test.cc b/lib/xray/tests/unit/function_call_trie_test.cc
new file mode 100644
index 000000000000..049ecfb07e01
--- /dev/null
+++ b/lib/xray/tests/unit/function_call_trie_test.cc
@@ -0,0 +1,286 @@
+//===-- function_call_trie_test.cc ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "gtest/gtest.h"
+
+#include "xray_function_call_trie.h"
+
+namespace __xray {
+
+namespace {
+
+TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
+  profilingFlags()->setDefaults();
+  FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(Allocators);
+}
+
+TEST(FunctionCallTrieTest, EnterAndExitFunction) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 1);
+  Trie.exitFunction(1, 2);
+
+  // We need a way to pull the data out. At this point, until we get a data
+  // collection service implemented, we're going to export the data as a list of
+  // roots, and manually walk through the structure ourselves.
+
+  const auto &R = Trie.getRoots();
+
+  ASSERT_EQ(R.size(), 1u);
+  ASSERT_EQ(R.front()->FId, 1);
+  ASSERT_EQ(R.front()->CallCount, 1);
+  ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, MissingFunctionEntry) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.exitFunction(1, 1);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_TRUE(R.empty());
+}
+
+TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.enterFunction(2, 1);
+  Trie.enterFunction(3, 3);
+  Trie.exitFunction(1, 5);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_FALSE(R.empty());
+  EXPECT_EQ(R.size(), size_t{1});
+}
+
+TEST(FunctionCallTrieTest, MissingFunctionExit) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.enterFunction(1, 1);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_FALSE(R.empty());
+  EXPECT_EQ(R.size(), size_t{1});
+}
+
+TEST(FunctionCallTrieTest, MultipleRoots) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  // Enter and exit FId = 1.
+  Trie.enterFunction(1, 1);
+  Trie.exitFunction(1, 2);
+
+  // Enter and exit FId = 2.
+  Trie.enterFunction(2, 3);
+  Trie.exitFunction(2, 4);
+
+  const auto &R = Trie.getRoots();
+  ASSERT_FALSE(R.empty());
+  ASSERT_EQ(R.size(), 2u);
+
+  // Make sure the roots have different IDs.
+  const auto R0 = R[0];
+  const auto R1 = R[1];
+  ASSERT_NE(R0->FId, R1->FId);
+
+  // Inspect the roots that they have the right data.
+  ASSERT_NE(R0, nullptr);
+  EXPECT_EQ(R0->CallCount, 1u);
+  EXPECT_EQ(R0->CumulativeLocalTime, 1u);
+
+  ASSERT_NE(R1, nullptr);
+  EXPECT_EQ(R1->CallCount, 1u);
+  EXPECT_EQ(R1->CumulativeLocalTime, 1u);
+}
+
+// While missing an intermediary entry may be rare in practice, we still enforce
+// that we can handle the case where we've missed the entry event somehow, in
+// between call entry/exits. To illustrate, imagine the following shadow call
+// stack:
+//
+//   f0@t0 -> f1@t1 -> f2@t2
+//
+// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for
+// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of
+// accounting local time to `f2` from d = (t3 - t2), then local time to `f1`
+// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'.
+TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 0);
+  Trie.enterFunction(2, 100);
+  Trie.enterFunction(3, 200);
+  Trie.exitFunction(3, 300);
+  Trie.exitFunction(1, 400);
+
+  // What we should see at this point is all the functions in the trie in a
+  // specific order (1 -> 2 -> 3) with the appropriate count(s) and local
+  // latencies.
+  const auto &R = Trie.getRoots();
+  ASSERT_FALSE(R.empty());
+  ASSERT_EQ(R.size(), 1u);
+
+  const auto &F1 = *R[0];
+  ASSERT_EQ(F1.FId, 1);
+  ASSERT_FALSE(F1.Callees.empty());
+
+  const auto &F2 = *F1.Callees[0].NodePtr;
+  ASSERT_EQ(F2.FId, 2);
+  ASSERT_FALSE(F2.Callees.empty());
+
+  const auto &F3 = *F2.Callees[0].NodePtr;
+  ASSERT_EQ(F3.FId, 3);
+  ASSERT_TRUE(F3.Callees.empty());
+
+  // Now that we've established the preconditions, we check for specific aspects
+  // of the nodes.
+  EXPECT_EQ(F3.CallCount, 1);
+  EXPECT_EQ(F2.CallCount, 1);
+  EXPECT_EQ(F1.CallCount, 1);
+  EXPECT_EQ(F3.CumulativeLocalTime, 100);
+  EXPECT_EQ(F2.CumulativeLocalTime, 300);
+  EXPECT_EQ(F1.CumulativeLocalTime, 100);
+}
+
+TEST(FunctionCallTrieTest, DeepCallStack) {
+  // Simulate a relatively deep call stack (32 levels) and ensure that we can
+  // properly pop all the way up the stack.
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  for (int i = 0; i < 32; ++i)
+    Trie.enterFunction(i + 1, i);
+  Trie.exitFunction(1, 33);
+
+  // Here, validate that we have a 32-level deep function call path from the
+  // root (1) down to the leaf (33).
+  const auto &R = Trie.getRoots();
+  ASSERT_EQ(R.size(), 1u);
+  auto F = R[0];
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(F->FId, i + 1);
+    EXPECT_EQ(F->CallCount, 1);
+    if (F->Callees.empty() && i != 31)
+      FAIL() << "Empty callees for FId " << F->FId;
+    if (i != 31)
+      F = F->Callees[0].NodePtr;
+  }
+}
+
+// TODO: Test that we can handle cross-CPU migrations, where TSCs are not
+// guaranteed to be synchronised.
+TEST(FunctionCallTrieTest, DeepCopy) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 0);
+  Trie.enterFunction(2, 1);
+  Trie.exitFunction(2, 2);
+  Trie.enterFunction(3, 3);
+  Trie.exitFunction(3, 4);
+  Trie.exitFunction(1, 5);
+
+  // We want to make a deep copy and compare notes.
+  auto B = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Copy(B);
+  Trie.deepCopyInto(Copy);
+
+  ASSERT_NE(Trie.getRoots().size(), 0u);
+  ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size());
+  const auto &R0Orig = *Trie.getRoots()[0];
+  const auto &R0Copy = *Copy.getRoots()[0];
+  EXPECT_EQ(R0Orig.FId, 1);
+  EXPECT_EQ(R0Orig.FId, R0Copy.FId);
+
+  ASSERT_EQ(R0Orig.Callees.size(), 2u);
+  ASSERT_EQ(R0Copy.Callees.size(), 2u);
+
+  const auto &F1Orig =
+      *R0Orig.Callees
+           .find_element(
+               [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
+           ->NodePtr;
+  const auto &F1Copy =
+      *R0Copy.Callees
+           .find_element(
+               [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
+           ->NodePtr;
+  EXPECT_EQ(&R0Orig, F1Orig.Parent);
+  EXPECT_EQ(&R0Copy, F1Copy.Parent);
+}
+
+TEST(FunctionCallTrieTest, MergeInto) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie T0(A);
+  FunctionCallTrie T1(A);
+
+  // 1 -> 2 -> 3
+  T0.enterFunction(1, 0);
+  T0.enterFunction(2, 1);
+  T0.enterFunction(3, 2);
+  T0.exitFunction(3, 3);
+  T0.exitFunction(2, 4);
+  T0.exitFunction(1, 5);
+
+  // 1 -> 2 -> 3
+  T1.enterFunction(1, 0);
+  T1.enterFunction(2, 1);
+  T1.enterFunction(3, 2);
+  T1.exitFunction(3, 3);
+  T1.exitFunction(2, 4);
+  T1.exitFunction(1, 5);
+
+  // We use a different allocator here to make sure that we're able to transfer
+  // data into a FunctionCallTrie which uses a different allocator. This
+  // reflects the inteded usage scenario for when we're collecting profiles that
+  // aggregate across threads.
+  auto B = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Merged(B);
+
+  T0.mergeInto(Merged);
+  T1.mergeInto(Merged);
+
+  ASSERT_EQ(Merged.getRoots().size(), 1u);
+  const auto &R0 = *Merged.getRoots()[0];
+  EXPECT_EQ(R0.FId, 1);
+  EXPECT_EQ(R0.CallCount, 2);
+  EXPECT_EQ(R0.CumulativeLocalTime, 10);
+  EXPECT_EQ(R0.Callees.size(), 1u);
+
+  const auto &F1 = *R0.Callees[0].NodePtr;
+  EXPECT_EQ(F1.FId, 2);
+  EXPECT_EQ(F1.CallCount, 2);
+  EXPECT_EQ(F1.CumulativeLocalTime, 6);
+  EXPECT_EQ(F1.Callees.size(), 1u);
+
+  const auto &F2 = *F1.Callees[0].NodePtr;
+  EXPECT_EQ(F2.FId, 3);
+  EXPECT_EQ(F2.CallCount, 2);
+  EXPECT_EQ(F2.CumulativeLocalTime, 2);
+  EXPECT_EQ(F2.Callees.size(), 0u);
+}
+
+} // namespace
+
+} // namespace __xray
diff --git a/lib/xray/tests/unit/profile_collector_test.cc b/lib/xray/tests/unit/profile_collector_test.cc
new file mode 100644
index 000000000000..b7dbe567312a
--- /dev/null
+++ b/lib/xray/tests/unit/profile_collector_test.cc
@@ -0,0 +1,179 @@
+//===-- profile_collector_test.cc -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "gtest/gtest.h"
+
+#include "xray_profile_collector.h"
+#include "xray_profiling_flags.h"
+#include <cstdint>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace __xray {
+namespace {
+
+static constexpr auto kHeaderSize = 16u;
+
+void ValidateBlock(XRayBuffer B) {
+  profilingFlags()->setDefaults();
+  ASSERT_NE(static_cast<const void *>(B.Data), nullptr);
+  ASSERT_NE(B.Size, 0u);
+  ASSERT_GE(B.Size, kHeaderSize);
+  // We look at the block size, the block number, and the thread ID to ensure
+  // that none of them are zero (or that the header data is laid out as we
+  // expect).
+  char LocalBuffer[kHeaderSize] = {};
+  internal_memcpy(LocalBuffer, B.Data, kHeaderSize);
+  u32 BlockSize = 0;
+  u32 BlockNumber = 0;
+  u64 ThreadId = 0;
+  internal_memcpy(&BlockSize, LocalBuffer, sizeof(u32));
+  internal_memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32));
+  internal_memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64));
+  ASSERT_NE(BlockSize, 0u);
+  ASSERT_GE(BlockNumber, 0u);
+  ASSERT_NE(ThreadId, 0u);
+}
+
+std::tuple<u32, u32, u64> ParseBlockHeader(XRayBuffer B) {
+  char LocalBuffer[kHeaderSize] = {};
+  internal_memcpy(LocalBuffer, B.Data, kHeaderSize);
+  u32 BlockSize = 0;
+  u32 BlockNumber = 0;
+  u64 ThreadId = 0;
+  internal_memcpy(&BlockSize, LocalBuffer, sizeof(u32));
+  internal_memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32));
+  internal_memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64));
+  return std::make_tuple(BlockSize, BlockNumber, ThreadId);
+}
+
+struct Profile {
+  int64_t CallCount;
+  int64_t CumulativeLocalTime;
+  std::vector<int32_t> Path;
+};
+
+std::tuple<Profile, const char *> ParseProfile(const char *P) {
+  Profile Result;
+  // Read the path first, until we find a sentinel 0.
+  int32_t F;
+  do {
+    internal_memcpy(&F, P, sizeof(int32_t));
+    P += sizeof(int32_t);
+    Result.Path.push_back(F);
+  } while (F != 0);
+
+  // Then read the CallCount.
+  internal_memcpy(&Result.CallCount, P, sizeof(int64_t));
+  P += sizeof(int64_t);
+
+  // Then read the CumulativeLocalTime.
+  internal_memcpy(&Result.CumulativeLocalTime, P, sizeof(int64_t));
+  P += sizeof(int64_t);
+  return std::make_tuple(std::move(Result), P);
+}
+
+TEST(profileCollectorServiceTest, PostSerializeCollect) {
+  profilingFlags()->setDefaults();
+  // The most basic use-case (the one we actually only care about) is the one
+  // where we ensure that we can post FunctionCallTrie instances, which are then
+  // destroyed but serialized properly.
+  //
+  // First, we initialise a set of allocators in the local scope. This ensures
+  // that we're able to copy the contents of the FunctionCallTrie that uses
+  // the local allocators.
+  auto Allocators = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie T(Allocators);
+
+  // Then, we populate the trie with some data.
+  T.enterFunction(1, 1);
+  T.enterFunction(2, 2);
+  T.exitFunction(2, 3);
+  T.exitFunction(1, 4);
+
+  // Then we post the data to the global profile collector service.
+  profileCollectorService::post(T, 1);
+
+  // Then we serialize the data.
+  profileCollectorService::serialize();
+
+  // Then we go through a single buffer to see whether we're getting the data we
+  // expect.
+  auto B = profileCollectorService::nextBuffer({nullptr, 0});
+  ValidateBlock(B);
+  u32 BlockSize;
+  u32 BlockNum;
+  u64 ThreadId;
+  std::tie(BlockSize, BlockNum, ThreadId) = ParseBlockHeader(B);
+
+  // We look at the serialized buffer to see whether the Trie we're expecting
+  // to see is there.
+  auto DStart = static_cast<const char *>(B.Data) + kHeaderSize;
+  std::vector<char> D(DStart, DStart + BlockSize);
+  B = profileCollectorService::nextBuffer(B);
+  ASSERT_EQ(B.Data, nullptr);
+  ASSERT_EQ(B.Size, 0u);
+
+  Profile Profile1, Profile2;
+  auto P = static_cast<const char *>(D.data());
+  std::tie(Profile1, P) = ParseProfile(P);
+  std::tie(Profile2, P) = ParseProfile(P);
+
+  ASSERT_NE(Profile1.Path.size(), Profile2.Path.size());
+  auto &P1 = Profile1.Path.size() < Profile2.Path.size() ? Profile2 : Profile1;
+  auto &P2 = Profile1.Path.size() < Profile2.Path.size() ? Profile1 : Profile2;
+  std::vector<int32_t> P1Expected = {2, 1, 0};
+  std::vector<int32_t> P2Expected = {1, 0};
+  ASSERT_EQ(P1.Path.size(), P1Expected.size());
+  ASSERT_EQ(P2.Path.size(), P2Expected.size());
+  ASSERT_EQ(P1.Path, P1Expected);
+  ASSERT_EQ(P2.Path, P2Expected);
+}
+
+// We break out a function that will be run in multiple threads, one that will
+// use a thread local allocator, and will post the FunctionCallTrie to the
+// profileCollectorService. This simulates what the threads being profiled would
+// be doing anyway, but through the XRay logging implementation.
+void threadProcessing() {
+  thread_local auto Allocators = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie T(Allocators);
+
+  T.enterFunction(1, 1);
+  T.enterFunction(2, 2);
+  T.exitFunction(2, 3);
+  T.exitFunction(1, 4);
+
+  profileCollectorService::post(T, GetTid());
+}
+
+TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) {
+  profilingFlags()->setDefaults();
+  std::thread t1(threadProcessing);
+  std::thread t2(threadProcessing);
+
+  t1.join();
+  t2.join();
+
+  // At this point, t1 and t2 are already done with what they were doing.
+  profileCollectorService::serialize();
+
+  // Ensure that we see two buffers.
+  auto B = profileCollectorService::nextBuffer({nullptr, 0});
+  ValidateBlock(B);
+
+  B = profileCollectorService::nextBuffer(B);
+  ValidateBlock(B);
+}
+
+} // namespace
+} // namespace __xray
diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc
new file mode 100644
index 000000000000..035674ccfaf5
--- /dev/null
+++ b/lib/xray/tests/unit/segmented_array_test.cc
@@ -0,0 +1,200 @@
+#include "xray_segmented_array.h"
+#include "gtest/gtest.h"
+
+namespace __xray {
+namespace {
+
+struct TestData {
+  s64 First;
+  s64 Second;
+
+  // Need a constructor for emplace operations.
+  TestData(s64 F, s64 S) : First(F), Second(S) {}
+};
+
+TEST(SegmentedArrayTest, ConstructWithAllocators) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  Array<TestData> Data(A);
+  (void)Data;
+}
+
+TEST(SegmentedArrayTest, ConstructAndPopulate) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  Array<TestData> data(A);
+  ASSERT_NE(data.Append(TestData{0, 0}), nullptr);
+  ASSERT_NE(data.Append(TestData{1, 1}), nullptr);
+  ASSERT_EQ(data.size(), 2u);
+}
+
+TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  Array<TestData> data(A);
+  ASSERT_NE(data.Append(TestData{0, 1}), nullptr);
+  ASSERT_EQ(data.size(), 1u);
+  ASSERT_EQ(data[0].First, 0);
+  ASSERT_EQ(data[0].Second, 1);
+}
+
+TEST(SegmentedArrayTest, PopulateWithMoreElements) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 24);
+  Array<TestData> data(A);
+  static const auto kMaxElements = 100u;
+  for (auto I = 0u; I < kMaxElements; ++I) {
+    ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr);
+  }
+  ASSERT_EQ(data.size(), kMaxElements);
+  for (auto I = 0u; I < kMaxElements; ++I) {
+    ASSERT_EQ(data[I].First, I);
+    ASSERT_EQ(data[I].Second, I + 1);
+  }
+}
+
+TEST(SegmentedArrayTest, AppendEmplace) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  Array<TestData> data(A);
+  ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+  ASSERT_EQ(data[0].First, 1);
+  ASSERT_EQ(data[0].Second, 1);
+}
+
+TEST(SegmentedArrayTest, AppendAndTrim) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  Array<TestData> data(A);
+  ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+  ASSERT_EQ(data.size(), 1u);
+  data.trim(1);
+  ASSERT_EQ(data.size(), 0u);
+  ASSERT_TRUE(data.empty());
+}
+
+TEST(SegmentedArrayTest, IteratorAdvance) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  Array<TestData> data(A);
+  ASSERT_TRUE(data.empty());
+  ASSERT_EQ(data.begin(), data.end());
+  auto I0 = data.begin();
+  ASSERT_EQ(I0++, data.begin());
+  ASSERT_NE(I0, data.begin());
+  for (const auto &D : data) {
+    (void)D;
+    FAIL();
+  }
+  ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+  ASSERT_EQ(data.size(), 1u);
+  ASSERT_NE(data.begin(), data.end());
+  auto &D0 = *data.begin();
+  ASSERT_EQ(D0.First, 1);
+  ASSERT_EQ(D0.Second, 1);
+}
+
+TEST(SegmentedArrayTest, IteratorRetreat) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  Array<TestData> data(A);
+  ASSERT_TRUE(data.empty());
+  ASSERT_EQ(data.begin(), data.end());
+  ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+  ASSERT_EQ(data.size(), 1u);
+  ASSERT_NE(data.begin(), data.end());
+  auto &D0 = *data.begin();
+  ASSERT_EQ(D0.First, 1);
+  ASSERT_EQ(D0.Second, 1);
+
+  auto I0 = data.end();
+  ASSERT_EQ(I0--, data.end());
+  ASSERT_NE(I0, data.end());
+  ASSERT_EQ(I0, data.begin());
+  ASSERT_EQ(I0->First, 1);
+  ASSERT_EQ(I0->Second, 1);
+}
+
+TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 20);
+  Array<TestData> Data(A);
+  ASSERT_TRUE(Data.empty());
+  auto I0Begin = Data.begin(), I0End = Data.end();
+  // Add enough elements in Data to have more than one chunk.
+  constexpr auto Segment = Array<TestData>::SegmentSize;
+  constexpr auto SegmentX2 = Segment * 2;
+  for (auto i = SegmentX2; i > 0u; --i) {
+    Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
+  }
+  ASSERT_EQ(Data.size(), SegmentX2);
+  {
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.First, 1);
+    ASSERT_EQ(Back.Second, 1);
+  }
+
+  // Trim one chunk's elements worth.
+  Data.trim(Segment);
+  ASSERT_EQ(Data.size(), Segment);
+
+  // Check that we are still able to access 'back' properly.
+  {
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.First, static_cast<s64>(Segment + 1));
+    ASSERT_EQ(Back.Second, static_cast<s64>(Segment + 1));
+  }
+
+  // Then trim until it's empty.
+  Data.trim(Segment);
+  ASSERT_TRUE(Data.empty());
+
+  // Here our iterators should be the same.
+  auto I1Begin = Data.begin(), I1End = Data.end();
+  EXPECT_EQ(I0Begin, I1Begin);
+  EXPECT_EQ(I0End, I1End);
+
+  // Then we ensure that adding elements back works just fine.
+  for (auto i = SegmentX2; i > 0u; --i) {
+    Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
+  }
+  EXPECT_EQ(Data.size(), SegmentX2);
+}
+
+struct ShadowStackEntry {
+  uint64_t EntryTSC = 0;
+  uint64_t *NodePtr = nullptr;
+  ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {}
+};
+
+TEST(SegmentedArrayTest, SimulateStackBehaviour) {
+  using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
+  AllocatorType A(1 << 10);
+  Array<ShadowStackEntry> Data(A);
+  static uint64_t Dummy = 0;
+  constexpr uint64_t Max = 9;
+
+  for (uint64_t i = 0; i < Max; ++i) {
+    auto P = Data.Append({i, &Dummy});
+    ASSERT_NE(P, nullptr);
+    ASSERT_EQ(P->NodePtr, &Dummy);
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.NodePtr, &Dummy);
+    ASSERT_EQ(Back.EntryTSC, i);
+  }
+
+  // Simulate a stack by checking the data from the end as we're trimming.
+  auto Counter = Max;
+  ASSERT_EQ(Data.size(), size_t(Max));
+  while (!Data.empty()) {
+    const auto &Top = Data.back();
+    uint64_t *TopNode = Top.NodePtr;
+    EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter;
+    Data.trim(1);
+    --Counter;
+    ASSERT_EQ(Data.size(), size_t(Counter));
+  }
+}
+
+} // namespace
+} // namespace __xray
diff --git a/lib/xray/xray_AArch64.cc b/lib/xray/xray_AArch64.cc
index f26e77dd7fc1..096de009e83c 100644
--- a/lib/xray/xray_AArch64.cc
+++ b/lib/xray/xray_AArch64.cc
@@ -112,6 +112,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
   return false;
 }
 
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in aarch64?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h
new file mode 100644
index 000000000000..8244815284a8
--- /dev/null
+++ b/lib/xray/xray_allocator.h
@@ -0,0 +1,129 @@
+//===-- xray_allocator.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the allocator interface for an arena allocator, used primarily for
+// the profiling runtime.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_ALLOCATOR_H
+#define XRAY_ALLOCATOR_H
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_posix.h"
+#include "xray_utils.h"
+#include <sys/mman.h>
+#include <cstddef>
+#include <cstdint>
+
+#ifndef MAP_NORESERVE
+// no-op on NetBSD (at least), unsupported flag on FreeBSD basically because unneeded
+#define MAP_NORESERVE 0
+#endif
+
+namespace __xray {
+
+/// The Allocator type hands out fixed-sized chunks of memory that are
+/// cache-line aligned and sized. This is useful for placement of
+/// performance-sensitive data in memory that's frequently accessed. The
+/// allocator also self-limits the peak memory usage to a dynamically defined
+/// maximum.
+///
+/// N is the lower-bound size of the block of memory to return from the
+/// allocation function. N is used to compute the size of a block, which is
+/// cache-line-size multiples worth of memory. We compute the size of a block by
+/// determining how many cache lines worth of memory is required to subsume N.
+///
+/// The Allocator instance will manage its own memory acquired through mmap.
+/// This severely constrains the platforms on which this can be used to POSIX
+/// systems where mmap semantics are well-defined.
+///
+/// FIXME: Isolate the lower-level memory management to a different abstraction
+/// that can be platform-specific.
+template <size_t N> struct Allocator {
+  // The Allocator returns memory as Block instances.
+  struct Block {
+    /// Compute the minimum cache-line size multiple that is >= N.
+    static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
+    void *Data;
+  };
+
+private:
+  const size_t MaxMemory{0};
+  void *BackingStore = nullptr;
+  void *AlignedNextBlock = nullptr;
+  size_t AllocatedBlocks = 0;
+  SpinMutex Mutex{};
+
+  void *Alloc() {
+    SpinMutexLock Lock(&Mutex);
+    if (UNLIKELY(BackingStore == nullptr)) {
+      BackingStore = reinterpret_cast<void *>(
+          internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0));
+      if (BackingStore == MAP_FAILED) {
+        BackingStore = nullptr;
+        if (Verbosity())
+          Report("XRay Profiling: Failed to allocate memory for allocator.\n");
+        return nullptr;
+      }
+
+      AlignedNextBlock = BackingStore;
+
+      // Ensure that NextBlock is aligned appropriately.
+      auto BackingStoreNum = reinterpret_cast<uintptr_t>(BackingStore);
+      auto AlignedNextBlockNum = nearest_boundary(
+          reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
+      if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
+        munmap(BackingStore, MaxMemory);
+        AlignedNextBlock = BackingStore = nullptr;
+        if (Verbosity())
+          Report("XRay Profiling: Cannot obtain enough memory from "
+                 "preallocated region.\n");
+        return nullptr;
+      }
+
+      AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum);
+
+      // Assert that AlignedNextBlock is cache-line aligned.
+      DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
+                0);
+    }
+
+    if ((AllocatedBlocks * Block::Size) >= MaxMemory)
+      return nullptr;
+
+    // Align the pointer we'd like to return to an appropriate alignment, then
+    // advance the pointer from where to start allocations.
+    void *Result = AlignedNextBlock;
+    AlignedNextBlock = reinterpret_cast<void *>(
+        reinterpret_cast<char *>(AlignedNextBlock) + N);
+    ++AllocatedBlocks;
+    return Result;
+  }
+
+public:
+  explicit Allocator(size_t M)
+      : MaxMemory(nearest_boundary(M, kCacheLineSize)) {}
+
+  Block Allocate() { return {Alloc()}; }
+
+  ~Allocator() NOEXCEPT {
+    if (BackingStore != nullptr) {
+      internal_munmap(BackingStore, MaxMemory);
+    }
+  }
+};
+
+} // namespace __xray
+
+#endif // XRAY_ALLOCATOR_H
diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc
index da4efcdd2b17..5b828287e3f6 100644
--- a/lib/xray/xray_arm.cc
+++ b/lib/xray/xray_arm.cc
@@ -149,6 +149,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
   return false;
 }
 
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in arm?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
diff --git a/lib/xray/xray_basic_flags.cc b/lib/xray/xray_basic_flags.cc
new file mode 100644
index 000000000000..14d805c71a88
--- /dev/null
+++ b/lib/xray/xray_basic_flags.cc
@@ -0,0 +1,50 @@
+//===-- xray_basic_flags.cc -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay Basic flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_basic_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+/// Use via basicFlags().
+BasicFlags xray_basic_flags_dont_use_directly;
+
+void BasicFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayBasicFlags(FlagParser *P,
+                            BasicFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedBasicFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_BASIC_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_BASIC_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_basic_flags.h b/lib/xray/xray_basic_flags.h
new file mode 100644
index 000000000000..041578f0663c
--- /dev/null
+++ b/lib/xray/xray_basic_flags.h
@@ -0,0 +1,38 @@
+//===-- xray_basic_flags.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instruementation system.
+//
+// XRay Basic Mode runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_BASIC_FLAGS_H
+#define XRAY_BASIC_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct BasicFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern BasicFlags xray_basic_flags_dont_use_directly;
+extern void registerXRayBasicFlags(FlagParser *P, BasicFlags *F);
+const char *useCompilerDefinedBasicFlags();
+inline BasicFlags *basicFlags() { return &xray_basic_flags_dont_use_directly; }
+
+} // namespace __xray
+
+#endif // XRAY_BASIC_FLAGS_H
diff --git a/lib/xray/xray_basic_flags.inc b/lib/xray/xray_basic_flags.inc
new file mode 100644
index 000000000000..327735b51055
--- /dev/null
+++ b/lib/xray/xray_basic_flags.inc
@@ -0,0 +1,24 @@
+//===-- xray_basic_flags.inc ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(int, func_duration_threshold_us, 5,
+          "Basic logging will try to skip functions that execute for fewer "
+          "microseconds than this threshold.")
+XRAY_FLAG(int, max_stack_depth, 64,
+          "Basic logging will keep track of at most this deep a call stack, "
+          "any more and the recordings will be dropped.")
+XRAY_FLAG(int, thread_buffer_size, 1024,
+          "The number of entries to keep on a per-thread buffer.")
diff --git a/lib/xray/xray_inmemory_log.cc b/lib/xray/xray_basic_logging.cc
index a27ffbcbd12e..585ca641cd0c 100644
--- a/lib/xray/xray_inmemory_log.cc
+++ b/lib/xray/xray_basic_logging.cc
@@ -1,4 +1,4 @@
-//===-- xray_inmemory_log.cc ------------------------------------*- C++ -*-===//
+//===-- xray_basic_logging.cc -----------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,8 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cassert>
-#include <cstring>
 #include <errno.h>
 #include <fcntl.h>
 #include <pthread.h>
@@ -29,16 +27,18 @@
 #include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "xray/xray_records.h"
+#include "xray_recursion_guard.h"
+#include "xray_basic_flags.h"
+#include "xray_basic_logging.h"
 #include "xray_defs.h"
 #include "xray_flags.h"
-#include "xray_inmemory_log.h"
 #include "xray_interface_internal.h"
 #include "xray_tsc.h"
 #include "xray_utils.h"
 
 namespace __xray {
 
-__sanitizer::SpinMutex LogMutex;
+SpinMutex LogMutex;
 
 // We use elements of this type to record the entry TSC of every function ID we
 // see as we're tracing a particular thread's execution.
@@ -60,43 +60,41 @@ struct alignas(64) ThreadLocalData {
   size_t StackSize = 0;
   size_t StackEntries = 0;
   int Fd = -1;
-  pid_t TID = 0;
 };
 
 static pthread_key_t PThreadKey;
 
-static __sanitizer::atomic_uint8_t BasicInitialized{0};
+static atomic_uint8_t BasicInitialized{0};
 
 BasicLoggingOptions GlobalOptions;
 
-thread_local volatile bool RecursionGuard = false;
+thread_local atomic_uint8_t Guard{0};
 
-static uint64_t thresholdTicks() XRAY_NEVER_INSTRUMENT {
-  static uint64_t TicksPerSec = probeRequiredCPUFeatures()
-                                    ? getTSCFrequency()
-                                    : __xray::NanosecondsPerSecond;
-  static const uint64_t ThresholdTicks =
-      TicksPerSec * GlobalOptions.DurationFilterMicros / 1000000;
-  return ThresholdTicks;
-}
+static atomic_uint8_t UseRealTSC{0};
+static atomic_uint64_t ThresholdTicks{0};
+static atomic_uint64_t TicksPerSec{0};
+static atomic_uint64_t CycleFrequency{NanosecondsPerSecond};
 
 static int openLogFile() XRAY_NEVER_INSTRUMENT {
   int F = getLogFD();
   if (F == -1)
     return -1;
 
-  // Test for required CPU features and cache the cycle frequency
-  static bool TSCSupported = probeRequiredCPUFeatures();
-  static uint64_t CycleFrequency =
-      TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;
+  static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT;
+  pthread_once(&DetectOnce, +[] {
+    if (atomic_load(&UseRealTSC, memory_order_acquire))
+      atomic_store(&CycleFrequency, getTSCFrequency(), memory_order_release);
+  });
 
   // Since we're here, we get to write the header. We set it up so that the
   // header will only be written once, at the start, and let the threads
   // logging do writes which just append.
   XRayFileHeader Header;
-  Header.Version = 2; // Version 2 includes tail exit records.
+  // Version 2 includes tail exit records.
+  // Version 3 includes pid inside records.
+  Header.Version = 3;
   Header.Type = FileTypes::NAIVE_LOG;
-  Header.CycleFrequency = CycleFrequency;
+  Header.CycleFrequency = atomic_load(&CycleFrequency, memory_order_acquire);
 
   // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
   // before setting the values in the header.
@@ -107,20 +105,21 @@ static int openLogFile() XRAY_NEVER_INSTRUMENT {
   return F;
 }
 
-int getGlobalFd() XRAY_NEVER_INSTRUMENT {
-  static int Fd = openLogFile();
+static int getGlobalFd() XRAY_NEVER_INSTRUMENT {
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  static int Fd = 0;
+  pthread_once(&OnceInit, +[] { Fd = openLogFile(); });
   return Fd;
 }
 
-ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
   thread_local ThreadLocalData TLD;
   thread_local bool UNUSED TOnce = [] {
     if (GlobalOptions.ThreadBufferSize == 0) {
-      if (__sanitizer::Verbosity())
+      if (Verbosity())
         Report("Not initializing TLD since ThreadBufferSize == 0.\n");
       return false;
     }
-    TLD.TID = __sanitizer::GetTid();
     pthread_setspecific(PThreadKey, &TLD);
     TLD.Fd = getGlobalFd();
     TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>(
@@ -129,7 +128,7 @@ ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
     TLD.BufferSize = GlobalOptions.ThreadBufferSize;
     TLD.BufferOffset = 0;
     if (GlobalOptions.MaxStackDepth == 0) {
-      if (__sanitizer::Verbosity())
+      if (Verbosity())
         Report("Not initializing the ShadowStack since MaxStackDepth == 0.\n");
       TLD.StackSize = 0;
       TLD.StackEntries = 0;
@@ -141,13 +140,6 @@ ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
                       alignof(StackEntry)));
     TLD.StackSize = GlobalOptions.MaxStackDepth;
     TLD.StackEntries = 0;
-    if (__sanitizer::Verbosity() >= 2) {
-      static auto UNUSED Once = [] {
-        auto ticks = thresholdTicks();
-        Report("Ticks threshold: %d\n", ticks);
-        return false;
-      }();
-    }
     return false;
   }();
   return TLD;
@@ -157,7 +149,6 @@ template <class RDTSC>
 void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
                     RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
   auto &TLD = getThreadLocalData();
-  auto &InMemoryBuffer = TLD.InMemoryBuffer;
   int Fd = getGlobalFd();
   if (Fd == -1)
     return;
@@ -165,10 +156,9 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
   // Use a simple recursion guard, to handle cases where we're already logging
   // and for one reason or another, this function gets called again in the same
   // thread.
-  if (RecursionGuard)
+  RecursionGuard G(Guard);
+  if (!G)
     return;
-  RecursionGuard = true;
-  auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; });
 
   uint8_t CPU = 0;
   uint64_t TSC = ReadTSC(CPU);
@@ -189,7 +179,7 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
     E.TSC = TSC;
     auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
                          (sizeof(StackEntry) * (TLD.StackEntries - 1));
-    __sanitizer::internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry));
+    internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry));
     break;
   }
   case XRayEntryType::EXIT:
@@ -213,12 +203,12 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
     StackEntry StackTop;
     auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
                          (sizeof(StackEntry) * TLD.StackEntries);
-    __sanitizer::internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry));
+    internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry));
     if (StackTop.FuncId == FuncId && StackTop.CPU == CPU &&
         StackTop.TSC < TSC) {
       auto Delta = TSC - StackTop.TSC;
-      if (Delta < thresholdTicks()) {
-        assert(TLD.BufferOffset > 0);
+      if (Delta < atomic_load(&ThresholdTicks, memory_order_relaxed)) {
+        DCHECK(TLD.BufferOffset > 0);
         TLD.BufferOffset -= StackTop.Type == XRayEntryType::ENTRY ? 1 : 2;
         return;
       }
@@ -227,27 +217,26 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
   }
   default:
     // Should be unreachable.
-    assert(false && "Unsupported XRayEntryType encountered.");
+    DCHECK(false && "Unsupported XRayEntryType encountered.");
     break;
   }
 
   // First determine whether the delta between the function's enter record and
   // the exit record is higher than the threshold.
-  __xray::XRayRecord R;
+  XRayRecord R;
   R.RecordType = RecordTypes::NORMAL;
   R.CPU = CPU;
   R.TSC = TSC;
-  R.TId = TLD.TID;
+  R.TId = GetTid(); 
+  R.PId = internal_getpid(); 
   R.Type = Type;
   R.FuncId = FuncId;
-  auto EntryPtr = static_cast<char *>(InMemoryBuffer) +
-                  (sizeof(__xray::XRayRecord) * TLD.BufferOffset);
-  __sanitizer::internal_memcpy(EntryPtr, &R, sizeof(R));
+  auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer);
+  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
   if (++TLD.BufferOffset == TLD.BufferSize) {
-    __sanitizer::SpinMutexLock L(&LogMutex);
-    auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer);
-    retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer),
-                     reinterpret_cast<char *>(RecordBuffer + TLD.BufferOffset));
+    SpinMutexLock L(&LogMutex);
+    retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
+                     reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
     TLD.BufferOffset = 0;
     TLD.StackEntries = 0;
   }
@@ -257,8 +246,8 @@ template <class RDTSC>
 void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
                            RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
   auto &TLD = getThreadLocalData();
-  auto &InMemoryBuffer = TLD.InMemoryBuffer;
-  auto &Offset = TLD.BufferOffset;
+  auto FirstEntry =
+      reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer);
   const auto &BuffLen = TLD.BufferSize;
   int Fd = getGlobalFd();
   if (Fd == -1)
@@ -267,45 +256,41 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
   // First we check whether there's enough space to write the data consecutively
   // in the thread-local buffer. If not, we first flush the buffer before
   // attempting to write the two records that must be consecutive.
-  if (Offset + 2 > BuffLen) {
-    __sanitizer::SpinMutexLock L(&LogMutex);
-    auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer);
-    retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer),
-                     reinterpret_cast<char *>(RecordBuffer + Offset));
-    Offset = 0;
+  if (TLD.BufferOffset + 2 > BuffLen) {
+    SpinMutexLock L(&LogMutex);
+    retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
+                     reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
     TLD.StackEntries = 0;
   }
 
   // Then we write the "we have an argument" record.
   InMemoryRawLog(FuncId, Type, ReadTSC);
 
-  if (RecursionGuard)
+  RecursionGuard G(Guard);
+  if (!G)
     return;
-  RecursionGuard = true;
-  auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; });
 
-  // And from here on write the arg payload.
-  __xray::XRayArgPayload R;
+  // And, from here on write the arg payload.
+  XRayArgPayload R;
   R.RecordType = RecordTypes::ARG_PAYLOAD;
   R.FuncId = FuncId;
-  R.TId = TLD.TID;
+  R.TId = GetTid(); 
+  R.PId = internal_getpid(); 
   R.Arg = Arg1;
-  auto EntryPtr =
-      &reinterpret_cast<__xray::XRayArgPayload *>(&InMemoryBuffer)[Offset];
-  std::memcpy(EntryPtr, &R, sizeof(R));
-  if (++Offset == BuffLen) {
-    __sanitizer::SpinMutexLock L(&LogMutex);
-    auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer);
-    retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer),
-                     reinterpret_cast<char *>(RecordBuffer + Offset));
-    Offset = 0;
+  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
+  if (++TLD.BufferOffset == BuffLen) {
+    SpinMutexLock L(&LogMutex);
+    retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
+                     reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
     TLD.StackEntries = 0;
   }
 }
 
 void basicLoggingHandleArg0RealTSC(int32_t FuncId,
                                    XRayEntryType Type) XRAY_NEVER_INSTRUMENT {
-  InMemoryRawLog(FuncId, Type, __xray::readTSC);
+  InMemoryRawLog(FuncId, Type, readTSC);
 }
 
 void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type)
@@ -318,13 +303,13 @@ void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type)
       TS = {0, 0};
     }
     CPU = 0;
-    return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
+    return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
   });
 }
 
 void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Type,
                                    uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
-  InMemoryRawLogWithArg(FuncId, Type, Arg1, __xray::readTSC);
+  InMemoryRawLogWithArg(FuncId, Type, Arg1, readTSC);
 }
 
 void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type,
@@ -338,34 +323,34 @@ void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type,
           TS = {0, 0};
         }
         CPU = 0;
-        return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
+        return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
       });
 }
 
 static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
   ThreadLocalData &TLD = *reinterpret_cast<ThreadLocalData *>(P);
-  auto ExitGuard = __sanitizer::at_scope_exit([&TLD] {
+  auto ExitGuard = at_scope_exit([&TLD] {
     // Clean up dynamic resources.
     if (TLD.InMemoryBuffer)
       InternalFree(TLD.InMemoryBuffer);
     if (TLD.ShadowStack)
       InternalFree(TLD.ShadowStack);
-    if (__sanitizer::Verbosity())
-      Report("Cleaned up log for TID: %d\n", TLD.TID);
+    if (Verbosity())
+      Report("Cleaned up log for TID: %d\n", GetTid());
   });
 
   if (TLD.Fd == -1 || TLD.BufferOffset == 0) {
-    if (__sanitizer::Verbosity())
-      Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", TLD.TID,
+    if (Verbosity())
+      Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", GetTid(),
              TLD.Fd, TLD.BufferOffset);
     return;
   }
 
   {
-    __sanitizer::SpinMutexLock L(&LogMutex);
+    SpinMutexLock L(&LogMutex);
     retryingWriteAll(TLD.Fd, reinterpret_cast<char *>(TLD.InMemoryBuffer),
                      reinterpret_cast<char *>(TLD.InMemoryBuffer) +
-                         (sizeof(__xray::XRayRecord) * TLD.BufferOffset));
+                         (sizeof(XRayRecord) * TLD.BufferOffset));
   }
 
   // Because this thread's exit could be the last one trying to write to
@@ -378,45 +363,89 @@ static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
 XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
                                    void *Options,
                                    size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  static bool UNUSED Once = [] {
-    pthread_key_create(&PThreadKey, TLDDestructor);
-    return false;
-  }();
-
   uint8_t Expected = 0;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &BasicInitialized, &Expected, 1, __sanitizer::memory_order_acq_rel)) {
-    if (__sanitizer::Verbosity())
+  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
+                                      memory_order_acq_rel)) {
+    if (Verbosity())
       Report("Basic logging already initialized.\n");
     return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
   }
 
-  if (OptionsSize != sizeof(BasicLoggingOptions)) {
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  pthread_once(&OnceInit, +[] {
+    pthread_key_create(&PThreadKey, TLDDestructor);
+    atomic_store(&UseRealTSC, probeRequiredCPUFeatures(), memory_order_release);
+    // Initialize the global TicksPerSec value.
+    atomic_store(&TicksPerSec,
+                 probeRequiredCPUFeatures() ? getTSCFrequency()
+                                            : NanosecondsPerSecond,
+                 memory_order_release);
+    if (!atomic_load(&UseRealTSC, memory_order_relaxed) && Verbosity())
+      Report("WARNING: Required CPU features missing for XRay instrumentation, "
+             "using emulation instead.\n");
+  });
+
+  if (BufferSize == 0 && BufferMax == 0 && Options != nullptr) {
+    FlagParser P;
+    BasicFlags F;
+    F.setDefaults();
+    registerXRayBasicFlags(&P, &F);
+    P.ParseString(useCompilerDefinedBasicFlags());
+    auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
+    if (EnvOpts == nullptr)
+      EnvOpts = "";
+
+    P.ParseString(EnvOpts);
+
+    // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
+    // set through XRAY_OPTIONS instead.
+    if (internal_strlen(EnvOpts) == 0) {
+      F.func_duration_threshold_us =
+          flags()->xray_naive_log_func_duration_threshold_us;
+      F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
+      F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
+    }
+
+    P.ParseString(static_cast<const char *>(Options));
+    GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
+    GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
+    GlobalOptions.MaxStackDepth = F.max_stack_depth;
+    *basicFlags() = F;
+  } else if (OptionsSize != sizeof(BasicLoggingOptions)) {
     Report("Invalid options size, potential ABI mismatch; expected %d got %d",
            sizeof(BasicLoggingOptions), OptionsSize);
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  } else {
+    if (Verbosity())
+      Report("XRay Basic: struct-based init is deprecated, please use "
+             "string-based configuration instead.\n");
+    GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options);
   }
 
-  static auto UseRealTSC = probeRequiredCPUFeatures();
-  if (!UseRealTSC && __sanitizer::Verbosity())
-    Report("WARNING: Required CPU features missing for XRay instrumentation, "
-           "using emulation instead.\n");
-
-  GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options);
-  __xray_set_handler_arg1(UseRealTSC ? basicLoggingHandleArg1RealTSC
-                                     : basicLoggingHandleArg1EmulateTSC);
-  __xray_set_handler(UseRealTSC ? basicLoggingHandleArg0RealTSC
-                                : basicLoggingHandleArg0EmulateTSC);
+  atomic_store(&ThresholdTicks,
+               atomic_load(&TicksPerSec, memory_order_acquire) *
+                   GlobalOptions.DurationFilterMicros / 1000000,
+               memory_order_release);
+  __xray_set_handler_arg1(atomic_load(&UseRealTSC, memory_order_acquire)
+                              ? basicLoggingHandleArg1RealTSC
+                              : basicLoggingHandleArg1EmulateTSC);
+  __xray_set_handler(atomic_load(&UseRealTSC, memory_order_acquire)
+                         ? basicLoggingHandleArg0RealTSC
+                         : basicLoggingHandleArg0EmulateTSC);
+
+  // TODO: Implement custom event and typed event handling support in Basic
+  // Mode.
   __xray_remove_customevent_handler();
+  __xray_remove_typedevent_handler();
 
   return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
 }
 
 XRayLogInitStatus basicLoggingFinalize() XRAY_NEVER_INSTRUMENT {
   uint8_t Expected = 0;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &BasicInitialized, &Expected, 0, __sanitizer::memory_order_acq_rel) &&
-      __sanitizer::Verbosity())
+  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 0,
+                                      memory_order_acq_rel) &&
+      Verbosity())
     Report("Basic logging already finalized.\n");
 
   // Nothing really to do aside from marking state of the global to be
@@ -444,24 +473,41 @@ bool basicLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
   };
   auto RegistrationResult = __xray_log_register_mode("xray-basic", Impl);
   if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
-      __sanitizer::Verbosity())
+      Verbosity())
     Report("Cannot register XRay Basic Mode to 'xray-basic'; error = %d\n",
            RegistrationResult);
   if (flags()->xray_naive_log ||
-      !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-basic")) {
-    __xray_set_log_impl(Impl);
-    BasicLoggingOptions Options;
-    Options.DurationFilterMicros =
-        flags()->xray_naive_log_func_duration_threshold_us;
-    Options.MaxStackDepth = flags()->xray_naive_log_max_stack_depth;
-    Options.ThreadBufferSize = flags()->xray_naive_log_thread_buffer_size;
-    __xray_log_init(flags()->xray_naive_log_thread_buffer_size, 0, &Options,
-                    sizeof(BasicLoggingOptions));
-    static auto UNUSED Once = [] {
-      static auto UNUSED &TLD = getThreadLocalData();
-      __sanitizer::Atexit(+[] { TLDDestructor(&TLD); });
+      !internal_strcmp(flags()->xray_mode, "xray-basic")) {
+    auto SelectResult = __xray_log_select_mode("xray-basic");
+    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+      if (Verbosity())
+        Report("Failed selecting XRay Basic Mode; error = %d\n", SelectResult);
       return false;
-    }();
+    }
+
+    // We initialize the implementation using the data we get from the
+    // XRAY_BASIC_OPTIONS environment variable, at this point of the
+    // implementation.
+    auto *Env = GetEnv("XRAY_BASIC_OPTIONS");
+    auto InitResult =
+        __xray_log_init_mode("xray-basic", Env == nullptr ? "" : Env);
+    if (InitResult != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
+      if (Verbosity())
+        Report("Failed initializing XRay Basic Mode; error = %d\n", InitResult);
+      return false;
+    }
+
+    // At this point we know that we've successfully initialized Basic mode
+    // tracing, and the only chance we're going to get for the current thread to
+    // clean-up may be at thread/program exit. To ensure that we're going to get
+    // the cleanup even without calling the finalization routines, we're
+    // registering a program exit function that will do the cleanup.
+    static pthread_once_t DynamicOnce = PTHREAD_ONCE_INIT;
+    pthread_once(&DynamicOnce, +[] {
+      static void *FakeTLD = nullptr;
+      FakeTLD = &getThreadLocalData();
+      Atexit(+[] { TLDDestructor(FakeTLD); });
+    });
   }
   return true;
 }
diff --git a/lib/xray/xray_inmemory_log.h b/lib/xray/xray_basic_logging.h
index e4fcb8ca5ffd..1639b96d91a1 100644
--- a/lib/xray/xray_inmemory_log.h
+++ b/lib/xray/xray_basic_logging.h
@@ -1,5 +1,4 @@
-//===-- xray_inmemory_log.h
-//------------------------------------------------===//
+//===-- xray_basic_logging.h ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/xray/xray_buffer_queue.cc b/lib/xray/xray_buffer_queue.cc
index a0018f6b0cba..8dfcc23540b1 100644
--- a/lib/xray/xray_buffer_queue.cc
+++ b/lib/xray/xray_buffer_queue.cc
@@ -16,14 +16,37 @@
 #include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
+#include <memory>
 
 using namespace __xray;
 using namespace __sanitizer;
 
+template <class T> static T *initArray(size_t N) {
+  auto A = reinterpret_cast<T *>(
+      InternalAlloc(N * sizeof(T), nullptr, kCacheLineSize));
+  if (A != nullptr)
+    while (N > 0)
+      new (A + (--N)) T();
+  return A;
+}
+
 BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
-    : BufferSize(B), Buffers(new BufferRep[N]()), BufferCount(N), Finalizing{0},
-      OwnedBuffers(new void *[N]()), Next(Buffers), First(Buffers),
-      LiveBuffers(0) {
+    : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)),
+      BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)),
+      Next(Buffers), First(Buffers), LiveBuffers(0) {
+  if (Buffers == nullptr) {
+    Success = false;
+    return;
+  }
+  if (OwnedBuffers == nullptr) {
+    // Clean up the buffers we've already allocated.
+    for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
+      B->~BufferRep();
+    InternalFree(Buffers);
+    Success = false;
+    return;
+  };
+
   for (size_t i = 0; i < N; ++i) {
     auto &T = Buffers[i];
     void *Tmp = InternalAlloc(BufferSize, nullptr, 64);
@@ -37,7 +60,7 @@ BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
       return;
     }
     auto &Buf = T.Buff;
-    Buf.Buffer = Tmp;
+    Buf.Data = Tmp;
     Buf.Size = B;
     Buf.Extents = reinterpret_cast<BufferExtents *>(Extents);
     OwnedBuffers[i] = Tmp;
@@ -46,9 +69,9 @@ BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
 }
 
 BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
-  if (__sanitizer::atomic_load(&Finalizing, __sanitizer::memory_order_acquire))
+  if (atomic_load(&Finalizing, memory_order_acquire))
     return ErrorCode::QueueFinalizing;
-  __sanitizer::SpinMutexLock Guard(&Mutex);
+  SpinMutexLock Guard(&Mutex);
   if (LiveBuffers == BufferCount)
     return ErrorCode::NotEnoughMemory;
 
@@ -68,7 +91,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
   // Blitz through the buffers array to find the buffer.
   bool Found = false;
   for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) {
-    if (*I == Buf.Buffer) {
+    if (*I == Buf.Data) {
       Found = true;
       break;
     }
@@ -76,7 +99,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
   if (!Found)
     return ErrorCode::UnrecognizedBuffer;
 
-  __sanitizer::SpinMutexLock Guard(&Mutex);
+  SpinMutexLock Guard(&Mutex);
 
   // This points to a semantic bug, we really ought to not be releasing more
   // buffers than we actually get.
@@ -86,7 +109,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
   // Now that the buffer has been released, we mark it as "used".
   First->Buff = Buf;
   First->Used = true;
-  Buf.Buffer = nullptr;
+  Buf.Data = nullptr;
   Buf.Size = 0;
   --LiveBuffers;
   if (++First == (Buffers + BufferCount))
@@ -96,8 +119,7 @@ BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
 }
 
 BufferQueue::ErrorCode BufferQueue::finalize() {
-  if (__sanitizer::atomic_exchange(&Finalizing, 1,
-                                   __sanitizer::memory_order_acq_rel))
+  if (atomic_exchange(&Finalizing, 1, memory_order_acq_rel))
     return ErrorCode::QueueFinalizing;
   return ErrorCode::Ok;
 }
@@ -106,9 +128,11 @@ BufferQueue::~BufferQueue() {
   for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
     auto &T = *I;
     auto &Buf = T.Buff;
-    InternalFree(Buf.Buffer);
+    InternalFree(Buf.Data);
     InternalFree(Buf.Extents);
   }
-  delete[] Buffers;
-  delete[] OwnedBuffers;
+  for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
+    B->~BufferRep();
+  InternalFree(Buffers);
+  InternalFree(OwnedBuffers);
 }
diff --git a/lib/xray/xray_buffer_queue.h b/lib/xray/xray_buffer_queue.h
index 1ceb58274616..e76fa7983c90 100644
--- a/lib/xray/xray_buffer_queue.h
+++ b/lib/xray/xray_buffer_queue.h
@@ -15,9 +15,10 @@
 #ifndef XRAY_BUFFER_QUEUE_H
 #define XRAY_BUFFER_QUEUE_H
 
-#include <cstddef>
 #include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include <cstddef>
 
 namespace __xray {
 
@@ -27,18 +28,17 @@ namespace __xray {
 /// the "flight data recorder" (FDR) mode to support ongoing XRay function call
 /// trace collection.
 class BufferQueue {
- public:
+public:
   struct alignas(64) BufferExtents {
-    __sanitizer::atomic_uint64_t Size;
+    atomic_uint64_t Size;
   };
 
   struct Buffer {
-    void *Buffer = nullptr;
+    void *Data = nullptr;
     size_t Size = 0;
-    BufferExtents* Extents;
+    BufferExtents *Extents;
   };
 
- private:
   struct BufferRep {
     // The managed buffer.
     Buffer Buff;
@@ -48,14 +48,72 @@ class BufferQueue {
     bool Used = false;
   };
 
+private:
+  // This models a ForwardIterator. |T| Must be either a `Buffer` or `const
+  // Buffer`. Note that we only advance to the "used" buffers, when
+  // incrementing, so that at dereference we're always at a valid point.
+  template <class T> class Iterator {
+  public:
+    BufferRep *Buffers = nullptr;
+    size_t Offset = 0;
+    size_t Max = 0;
+
+    Iterator &operator++() {
+      DCHECK_NE(Offset, Max);
+      do {
+        ++Offset;
+      } while (!Buffers[Offset].Used && Offset != Max);
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator C = *this;
+      ++(*this);
+      return C;
+    }
+
+    T &operator*() const { return Buffers[Offset].Buff; }
+
+    T *operator->() const { return &(Buffers[Offset].Buff); }
+
+    Iterator(BufferRep *Root, size_t O, size_t M)
+        : Buffers(Root), Offset(O), Max(M) {
+      // We want to advance to the first Offset where the 'Used' property is
+      // true, or to the end of the list/queue.
+      while (!Buffers[Offset].Used && Offset != Max) {
+        ++Offset;
+      }
+    }
+
+    Iterator() = default;
+    Iterator(const Iterator &) = default;
+    Iterator(Iterator &&) = default;
+    Iterator &operator=(const Iterator &) = default;
+    Iterator &operator=(Iterator &&) = default;
+    ~Iterator() = default;
+
+    template <class V>
+    friend bool operator==(const Iterator &L, const Iterator<V> &R) {
+      DCHECK_EQ(L.Max, R.Max);
+      return L.Buffers == R.Buffers && L.Offset == R.Offset;
+    }
+
+    template <class V>
+    friend bool operator!=(const Iterator &L, const Iterator<V> &R) {
+      return !(L == R);
+    }
+  };
+
   // Size of each individual Buffer.
   size_t BufferSize;
 
   BufferRep *Buffers;
+
+  // Amount of pre-allocated buffers.
   size_t BufferCount;
 
-  __sanitizer::SpinMutex Mutex;
-  __sanitizer::atomic_uint8_t Finalizing;
+  SpinMutex Mutex;
+  atomic_uint8_t Finalizing;
 
   // Pointers to buffers managed/owned by the BufferQueue.
   void **OwnedBuffers;
@@ -70,7 +128,7 @@ class BufferQueue {
   // Count of buffers that have been handed out through 'getBuffer'.
   size_t LiveBuffers;
 
- public:
+public:
   enum class ErrorCode : unsigned {
     Ok,
     NotEnoughMemory,
@@ -81,16 +139,16 @@ class BufferQueue {
 
   static const char *getErrorString(ErrorCode E) {
     switch (E) {
-      case ErrorCode::Ok:
-        return "(none)";
-      case ErrorCode::NotEnoughMemory:
-        return "no available buffers in the queue";
-      case ErrorCode::QueueFinalizing:
-        return "queue already finalizing";
-      case ErrorCode::UnrecognizedBuffer:
-        return "buffer being returned not owned by buffer queue";
-      case ErrorCode::AlreadyFinalized:
-        return "queue already finalized";
+    case ErrorCode::Ok:
+      return "(none)";
+    case ErrorCode::NotEnoughMemory:
+      return "no available buffers in the queue";
+    case ErrorCode::QueueFinalizing:
+      return "queue already finalizing";
+    case ErrorCode::UnrecognizedBuffer:
+      return "buffer being returned not owned by buffer queue";
+    case ErrorCode::AlreadyFinalized:
+      return "queue already finalized";
     }
     return "unknown error";
   }
@@ -122,8 +180,7 @@ class BufferQueue {
   ErrorCode releaseBuffer(Buffer &Buf);
 
   bool finalizing() const {
-    return __sanitizer::atomic_load(&Finalizing,
-                                    __sanitizer::memory_order_acquire);
+    return atomic_load(&Finalizing, memory_order_acquire);
   }
 
   /// Returns the configured size of the buffers in the buffer queue.
@@ -141,19 +198,29 @@ class BufferQueue {
   /// Applies the provided function F to each Buffer in the queue, only if the
   /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a
   /// releaseBuffer(...) operation).
-  template <class F>
-  void apply(F Fn) {
-    __sanitizer::SpinMutexLock G(&Mutex);
-    for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
-      const auto &T = *I;
-      if (T.Used) Fn(T.Buff);
-    }
+  template <class F> void apply(F Fn) {
+    SpinMutexLock G(&Mutex);
+    for (auto I = begin(), E = end(); I != E; ++I)
+      Fn(*I);
+  }
+
+  using const_iterator = Iterator<const Buffer>;
+  using iterator = Iterator<Buffer>;
+
+  /// Provides iterator access to the raw Buffer instances.
+  iterator begin() const { return iterator(Buffers, 0, BufferCount); }
+  const_iterator cbegin() const {
+    return const_iterator(Buffers, 0, BufferCount);
+  }
+  iterator end() const { return iterator(Buffers, BufferCount, BufferCount); }
+  const_iterator cend() const {
+    return const_iterator(Buffers, BufferCount, BufferCount);
   }
 
   // Cleans up allocated buffers.
   ~BufferQueue();
 };
 
-}  // namespace __xray
+} // namespace __xray
 
-#endif  // XRAY_BUFFER_QUEUE_H
+#endif // XRAY_BUFFER_QUEUE_H
diff --git a/lib/xray/xray_fdr_flags.cc b/lib/xray/xray_fdr_flags.cc
new file mode 100644
index 000000000000..a14851b1b616
--- /dev/null
+++ b/lib/xray/xray_fdr_flags.cc
@@ -0,0 +1,48 @@
+//===-- xray_fdr_flags.cc ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay FDR flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_fdr_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+FDRFlags xray_fdr_flags_dont_use_directly; // use via fdrFlags().
+
+void FDRFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayFDRFlags(FlagParser *P, FDRFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedFDRFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_FDR_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_FDR_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_fdr_flags.h b/lib/xray/xray_fdr_flags.h
new file mode 100644
index 000000000000..9c953f1cabcf
--- /dev/null
+++ b/lib/xray/xray_fdr_flags.h
@@ -0,0 +1,38 @@
+//===-- xray_fdr_flags.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the flags for the flight-data-recorder mode implementation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FDR_FLAGS_H
+#define XRAY_FDR_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct FDRFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern FDRFlags xray_fdr_flags_dont_use_directly;
+extern void registerXRayFDRFlags(FlagParser *P, FDRFlags *F);
+const char *useCompilerDefinedFDRFlags();
+inline FDRFlags *fdrFlags() { return &xray_fdr_flags_dont_use_directly; }
+
+} // namespace __xray
+
+#endif // XRAY_FDR_FLAGS_H
diff --git a/lib/xray/xray_fdr_flags.inc b/lib/xray/xray_fdr_flags.inc
new file mode 100644
index 000000000000..d8721ad12cbe
--- /dev/null
+++ b/lib/xray/xray_fdr_flags.inc
@@ -0,0 +1,29 @@
+//===-- xray_fdr_flags.inc --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay FDR Mode runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+// FDR (Flight Data Recorder) Mode logging options.
+XRAY_FLAG(int, func_duration_threshold_us, 5,
+          "FDR logging will try to skip functions that execute for fewer "
+          "microseconds than this threshold.")
+XRAY_FLAG(int, grace_period_ms, 100,
+          "FDR logging will wait this much time in milliseconds before "
+          "actually flushing the log; this gives a chance for threads to "
+          "notice that the log has been finalized and clean up.")
+XRAY_FLAG(int, buffer_size, 16384,
+          "Size of buffers in the circular buffer queue.")
+XRAY_FLAG(int, buffer_max, 100, "Maximum number of buffers in the queue.")
+XRAY_FLAG(bool, no_file_flush, false,
+          "Set to true to not write log files by default.")
diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h
index 324208db82ca..87096d4fc29e 100644
--- a/lib/xray/xray_fdr_log_records.h
+++ b/lib/xray/xray_fdr_log_records.h
@@ -32,6 +32,8 @@ struct alignas(16) MetadataRecord {
     CustomEventMarker,
     CallArgument,
     BufferExtents,
+    TypedEventMarker,
+    Pid,
   };
 
   // Use 7 bits to identify this record type.
diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
index 1bfa10c21f5c..6cb2dfa0c658 100644
--- a/lib/xray/xray_fdr_logging.cc
+++ b/lib/xray/xray_fdr_logging.cc
@@ -15,64 +15,836 @@
 //
 //===----------------------------------------------------------------------===//
 #include "xray_fdr_logging.h"
+#include <cassert>
 #include <errno.h>
+#include <limits>
+#include <memory>
+#include <pthread.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <time.h>
 #include <unistd.h>
 
+#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "xray/xray_interface.h"
 #include "xray/xray_records.h"
 #include "xray_buffer_queue.h"
 #include "xray_defs.h"
-#include "xray_fdr_logging_impl.h"
+#include "xray_fdr_flags.h"
 #include "xray_flags.h"
+#include "xray_recursion_guard.h"
 #include "xray_tsc.h"
 #include "xray_utils.h"
 
 namespace __xray {
 
+atomic_sint32_t LoggingStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+// Group together thread-local-data in a struct, then hide it behind a function
+// call so that it can be initialized on first use instead of as a global. We
+// force the alignment to 64-bytes for x86 cache line alignment, as this
+// structure is used in the hot path of implementation.
+struct alignas(64) ThreadLocalData {
+  BufferQueue::Buffer Buffer;
+  char *RecordPtr = nullptr;
+  // The number of FunctionEntry records immediately preceding RecordPtr.
+  uint8_t NumConsecutiveFnEnters = 0;
+
+  // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit
+  // records preceding RecordPtr.
+  uint8_t NumTailCalls = 0;
+
+  // We use a thread_local variable to keep track of which CPUs we've already
+  // run, and the TSC times for these CPUs. This allows us to stop repeating the
+  // CPU field in the function records.
+  //
+  // We assume that we'll support only 65536 CPUs for x86_64.
+  uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
+  uint64_t LastTSC = 0;
+  uint64_t LastFunctionEntryTSC = 0;
+
+  // Make sure a thread that's ever called handleArg0 has a thread-local
+  // live reference to the buffer queue for this particular instance of
+  // FDRLogging, and that we're going to clean it up when the thread exits.
+  BufferQueue *BQ = nullptr;
+};
+
+static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
+              "ThreadLocalData must be trivially destructible");
+
+static constexpr auto MetadataRecSize = sizeof(MetadataRecord);
+static constexpr auto FunctionRecSize = sizeof(FunctionRecord);
+
+// Use a global pthread key to identify thread-local data for logging.
+static pthread_key_t Key;
+
 // Global BufferQueue.
-BufferQueue *BQ = nullptr;
+static BufferQueue *BQ = nullptr;
 
-__sanitizer::atomic_sint32_t LogFlushStatus = {
+static atomic_sint32_t LogFlushStatus = {
     XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
 
-FDRLoggingOptions FDROptions;
+static FDRLoggingOptions FDROptions;
+
+static SpinMutex FDROptionsMutex;
+
+// This function will initialize the thread-local data structure used by the FDR
+// logging implementation and return a reference to it. The implementation
+// details require a bit of care to maintain.
+//
+// First, some requirements on the implementation in general:
+//
+//   - XRay handlers should not call any memory allocation routines that may
+//     delegate to an instrumented implementation. This means functions like
+//     malloc() and free() should not be called while instrumenting.
+//
+//   - We would like to use some thread-local data initialized on first-use of
+//     the XRay instrumentation. These allow us to implement unsynchronized
+//     routines that access resources associated with the thread.
+//
+// The implementation here uses a few mechanisms that allow us to provide both
+// the requirements listed above. We do this by:
+//
+//   1. Using a thread-local aligned storage buffer for representing the
+//      ThreadLocalData struct. This data will be uninitialized memory by
+//      design.
+//
+//   2. Not requiring a thread exit handler/implementation, keeping the
+//      thread-local as purely a collection of references/data that do not
+//      require cleanup.
+//
+// We're doing this to avoid using a `thread_local` object that has a
+// non-trivial destructor, because the C++ runtime might call std::malloc(...)
+// to register calls to destructors. Deadlocks may arise when, for example, an
+// externally provided malloc implementation is XRay instrumented, and
+// initializing the thread-locals involves calling into malloc. A malloc
+// implementation that does global synchronization might be holding a lock for a
+// critical section, calling a function that might be XRay instrumented (and
+// thus in turn calling into malloc by virtue of registration of the
+// thread_local's destructor).
+static_assert(alignof(ThreadLocalData) >= 64,
+              "ThreadLocalData must be cache line aligned.");
+static ThreadLocalData &getThreadLocalData() {
+  thread_local typename std::aligned_storage<
+      sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{};
+
+  if (pthread_getspecific(Key) == NULL) {
+    new (reinterpret_cast<ThreadLocalData *>(&TLDStorage)) ThreadLocalData{};
+    pthread_setspecific(Key, &TLDStorage);
+  }
+
+  return *reinterpret_cast<ThreadLocalData *>(&TLDStorage);
+}
+
+static void writeNewBufferPreamble(tid_t Tid, timespec TS,
+                                   pid_t Pid) XRAY_NEVER_INSTRUMENT {
+  static constexpr int InitRecordsCount = 3;
+  auto &TLD = getThreadLocalData();
+  MetadataRecord Metadata[InitRecordsCount];
+  {
+    // Write out a MetadataRecord to signify that this is the start of a new
+    // buffer, associated with a particular thread, with a new CPU.  For the
+    // data, we have 15 bytes to squeeze as much information as we can.  At this
+    // point we only write down the following bytes:
+    //   - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes)
+    auto &NewBuffer = Metadata[0];
+    NewBuffer.Type = uint8_t(RecordType::Metadata);
+    NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer);
+    int32_t tid = static_cast<int32_t>(Tid);
+    internal_memcpy(&NewBuffer.Data, &tid, sizeof(tid));
+  }
+
+  // Also write the WalltimeMarker record.
+  {
+    static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
+    auto &WalltimeMarker = Metadata[1];
+    WalltimeMarker.Type = uint8_t(RecordType::Metadata);
+    WalltimeMarker.RecordKind =
+        uint8_t(MetadataRecord::RecordKinds::WalltimeMarker);
+
+    // We only really need microsecond precision here, and enforce across
+    // platforms that we need 64-bit seconds and 32-bit microseconds encoded in
+    // the Metadata record.
+    int32_t Micros = TS.tv_nsec / 1000;
+    int64_t Seconds = TS.tv_sec;
+    internal_memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds));
+    internal_memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros,
+                    sizeof(Micros));
+  }
+
+  // Also write the Pid record.
+  {
+    // Write out a MetadataRecord that contains the current pid
+    auto &PidMetadata = Metadata[2];
+    PidMetadata.Type = uint8_t(RecordType::Metadata);
+    PidMetadata.RecordKind = uint8_t(MetadataRecord::RecordKinds::Pid);
+    int32_t pid = static_cast<int32_t>(Pid);
+    internal_memcpy(&PidMetadata.Data, &pid, sizeof(pid));
+  }
+
+  TLD.NumConsecutiveFnEnters = 0;
+  TLD.NumTailCalls = 0;
+  if (TLD.BQ == nullptr || TLD.BQ->finalizing())
+    return;
+  internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
+  TLD.RecordPtr += sizeof(Metadata);
+  // Since we write out the extents as the first metadata record of the
+  // buffer, we need to write out the extents including the extents record.
+  atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
+               memory_order_release);
+}
+
+static void setupNewBuffer(int (*wall_clock_reader)(
+    clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  auto &B = TLD.Buffer;
+  TLD.RecordPtr = static_cast<char *>(B.Data);
+  tid_t Tid = GetTid();
+  timespec TS{0, 0};
+  pid_t Pid = internal_getpid();
+  // This is typically clock_gettime, but callers have injection ability.
+  wall_clock_reader(CLOCK_MONOTONIC, &TS);
+  writeNewBufferPreamble(Tid, TS, Pid);
+  TLD.NumConsecutiveFnEnters = 0;
+  TLD.NumTailCalls = 0;
+}
+
+static void incrementExtents(size_t Add) {
+  auto &TLD = getThreadLocalData();
+  atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel);
+}
+
+static void decrementExtents(size_t Subtract) {
+  auto &TLD = getThreadLocalData();
+  atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel);
+}
+
+static void writeNewCPUIdMetadata(uint16_t CPU,
+                                  uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  MetadataRecord NewCPUId;
+  NewCPUId.Type = uint8_t(RecordType::Metadata);
+  NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
+
+  // The data for the New CPU will contain the following bytes:
+  //   - CPU ID (uint16_t, 2 bytes)
+  //   - Full TSC (uint64_t, 8 bytes)
+  // Total = 10 bytes.
+  internal_memcpy(&NewCPUId.Data, &CPU, sizeof(CPU));
+  internal_memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
+  internal_memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord));
+  TLD.RecordPtr += sizeof(MetadataRecord);
+  TLD.NumConsecutiveFnEnters = 0;
+  TLD.NumTailCalls = 0;
+  incrementExtents(sizeof(MetadataRecord));
+}
+
+static void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  MetadataRecord TSCWrap;
+  TSCWrap.Type = uint8_t(RecordType::Metadata);
+  TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
+
+  // The data for the TSCWrap record contains the following bytes:
+  //   - Full TSC (uint64_t, 8 bytes)
+  // Total = 8 bytes.
+  internal_memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
+  internal_memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord));
+  TLD.RecordPtr += sizeof(MetadataRecord);
+  TLD.NumConsecutiveFnEnters = 0;
+  TLD.NumTailCalls = 0;
+  incrementExtents(sizeof(MetadataRecord));
+}
+
+// Call Argument metadata records store the arguments to a function in the
+// order of their appearance; holes are not supported by the buffer format.
+static void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  MetadataRecord CallArg;
+  CallArg.Type = uint8_t(RecordType::Metadata);
+  CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument);
+
+  internal_memcpy(CallArg.Data, &A, sizeof(A));
+  internal_memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord));
+  TLD.RecordPtr += sizeof(MetadataRecord);
+  incrementExtents(sizeof(MetadataRecord));
+}
+
+static void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
+                                XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT {
+  FunctionRecord FuncRecord;
+  FuncRecord.Type = uint8_t(RecordType::Function);
+  // Only take 28 bits of the function id.
+  FuncRecord.FuncId = FuncId & ~(0x0F << 28);
+  FuncRecord.TSCDelta = TSCDelta;
+
+  auto &TLD = getThreadLocalData();
+  switch (EntryType) {
+  case XRayEntryType::ENTRY:
+    ++TLD.NumConsecutiveFnEnters;
+    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
+    break;
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    // We should not rewind functions with logged args.
+    TLD.NumConsecutiveFnEnters = 0;
+    TLD.NumTailCalls = 0;
+    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
+    break;
+  case XRayEntryType::EXIT:
+    // If we've decided to log the function exit, we will never erase the log
+    // before it.
+    TLD.NumConsecutiveFnEnters = 0;
+    TLD.NumTailCalls = 0;
+    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
+    break;
+  case XRayEntryType::TAIL:
+    // If we just entered the function we're tail exiting from or erased every
+    // invocation since then, this function entry tail pair is a candidate to
+    // be erased when the child function exits.
+    if (TLD.NumConsecutiveFnEnters > 0) {
+      ++TLD.NumTailCalls;
+      TLD.NumConsecutiveFnEnters = 0;
+    } else {
+      // We will never be able to erase this tail call since we have logged
+      // something in between the function entry and tail exit.
+      TLD.NumTailCalls = 0;
+      TLD.NumConsecutiveFnEnters = 0;
+    }
+    FuncRecord.RecordKind =
+        uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
+    break;
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static atomic_uint8_t ErrorLatch{0};
+    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d\n",
+             FuncId);
+    return;
+  }
+  case XRayEntryType::TYPED_EVENT: {
+    static atomic_uint8_t ErrorLatch{0};
+    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+      Report("Internal error: patched an XRay typed event call as a function; "
+             "func id = %d\n",
+             FuncId);
+    return;
+  }
+  }
+
+  internal_memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord));
+  TLD.RecordPtr += sizeof(FunctionRecord);
+  incrementExtents(sizeof(FunctionRecord));
+}
+
+static atomic_uint64_t TicksPerSec{0};
+static atomic_uint64_t ThresholdTicks{0};
+
+// Re-point the thread local pointer into this thread's Buffer before the recent
+// "Function Entry" record and any "Tail Call Exit" records after that.
+static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
+                             uint64_t &LastFunctionEntryTSC, int32_t FuncId) {
+  auto &TLD = getThreadLocalData();
+  TLD.RecordPtr -= FunctionRecSize;
+  decrementExtents(FunctionRecSize);
+  FunctionRecord FuncRecord;
+  internal_memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize);
+  DCHECK(FuncRecord.RecordKind ==
+             uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
+         "Expected to find function entry recording when rewinding.");
+  DCHECK(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) &&
+         "Expected matching function id when rewinding Exit");
+  --TLD.NumConsecutiveFnEnters;
+  LastTSC -= FuncRecord.TSCDelta;
+
+  // We unwound one call. Update the state and return without writing a log.
+  if (TLD.NumConsecutiveFnEnters != 0) {
+    LastFunctionEntryTSC -= FuncRecord.TSCDelta;
+    return;
+  }
+
+  // Otherwise we've rewound the stack of all function entries, we might be
+  // able to rewind further by erasing tail call functions that are being
+  // exited from via this exit.
+  LastFunctionEntryTSC = 0;
+  auto RewindingTSC = LastTSC;
+  auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize;
+  while (TLD.NumTailCalls > 0) {
+    // Rewind the TSC back over the TAIL EXIT record.
+    FunctionRecord ExpectedTailExit;
+    internal_memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize);
+
+    DCHECK(ExpectedTailExit.RecordKind ==
+               uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) &&
+           "Expected to find tail exit when rewinding.");
+    RewindingRecordPtr -= FunctionRecSize;
+    RewindingTSC -= ExpectedTailExit.TSCDelta;
+    FunctionRecord ExpectedFunctionEntry;
+    internal_memcpy(&ExpectedFunctionEntry, RewindingRecordPtr,
+                    FunctionRecSize);
+    DCHECK(ExpectedFunctionEntry.RecordKind ==
+               uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
+           "Expected to find function entry when rewinding tail call.");
+    DCHECK(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId &&
+           "Expected funcids to match when rewinding tail call.");
+
+    // This tail call exceeded the threshold duration. It will not be erased.
+    if ((TSC - RewindingTSC) >= atomic_load_relaxed(&ThresholdTicks)) {
+      TLD.NumTailCalls = 0;
+      return;
+    }
+
+    // We can erase a tail exit pair that we're exiting through since
+    // its duration is under threshold.
+    --TLD.NumTailCalls;
+    RewindingRecordPtr -= FunctionRecSize;
+    RewindingTSC -= ExpectedFunctionEntry.TSCDelta;
+    TLD.RecordPtr -= 2 * FunctionRecSize;
+    LastTSC = RewindingTSC;
+    decrementExtents(2 * FunctionRecSize);
+  }
+}
+
+static bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
+  auto &TLD = getThreadLocalData();
+  auto EC = BQArg.releaseBuffer(TLD.Buffer);
+  if (EC != BufferQueue::ErrorCode::Ok) {
+    Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Data,
+           BufferQueue::getErrorString(EC));
+    return false;
+  }
+  return true;
+}
+
+static bool prepareBuffer(uint64_t TSC, unsigned char CPU,
+                          int (*wall_clock_reader)(clockid_t,
+                                                   struct timespec *),
+                          size_t MaxSize) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  char *BufferStart = static_cast<char *>(TLD.Buffer.Data);
+  if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) {
+    if (!releaseThreadLocalBuffer(*TLD.BQ))
+      return false;
+    auto EC = TLD.BQ->getBuffer(TLD.Buffer);
+    if (EC != BufferQueue::ErrorCode::Ok) {
+      Report("Failed to prepare a buffer; error = '%s'\n",
+             BufferQueue::getErrorString(EC));
+      return false;
+    }
+    setupNewBuffer(wall_clock_reader);
 
-__sanitizer::SpinMutex FDROptionsMutex;
+    // Always write the CPU metadata as the first record in the buffer.
+    writeNewCPUIdMetadata(CPU, TSC);
+  }
+  return true;
+}
+
+static bool
+isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU,
+                         int (*wall_clock_reader)(clockid_t, struct timespec *))
+    XRAY_NEVER_INSTRUMENT {
+  // Bail out right away if logging is not initialized yet.
+  // We should take the opportunity to release the buffer though.
+  auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
+  auto &TLD = getThreadLocalData();
+  if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
+    if (TLD.RecordPtr != nullptr &&
+        (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+         Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
+      if (!releaseThreadLocalBuffer(*LBQ))
+        return false;
+      TLD.RecordPtr = nullptr;
+      return false;
+    }
+    return false;
+  }
+
+  if (atomic_load(&LoggingStatus, memory_order_acquire) !=
+          XRayLogInitStatus::XRAY_LOG_INITIALIZED ||
+      LBQ->finalizing()) {
+    if (!releaseThreadLocalBuffer(*LBQ))
+      return false;
+    TLD.RecordPtr = nullptr;
+  }
+
+  if (TLD.Buffer.Data == nullptr) {
+    auto EC = LBQ->getBuffer(TLD.Buffer);
+    if (EC != BufferQueue::ErrorCode::Ok) {
+      auto LS = atomic_load(&LoggingStatus, memory_order_acquire);
+      if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING &&
+          LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
+        Report("Failed to acquire a buffer; error = '%s'\n",
+               BufferQueue::getErrorString(EC));
+      return false;
+    }
+
+    setupNewBuffer(wall_clock_reader);
+
+    // Always write the CPU metadata as the first record in the buffer.
+    writeNewCPUIdMetadata(CPU, TSC);
+  }
+
+  if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) {
+    // This means this is the first CPU this thread has ever run on. We set
+    // the current CPU and record this as the first TSC we've seen.
+    TLD.CurrentCPU = CPU;
+    writeNewCPUIdMetadata(CPU, TSC);
+  }
+
+  return true;
+}
+
+// Compute the TSC difference between the time of measurement and the previous
+// event. There are a few interesting situations we need to account for:
+//
+//   - The thread has migrated to a different CPU. If this is the case, then
+//     we write down the following records:
+//
+//       1. A 'NewCPUId' Metadata record.
+//       2. A FunctionRecord with a 0 for the TSCDelta field.
+//
+//   - The TSC delta is greater than the 32 bits we can store in a
+//     FunctionRecord. In this case we write down the following records:
+//
+//       1. A 'TSCWrap' Metadata record.
+//       2. A FunctionRecord with a 0 for the TSCDelta field.
+//
+//   - The TSC delta is representable within the 32 bits we can store in a
+//     FunctionRecord. In this case we write down just a FunctionRecord with
+//     the correct TSC delta.
+static uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC,
+                                   uint8_t CPU) {
+  if (CPU != TLD.CurrentCPU) {
+    // We've moved to a new CPU.
+    writeNewCPUIdMetadata(CPU, TSC);
+    return 0;
+  }
+  // If the delta is greater than the range for a uint32_t, then we write out
+  // the TSC wrap metadata entry with the full TSC, and the TSC for the
+  // function record be 0.
+  uint64_t Delta = TSC - TLD.LastTSC;
+  if (Delta <= std::numeric_limits<uint32_t>::max())
+    return Delta;
+
+  writeTSCWrapMetadata(TSC);
+  return 0;
+}
+
+static void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  auto BufferStart = static_cast<char *>(TLD.Buffer.Data);
+  if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <=
+      ptrdiff_t{MetadataRecSize}) {
+    if (!releaseThreadLocalBuffer(*TLD.BQ))
+      return;
+    TLD.RecordPtr = nullptr;
+  }
+}
+
+thread_local atomic_uint8_t Running{0};
+
+/// Here's where the meat of the processing happens. The writer captures
+/// function entry, exit and tail exit points with a time and will create
+/// TSCWrap, NewCPUId and Function records as necessary. The writer might
+/// walk backward through its buffer and erase trivial functions to avoid
+/// polluting the log and may use the buffer queue to obtain or release a
+/// buffer.
+static void processFunctionHook(int32_t FuncId, XRayEntryType Entry,
+                                uint64_t TSC, unsigned char CPU, uint64_t Arg1,
+                                int (*wall_clock_reader)(clockid_t,
+                                                         struct timespec *))
+    XRAY_NEVER_INSTRUMENT {
+  __asm volatile("# LLVM-MCA-BEGIN processFunctionHook");
+  // Prevent signal handler recursion, so in case we're already in a log writing
+  // mode and the signal handler comes in (and is also instrumented) then we
+  // don't want to be clobbering potentially partial writes already happening in
+  // the thread. We use a simple thread_local latch to only allow one on-going
+  // handleArg0 to happen at any given time.
+  RecursionGuard Guard{Running};
+  if (!Guard) {
+    DCHECK(atomic_load_relaxed(&Running) && "RecursionGuard is buggy!");
+    return;
+  }
+
+  auto &TLD = getThreadLocalData();
+
+  if (TLD.BQ == nullptr)
+    TLD.BQ = BQ;
+
+  if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
+    return;
+
+  // Before we go setting up writing new function entries, we need to be really
+  // careful about the pointer math we're doing. This means we need to ensure
+  // that the record we are about to write is going to fit into the buffer,
+  // without overflowing the buffer.
+  //
+  // To do this properly, we use the following assumptions:
+  //
+  //   - The least number of bytes we will ever write is 8
+  //     (sizeof(FunctionRecord)) only if the delta between the previous entry
+  //     and this entry is within 32 bits.
+  //   - The most number of bytes we will ever write is 8 + 16 + 16 = 40.
+  //     This is computed by:
+  //
+  //       MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord)
+  //
+  //     These arise in the following cases:
+  //
+  //       1. When the delta between the TSC we get and the previous TSC for the
+  //          same CPU is outside of the uint32_t range, we end up having to
+  //          write a MetadataRecord to indicate a "tsc wrap" before the actual
+  //          FunctionRecord.
+  //       2. When we learn that we've moved CPUs, we need to write a
+  //          MetadataRecord to indicate a "cpu change", and thus write out the
+  //          current TSC for that CPU before writing out the actual
+  //          FunctionRecord.
+  //       3. When we learn about a new CPU ID, we need to write down a "new cpu
+  //          id" MetadataRecord before writing out the actual FunctionRecord.
+  //       4. The second MetadataRecord is the optional function call argument.
+  //
+  // So the math we need to do is to determine whether writing 40 bytes past the
+  // current pointer exceeds the buffer's maximum size. If we don't have enough
+  // space to write 40 bytes in the buffer, we need get a new Buffer, set it up
+  // properly before doing any further writing.
+  size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize;
+  if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) {
+    TLD.BQ = nullptr;
+    return;
+  }
+
+  // By this point, we are now ready to write up to 40 bytes (explained above).
+  DCHECK((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Data) >=
+             static_cast<ptrdiff_t>(MetadataRecSize) &&
+         "Misconfigured BufferQueue provided; Buffer size not large enough.");
+
+  auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU);
+  TLD.LastTSC = TSC;
+  TLD.CurrentCPU = CPU;
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    // Update the thread local state for the next invocation.
+    TLD.LastFunctionEntryTSC = TSC;
+    break;
+  case XRayEntryType::TAIL:
+  case XRayEntryType::EXIT:
+    // Break out and write the exit record if we can't erase any functions.
+    if (TLD.NumConsecutiveFnEnters == 0 ||
+        (TSC - TLD.LastFunctionEntryTSC) >=
+            atomic_load_relaxed(&ThresholdTicks))
+      break;
+    rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId);
+    return; // without writing log.
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static atomic_uint8_t ErrorLatch{0};
+    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d\n",
+             FuncId);
+    return;
+  }
+  case XRayEntryType::TYPED_EVENT: {
+    static atomic_uint8_t ErrorLatch{0};
+    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
+      Report("Internal error: patched an XRay typed event call as a function; "
+             "func id = %d\n",
+             FuncId);
+    return;
+  }
+  }
+
+  writeFunctionRecord(FuncId, RecordTSCDelta, Entry);
+  if (Entry == XRayEntryType::LOG_ARGS_ENTRY)
+    writeCallArgumentMetadata(Arg1);
+
+  // If we've exhausted the buffer by this time, we then release the buffer to
+  // make sure that other threads may start using this buffer.
+  endBufferIfFull();
+  __asm volatile("# LLVM-MCA-END");
+}
+
+static XRayFileHeader &fdrCommonHeaderInfo() {
+  static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage;
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  static bool TSCSupported = true;
+  static uint64_t CycleFrequency = NanosecondsPerSecond;
+  pthread_once(&OnceInit, +[] {
+    XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
+    // Version 2 of the log writes the extents of the buffer, instead of
+    // relying on an end-of-buffer record.
+    // Version 3 includes PID metadata record
+    H.Version = 3;
+    H.Type = FileTypes::FDR_LOG;
+
+    // Test for required CPU features and cache the cycle frequency
+    TSCSupported = probeRequiredCPUFeatures();
+    if (TSCSupported)
+      CycleFrequency = getTSCFrequency();
+    H.CycleFrequency = CycleFrequency;
+
+    // FIXME: Actually check whether we have 'constant_tsc' and
+    // 'nonstop_tsc' before setting the values in the header.
+    H.ConstantTSC = 1;
+    H.NonstopTSC = 1;
+  });
+  return reinterpret_cast<XRayFileHeader &>(HStorage);
+}
+
+// This is the iterator implementation, which knows how to handle FDR-mode
+// specific buffers. This is used as an implementation of the iterator function
+// needed by __xray_set_buffer_iterator(...). It maintains a global state of the
+// buffer iteration for the currently installed FDR mode buffers. In particular:
+//
+//   - If the argument represents the initial state of XRayBuffer ({nullptr, 0})
+//     then the iterator returns the header information.
+//   - If the argument represents the header information ({address of header
+//     info, size of the header info}) then it returns the first FDR buffer's
+//     address and extents.
+//   - It will keep returning the next buffer and extents as there are more
+//     buffers to process. When the input represents the last buffer, it will
+//     return the initial state to signal completion ({nullptr, 0}).
+//
+// See xray/xray_log_interface.h for more details on the requirements for the
+// implementations of __xray_set_buffer_iterator(...) and
+// __xray_log_process_buffers(...).
+XRayBuffer fdrIterator(const XRayBuffer B) {
+  DCHECK(internal_strcmp(__xray_log_get_current_mode(), "xray-fdr") == 0);
+  DCHECK(BQ->finalizing());
+
+  if (BQ == nullptr || !BQ->finalizing()) {
+    if (Verbosity())
+      Report(
+          "XRay FDR: Failed global buffer queue is null or not finalizing!\n");
+    return {nullptr, 0};
+  }
+
+  // We use a global scratch-pad for the header information, which only gets
+  // initialized the first time this function is called. We'll update one part
+  // of this information with some relevant data (in particular the number of
+  // buffers to expect).
+  static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage;
+  static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT;
+  pthread_once(&HeaderOnce, +[] {
+    reinterpret_cast<XRayFileHeader &>(HeaderStorage) = fdrCommonHeaderInfo();
+  });
+
+  // We use a convenience alias for code referring to Header from here on out.
+  auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage);
+  if (B.Data == nullptr && B.Size == 0) {
+    Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
+    return XRayBuffer{static_cast<void *>(&Header), sizeof(Header)};
+  }
+
+  static BufferQueue::const_iterator It{};
+  static BufferQueue::const_iterator End{};
+  static void *CurrentBuffer{nullptr};
+  if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
+    // From this point on, we provide raw access to the raw buffer we're getting
+    // from the BufferQueue. We're relying on the iterators from the current
+    // Buffer queue.
+    It = BQ->cbegin();
+    End = BQ->cend();
+  }
+
+  if (CurrentBuffer != nullptr) {
+    InternalFree(CurrentBuffer);
+    CurrentBuffer = nullptr;
+  }
+
+  if (It == End)
+    return {nullptr, 0};
+
+  // Set up the current buffer to contain the extents like we would when writing
+  // out to disk. The difference here would be that we still write "empty"
+  // buffers, or at least go through the iterators faithfully to let the
+  // handlers see the empty buffers in the queue.
+  auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire);
+  auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+  CurrentBuffer = InternalAlloc(SerializedBufferSize);
+  if (CurrentBuffer == nullptr)
+    return {nullptr, 0};
+
+  // Write out the extents as a Metadata Record into the CurrentBuffer.
+  MetadataRecord ExtentsRecord;
+  ExtentsRecord.Type = uint8_t(RecordType::Metadata);
+  ExtentsRecord.RecordKind =
+      uint8_t(MetadataRecord::RecordKinds::BufferExtents);
+  internal_memcpy(ExtentsRecord.Data, &BufferSize, sizeof(BufferSize));
+  auto AfterExtents =
+      static_cast<char *>(internal_memcpy(CurrentBuffer, &ExtentsRecord,
+                                          sizeof(MetadataRecord))) +
+      sizeof(MetadataRecord);
+  internal_memcpy(AfterExtents, It->Data, BufferSize);
+
+  XRayBuffer Result;
+  Result.Data = CurrentBuffer;
+  Result.Size = SerializedBufferSize;
+  ++It;
+  return Result;
+}
 
 // Must finalize before flushing.
 XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
-  if (__sanitizer::atomic_load(&LoggingStatus,
-                               __sanitizer::memory_order_acquire) !=
+  if (atomic_load(&LoggingStatus, memory_order_acquire) !=
       XRayLogInitStatus::XRAY_LOG_FINALIZED) {
-    if (__sanitizer::Verbosity())
+    if (Verbosity())
       Report("Not flushing log, implementation is not finalized.\n");
     return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
   }
 
   s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &LogFlushStatus, &Result, XRayLogFlushStatus::XRAY_LOG_FLUSHING,
-          __sanitizer::memory_order_release)) {
-
-    if (__sanitizer::Verbosity())
+  if (!atomic_compare_exchange_strong(&LogFlushStatus, &Result,
+                                      XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+                                      memory_order_release)) {
+    if (Verbosity())
       Report("Not flushing log, implementation is still finalizing.\n");
     return static_cast<XRayLogFlushStatus>(Result);
   }
 
   if (BQ == nullptr) {
-    if (__sanitizer::Verbosity())
+    if (Verbosity())
       Report("Cannot flush when global buffer queue is null.\n");
     return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
   }
 
   // We wait a number of milliseconds to allow threads to see that we've
   // finalised before attempting to flush the log.
-  __sanitizer::SleepForMillis(flags()->xray_fdr_log_grace_period_ms);
+  SleepForMillis(fdrFlags()->grace_period_ms);
+
+  // At this point, we're going to uninstall the iterator implementation, before
+  // we decide to do anything further with the global buffer queue.
+  __xray_log_remove_buffer_iterator();
+
+  // Once flushed, we should set the global status of the logging implementation
+  // to "uninitialized" to allow for FDR-logging multiple runs.
+  auto ResetToUnitialized = at_scope_exit([] {
+    atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                 memory_order_release);
+  });
+
+  auto CleanupBuffers = at_scope_exit([] {
+    if (BQ != nullptr) {
+      auto &TLD = getThreadLocalData();
+      if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
+        releaseThreadLocalBuffer(*TLD.BQ);
+      BQ->~BufferQueue();
+      InternalFree(BQ);
+      BQ = nullptr;
+    }
+  });
+
+  if (fdrFlags()->no_file_flush) {
+    if (Verbosity())
+      Report("XRay FDR: Not flushing to file, 'no_file_flush=true'.\n");
+
+    atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+                 memory_order_release);
+    return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+  }
 
   // We write out the file in the following format:
   //
@@ -85,35 +857,20 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
   //
   int Fd = -1;
   {
-    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+    // FIXME: Remove this section of the code, when we remove the struct-based
+    // configuration API.
+    SpinMutexLock Guard(&FDROptionsMutex);
     Fd = FDROptions.Fd;
   }
   if (Fd == -1)
     Fd = getLogFD();
   if (Fd == -1) {
     auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-    __sanitizer::atomic_store(&LogFlushStatus, Result,
-                              __sanitizer::memory_order_release);
+    atomic_store(&LogFlushStatus, Result, memory_order_release);
     return Result;
   }
 
-  // Test for required CPU features and cache the cycle frequency
-  static bool TSCSupported = probeRequiredCPUFeatures();
-  static uint64_t CycleFrequency =
-      TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;
-
-  XRayFileHeader Header;
-
-  // Version 2 of the log writes the extents of the buffer, instead of relying
-  // on an end-of-buffer record.
-  Header.Version = 2;
-  Header.Type = FileTypes::FDR_LOG;
-  Header.CycleFrequency = CycleFrequency;
-
-  // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
-  // before setting the values in the header.
-  Header.ConstantTSC = 1;
-  Header.NonstopTSC = 1;
+  XRayFileHeader Header = fdrCommonHeaderInfo();
   Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
   retryingWriteAll(Fd, reinterpret_cast<char *>(&Header),
                    reinterpret_cast<char *>(&Header) + sizeof(Header));
@@ -121,39 +878,36 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
   BQ->apply([&](const BufferQueue::Buffer &B) {
     // Starting at version 2 of the FDR logging implementation, we only write
     // the records identified by the extents of the buffer. We use the Extents
-    // from the Buffer and write that out as the first record in the buffer.
-    // We still use a Metadata record, but fill in the extents instead for the
+    // from the Buffer and write that out as the first record in the buffer.  We
+    // still use a Metadata record, but fill in the extents instead for the
     // data.
     MetadataRecord ExtentsRecord;
-    auto BufferExtents = __sanitizer::atomic_load(
-        &B.Extents->Size, __sanitizer::memory_order_acquire);
-    assert(BufferExtents <= B.Size);
+    auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire);
+    DCHECK(BufferExtents <= B.Size);
     ExtentsRecord.Type = uint8_t(RecordType::Metadata);
     ExtentsRecord.RecordKind =
         uint8_t(MetadataRecord::RecordKinds::BufferExtents);
-    std::memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
+    internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
     if (BufferExtents > 0) {
       retryingWriteAll(Fd, reinterpret_cast<char *>(&ExtentsRecord),
                        reinterpret_cast<char *>(&ExtentsRecord) +
                            sizeof(MetadataRecord));
-      retryingWriteAll(Fd, reinterpret_cast<char *>(B.Buffer),
-                       reinterpret_cast<char *>(B.Buffer) + BufferExtents);
+      retryingWriteAll(Fd, reinterpret_cast<char *>(B.Data),
+                       reinterpret_cast<char *>(B.Data) + BufferExtents);
     }
   });
 
-  __sanitizer::atomic_store(&LogFlushStatus,
-                            XRayLogFlushStatus::XRAY_LOG_FLUSHED,
-                            __sanitizer::memory_order_release);
+  atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+               memory_order_release);
   return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
 }
 
 XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
   s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &LoggingStatus, &CurrentStatus,
-          XRayLogInitStatus::XRAY_LOG_FINALIZING,
-          __sanitizer::memory_order_release)) {
-    if (__sanitizer::Verbosity())
+  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
       Report("Cannot finalize log, implementation not initialized.\n");
     return static_cast<XRayLogInitStatus>(CurrentStatus);
   }
@@ -162,39 +916,11 @@ XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
   // operations to be performed until re-initialized.
   BQ->finalize();
 
-  __sanitizer::atomic_store(&LoggingStatus,
-                            XRayLogInitStatus::XRAY_LOG_FINALIZED,
-                            __sanitizer::memory_order_release);
+  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+               memory_order_release);
   return XRayLogInitStatus::XRAY_LOG_FINALIZED;
 }
 
-XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT {
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_FINALIZED;
-  if (__sanitizer::atomic_compare_exchange_strong(
-          &LoggingStatus, &CurrentStatus,
-          XRayLogInitStatus::XRAY_LOG_INITIALIZED,
-          __sanitizer::memory_order_release))
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-
-  // Release the in-memory buffer queue.
-  delete BQ;
-  BQ = nullptr;
-
-  // Spin until the flushing status is flushed.
-  s32 CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED;
-  while (__sanitizer::atomic_compare_exchange_weak(
-      &LogFlushStatus, &CurrentFlushingStatus,
-      XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING,
-      __sanitizer::memory_order_release)) {
-    if (CurrentFlushingStatus == XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING)
-      break;
-    CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED;
-  }
-
-  // At this point, we know that the status is flushed, and that we can assume
-  return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-}
-
 struct TSCAndCPU {
   uint64_t TSC = 0;
   unsigned char CPU = 0;
@@ -202,12 +928,14 @@ struct TSCAndCPU {
 
 static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
   // We want to get the TSC as early as possible, so that we can check whether
-  // we've seen this CPU before. We also do it before we load anything else, to
-  // allow for forward progress with the scheduling.
+  // we've seen this CPU before. We also do it before we load anything else,
+  // to allow for forward progress with the scheduling.
   TSCAndCPU Result;
 
   // Test once for required CPU features
-  static bool TSCSupported = probeRequiredCPUFeatures();
+  static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT;
+  static bool TSCSupported = true;
+  pthread_once(&OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
 
   if (TSCSupported) {
     Result.TSC = __xray::readTSC(Result.CPU);
@@ -228,20 +956,17 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
 void fdrLoggingHandleArg0(int32_t FuncId,
                           XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
   auto TC = getTimestamp();
-  __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0,
-                                           clock_gettime, BQ);
+  processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, clock_gettime);
 }
 
 void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry,
                           uint64_t Arg) XRAY_NEVER_INSTRUMENT {
   auto TC = getTimestamp();
-  __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg,
-                                           clock_gettime, BQ);
+  processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, clock_gettime);
 }
 
 void fdrLoggingHandleCustomEvent(void *Event,
                                  std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
-  using namespace __xray_fdr_internal;
   auto TC = getTimestamp();
   auto &TSC = TC.TSC;
   auto &CPU = TC.CPU;
@@ -249,13 +974,8 @@ void fdrLoggingHandleCustomEvent(void *Event,
   if (!Guard)
     return;
   if (EventSize > std::numeric_limits<int32_t>::max()) {
-    using Empty = struct {};
-    static Empty Once = [&] {
-      Report("Event size too large = %zu ; > max = %d\n", EventSize,
-             std::numeric_limits<int32_t>::max());
-      return Empty();
-    }();
-    (void)Once;
+    static pthread_once_t Once = PTHREAD_ONCE_INIT;
+    pthread_once(&Once, +[] { Report("Event size too large.\n"); });
   }
   int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
   auto &TLD = getThreadLocalData();
@@ -264,8 +984,8 @@ void fdrLoggingHandleCustomEvent(void *Event,
 
   // Here we need to prepare the log to handle:
   //   - The metadata record we're going to write. (16 bytes)
-  //   - The additional data we're going to write. Currently, that's the size of
-  //   the event we're going to dump into the log as free-form bytes.
+  //   - The additional data we're going to write. Currently, that's the size
+  //   of the event we're going to dump into the log as free-form bytes.
   if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
     TLD.BQ = nullptr;
     return;
@@ -280,90 +1000,207 @@ void fdrLoggingHandleCustomEvent(void *Event,
   CustomEvent.RecordKind =
       uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
   constexpr auto TSCSize = sizeof(TC.TSC);
-  std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
-  std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
-  std::memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
+  internal_memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
+  internal_memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+  internal_memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
   TLD.RecordPtr += sizeof(CustomEvent);
-  std::memcpy(TLD.RecordPtr, Event, ReducedEventSize);
+  internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
+  incrementExtents(MetadataRecSize + EventSize);
+  endBufferIfFull();
+}
+
+void fdrLoggingHandleTypedEvent(
+    uint16_t EventType, const void *Event,
+    std::size_t EventSize) noexcept XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+  if (EventSize > std::numeric_limits<int32_t>::max()) {
+    static pthread_once_t Once = PTHREAD_ONCE_INIT;
+    pthread_once(&Once, +[] { Report("Event size too large.\n"); });
+  }
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  auto &TLD = getThreadLocalData();
+  if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime))
+    return;
+
+  // Here we need to prepare the log to handle:
+  //   - The metadata record we're going to write. (16 bytes)
+  //   - The additional data we're going to write. Currently, that's the size
+  //   of the event we're going to dump into the log as free-form bytes.
+  if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
+    TLD.BQ = nullptr;
+    return;
+  }
+  // Write the custom event metadata record, which consists of the following
+  // information:
+  //   - 8 bytes (64-bits) for the full TSC when the event started.
+  //   - 4 bytes (32-bits) for the length of the data.
+  //   - 2 bytes (16-bits) for the event type. 3 bytes remain since one of the
+  //       bytes has the record type (Metadata Record) and kind (TypedEvent).
+  //       We'll log the error if the event type is greater than 2 bytes.
+  //       Event types are generated sequentially, so 2^16 is enough.
+  MetadataRecord TypedEvent;
+  TypedEvent.Type = uint8_t(RecordType::Metadata);
+  TypedEvent.RecordKind =
+      uint8_t(MetadataRecord::RecordKinds::TypedEventMarker);
+  constexpr auto TSCSize = sizeof(TC.TSC);
+  internal_memcpy(&TypedEvent.Data, &ReducedEventSize, sizeof(int32_t));
+  internal_memcpy(&TypedEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+  internal_memcpy(&TypedEvent.Data[sizeof(int32_t) + TSCSize], &EventType,
+                  sizeof(EventType));
+  internal_memcpy(TLD.RecordPtr, &TypedEvent, sizeof(TypedEvent));
+
+  TLD.RecordPtr += sizeof(TypedEvent);
+  internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
   incrementExtents(MetadataRecSize + EventSize);
   endBufferIfFull();
 }
 
-XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
+XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
                                  void *Options,
                                  size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  if (OptionsSize != sizeof(FDRLoggingOptions)) {
-    if (__sanitizer::Verbosity())
-      Report("Cannot initialize FDR logging; wrong size for options: %d\n",
-             OptionsSize);
-    return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
-        &LoggingStatus, __sanitizer::memory_order_acquire));
-  }
+  if (Options == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
   s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &LoggingStatus, &CurrentStatus,
-          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
-          __sanitizer::memory_order_release)) {
-    if (__sanitizer::Verbosity())
+  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
       Report("Cannot initialize already initialized implementation.\n");
     return static_cast<XRayLogInitStatus>(CurrentStatus);
   }
 
-  {
-    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
-    memcpy(&FDROptions, Options, OptionsSize);
+  // Because of __xray_log_init_mode(...) which guarantees that this will be
+  // called with BufferSize == 0 and BufferMax == 0 we parse the configuration
+  // provided in the Options pointer as a string instead.
+  if (BufferSize == 0 && BufferMax == 0) {
+    if (Verbosity())
+      Report("Initializing FDR mode with options: %s\n",
+             static_cast<const char *>(Options));
+
+    // TODO: Factor out the flags specific to the FDR mode implementation. For
+    // now, use the global/single definition of the flags, since the FDR mode
+    // flags are already defined there.
+    FlagParser FDRParser;
+    FDRFlags FDRFlags;
+    registerXRayFDRFlags(&FDRParser, &FDRFlags);
+    FDRFlags.setDefaults();
+
+    // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
+    // options until we migrate everyone to use the XRAY_FDR_OPTIONS
+    // compiler-provided options.
+    FDRParser.ParseString(useCompilerDefinedFlags());
+    FDRParser.ParseString(useCompilerDefinedFDRFlags());
+    auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
+    if (EnvOpts == nullptr)
+      EnvOpts = "";
+    FDRParser.ParseString(EnvOpts);
+
+    // FIXME: Remove this when we fully remove the deprecated flags.
+    if (internal_strlen(EnvOpts) == 0) {
+      FDRFlags.func_duration_threshold_us =
+          flags()->xray_fdr_log_func_duration_threshold_us;
+      FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
+    }
+
+    // The provided options should always override the compiler-provided and
+    // environment-variable defined options.
+    FDRParser.ParseString(static_cast<const char *>(Options));
+    *fdrFlags() = FDRFlags;
+    BufferSize = FDRFlags.buffer_size;
+    BufferMax = FDRFlags.buffer_max;
+    SpinMutexLock Guard(&FDROptionsMutex);
+    FDROptions.Fd = -1;
+    FDROptions.ReportErrors = true;
+  } else if (OptionsSize != sizeof(FDRLoggingOptions)) {
+    // FIXME: This is deprecated, and should really be removed.
+    // At this point we use the flag parser specific to the FDR mode
+    // implementation.
+    if (Verbosity())
+      Report("Cannot initialize FDR logging; wrong size for options: %d\n",
+             OptionsSize);
+    return static_cast<XRayLogInitStatus>(
+        atomic_load(&LoggingStatus, memory_order_acquire));
+  } else {
+    if (Verbosity())
+      Report("XRay FDR: struct-based init is deprecated, please use "
+             "string-based configuration instead.\n");
+    SpinMutexLock Guard(&FDROptionsMutex);
+    internal_memcpy(&FDROptions, Options, OptionsSize);
   }
 
   bool Success = false;
 
   if (BQ != nullptr) {
-    delete BQ;
+    BQ->~BufferQueue();
+    InternalFree(BQ);
     BQ = nullptr;
   }
 
-  if (BQ == nullptr)
-    BQ = new BufferQueue(BufferSize, BufferMax, Success);
+  if (BQ == nullptr) {
+    BQ = reinterpret_cast<BufferQueue *>(
+        InternalAlloc(sizeof(BufferQueue), nullptr, 64));
+    new (BQ) BufferQueue(BufferSize, BufferMax, Success);
+  }
 
   if (!Success) {
     Report("BufferQueue init failed.\n");
     if (BQ != nullptr) {
-      delete BQ;
+      BQ->~BufferQueue();
+      InternalFree(BQ);
       BQ = nullptr;
     }
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
   }
 
-  static bool UNUSED Once = [] {
-    pthread_key_create(&__xray_fdr_internal::Key, +[](void *) {
-      auto &TLD = __xray_fdr_internal::getThreadLocalData();
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  pthread_once(&OnceInit, +[] {
+    atomic_store(&TicksPerSec,
+                 probeRequiredCPUFeatures() ? getTSCFrequency()
+                                            : __xray::NanosecondsPerSecond,
+                 memory_order_release);
+    pthread_key_create(&Key, +[](void *TLDPtr) {
+      if (TLDPtr == nullptr)
+        return;
+      auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
       if (TLD.BQ == nullptr)
         return;
       auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
       if (EC != BufferQueue::ErrorCode::Ok)
         Report("At thread exit, failed to release buffer at %p; error=%s\n",
-               TLD.Buffer.Buffer, BufferQueue::getErrorString(EC));
+               TLD.Buffer.Data, BufferQueue::getErrorString(EC));
     });
-    return false;
-  }();
+  });
 
+  atomic_store(&ThresholdTicks,
+               atomic_load_relaxed(&TicksPerSec) *
+                   fdrFlags()->func_duration_threshold_us / 1000000,
+               memory_order_release);
   // Arg1 handler should go in first to avoid concurrent code accidentally
   // falling back to arg0 when it should have ran arg1.
   __xray_set_handler_arg1(fdrLoggingHandleArg1);
   // Install the actual handleArg0 handler after initialising the buffers.
   __xray_set_handler(fdrLoggingHandleArg0);
   __xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
+  __xray_set_typedevent_handler(fdrLoggingHandleTypedEvent);
+
+  // Install the buffer iterator implementation.
+  __xray_log_set_buffer_iterator(fdrIterator);
 
-  __sanitizer::atomic_store(&LoggingStatus,
-                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
-                            __sanitizer::memory_order_release);
+  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+               memory_order_release);
 
-  if (__sanitizer::Verbosity())
+  if (Verbosity())
     Report("XRay FDR init successful.\n");
   return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
 }
 
 bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
-  using namespace __xray;
   XRayLogImpl Impl{
       fdrLoggingInit,
       fdrLoggingFinalize,
@@ -372,11 +1209,10 @@ bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
   };
   auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
   if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
-      __sanitizer::Verbosity())
+      Verbosity())
     Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
            RegistrationResult);
-  if (flags()->xray_fdr_log ||
-      !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-fdr"))
+  if (flags()->xray_fdr_log || !internal_strcmp(flags()->xray_mode, "xray-fdr"))
     __xray_set_log_impl(Impl);
   return true;
 }
diff --git a/lib/xray/xray_fdr_logging_impl.h b/lib/xray/xray_fdr_logging_impl.h
deleted file mode 100644
index 59eab55b2573..000000000000
--- a/lib/xray/xray_fdr_logging_impl.h
+++ /dev/null
@@ -1,705 +0,0 @@
-//===-- xray_fdr_logging_impl.h ---------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Here we implement the thread local state management and record i/o for Flight
-// Data Recorder mode for XRay, where we use compact structures to store records
-// in memory as well as when writing out the data to files.
-//
-//===----------------------------------------------------------------------===//
-#ifndef XRAY_XRAY_FDR_LOGGING_IMPL_H
-#define XRAY_XRAY_FDR_LOGGING_IMPL_H
-
-#include <cassert>
-#include <cstddef>
-#include <cstring>
-#include <limits>
-#include <pthread.h>
-#include <sys/syscall.h>
-#include <time.h>
-#include <type_traits>
-#include <unistd.h>
-
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray/xray_log_interface.h"
-#include "xray_buffer_queue.h"
-#include "xray_defs.h"
-#include "xray_fdr_log_records.h"
-#include "xray_flags.h"
-#include "xray_tsc.h"
-
-namespace __xray {
-
-__sanitizer::atomic_sint32_t LoggingStatus = {
-    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-
-/// We expose some of the state transitions when FDR logging mode is operating
-/// such that we can simulate a series of log events that may occur without
-/// and test with determinism without worrying about the real CPU time.
-///
-/// Because the code uses thread_local allocation extensively as part of its
-/// design, callers that wish to test events occuring on different threads
-/// will actually have to run them on different threads.
-///
-/// This also means that it is possible to break invariants maintained by
-/// cooperation with xray_fdr_logging class, so be careful and think twice.
-namespace __xray_fdr_internal {
-
-/// Writes the new buffer record and wallclock time that begin a buffer for the
-/// current thread.
-static void writeNewBufferPreamble(pid_t Tid, timespec TS);
-
-/// Writes a Function Record to the buffer associated with the current thread.
-static void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
-                                XRayEntryType EntryType);
-
-/// Sets up a new buffer in thread_local storage and writes a preamble. The
-/// wall_clock_reader function is used to populate the WallTimeRecord entry.
-static void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
-                                                    struct timespec *));
-
-/// TSC Wrap records are written when a TSC delta encoding scheme overflows.
-static void writeTSCWrapMetadata(uint64_t TSC);
-
-// Group together thread-local-data in a struct, then hide it behind a function
-// call so that it can be initialized on first use instead of as a global. We
-// force the alignment to 64-bytes for x86 cache line alignment, as this
-// structure is used in the hot path of implementation.
-struct alignas(64) ThreadLocalData {
-  BufferQueue::Buffer Buffer;
-  char *RecordPtr = nullptr;
-  // The number of FunctionEntry records immediately preceding RecordPtr.
-  uint8_t NumConsecutiveFnEnters = 0;
-
-  // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit
-  // records preceding RecordPtr.
-  uint8_t NumTailCalls = 0;
-
-  // We use a thread_local variable to keep track of which CPUs we've already
-  // run, and the TSC times for these CPUs. This allows us to stop repeating the
-  // CPU field in the function records.
-  //
-  // We assume that we'll support only 65536 CPUs for x86_64.
-  uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
-  uint64_t LastTSC = 0;
-  uint64_t LastFunctionEntryTSC = 0;
-
-  // Make sure a thread that's ever called handleArg0 has a thread-local
-  // live reference to the buffer queue for this particular instance of
-  // FDRLogging, and that we're going to clean it up when the thread exits.
-  BufferQueue *BQ = nullptr;
-};
-
-static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
-              "ThreadLocalData must be trivially destructible");
-
-static constexpr auto MetadataRecSize = sizeof(MetadataRecord);
-static constexpr auto FunctionRecSize = sizeof(FunctionRecord);
-
-// Use a global pthread key to identify thread-local data for logging.
-static pthread_key_t Key;
-
-// This function will initialize the thread-local data structure used by the FDR
-// logging implementation and return a reference to it. The implementation
-// details require a bit of care to maintain.
-//
-// First, some requirements on the implementation in general:
-//
-//   - XRay handlers should not call any memory allocation routines that may
-//     delegate to an instrumented implementation. This means functions like
-//     malloc() and free() should not be called while instrumenting.
-//
-//   - We would like to use some thread-local data initialized on first-use of
-//     the XRay instrumentation. These allow us to implement unsynchronized
-//     routines that access resources associated with the thread.
-//
-// The implementation here uses a few mechanisms that allow us to provide both
-// the requirements listed above. We do this by:
-//
-//   1. Using a thread-local aligned storage buffer for representing the
-//      ThreadLocalData struct. This data will be uninitialized memory by
-//      design.
-//
-//   2. Not requiring a thread exit handler/implementation, keeping the
-//      thread-local as purely a collection of references/data that do not
-//      require cleanup.
-//
-// We're doing this to avoid using a `thread_local` object that has a
-// non-trivial destructor, because the C++ runtime might call std::malloc(...)
-// to register calls to destructors. Deadlocks may arise when, for example, an
-// externally provided malloc implementation is XRay instrumented, and
-// initializing the thread-locals involves calling into malloc. A malloc
-// implementation that does global synchronization might be holding a lock for a
-// critical section, calling a function that might be XRay instrumented (and
-// thus in turn calling into malloc by virtue of registration of the
-// thread_local's destructor).
-static ThreadLocalData &getThreadLocalData() {
-  static_assert(alignof(ThreadLocalData) >= 64,
-                "ThreadLocalData must be cache line aligned.");
-  thread_local ThreadLocalData TLD;
-  thread_local bool UNUSED ThreadOnce = [] {
-    pthread_setspecific(Key, &TLD);
-    return false;
-  }();
-  return TLD;
-}
-
-//-----------------------------------------------------------------------------|
-// The rest of the file is implementation.                                     |
-//-----------------------------------------------------------------------------|
-// Functions are implemented in the header for inlining since we don't want    |
-// to grow the stack when we've hijacked the binary for logging.               |
-//-----------------------------------------------------------------------------|
-
-namespace {
-
-class RecursionGuard {
-  volatile bool &Running;
-  const bool Valid;
-
-public:
-  explicit RecursionGuard(volatile bool &R) : Running(R), Valid(!R) {
-    if (Valid)
-      Running = true;
-  }
-
-  RecursionGuard(const RecursionGuard &) = delete;
-  RecursionGuard(RecursionGuard &&) = delete;
-  RecursionGuard &operator=(const RecursionGuard &) = delete;
-  RecursionGuard &operator=(RecursionGuard &&) = delete;
-
-  explicit operator bool() const { return Valid; }
-
-  ~RecursionGuard() noexcept {
-    if (Valid)
-      Running = false;
-  }
-};
-
-} // namespace
-
-static void writeNewBufferPreamble(pid_t Tid,
-                                   timespec TS) XRAY_NEVER_INSTRUMENT {
-  static constexpr int InitRecordsCount = 2;
-  auto &TLD = getThreadLocalData();
-  MetadataRecord Metadata[InitRecordsCount];
-  {
-    // Write out a MetadataRecord to signify that this is the start of a new
-    // buffer, associated with a particular thread, with a new CPU.  For the
-    // data, we have 15 bytes to squeeze as much information as we can.  At this
-    // point we only write down the following bytes:
-    //   - Thread ID (pid_t, 4 bytes)
-    auto &NewBuffer = Metadata[0];
-    NewBuffer.Type = uint8_t(RecordType::Metadata);
-    NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer);
-    std::memcpy(&NewBuffer.Data, &Tid, sizeof(pid_t));
-  }
-
-  // Also write the WalltimeMarker record.
-  {
-    static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
-    auto &WalltimeMarker = Metadata[1];
-    WalltimeMarker.Type = uint8_t(RecordType::Metadata);
-    WalltimeMarker.RecordKind =
-        uint8_t(MetadataRecord::RecordKinds::WalltimeMarker);
-
-    // We only really need microsecond precision here, and enforce across
-    // platforms that we need 64-bit seconds and 32-bit microseconds encoded in
-    // the Metadata record.
-    int32_t Micros = TS.tv_nsec / 1000;
-    int64_t Seconds = TS.tv_sec;
-    std::memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds));
-    std::memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros, sizeof(Micros));
-  }
-
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-  if (TLD.BQ == nullptr || TLD.BQ->finalizing())
-    return;
-  std::memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
-  TLD.RecordPtr += sizeof(Metadata);
-  // Since we write out the extents as the first metadata record of the
-  // buffer, we need to write out the extents including the extents record.
-  __sanitizer::atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
-                            __sanitizer::memory_order_release);
-}
-
-inline void setupNewBuffer(int (*wall_clock_reader)(
-    clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  auto &B = TLD.Buffer;
-  TLD.RecordPtr = static_cast<char *>(B.Buffer);
-  pid_t Tid = syscall(SYS_gettid);
-  timespec TS{0, 0};
-  // This is typically clock_gettime, but callers have injection ability.
-  wall_clock_reader(CLOCK_MONOTONIC, &TS);
-  writeNewBufferPreamble(Tid, TS);
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-}
-
-static void incrementExtents(size_t Add) {
-  auto &TLD = getThreadLocalData();
-  __sanitizer::atomic_fetch_add(&TLD.Buffer.Extents->Size, Add,
-                                __sanitizer::memory_order_acq_rel);
-}
-
-static void decrementExtents(size_t Subtract) {
-  auto &TLD = getThreadLocalData();
-  __sanitizer::atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract,
-                                __sanitizer::memory_order_acq_rel);
-}
-
-inline void writeNewCPUIdMetadata(uint16_t CPU,
-                                  uint64_t TSC) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  MetadataRecord NewCPUId;
-  NewCPUId.Type = uint8_t(RecordType::Metadata);
-  NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
-
-  // The data for the New CPU will contain the following bytes:
-  //   - CPU ID (uint16_t, 2 bytes)
-  //   - Full TSC (uint64_t, 8 bytes)
-  // Total = 10 bytes.
-  std::memcpy(&NewCPUId.Data, &CPU, sizeof(CPU));
-  std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
-  std::memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-  incrementExtents(sizeof(MetadataRecord));
-}
-
-inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  MetadataRecord TSCWrap;
-  TSCWrap.Type = uint8_t(RecordType::Metadata);
-  TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
-
-  // The data for the TSCWrap record contains the following bytes:
-  //   - Full TSC (uint64_t, 8 bytes)
-  // Total = 8 bytes.
-  std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
-  std::memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-  incrementExtents(sizeof(MetadataRecord));
-}
-
-// Call Argument metadata records store the arguments to a function in the
-// order of their appearance; holes are not supported by the buffer format.
-static inline void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  MetadataRecord CallArg;
-  CallArg.Type = uint8_t(RecordType::Metadata);
-  CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument);
-
-  std::memcpy(CallArg.Data, &A, sizeof(A));
-  std::memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
-  incrementExtents(sizeof(MetadataRecord));
-}
-
-static inline void
-writeFunctionRecord(int FuncId, uint32_t TSCDelta,
-                    XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT {
-  FunctionRecord FuncRecord;
-  FuncRecord.Type = uint8_t(RecordType::Function);
-  // Only take 28 bits of the function id.
-  FuncRecord.FuncId = FuncId & ~(0x0F << 28);
-  FuncRecord.TSCDelta = TSCDelta;
-
-  auto &TLD = getThreadLocalData();
-  switch (EntryType) {
-  case XRayEntryType::ENTRY:
-    ++TLD.NumConsecutiveFnEnters;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
-    break;
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    // We should not rewind functions with logged args.
-    TLD.NumConsecutiveFnEnters = 0;
-    TLD.NumTailCalls = 0;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
-    break;
-  case XRayEntryType::EXIT:
-    // If we've decided to log the function exit, we will never erase the log
-    // before it.
-    TLD.NumConsecutiveFnEnters = 0;
-    TLD.NumTailCalls = 0;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
-    break;
-  case XRayEntryType::TAIL:
-    // If we just entered the function we're tail exiting from or erased every
-    // invocation since then, this function entry tail pair is a candidate to
-    // be erased when the child function exits.
-    if (TLD.NumConsecutiveFnEnters > 0) {
-      ++TLD.NumTailCalls;
-      TLD.NumConsecutiveFnEnters = 0;
-    } else {
-      // We will never be able to erase this tail call since we have logged
-      // something in between the function entry and tail exit.
-      TLD.NumTailCalls = 0;
-      TLD.NumConsecutiveFnEnters = 0;
-    }
-    FuncRecord.RecordKind =
-        uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
-    break;
-  case XRayEntryType::CUSTOM_EVENT: {
-    // This is a bug in patching, so we'll report it once and move on.
-    static bool Once = [&] {
-      Report("Internal error: patched an XRay custom event call as a function; "
-             "func id = %d\n",
-             FuncId);
-      return true;
-    }();
-    (void)Once;
-    return;
-  }
-  }
-
-  std::memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord));
-  TLD.RecordPtr += sizeof(FunctionRecord);
-  incrementExtents(sizeof(FunctionRecord));
-}
-
-static uint64_t thresholdTicks() {
-  static uint64_t TicksPerSec = probeRequiredCPUFeatures()
-                                    ? getTSCFrequency()
-                                    : __xray::NanosecondsPerSecond;
-  static const uint64_t ThresholdTicks =
-      TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000;
-  return ThresholdTicks;
-}
-
-// Re-point the thread local pointer into this thread's Buffer before the recent
-// "Function Entry" record and any "Tail Call Exit" records after that.
-static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
-                             uint64_t &LastFunctionEntryTSC, int32_t FuncId) {
-  auto &TLD = getThreadLocalData();
-  TLD.RecordPtr -= FunctionRecSize;
-  decrementExtents(FunctionRecSize);
-  FunctionRecord FuncRecord;
-  std::memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize);
-  assert(FuncRecord.RecordKind ==
-             uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
-         "Expected to find function entry recording when rewinding.");
-  assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) &&
-         "Expected matching function id when rewinding Exit");
-  --TLD.NumConsecutiveFnEnters;
-  LastTSC -= FuncRecord.TSCDelta;
-
-  // We unwound one call. Update the state and return without writing a log.
-  if (TLD.NumConsecutiveFnEnters != 0) {
-    LastFunctionEntryTSC -= FuncRecord.TSCDelta;
-    return;
-  }
-
-  // Otherwise we've rewound the stack of all function entries, we might be
-  // able to rewind further by erasing tail call functions that are being
-  // exited from via this exit.
-  LastFunctionEntryTSC = 0;
-  auto RewindingTSC = LastTSC;
-  auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize;
-  while (TLD.NumTailCalls > 0) {
-    // Rewind the TSC back over the TAIL EXIT record.
-    FunctionRecord ExpectedTailExit;
-    std::memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize);
-
-    assert(ExpectedTailExit.RecordKind ==
-               uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) &&
-           "Expected to find tail exit when rewinding.");
-    RewindingRecordPtr -= FunctionRecSize;
-    RewindingTSC -= ExpectedTailExit.TSCDelta;
-    FunctionRecord ExpectedFunctionEntry;
-    std::memcpy(&ExpectedFunctionEntry, RewindingRecordPtr, FunctionRecSize);
-    assert(ExpectedFunctionEntry.RecordKind ==
-               uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
-           "Expected to find function entry when rewinding tail call.");
-    assert(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId &&
-           "Expected funcids to match when rewinding tail call.");
-
-    // This tail call exceeded the threshold duration. It will not be erased.
-    if ((TSC - RewindingTSC) >= thresholdTicks()) {
-      TLD.NumTailCalls = 0;
-      return;
-    }
-
-    // We can erase a tail exit pair that we're exiting through since
-    // its duration is under threshold.
-    --TLD.NumTailCalls;
-    RewindingRecordPtr -= FunctionRecSize;
-    RewindingTSC -= ExpectedFunctionEntry.TSCDelta;
-    TLD.RecordPtr -= 2 * FunctionRecSize;
-    LastTSC = RewindingTSC;
-    decrementExtents(2 * FunctionRecSize);
-  }
-}
-
-inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
-  auto &TLD = getThreadLocalData();
-  auto EC = BQArg.releaseBuffer(TLD.Buffer);
-  if (EC != BufferQueue::ErrorCode::Ok) {
-    Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Buffer,
-           BufferQueue::getErrorString(EC));
-    return false;
-  }
-  return true;
-}
-
-inline bool prepareBuffer(uint64_t TSC, unsigned char CPU,
-                          int (*wall_clock_reader)(clockid_t,
-                                                   struct timespec *),
-                          size_t MaxSize) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  char *BufferStart = static_cast<char *>(TLD.Buffer.Buffer);
-  if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) {
-    if (!releaseThreadLocalBuffer(*TLD.BQ))
-      return false;
-    auto EC = TLD.BQ->getBuffer(TLD.Buffer);
-    if (EC != BufferQueue::ErrorCode::Ok) {
-      Report("Failed to acquire a buffer; error=%s\n",
-             BufferQueue::getErrorString(EC));
-      return false;
-    }
-    setupNewBuffer(wall_clock_reader);
-
-    // Always write the CPU metadata as the first record in the buffer.
-    writeNewCPUIdMetadata(CPU, TSC);
-  }
-  return true;
-}
-
-inline bool
-isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU,
-                         int (*wall_clock_reader)(clockid_t, struct timespec *))
-    XRAY_NEVER_INSTRUMENT {
-  // Bail out right away if logging is not initialized yet.
-  // We should take the opportunity to release the buffer though.
-  auto Status = __sanitizer::atomic_load(&LoggingStatus,
-                                         __sanitizer::memory_order_acquire);
-  auto &TLD = getThreadLocalData();
-  if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
-    if (TLD.RecordPtr != nullptr &&
-        (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
-         Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
-      if (!releaseThreadLocalBuffer(*LBQ))
-        return false;
-      TLD.RecordPtr = nullptr;
-      return false;
-    }
-    return false;
-  }
-
-  if (__sanitizer::atomic_load(&LoggingStatus,
-                               __sanitizer::memory_order_acquire) !=
-          XRayLogInitStatus::XRAY_LOG_INITIALIZED ||
-      LBQ->finalizing()) {
-    if (!releaseThreadLocalBuffer(*LBQ))
-      return false;
-    TLD.RecordPtr = nullptr;
-  }
-
-  if (TLD.Buffer.Buffer == nullptr) {
-    auto EC = LBQ->getBuffer(TLD.Buffer);
-    if (EC != BufferQueue::ErrorCode::Ok) {
-      auto LS = __sanitizer::atomic_load(&LoggingStatus,
-                                         __sanitizer::memory_order_acquire);
-      if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING &&
-          LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
-        Report("Failed to acquire a buffer; error=%s\n",
-               BufferQueue::getErrorString(EC));
-      return false;
-    }
-
-    setupNewBuffer(wall_clock_reader);
-
-    // Always write the CPU metadata as the first record in the buffer.
-    writeNewCPUIdMetadata(CPU, TSC);
-  }
-
-  if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) {
-    // This means this is the first CPU this thread has ever run on. We set
-    // the current CPU and record this as the first TSC we've seen.
-    TLD.CurrentCPU = CPU;
-    writeNewCPUIdMetadata(CPU, TSC);
-  }
-
-  return true;
-} // namespace __xray_fdr_internal
-
-// Compute the TSC difference between the time of measurement and the previous
-// event. There are a few interesting situations we need to account for:
-//
-//   - The thread has migrated to a different CPU. If this is the case, then
-//     we write down the following records:
-//
-//       1. A 'NewCPUId' Metadata record.
-//       2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-//   - The TSC delta is greater than the 32 bits we can store in a
-//     FunctionRecord. In this case we write down the following records:
-//
-//       1. A 'TSCWrap' Metadata record.
-//       2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-//   - The TSC delta is representable within the 32 bits we can store in a
-//     FunctionRecord. In this case we write down just a FunctionRecord with
-//     the correct TSC delta.
-inline uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC,
-                                   uint8_t CPU) {
-  if (CPU != TLD.CurrentCPU) {
-    // We've moved to a new CPU.
-    writeNewCPUIdMetadata(CPU, TSC);
-    return 0;
-  }
-  // If the delta is greater than the range for a uint32_t, then we write out
-  // the TSC wrap metadata entry with the full TSC, and the TSC for the
-  // function record be 0.
-  uint64_t Delta = TSC - TLD.LastTSC;
-  if (Delta <= std::numeric_limits<uint32_t>::max())
-    return Delta;
-
-  writeTSCWrapMetadata(TSC);
-  return 0;
-}
-
-inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  auto BufferStart = static_cast<char *>(TLD.Buffer.Buffer);
-  if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <=
-      ptrdiff_t{MetadataRecSize}) {
-    if (!releaseThreadLocalBuffer(*TLD.BQ))
-      return;
-    TLD.RecordPtr = nullptr;
-  }
-}
-
-thread_local volatile bool Running = false;
-
-/// Here's where the meat of the processing happens. The writer captures
-/// function entry, exit and tail exit points with a time and will create
-/// TSCWrap, NewCPUId and Function records as necessary. The writer might
-/// walk backward through its buffer and erase trivial functions to avoid
-/// polluting the log and may use the buffer queue to obtain or release a
-/// buffer.
-inline void processFunctionHook(int32_t FuncId, XRayEntryType Entry,
-                                uint64_t TSC, unsigned char CPU, uint64_t Arg1,
-                                int (*wall_clock_reader)(clockid_t,
-                                                         struct timespec *),
-                                BufferQueue *BQ) XRAY_NEVER_INSTRUMENT {
-  // Prevent signal handler recursion, so in case we're already in a log writing
-  // mode and the signal handler comes in (and is also instrumented) then we
-  // don't want to be clobbering potentially partial writes already happening in
-  // the thread. We use a simple thread_local latch to only allow one on-going
-  // handleArg0 to happen at any given time.
-  RecursionGuard Guard{Running};
-  if (!Guard) {
-    assert(Running == true && "RecursionGuard is buggy!");
-    return;
-  }
-
-  auto &TLD = getThreadLocalData();
-
-  // In case the reference has been cleaned up before, we make sure we
-  // initialize it to the provided BufferQueue.
-  if (TLD.BQ == nullptr)
-    TLD.BQ = BQ;
-
-  if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
-    return;
-
-  // Before we go setting up writing new function entries, we need to be really
-  // careful about the pointer math we're doing. This means we need to ensure
-  // that the record we are about to write is going to fit into the buffer,
-  // without overflowing the buffer.
-  //
-  // To do this properly, we use the following assumptions:
-  //
-  //   - The least number of bytes we will ever write is 8
-  //     (sizeof(FunctionRecord)) only if the delta between the previous entry
-  //     and this entry is within 32 bits.
-  //   - The most number of bytes we will ever write is 8 + 16 + 16 = 40.
-  //     This is computed by:
-  //
-  //       MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord)
-  //
-  //     These arise in the following cases:
-  //
-  //       1. When the delta between the TSC we get and the previous TSC for the
-  //          same CPU is outside of the uint32_t range, we end up having to
-  //          write a MetadataRecord to indicate a "tsc wrap" before the actual
-  //          FunctionRecord.
-  //       2. When we learn that we've moved CPUs, we need to write a
-  //          MetadataRecord to indicate a "cpu change", and thus write out the
-  //          current TSC for that CPU before writing out the actual
-  //          FunctionRecord.
-  //       3. When we learn about a new CPU ID, we need to write down a "new cpu
-  //          id" MetadataRecord before writing out the actual FunctionRecord.
-  //       4. The second MetadataRecord is the optional function call argument.
-  //
-  // So the math we need to do is to determine whether writing 40 bytes past the
-  // current pointer exceeds the buffer's maximum size. If we don't have enough
-  // space to write 40 bytes in the buffer, we need get a new Buffer, set it up
-  // properly before doing any further writing.
-  size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize;
-  if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) {
-    TLD.BQ = nullptr;
-    return;
-  }
-
-  // By this point, we are now ready to write up to 40 bytes (explained above).
-  assert((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Buffer) >=
-             static_cast<ptrdiff_t>(MetadataRecSize) &&
-         "Misconfigured BufferQueue provided; Buffer size not large enough.");
-
-  auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU);
-  TLD.LastTSC = TSC;
-  TLD.CurrentCPU = CPU;
-  switch (Entry) {
-  case XRayEntryType::ENTRY:
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    // Update the thread local state for the next invocation.
-    TLD.LastFunctionEntryTSC = TSC;
-    break;
-  case XRayEntryType::TAIL:
-  case XRayEntryType::EXIT:
-    // Break out and write the exit record if we can't erase any functions.
-    if (TLD.NumConsecutiveFnEnters == 0 ||
-        (TSC - TLD.LastFunctionEntryTSC) >= thresholdTicks())
-      break;
-    rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId);
-    return; // without writing log.
-  case XRayEntryType::CUSTOM_EVENT: {
-    // This is a bug in patching, so we'll report it once and move on.
-    static bool Once = [&] {
-      Report("Internal error: patched an XRay custom event call as a function; "
-             "func id = %d",
-             FuncId);
-      return true;
-    }();
-    (void)Once;
-    return;
-  }
-  }
-
-  writeFunctionRecord(FuncId, RecordTSCDelta, Entry);
-  if (Entry == XRayEntryType::LOG_ARGS_ENTRY)
-    writeCallArgumentMetadata(Arg1);
-
-  // If we've exhausted the buffer by this time, we then release the buffer to
-  // make sure that other threads may start using this buffer.
-  endBufferIfFull();
-}
-
-} // namespace __xray_fdr_internal
-} // namespace __xray
-
-#endif // XRAY_XRAY_FDR_LOGGING_IMPL_H
diff --git a/lib/xray/xray_flags.cc b/lib/xray/xray_flags.cc
index 1ee4d10d753c..b50b68666d80 100644
--- a/lib/xray/xray_flags.cc
+++ b/lib/xray/xray_flags.cc
@@ -30,7 +30,7 @@ void Flags::setDefaults() XRAY_NEVER_INSTRUMENT {
 #undef XRAY_FLAG
 }
 
-static void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
+void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
 #define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
   RegisterFlag(P, #Name, Description, &F->Name);
 #include "xray_flags.inc"
@@ -42,15 +42,14 @@ static void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
 // options that control XRay. This means users/deployments can tweak the
 // defaults that override the hard-coded defaults in the xray_flags.inc at
 // compile-time using the XRAY_DEFAULT_OPTIONS macro.
-static const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
 #ifdef XRAY_DEFAULT_OPTIONS
-// Do the double-layered string conversion to prevent badly crafted strings
-// provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues (or
-// changing the semantics of the implementation through the macro). This ensures
-// that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a string literal.
-#define XRAY_STRINGIZE(x) #x
-#define XRAY_STRINGIZE_OPTIONS(options) XRAY_STRINGIZE(options)
-  return XRAY_STRINGIZE_OPTIONS(XRAY_DEFAULT_OPTIONS);
+  // Do the double-layered string conversion to prevent badly crafted strings
+  // provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues
+  // (or changing the semantics of the implementation through the macro). This
+  // ensures that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a
+  // string literal.
+  return SANITIZER_STRINGIFY(XRAY_DEFAULT_OPTIONS);
 #else
   return "";
 #endif
diff --git a/lib/xray/xray_flags.h b/lib/xray/xray_flags.h
index 3ed5b8844cb4..7c1ba9458856 100644
--- a/lib/xray/xray_flags.h
+++ b/lib/xray/xray_flags.h
@@ -29,6 +29,8 @@ struct Flags {
 };
 
 extern Flags xray_flags_dont_use_directly;
+extern void registerXRayFlags(FlagParser *P, Flags *F);
+const char *useCompilerDefinedFlags();
 inline Flags *flags() { return &xray_flags_dont_use_directly; }
 
 void initializeFlags();
diff --git a/lib/xray/xray_flags.inc b/lib/xray/xray_flags.inc
index 29f1fce7d7f4..c87903963a36 100644
--- a/lib/xray/xray_flags.inc
+++ b/lib/xray/xray_flags.inc
@@ -27,23 +27,24 @@ XRAY_FLAG(uptr, xray_page_size_override, 0,
 XRAY_FLAG(bool, xray_naive_log, false,
           "DEPRECATED: Use xray_mode=xray-basic instead.")
 XRAY_FLAG(int, xray_naive_log_func_duration_threshold_us, 5,
-          "Naive logging will try to skip functions that execute for fewer "
-          "microseconds than this threshold.")
+          "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+          "func_duration_threshold_us instead.")
 XRAY_FLAG(int, xray_naive_log_max_stack_depth, 64,
-          "Naive logging will keep track of at most this deep a call stack, "
-          "any more and the recordings will be droppped.")
+          "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+          "max_stack_depth instead.")
 XRAY_FLAG(int, xray_naive_log_thread_buffer_size, 1024,
-          "The number of entries to keep on a per-thread buffer.")
+          "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+          "thread_buffer_size instead.")
 
 // FDR (Flight Data Recorder) Mode logging options.
 XRAY_FLAG(bool, xray_fdr_log, false,
           "DEPRECATED: Use xray_mode=xray-fdr instead.")
 XRAY_FLAG(int, xray_fdr_log_func_duration_threshold_us, 5,
-          "FDR logging will try to skip functions that execute for fewer "
-          "microseconds than this threshold.")
+          "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+          "func_duration_threshold_us instead.")
 XRAY_FLAG(int, xray_fdr_log_grace_period_us, 0,
-          "DEPRECATED: use xray_fdr_log_grace_period_ms instead.")
+          "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+          "grace_period_ms instead.")
 XRAY_FLAG(int, xray_fdr_log_grace_period_ms, 100,
-          "FDR logging will wait this much time in microseconds before "
-          "actually flushing the log; this gives a chance for threads to "
-          "notice that the log has been finalized and clean up.")
+          "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+          "grace_period_ms instead.")
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
new file mode 100644
index 000000000000..2acf14aa5625
--- /dev/null
+++ b/lib/xray/xray_function_call_trie.h
@@ -0,0 +1,455 @@
+//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a function call trie.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FUNCTION_CALL_TRIE_H
+#define XRAY_FUNCTION_CALL_TRIE_H
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
+#include <memory> // For placement new.
+#include <utility>
+
+namespace __xray {
+
+/// A FunctionCallTrie represents the stack traces of XRay instrumented
+/// functions that we've encountered, where a node corresponds to a function and
+/// the path from the root to the node its stack trace. Each node in the trie
+/// will contain some useful values, including:
+///
+///   * The cumulative amount of time spent in this particular node/stack.
+///   * The number of times this stack has appeared.
+///   * A histogram of latencies for that particular node.
+///
+/// Each node in the trie will also contain a list of callees, represented using
+/// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function
+/// ID of the callee, and a pointer to the node.
+///
+/// If we visualise this data structure, we'll find the following potential
+/// representation:
+///
+///   [function id node] -> [callees] [cumulative time]
+///                         [call counter] [latency histogram]
+///
+/// As an example, when we have a function in this pseudocode:
+///
+///   func f(N) {
+///     g()
+///     h()
+///     for i := 1..N { j() }
+///   }
+///
+/// We may end up with a trie of the following form:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ ... ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///
+/// If for instance the function g() called j() like so:
+///
+///   func g() {
+///     for i := 1..10 { j() }
+///   }
+///
+/// We'll find the following updated trie:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ j' ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///   j' -> [ ... ] [...] [10] [...]
+///
+/// Note that we'll have a new node representing the path `f -> g -> j'` with
+/// isolated data. This isolation gives us a means of representing the stack
+/// traces as a path, as opposed to a key in a table. The alternative
+/// implementation here would be to use a separate table for the path, and use
+/// hashes of the path as an identifier to accumulate the information. We've
+/// moved away from this approach as it takes a lot of time to compute the hash
+/// every time we need to update a function's call information as we're handling
+/// the entry and exit events.
+///
+/// This approach allows us to maintain a shadow stack, which represents the
+/// currently executing path, and on function exits quickly compute the amount
+/// of time elapsed from the entry, then update the counters for the node
+/// already represented in the trie. This necessitates an efficient
+/// representation of the various data structures (the list of callees must be
+/// cache-aware and efficient to look up, and the histogram must be compact and
+/// quick to update) to enable us to keep the overheads of this implementation
+/// to the minimum.
+class FunctionCallTrie {
+public:
+  struct Node;
+
+  // We use a NodeIdPair type instead of a std::pair<...> to not rely on the
+  // standard library types in this header.
+  struct NodeIdPair {
+    Node *NodePtr;
+    int32_t FId;
+
+    // Constructor for inplace-construction.
+    NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
+  };
+
+  using NodeIdPairArray = Array<NodeIdPair>;
+  using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
+
+  // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
+  // number of times this node actually appeared, the cumulative amount of time
+  // for this particular node including its children call times, and just the
+  // local time spent on this node. Each Node will have the ID of the XRay
+  // instrumented function that it is associated to.
+  struct Node {
+    Node *Parent;
+    NodeIdPairArray Callees;
+    int64_t CallCount;
+    int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+    int32_t FId;
+
+    // We add a constructor here to allow us to inplace-construct through
+    // Array<...>'s AppendEmplace.
+    Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+         int32_t F)
+        : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
+          FId(F) {}
+
+    // TODO: Include the compact histogram.
+  };
+
+private:
+  struct ShadowStackEntry {
+    uint64_t EntryTSC;
+    Node *NodePtr;
+
+    // We add a constructor here to allow us to inplace-construct through
+    // Array<...>'s AppendEmplace.
+    ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
+  };
+
+  using NodeArray = Array<Node>;
+  using RootArray = Array<Node *>;
+  using ShadowStackArray = Array<ShadowStackEntry>;
+
+public:
+  // We collate the allocators we need into a single struct, as a convenience to
+  // allow us to initialize these as a group.
+  struct Allocators {
+    using NodeAllocatorType = NodeArray::AllocatorType;
+    using RootAllocatorType = RootArray::AllocatorType;
+    using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
+
+    NodeAllocatorType *NodeAllocator = nullptr;
+    RootAllocatorType *RootAllocator = nullptr;
+    ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
+    NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+
+    Allocators() {}
+    Allocators(const Allocators &) = delete;
+    Allocators &operator=(const Allocators &) = delete;
+
+    Allocators(Allocators &&O)
+        : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
+          ShadowStackAllocator(O.ShadowStackAllocator),
+          NodeIdPairAllocator(O.NodeIdPairAllocator) {
+      O.NodeAllocator = nullptr;
+      O.RootAllocator = nullptr;
+      O.ShadowStackAllocator = nullptr;
+      O.NodeIdPairAllocator = nullptr;
+    }
+
+    Allocators &operator=(Allocators &&O) {
+      {
+        auto Tmp = O.NodeAllocator;
+        O.NodeAllocator = this->NodeAllocator;
+        this->NodeAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.RootAllocator;
+        O.RootAllocator = this->RootAllocator;
+        this->RootAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.ShadowStackAllocator;
+        O.ShadowStackAllocator = this->ShadowStackAllocator;
+        this->ShadowStackAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.NodeIdPairAllocator;
+        O.NodeIdPairAllocator = this->NodeIdPairAllocator;
+        this->NodeIdPairAllocator = Tmp;
+      }
+      return *this;
+    }
+
+    ~Allocators() {
+      // Note that we cannot use delete on these pointers, as they need to be
+      // returned to the sanitizer_common library's internal memory tracking
+      // system.
+      if (NodeAllocator != nullptr) {
+        NodeAllocator->~NodeAllocatorType();
+        InternalFree(NodeAllocator);
+        NodeAllocator = nullptr;
+      }
+      if (RootAllocator != nullptr) {
+        RootAllocator->~RootAllocatorType();
+        InternalFree(RootAllocator);
+        RootAllocator = nullptr;
+      }
+      if (ShadowStackAllocator != nullptr) {
+        ShadowStackAllocator->~ShadowStackAllocatorType();
+        InternalFree(ShadowStackAllocator);
+        ShadowStackAllocator = nullptr;
+      }
+      if (NodeIdPairAllocator != nullptr) {
+        NodeIdPairAllocator->~NodeIdPairAllocatorType();
+        InternalFree(NodeIdPairAllocator);
+        NodeIdPairAllocator = nullptr;
+      }
+    }
+  };
+
+  // TODO: Support configuration of options through the arguments.
+  static Allocators InitAllocators() {
+    return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
+  }
+
+  static Allocators InitAllocatorsCustom(uptr Max) {
+    Allocators A;
+    auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
+        InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
+    new (NodeAllocator) Allocators::NodeAllocatorType(Max);
+    A.NodeAllocator = NodeAllocator;
+
+    auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
+        InternalAlloc(sizeof(Allocators::RootAllocatorType)));
+    new (RootAllocator) Allocators::RootAllocatorType(Max);
+    A.RootAllocator = RootAllocator;
+
+    auto ShadowStackAllocator =
+        reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
+            InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
+    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max);
+    A.ShadowStackAllocator = ShadowStackAllocator;
+
+    auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+        InternalAlloc(sizeof(NodeIdPairAllocatorType)));
+    new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
+    A.NodeIdPairAllocator = NodeIdPairAllocator;
+    return A;
+  }
+
+private:
+  NodeArray Nodes;
+  RootArray Roots;
+  ShadowStackArray ShadowStack;
+  NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+
+public:
+  explicit FunctionCallTrie(const Allocators &A)
+      : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+        ShadowStack(*A.ShadowStackAllocator),
+        NodeIdPairAllocator(A.NodeIdPairAllocator) {}
+
+  void enterFunction(const int32_t FId, uint64_t TSC) {
+    DCHECK_NE(FId, 0);
+    // This function primarily deals with ensuring that the ShadowStack is
+    // consistent and ready for when an exit event is encountered.
+    if (UNLIKELY(ShadowStack.empty())) {
+      auto NewRoot =
+          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
+      Roots.Append(NewRoot);
+      ShadowStack.AppendEmplace(TSC, NewRoot);
+      return;
+    }
+
+    auto &Top = ShadowStack.back();
+    auto TopNode = Top.NodePtr;
+    DCHECK_NE(TopNode, nullptr);
+
+    // If we've seen this callee before, then we just access that node and place
+    // that on the top of the stack.
+    auto Callee = TopNode->Callees.find_element(
+        [FId](const NodeIdPair &NR) { return NR.FId == FId; });
+    if (Callee != nullptr) {
+      CHECK_NE(Callee->NodePtr, nullptr);
+      ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+      return;
+    }
+
+    // This means we've never seen this stack before, create a new node here.
+    auto NewNode =
+        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+    if (UNLIKELY(NewNode == nullptr))
+      return;
+    DCHECK_NE(NewNode, nullptr);
+    TopNode->Callees.AppendEmplace(NewNode, FId);
+    ShadowStack.AppendEmplace(TSC, NewNode);
+    DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
+    return;
+  }
+
+  void exitFunction(int32_t FId, uint64_t TSC) {
+    // When we exit a function, we look up the ShadowStack to see whether we've
+    // entered this function before. We do as little processing here as we can,
+    // since most of the hard work would have already been done at function
+    // entry.
+    uint64_t CumulativeTreeTime = 0;
+    while (!ShadowStack.empty()) {
+      const auto &Top = ShadowStack.back();
+      auto TopNode = Top.NodePtr;
+      DCHECK_NE(TopNode, nullptr);
+      auto LocalTime = TSC - Top.EntryTSC;
+      TopNode->CallCount++;
+      TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
+      CumulativeTreeTime += LocalTime;
+      ShadowStack.trim(1);
+
+      // TODO: Update the histogram for the node.
+      if (TopNode->FId == FId)
+        break;
+    }
+  }
+
+  const RootArray &getRoots() const { return Roots; }
+
+  // The deepCopyInto operation will update the provided FunctionCallTrie by
+  // re-creating the contents of this particular FunctionCallTrie in the other
+  // FunctionCallTrie. It will do this using a Depth First Traversal from the
+  // roots, and while doing so recreating the traversal in the provided
+  // FunctionCallTrie.
+  //
+  // This operation will *not* destroy the state in `O`, and thus may cause some
+  // duplicate entries in `O` if it is not empty.
+  //
+  // This function is *not* thread-safe, and may require external
+  // synchronisation of both "this" and |O|.
+  //
+  // This function must *not* be called with a non-empty FunctionCallTrie |O|.
+  void deepCopyInto(FunctionCallTrie &O) const {
+    DCHECK(O.getRoots().empty());
+
+    // We then push the root into a stack, to use as the parent marker for new
+    // nodes we push in as we're traversing depth-first down the call tree.
+    struct NodeAndParent {
+      FunctionCallTrie::Node *Node;
+      FunctionCallTrie::Node *NewNode;
+    };
+    using Stack = Array<NodeAndParent>;
+
+    typename Stack::AllocatorType StackAllocator(
+        profilingFlags()->stack_allocator_max);
+    Stack DFSStack(StackAllocator);
+
+    for (const auto Root : getRoots()) {
+      // Add a node in O for this root.
+      auto NewRoot = O.Nodes.AppendEmplace(
+          nullptr, *O.NodeIdPairAllocator, Root->CallCount,
+          Root->CumulativeLocalTime, Root->FId);
+
+      // Because we cannot allocate more memory we should bail out right away.
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
+
+      O.Roots.Append(NewRoot);
+
+      // TODO: Figure out what to do if we fail to allocate any more stack
+      // space. Maybe warn or report once?
+      DFSStack.AppendEmplace(Root, NewRoot);
+      while (!DFSStack.empty()) {
+        NodeAndParent NP = DFSStack.back();
+        DCHECK_NE(NP.Node, nullptr);
+        DCHECK_NE(NP.NewNode, nullptr);
+        DFSStack.trim(1);
+        for (const auto Callee : NP.Node->Callees) {
+          auto NewNode = O.Nodes.AppendEmplace(
+              NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
+              Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+          if (UNLIKELY(NewNode == nullptr))
+            return;
+          NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
+          DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
+        }
+      }
+    }
+  }
+
+  // The mergeInto operation will update the provided FunctionCallTrie by
+  // traversing the current trie's roots and updating (i.e. merging) the data in
+  // the nodes with the data in the target's nodes. If the node doesn't exist in
+  // the provided trie, we add a new one in the right position, and inherit the
+  // data from the original (current) trie, along with all its callees.
+  //
+  // This function is *not* thread-safe, and may require external
+  // synchronisation of both "this" and |O|.
+  void mergeInto(FunctionCallTrie &O) const {
+    struct NodeAndTarget {
+      FunctionCallTrie::Node *OrigNode;
+      FunctionCallTrie::Node *TargetNode;
+    };
+    using Stack = Array<NodeAndTarget>;
+    typename Stack::AllocatorType StackAllocator(
+        profilingFlags()->stack_allocator_max);
+    Stack DFSStack(StackAllocator);
+
+    for (const auto Root : getRoots()) {
+      Node *TargetRoot = nullptr;
+      auto R = O.Roots.find_element(
+          [&](const Node *Node) { return Node->FId == Root->FId; });
+      if (R == nullptr) {
+        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
+                                           0, Root->FId);
+        if (UNLIKELY(TargetRoot == nullptr))
+          return;
+
+        O.Roots.Append(TargetRoot);
+      } else {
+        TargetRoot = *R;
+      }
+
+      DFSStack.Append(NodeAndTarget{Root, TargetRoot});
+      while (!DFSStack.empty()) {
+        NodeAndTarget NT = DFSStack.back();
+        DCHECK_NE(NT.OrigNode, nullptr);
+        DCHECK_NE(NT.TargetNode, nullptr);
+        DFSStack.trim(1);
+        // TODO: Update the histogram as well when we have it ready.
+        NT.TargetNode->CallCount += NT.OrigNode->CallCount;
+        NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;
+        for (const auto Callee : NT.OrigNode->Callees) {
+          auto TargetCallee = NT.TargetNode->Callees.find_element(
+              [&](const FunctionCallTrie::NodeIdPair &C) {
+                return C.FId == Callee.FId;
+              });
+          if (TargetCallee == nullptr) {
+            auto NewTargetNode = O.Nodes.AppendEmplace(
+                NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+
+            if (UNLIKELY(NewTargetNode == nullptr))
+              return;
+
+            TargetCallee =
+                NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
+          }
+          DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
+        }
+      }
+    }
+  }
+};
+
+} // namespace __xray
+
+#endif // XRAY_FUNCTION_CALL_TRIE_H
diff --git a/lib/xray/xray_init.cc b/lib/xray/xray_init.cc
index 11892cb8b7a3..b4e069795195 100644
--- a/lib/xray/xray_init.cc
+++ b/lib/xray/xray_init.cc
@@ -38,32 +38,29 @@ using namespace __xray;
 //
 // FIXME: Support DSO instrumentation maps too. The current solution only works
 // for statically linked executables.
-__sanitizer::atomic_uint8_t XRayInitialized{0};
+atomic_uint8_t XRayInitialized{0};
 
 // This should always be updated before XRayInitialized is updated.
-__sanitizer::SpinMutex XRayInstrMapMutex;
+SpinMutex XRayInstrMapMutex;
 XRaySledMap XRayInstrMap;
 
 // Global flag to determine whether the flags have been initialized.
-__sanitizer::atomic_uint8_t XRayFlagsInitialized{0};
+atomic_uint8_t XRayFlagsInitialized{0};
 
 // A mutex to allow only one thread to initialize the XRay data structures.
-__sanitizer::SpinMutex XRayInitMutex;
+SpinMutex XRayInitMutex;
 
 // __xray_init() will do the actual loading of the current process' memory map
 // and then proceed to look for the .xray_instr_map section/segment.
 void __xray_init() XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayInitMutex);
+  SpinMutexLock Guard(&XRayInitMutex);
   // Short-circuit if we've already initialized XRay before.
-  if (__sanitizer::atomic_load(&XRayInitialized,
-                               __sanitizer::memory_order_acquire))
+  if (atomic_load(&XRayInitialized, memory_order_acquire))
     return;
 
-  if (!__sanitizer::atomic_load(&XRayFlagsInitialized,
-                                __sanitizer::memory_order_acquire)) {
+  if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) {
     initializeFlags();
-    __sanitizer::atomic_store(&XRayFlagsInitialized, true,
-                              __sanitizer::memory_order_release);
+    atomic_store(&XRayFlagsInitialized, true, memory_order_release);
   }
 
   if (__start_xray_instr_map == nullptr) {
@@ -73,14 +70,13 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
   }
 
   {
-    __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+    SpinMutexLock Guard(&XRayInstrMapMutex);
     XRayInstrMap.Sleds = __start_xray_instr_map;
     XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map;
     XRayInstrMap.SledsIndex = __start_xray_fn_idx;
     XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx;
   }
-  __sanitizer::atomic_store(&XRayInitialized, true,
-                            __sanitizer::memory_order_release);
+  atomic_store(&XRayInitialized, true, memory_order_release);
 
 #ifndef XRAY_NO_PREINIT
   if (flags()->patch_premain)
@@ -88,7 +84,13 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
 #endif
 }
 
-#if !defined(XRAY_NO_PREINIT) && SANITIZER_CAN_USE_PREINIT_ARRAY
+// FIXME: Make check-xray tests work on FreeBSD without
+// SANITIZER_CAN_USE_PREINIT_ARRAY.
+// See sanitizer_internal_defs.h where the macro is defined.
+// Calling unresolved PLT functions in .preinit_array can lead to deadlock on
+// FreeBSD but here it seems benign.
+#if !defined(XRAY_NO_PREINIT) &&                                               \
+    (SANITIZER_CAN_USE_PREINIT_ARRAY || SANITIZER_FREEBSD)
 // Only add the preinit array initialization if the sanitizers can.
 __attribute__((section(".preinit_array"),
                used)) void (*__local_xray_preinit)(void) = __xray_init;
diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc
index 766313e85c58..01bf6ddc607e 100644
--- a/lib/xray/xray_interface.cc
+++ b/lib/xray/xray_interface.cc
@@ -19,9 +19,12 @@
 #include <cstdio>
 #include <errno.h>
 #include <limits>
+#include <string.h>
 #include <sys/mman.h>
 
+#include "sanitizer_common/sanitizer_addrhashmap.h"
 #include "sanitizer_common/sanitizer_common.h"
+
 #include "xray_defs.h"
 #include "xray_flags.h"
 
@@ -48,26 +51,40 @@ static const int16_t cSledLength = 8;
 #endif /* CPU architecture */
 
 // This is the function to call when we encounter the entry or exit sleds.
-__sanitizer::atomic_uintptr_t XRayPatchedFunction{0};
+atomic_uintptr_t XRayPatchedFunction{0};
 
 // This is the function to call from the arg1-enabled sleds/trampolines.
-__sanitizer::atomic_uintptr_t XRayArgLogger{0};
+atomic_uintptr_t XRayArgLogger{0};
 
 // This is the function to call when we encounter a custom event log call.
-__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0};
+atomic_uintptr_t XRayPatchedCustomEvent{0};
+
+// This is the function to call when we encounter a typed event log call.
+atomic_uintptr_t XRayPatchedTypedEvent{0};
 
 // This is the global status to determine whether we are currently
 // patching/unpatching.
-__sanitizer::atomic_uint8_t XRayPatching{0};
+atomic_uint8_t XRayPatching{0};
+
+struct TypeDescription {
+  uint32_t type_id;
+  std::size_t description_string_length;
+};
+
+using TypeDescriptorMapType = AddrHashMap<TypeDescription, 11>;
+// An address map from immutable descriptors to type ids.
+TypeDescriptorMapType TypeDescriptorAddressMap{};
 
-// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo
-// any successful mprotect(...) changes. This is used to make a page writeable
-// and executable, and upon destruction if it was successful in doing so returns
-// the page into a read-only and executable page.
+atomic_uint32_t TypeEventDescriptorCounter{0};
+
+// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will
+// undo any successful mprotect(...) changes. This is used to make a page
+// writeable and executable, and upon destruction if it was successful in
+// doing so returns the page into a read-only and executable page.
 //
 // This is only used specifically for runtime-patching of the XRay
-// instrumentation points. This assumes that the executable pages are originally
-// read-and-execute only.
+// instrumentation points. This assumes that the executable pages are
+// originally read-and-execute only.
 class MProtectHelper {
   void *PageAlignedAddr;
   std::size_t MProtectLen;
@@ -116,6 +133,9 @@ bool patchSled(const XRaySledEntry &Sled, bool Enable,
   case XRayEntryType::CUSTOM_EVENT:
     Success = patchCustomEvent(Enable, FuncId, Sled);
     break;
+  case XRayEntryType::TYPED_EVENT:
+    Success = patchTypedEvent(Enable, FuncId, Sled);
+    break;
   default:
     Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
     return false;
@@ -125,19 +145,19 @@ bool patchSled(const XRaySledEntry &Sled, bool Enable,
 
 XRayPatchingStatus patchFunction(int32_t FuncId,
                                  bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!__sanitizer::atomic_load(&XRayInitialized,
-                                __sanitizer::memory_order_acquire))
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
     return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
 
   uint8_t NotPatching = false;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel))
+  if (!atomic_compare_exchange_strong(
+          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
     return XRayPatchingStatus::ONGOING; // Already patching.
 
   // Next, we look for the function index.
   XRaySledMap InstrMap;
   {
-    __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+    SpinMutexLock Guard(&XRayInstrMapMutex);
     InstrMap = XRayInstrMap;
   }
 
@@ -161,8 +181,8 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
   while (f != e)
     SucceedOnce |= patchSled(*f++, Enable, FuncId);
 
-  __sanitizer::atomic_store(&XRayPatching, false,
-                            __sanitizer::memory_order_release);
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
 
   if (!SucceedOnce) {
     Report("Failed patching any sled for function '%d'.", FuncId);
@@ -176,26 +196,26 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
 // implementation. |Enable| defines whether we're enabling or disabling the
 // runtime XRay instrumentation.
 XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!__sanitizer::atomic_load(&XRayInitialized,
-                                __sanitizer::memory_order_acquire))
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
     return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
 
   uint8_t NotPatching = false;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel))
+  if (!atomic_compare_exchange_strong(
+          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
     return XRayPatchingStatus::ONGOING; // Already patching.
 
   uint8_t PatchingSuccess = false;
   auto XRayPatchingStatusResetter =
-      __sanitizer::at_scope_exit([&PatchingSuccess] {
+      at_scope_exit([&PatchingSuccess] {
         if (!PatchingSuccess)
-          __sanitizer::atomic_store(&XRayPatching, false,
-                                    __sanitizer::memory_order_release);
+          atomic_store(&XRayPatching, false,
+                                    memory_order_release);
       });
 
   XRaySledMap InstrMap;
   {
-    __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+    SpinMutexLock Guard(&XRayInstrMapMutex);
     InstrMap = XRayInstrMap;
   }
   if (InstrMap.Entries == 0)
@@ -251,8 +271,8 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
     }
     patchSled(Sled, Enable, FuncId);
   }
-  __sanitizer::atomic_store(&XRayPatching, false,
-                            __sanitizer::memory_order_release);
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
   PatchingSuccess = true;
   return XRayPatchingStatus::SUCCESS;
 }
@@ -261,7 +281,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
                                             bool Enable) XRAY_NEVER_INSTRUMENT {
   XRaySledMap InstrMap;
   {
-    __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+    SpinMutexLock Guard(&XRayInstrMapMutex);
     InstrMap = XRayInstrMap;
   }
 
@@ -318,12 +338,12 @@ using namespace __xray;
 
 int __xray_set_handler(void (*entry)(int32_t,
                                      XRayEntryType)) XRAY_NEVER_INSTRUMENT {
-  if (__sanitizer::atomic_load(&XRayInitialized,
-                               __sanitizer::memory_order_acquire)) {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
 
-    __sanitizer::atomic_store(&__xray::XRayPatchedFunction,
+    atomic_store(&__xray::XRayPatchedFunction,
                               reinterpret_cast<uintptr_t>(entry),
-                              __sanitizer::memory_order_release);
+                              memory_order_release);
     return 1;
   }
   return 0;
@@ -331,11 +351,23 @@ int __xray_set_handler(void (*entry)(int32_t,
 
 int __xray_set_customevent_handler(void (*entry)(void *, size_t))
     XRAY_NEVER_INSTRUMENT {
-  if (__sanitizer::atomic_load(&XRayInitialized,
-                               __sanitizer::memory_order_acquire)) {
-    __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent,
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+    atomic_store(&__xray::XRayPatchedCustomEvent,
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_typedevent_handler(void (*entry)(
+    uint16_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+    atomic_store(&__xray::XRayPatchedTypedEvent,
                               reinterpret_cast<uintptr_t>(entry),
-                              __sanitizer::memory_order_release);
+                              memory_order_release);
     return 1;
   }
   return 0;
@@ -349,6 +381,21 @@ int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT {
   return __xray_set_customevent_handler(nullptr);
 }
 
+int __xray_remove_typedevent_handler() XRAY_NEVER_INSTRUMENT {
+  return __xray_set_typedevent_handler(nullptr);
+}
+
+uint16_t __xray_register_event_type(
+    const char *const event_type) XRAY_NEVER_INSTRUMENT {
+  TypeDescriptorMapType::Handle h(&TypeDescriptorAddressMap, (uptr)event_type);
+  if (h.created()) {
+    h->type_id = atomic_fetch_add(
+        &TypeEventDescriptorCounter, 1, memory_order_acq_rel);
+    h->description_string_length = strnlen(event_type, 1024);
+  }
+  return h->type_id;
+}
+
 XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT {
   return controlPatching(true);
 }
@@ -367,22 +414,22 @@ __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
 }
 
 int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) {
-  if (!__sanitizer::atomic_load(&XRayInitialized,
-                                __sanitizer::memory_order_acquire))
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
     return 0;
 
   // A relaxed write might not be visible even if the current thread gets
   // scheduled on a different CPU/NUMA node.  We need to wait for everyone to
   // have this handler installed for consistency of collected data across CPUs.
-  __sanitizer::atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
-                            __sanitizer::memory_order_release);
+  atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
+                            memory_order_release);
   return 1;
 }
 
 int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
 
 uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+  SpinMutexLock Guard(&XRayInstrMapMutex);
   if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions)
     return 0;
   return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Function
@@ -396,6 +443,6 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
 }
 
 size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex);
+  SpinMutexLock Guard(&XRayInstrMapMutex);
   return XRayInstrMap.Functions;
 }
diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h
index 5811e2b7300a..8ca87457437e 100644
--- a/lib/xray/xray_interface_internal.h
+++ b/lib/xray/xray_interface_internal.h
@@ -43,8 +43,8 @@ struct XRaySledEntry {
 };
 
 struct XRayFunctionSledIndex {
-  const XRaySledEntry* Begin;
-  const XRaySledEntry* End;
+  const XRaySledEntry *Begin;
+  const XRaySledEntry *End;
 };
 }
 
@@ -57,12 +57,13 @@ struct XRaySledMap {
   size_t Functions;
 };
 
-bool patchFunctionEntry(bool Enable, uint32_t FuncId,
-                        const XRaySledEntry &Sled, void (*Trampoline)());
+bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
+                        void (*Trampoline)());
 bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
                            const XRaySledEntry &Sled);
 bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 
 } // namespace __xray
 
@@ -74,6 +75,7 @@ extern void __xray_FunctionExit();
 extern void __xray_FunctionTailExit();
 extern void __xray_ArgLoggerEntry();
 extern void __xray_CustomEvent();
+extern void __xray_TypedEvent();
 }
 
 #endif
diff --git a/lib/xray/xray_log_interface.cc b/lib/xray/xray_log_interface.cc
index 783f004d292a..0886fd0d1210 100644
--- a/lib/xray/xray_log_interface.cc
+++ b/lib/xray/xray_log_interface.cc
@@ -18,9 +18,20 @@
 #include "xray/xray_interface.h"
 #include "xray_defs.h"
 
-__sanitizer::SpinMutex XRayImplMutex;
-XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr};
-XRayLogImpl *GlobalXRayImpl = nullptr;
+namespace __xray {
+static SpinMutex XRayImplMutex;
+static XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr};
+static XRayLogImpl *GlobalXRayImpl = nullptr;
+
+// This is the default implementation of a buffer iterator, which always yields
+// a null buffer.
+XRayBuffer NullBufferIterator(XRayBuffer) XRAY_NEVER_INSTRUMENT {
+  return {nullptr, 0};
+}
+
+// This is the global function responsible for iterating through given buffers.
+atomic_uintptr_t XRayBufferIterator{
+    reinterpret_cast<uintptr_t>(&NullBufferIterator)};
 
 // We use a linked list of Mode to XRayLogImpl mappings. This is a linked list
 // when it should be a map because we're avoiding having to depend on C++
@@ -31,9 +42,24 @@ struct ModeImpl {
   XRayLogImpl Impl;
 };
 
-ModeImpl SentinelModeImpl{
+static ModeImpl SentinelModeImpl{
     nullptr, nullptr, {nullptr, nullptr, nullptr, nullptr}};
-ModeImpl *ModeImpls = &SentinelModeImpl;
+static ModeImpl *ModeImpls = &SentinelModeImpl;
+static const ModeImpl *CurrentMode = nullptr;
+
+} // namespace __xray
+
+using namespace __xray;
+
+void __xray_log_set_buffer_iterator(XRayBuffer (*Iterator)(XRayBuffer))
+    XRAY_NEVER_INSTRUMENT {
+  atomic_store(&__xray::XRayBufferIterator,
+               reinterpret_cast<uintptr_t>(Iterator), memory_order_release);
+}
+
+void __xray_log_remove_buffer_iterator() XRAY_NEVER_INSTRUMENT {
+  __xray_log_set_buffer_iterator(&NullBufferIterator);
+}
 
 XRayLogRegisterStatus
 __xray_log_register_mode(const char *Mode,
@@ -42,16 +68,15 @@ __xray_log_register_mode(const char *Mode,
       Impl.log_finalize == nullptr || Impl.log_init == nullptr)
     return XRayLogRegisterStatus::XRAY_INCOMPLETE_IMPL;
 
-  __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+  SpinMutexLock Guard(&XRayImplMutex);
   // First, look for whether the mode already has a registered implementation.
   for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
-    if (!__sanitizer::internal_strcmp(Mode, it->Mode))
+    if (!internal_strcmp(Mode, it->Mode))
       return XRayLogRegisterStatus::XRAY_DUPLICATE_MODE;
   }
-  auto *NewModeImpl =
-      static_cast<ModeImpl *>(__sanitizer::InternalAlloc(sizeof(ModeImpl)));
+  auto *NewModeImpl = static_cast<ModeImpl *>(InternalAlloc(sizeof(ModeImpl)));
   NewModeImpl->Next = ModeImpls;
-  NewModeImpl->Mode = __sanitizer::internal_strdup(Mode);
+  NewModeImpl->Mode = internal_strdup(Mode);
   NewModeImpl->Impl = Impl;
   ModeImpls = NewModeImpl;
   return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
@@ -59,9 +84,10 @@ __xray_log_register_mode(const char *Mode,
 
 XRayLogRegisterStatus
 __xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+  SpinMutexLock Guard(&XRayImplMutex);
   for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
-    if (!__sanitizer::internal_strcmp(Mode, it->Mode)) {
+    if (!internal_strcmp(Mode, it->Mode)) {
+      CurrentMode = it;
       CurrentXRayImpl = it->Impl;
       GlobalXRayImpl = &CurrentXRayImpl;
       __xray_set_handler(it->Impl.handle_arg0);
@@ -71,24 +97,32 @@ __xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT {
   return XRayLogRegisterStatus::XRAY_MODE_NOT_FOUND;
 }
 
+const char *__xray_log_get_current_mode() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (CurrentMode != nullptr)
+    return CurrentMode->Mode;
+  return nullptr;
+}
+
 void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
   if (Impl.log_init == nullptr || Impl.log_finalize == nullptr ||
       Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) {
-    __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+    SpinMutexLock Guard(&XRayImplMutex);
     GlobalXRayImpl = nullptr;
+    CurrentMode = nullptr;
     __xray_remove_handler();
     __xray_remove_handler_arg1();
     return;
   }
 
-  __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+  SpinMutexLock Guard(&XRayImplMutex);
   CurrentXRayImpl = Impl;
   GlobalXRayImpl = &CurrentXRayImpl;
   __xray_set_handler(Impl.handle_arg0);
 }
 
 void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+  SpinMutexLock Guard(&XRayImplMutex);
   GlobalXRayImpl = nullptr;
   __xray_remove_handler();
   __xray_remove_handler_arg1();
@@ -97,22 +131,80 @@ void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT {
 XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers,
                                   void *Args,
                                   size_t ArgsSize) XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+  SpinMutexLock Guard(&XRayImplMutex);
   if (!GlobalXRayImpl)
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
   return GlobalXRayImpl->log_init(BufferSize, MaxBuffers, Args, ArgsSize);
 }
 
+XRayLogInitStatus __xray_log_init_mode(const char *Mode, const char *Config)
+    XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  if (Config == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Check first whether the current mode is the same as what we expect.
+  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Here we do some work to coerce the pointer we're provided, so that
+  // the implementations that still take void* pointers can handle the
+  // data provided in the Config argument.
+  return GlobalXRayImpl->log_init(
+      0, 0, const_cast<void *>(static_cast<const void *>(Config)), 0);
+}
+
+XRayLogInitStatus
+__xray_log_init_mode_bin(const char *Mode, const char *Config,
+                         size_t ConfigSize) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  if (Config == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Check first whether the current mode is the same as what we expect.
+  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Here we do some work to coerce the pointer we're provided, so that
+  // the implementations that still take void* pointers can handle the
+  // data provided in the Config argument.
+  return GlobalXRayImpl->log_init(
+      0, 0, const_cast<void *>(static_cast<const void *>(Config)), ConfigSize);
+}
+
 XRayLogInitStatus __xray_log_finalize() XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+  SpinMutexLock Guard(&XRayImplMutex);
   if (!GlobalXRayImpl)
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
   return GlobalXRayImpl->log_finalize();
 }
 
 XRayLogFlushStatus __xray_log_flushLog() XRAY_NEVER_INSTRUMENT {
-  __sanitizer::SpinMutexLock Guard(&XRayImplMutex);
+  SpinMutexLock Guard(&XRayImplMutex);
   if (!GlobalXRayImpl)
     return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
   return GlobalXRayImpl->flush_log();
 }
+
+XRayLogFlushStatus __xray_log_process_buffers(
+    void (*Processor)(const char *, XRayBuffer)) XRAY_NEVER_INSTRUMENT {
+  // We want to make sure that there will be no changes to the global state for
+  // the log by synchronising on the XRayBufferIteratorMutex.
+  if (!GlobalXRayImpl)
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  auto Iterator = reinterpret_cast<XRayBuffer (*)(XRayBuffer)>(
+      atomic_load(&XRayBufferIterator, memory_order_acquire));
+  auto Buffer = (*Iterator)(XRayBuffer{nullptr, 0});
+  auto Mode = CurrentMode ? CurrentMode->Mode : nullptr;
+  while (Buffer.Data != nullptr) {
+    (*Processor)(Mode, Buffer);
+    Buffer = (*Iterator)(Buffer);
+  }
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
diff --git a/lib/xray/xray_mips.cc b/lib/xray/xray_mips.cc
index cd863304db29..6f8243828668 100644
--- a/lib/xray/xray_mips.cc
+++ b/lib/xray/xray_mips.cc
@@ -158,6 +158,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
   return false;
 }
 
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_mips64.cc b/lib/xray/xray_mips64.cc
index fa8fdd5abccc..f1bdf1d7d22d 100644
--- a/lib/xray/xray_mips64.cc
+++ b/lib/xray/xray_mips64.cc
@@ -166,6 +166,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
   // FIXME: Implement in mips64?
   return false;
 }
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_powerpc64.cc b/lib/xray/xray_powerpc64.cc
index ab03cb10042f..5e4938361c0c 100644
--- a/lib/xray/xray_powerpc64.cc
+++ b/lib/xray/xray_powerpc64.cc
@@ -99,6 +99,12 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
   return false;
 }
 
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
new file mode 100644
index 000000000000..a43744d9a0cb
--- /dev/null
+++ b/lib/xray/xray_profile_collector.cc
@@ -0,0 +1,318 @@
+//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the interface for the profileCollectorService.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_profiling_flags.h"
+#include <memory>
+#include <pthread.h>
+#include <utility>
+
+namespace __xray {
+namespace profileCollectorService {
+
+namespace {
+
+SpinMutex GlobalMutex;
+struct ThreadTrie {
+  tid_t TId;
+  FunctionCallTrie *Trie;
+};
+
+struct ProfileBuffer {
+  void *Data;
+  size_t Size;
+};
+
+struct BlockHeader {
+  u32 BlockSize;
+  u32 BlockNum;
+  u64 ThreadId;
+};
+
+// These need to be pointers that point to heap/internal-allocator-allocated
+// objects because these are accessed even at program exit.
+Vector<ThreadTrie> *ThreadTries = nullptr;
+Vector<ProfileBuffer> *ProfileBuffers = nullptr;
+FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+
+} // namespace
+
+void post(const FunctionCallTrie &T, tid_t TId) {
+  static pthread_once_t Once = PTHREAD_ONCE_INIT;
+  pthread_once(&Once, +[] {
+    SpinMutexLock Lock(&GlobalMutex);
+    GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+    new (GlobalAllocators) FunctionCallTrie::Allocators();
+    *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
+        profilingFlags()->global_allocator_max);
+    ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
+        InternalAlloc(sizeof(Vector<ThreadTrie>)));
+    new (ThreadTries) Vector<ThreadTrie>();
+    ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
+        InternalAlloc(sizeof(Vector<ProfileBuffer>)));
+    new (ProfileBuffers) Vector<ProfileBuffer>();
+  });
+  DCHECK_NE(GlobalAllocators, nullptr);
+  DCHECK_NE(ThreadTries, nullptr);
+  DCHECK_NE(ProfileBuffers, nullptr);
+
+  ThreadTrie *Item = nullptr;
+  {
+    SpinMutexLock Lock(&GlobalMutex);
+    if (GlobalAllocators == nullptr)
+      return;
+
+    Item = ThreadTries->PushBack();
+    Item->TId = TId;
+
+    // Here we're using the internal allocator instead of the managed allocator
+    // because:
+    //
+    // 1) We're not using the segmented array data structure to host
+    //    FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
+    //    which works like a std::vector<...> keeping elements contiguous in
+    //    memory. The segmented array data structure assumes that elements are
+    //    trivially destructible, where FunctionCallTrie isn't.
+    //
+    // 2) Using a managed allocator means we need to manage that separately,
+    //    which complicates the nature of this code. To get around that, we're
+    //    using the internal allocator instead, which has its own global state
+    //    and is decoupled from the lifetime management required by the managed
+    //    allocator we have in XRay.
+    //
+    Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
+        sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
+    DCHECK_NE(Item->Trie, nullptr);
+    new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+  }
+
+  T.deepCopyInto(*Item->Trie);
+}
+
+// A PathArray represents the function id's representing a stack trace. In this
+// context a path is almost always represented from the leaf function in a call
+// stack to a root of the call trie.
+using PathArray = Array<int32_t>;
+
+struct ProfileRecord {
+  using PathAllocator = typename PathArray::AllocatorType;
+
+  // The Path in this record is the function id's from the leaf to the root of
+  // the function call stack as represented from a FunctionCallTrie.
+  PathArray *Path = nullptr;
+  const FunctionCallTrie::Node *Node = nullptr;
+
+  // Constructor for in-place construction.
+  ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
+      : Path([&] {
+          auto P =
+              reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
+          new (P) PathArray(A);
+          return P;
+        }()),
+        Node(N) {}
+};
+
+namespace {
+
+using ProfileRecordArray = Array<ProfileRecord>;
+
+// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
+// the path(s) and the data associated with the path.
+static void populateRecords(ProfileRecordArray &PRs,
+                            ProfileRecord::PathAllocator &PA,
+                            const FunctionCallTrie &Trie) {
+  using StackArray = Array<const FunctionCallTrie::Node *>;
+  using StackAllocator = typename StackArray::AllocatorType;
+  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+  StackArray DFSStack(StackAlloc);
+  for (const auto R : Trie.getRoots()) {
+    DFSStack.Append(R);
+    while (!DFSStack.empty()) {
+      auto Node = DFSStack.back();
+      DFSStack.trim(1);
+      auto Record = PRs.AppendEmplace(PA, Node);
+      if (Record == nullptr)
+        return;
+      DCHECK_NE(Record, nullptr);
+
+      // Traverse the Node's parents and as we're doing so, get the FIds in
+      // the order they appear.
+      for (auto N = Node; N != nullptr; N = N->Parent)
+        Record->Path->Append(N->FId);
+      DCHECK(!Record->Path->empty());
+
+      for (const auto C : Node->Callees)
+        DFSStack.Append(C.NodePtr);
+    }
+  }
+}
+
+static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
+                             const ProfileRecordArray &ProfileRecords) {
+  auto NextPtr = static_cast<char *>(
+                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
+                 sizeof(Header);
+  for (const auto &Record : ProfileRecords) {
+    // List of IDs follow:
+    for (const auto FId : *Record.Path)
+      NextPtr =
+          static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+          sizeof(FId);
+
+    // Add the sentinel here.
+    constexpr int32_t SentinelFId = 0;
+    NextPtr = static_cast<char *>(
+                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
+              sizeof(SentinelFId);
+
+    // Add the node data here.
+    NextPtr =
+        static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
+                                            sizeof(Record.Node->CallCount))) +
+        sizeof(Record.Node->CallCount);
+    NextPtr = static_cast<char *>(
+                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
+                                  sizeof(Record.Node->CumulativeLocalTime))) +
+              sizeof(Record.Node->CumulativeLocalTime);
+  }
+
+  DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
+}
+
+} // namespace
+
+void serialize() {
+  SpinMutexLock Lock(&GlobalMutex);
+
+  // Clear out the global ProfileBuffers.
+  for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
+    InternalFree((*ProfileBuffers)[I].Data);
+  ProfileBuffers->Reset();
+
+  if (ThreadTries->Size() == 0)
+    return;
+
+  // Then repopulate the global ProfileBuffers.
+  for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
+    ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+    ProfileRecord::PathAllocator PathAlloc(
+        profilingFlags()->global_allocator_max);
+    ProfileRecordArray ProfileRecords(PRAlloc);
+
+    // First, we want to compute the amount of space we're going to need. We'll
+    // use a local allocator and an __xray::Array<...> to store the intermediary
+    // data, then compute the size as we're going along. Then we'll allocate the
+    // contiguous space to contain the thread buffer data.
+    const auto &Trie = *(*ThreadTries)[I].Trie;
+    if (Trie.getRoots().empty())
+      continue;
+    populateRecords(ProfileRecords, PathAlloc, Trie);
+    DCHECK(!Trie.getRoots().empty());
+    DCHECK(!ProfileRecords.empty());
+
+    // Go through each record, to compute the sizes.
+    //
+    // header size = block size (4 bytes)
+    //   + block number (4 bytes)
+    //   + thread id (8 bytes)
+    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
+    //   + call count (8 bytes)
+    //   + local time (8 bytes)
+    //   + end of record (8 bytes)
+    u32 CumulativeSizes = 0;
+    for (const auto &Record : ProfileRecords)
+      CumulativeSizes += 20 + (4 * Record.Path->size());
+
+    BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
+    auto Buffer = ProfileBuffers->PushBack();
+    Buffer->Size = sizeof(Header) + CumulativeSizes;
+    Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+    DCHECK_NE(Buffer->Data, nullptr);
+    serializeRecords(Buffer, Header, ProfileRecords);
+
+    // Now clean up the ProfileRecords array, one at a time.
+    for (auto &Record : ProfileRecords) {
+      Record.Path->~PathArray();
+      InternalFree(Record.Path);
+    }
+  }
+}
+
+void reset() {
+  SpinMutexLock Lock(&GlobalMutex);
+  if (ProfileBuffers != nullptr) {
+    // Clear out the profile buffers that have been serialized.
+    for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
+      InternalFree((*ProfileBuffers)[I].Data);
+    ProfileBuffers->Reset();
+    InternalFree(ProfileBuffers);
+    ProfileBuffers = nullptr;
+  }
+
+  if (ThreadTries != nullptr) {
+    // Clear out the function call tries per thread.
+    for (uptr I = 0; I < ThreadTries->Size(); ++I) {
+      auto &T = (*ThreadTries)[I];
+      T.Trie->~FunctionCallTrie();
+      InternalFree(T.Trie);
+    }
+    ThreadTries->Reset();
+    InternalFree(ThreadTries);
+    ThreadTries = nullptr;
+  }
+
+  // Reset the global allocators.
+  if (GlobalAllocators != nullptr) {
+    GlobalAllocators->~Allocators();
+    InternalFree(GlobalAllocators);
+    GlobalAllocators = nullptr;
+  }
+  GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+      InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+  new (GlobalAllocators) FunctionCallTrie::Allocators();
+  *GlobalAllocators = FunctionCallTrie::InitAllocators();
+  ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
+      InternalAlloc(sizeof(Vector<ThreadTrie>)));
+  new (ThreadTries) Vector<ThreadTrie>();
+  ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
+      InternalAlloc(sizeof(Vector<ProfileBuffer>)));
+  new (ProfileBuffers) Vector<ProfileBuffer>();
+}
+
+XRayBuffer nextBuffer(XRayBuffer B) {
+  SpinMutexLock Lock(&GlobalMutex);
+
+  if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+    return {nullptr, 0};
+
+  if (B.Data == nullptr)
+    return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
+
+  BlockHeader Header;
+  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
+  auto NextBlock = Header.BlockNum + 1;
+  if (NextBlock < ProfileBuffers->Size())
+    return {(*ProfileBuffers)[NextBlock].Data,
+            (*ProfileBuffers)[NextBlock].Size};
+  return {nullptr, 0};
+}
+
+} // namespace profileCollectorService
+} // namespace __xray
diff --git a/lib/xray/xray_profile_collector.h b/lib/xray/xray_profile_collector.h
new file mode 100644
index 000000000000..335043db9526
--- /dev/null
+++ b/lib/xray/xray_profile_collector.h
@@ -0,0 +1,88 @@
+//===-- xray_profile_collector.h -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a data collection service, for XRay
+// profiling. What we implement here is an in-process service where
+// FunctionCallTrie instances can be handed off by threads, to be
+// consolidated/collected.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_PROFILE_COLLECTOR_H
+#define XRAY_XRAY_PROFILE_COLLECTOR_H
+
+#include "xray_function_call_trie.h"
+
+#include "xray/xray_log_interface.h"
+
+namespace __xray {
+
+/// The ProfileCollectorService implements a centralised mechanism for
+/// collecting FunctionCallTrie instances, indexed by thread ID. On demand, the
+/// ProfileCollectorService can be queried for the most recent state of the
+/// data, in a form that allows traversal.
+namespace profileCollectorService {
+
+/// Posts the FunctionCallTrie associated with a specific Thread ID. This
+/// will:
+///
+///   - Make a copy of the FunctionCallTrie and store that against the Thread
+///     ID. This will use the global allocator for the service-managed
+///     FunctionCallTrie instances.
+///   - Queue up a pointer to the FunctionCallTrie.
+///   - If the queue is long enough (longer than some arbitrary threshold) we
+///     then pre-calculate a single FunctionCallTrie for the whole process.
+///
+///
+/// We are making a copy of the FunctionCallTrie because the intent is to have
+/// this function be called at thread exit, or soon after the profiling
+/// handler is finalized through the XRay APIs. By letting threads each
+/// process their own thread-local FunctionCallTrie instances, we're removing
+/// the need for synchronisation across threads while we're profiling.
+/// However, once we're done profiling, we can then collect copies of these
+/// FunctionCallTrie instances and pay the cost of the copy.
+///
+/// NOTE: In the future, if this turns out to be more costly than "moving" the
+/// FunctionCallTrie instances from the owning thread to the collector
+/// service, then we can change the implementation to do it this way (moving)
+/// instead.
+void post(const FunctionCallTrie &T, tid_t TId);
+
+/// The serialize will process all FunctionCallTrie instances in memory, and
+/// turn those into specifically formatted blocks, each describing the
+/// function call trie's contents in a compact form. In memory, this looks
+/// like the following layout:
+///
+///   - block size (32 bits)
+///   - block number (32 bits)
+///   - thread id (64 bits)
+///   - list of records:
+///     - function ids in leaf to root order, terminated by
+///       0 (32 bits per function id)
+///     - call count (64 bit)
+///     - cumulative local time (64 bit)
+///     - record delimiter (64 bit, 0x0)
+///
+void serialize();
+
+/// The reset function will clear out any internal memory held by the
+/// service. The intent is to have the resetting be done in calls to the
+/// initialization routine, or explicitly through the flush log API.
+void reset();
+
+/// This nextBuffer function is meant to implement the iterator functionality,
+/// provided in the XRay API.
+XRayBuffer nextBuffer(XRayBuffer B);
+
+} // namespace profileCollectorService
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_PROFILE_COLLECTOR_H
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
new file mode 100644
index 000000000000..786084c77226
--- /dev/null
+++ b/lib/xray/xray_profiling.cc
@@ -0,0 +1,372 @@
+//===-- xray_profiling.cc ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This is the implementation of a profiling handler.
+//
+//===----------------------------------------------------------------------===//
+#include <memory>
+#include <time.h>
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "xray/xray_interface.h"
+#include "xray/xray_log_interface.h"
+
+#include "xray_flags.h"
+#include "xray_profile_collector.h"
+#include "xray_profiling_flags.h"
+#include "xray_recursion_guard.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+#include <pthread.h>
+
+namespace __xray {
+
+namespace {
+
+constexpr uptr XRayProfilingVersion = 0x20180424;
+
+struct XRayProfilingFileHeader {
+  const u64 MagicBytes = 0x7872617970726f66; // Identifier for XRay profiling
+                                             // files 'xrayprof' in hex.
+  const uptr Version = XRayProfilingVersion;
+  uptr Timestamp = 0; // System time in nanoseconds.
+  uptr PID = 0;       // Process ID.
+};
+
+atomic_sint32_t ProfilerLogFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+SpinMutex ProfilerOptionsMutex;
+
+struct alignas(64) ProfilingData {
+  FunctionCallTrie::Allocators *Allocators = nullptr;
+  FunctionCallTrie *FCT = nullptr;
+};
+
+static pthread_key_t ProfilingKey;
+
+thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
+static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+  thread_local auto ThreadOnce = [] {
+    new (&ThreadStorage) ProfilingData{};
+    pthread_setspecific(ProfilingKey, &ThreadStorage);
+    return false;
+  }();
+  (void)ThreadOnce;
+
+  auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+
+  // We need to check whether the global flag to finalizing/finalized has been
+  // switched. If it is, then we ought to not actually initialise the data.
+  auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
+  if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+      Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)
+    return TLD;
+
+  // If we're live, then we re-initialize TLD if the pointers are not null.
+  if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) {
+    TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+    new (TLD.Allocators) FunctionCallTrie::Allocators();
+    *TLD.Allocators = FunctionCallTrie::InitAllocators();
+    TLD.FCT = reinterpret_cast<FunctionCallTrie *>(
+        InternalAlloc(sizeof(FunctionCallTrie)));
+    new (TLD.FCT) FunctionCallTrie(*TLD.Allocators);
+  }
+
+  return TLD;
+}
+
+static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
+  auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+  if (TLD.Allocators != nullptr && TLD.FCT != nullptr) {
+    TLD.FCT->~FunctionCallTrie();
+    TLD.Allocators->~Allocators();
+    InternalFree(TLD.FCT);
+    InternalFree(TLD.Allocators);
+    TLD.FCT = nullptr;
+    TLD.Allocators = nullptr;
+  }
+}
+
+} // namespace
+
+const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_PROFILER_DEFAULT_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+atomic_sint32_t ProfileFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&ProfilerLogStatus, memory_order_acquire) !=
+      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+    if (Verbosity())
+      Report("Not flushing profiles, profiling not been finalized.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result,
+                                      XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+                                      memory_order_acq_rel)) {
+    if (Verbosity())
+      Report("Not flushing profiles, implementation still finalizing.\n");
+  }
+
+  // At this point, we'll create the file that will contain the profile, but
+  // only if the options say so.
+  if (!profilingFlags()->no_flush) {
+    // First check whether we have data in the profile collector service
+    // before we try and write anything down.
+    XRayBuffer B = profileCollectorService::nextBuffer({nullptr, 0});
+    if (B.Data == nullptr) {
+      if (Verbosity())
+        Report("profiling: No data to flush.\n");
+    } else {
+      int Fd = getLogFD();
+      if (Fd == -1) {
+        if (Verbosity())
+          Report("profiling: Failed to flush to file, dropping data.\n");
+      } else {
+        XRayProfilingFileHeader Header;
+        Header.Timestamp = NanoTime();
+        Header.PID = internal_getpid();
+        retryingWriteAll(Fd, reinterpret_cast<const char *>(&Header),
+                         reinterpret_cast<const char *>(&Header) +
+                             sizeof(Header));
+
+        // Now for each of the threads, write out the profile data as we would
+        // see it in memory, verbatim.
+        while (B.Data != nullptr && B.Size != 0) {
+          retryingWriteAll(Fd, reinterpret_cast<const char *>(B.Data),
+                           reinterpret_cast<const char *>(B.Data) + B.Size);
+          B = profileCollectorService::nextBuffer(B);
+        }
+        // Then we close out the file.
+        internal_close(Fd);
+      }
+    }
+  }
+
+  profileCollectorService::reset();
+
+  // Flush the current thread's local data structures as well.
+  cleanupTLD();
+
+  atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+               memory_order_release);
+
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+namespace {
+
+thread_local atomic_uint8_t ReentranceGuard{0};
+
+static void postCurrentThreadFCT(ProfilingData &TLD) {
+  if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
+    return;
+
+  profileCollectorService::post(*TLD.FCT, GetTid());
+  cleanupTLD();
+}
+
+} // namespace
+
+void profilingHandleArg0(int32_t FuncId,
+                         XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  unsigned char CPU;
+  auto TSC = readTSC(CPU);
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return;
+
+  auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
+  auto &TLD = getThreadLocalData();
+  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
+               Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
+    postCurrentThreadFCT(TLD);
+    return;
+  }
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.FCT->enterFunction(FuncId, TSC);
+    break;
+  case XRayEntryType::EXIT:
+  case XRayEntryType::TAIL:
+    TLD.FCT->exitFunction(FuncId, TSC);
+    break;
+  default:
+    // FIXME: Handle bugs.
+    break;
+  }
+}
+
+void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry,
+                         uint64_t) XRAY_NEVER_INSTRUMENT {
+  return profilingHandleArg0(FuncId, Entry);
+}
+
+XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot finalize profile, the profiling is not initialized.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  // Wait a grace period to allow threads to see that we're finalizing.
+  SleepForMillis(profilingFlags()->grace_period_ms);
+
+  // We also want to make sure that the current thread's data is cleaned up,
+  // if we have any.
+  auto &TLD = getThreadLocalData();
+  postCurrentThreadFCT(TLD);
+
+  // Then we force serialize the log data.
+  profileCollectorService::serialize();
+
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+               memory_order_release);
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+XRayLogInitStatus
+profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
+                     size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  if (BufferSize != 0 || BufferMax != 0) {
+    if (Verbosity())
+      Report("__xray_log_init() being used, and is unsupported. Use "
+             "__xray_log_init_mode(...) instead. Bailing out.");
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  }
+
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot initialize already initialised profiling "
+             "implementation.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    FlagParser ConfigParser;
+    ProfilerFlags Flags;
+    Flags.setDefaults();
+    registerProfilerFlags(&ConfigParser, &Flags);
+    ConfigParser.ParseString(profilingCompilerDefinedFlags());
+    const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
+    if (Env == nullptr)
+      Env = "";
+    ConfigParser.ParseString(Env);
+
+    // Then parse the configuration string provided.
+    ConfigParser.ParseString(static_cast<const char *>(Options));
+    if (Verbosity())
+      ReportUnrecognizedFlags();
+    *profilingFlags() = Flags;
+  }
+
+  // We need to reset the profile data collection implementation now.
+  profileCollectorService::reset();
+
+  // We need to set up the exit handlers.
+  static pthread_once_t Once = PTHREAD_ONCE_INIT;
+  pthread_once(&Once, +[] {
+    pthread_key_create(&ProfilingKey, +[](void *P) {
+      // This is the thread-exit handler.
+      auto &TLD = *reinterpret_cast<ProfilingData *>(P);
+      if (TLD.Allocators == nullptr && TLD.FCT == nullptr)
+        return;
+
+      postCurrentThreadFCT(TLD);
+    });
+
+    // We also need to set up an exit handler, so that we can get the profile
+    // information at exit time. We use the C API to do this, to not rely on C++
+    // ABI functions for registering exit handlers.
+    Atexit(+[] {
+      // Finalize and flush.
+      if (profilingFinalize() != XRAY_LOG_FINALIZED) {
+        cleanupTLD();
+        return;
+      }
+      if (profilingFlush() != XRAY_LOG_FLUSHED) {
+        cleanupTLD();
+        return;
+      }
+      if (Verbosity())
+        Report("XRay Profile flushed at exit.");
+    });
+  });
+
+  __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
+  __xray_set_handler(profilingHandleArg0);
+  __xray_set_handler_arg1(profilingHandleArg1);
+
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+               memory_order_release);
+  if (Verbosity())
+    Report("XRay Profiling init successful.\n");
+
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  // Set up the flag defaults from the static defaults and the
+  // compiler-provided defaults.
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    auto *F = profilingFlags();
+    F->setDefaults();
+    FlagParser ProfilingParser;
+    registerProfilerFlags(&ProfilingParser, F);
+    ProfilingParser.ParseString(profilingCompilerDefinedFlags());
+  }
+
+  XRayLogImpl Impl{
+      profilingLoggingInit,
+      profilingFinalize,
+      profilingHandleArg0,
+      profilingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+    if (Verbosity())
+      Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = "
+             "%d\n",
+             RegistrationResult);
+    return false;
+  }
+
+  if (!internal_strcmp(flags()->xray_mode, "xray-profiling"))
+    __xray_log_select_mode("xray_profiling");
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::profilingDynamicInitializer();
diff --git a/lib/xray/xray_profiling_flags.cc b/lib/xray/xray_profiling_flags.cc
new file mode 100644
index 000000000000..593e66a78ad2
--- /dev/null
+++ b/lib/xray/xray_profiling_flags.cc
@@ -0,0 +1,40 @@
+//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay runtime flags.
+//===----------------------------------------------------------------------===//
+
+#include "xray_profiling_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+namespace __xray {
+
+// Storage for the profiling flags.
+ProfilerFlags xray_profiling_flags_dont_use_directly;
+
+void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerProfilerFlags(FlagParser *P,
+                           ProfilerFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_profiling_flags.h b/lib/xray/xray_profiling_flags.h
new file mode 100644
index 000000000000..2f9a7514799a
--- /dev/null
+++ b/lib/xray/xray_profiling_flags.h
@@ -0,0 +1,39 @@
+//===-- xray_profiling_flags.h ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay profiling runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_PROFILER_FLAGS_H
+#define XRAY_PROFILER_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct ProfilerFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern ProfilerFlags xray_profiling_flags_dont_use_directly;
+inline ProfilerFlags *profilingFlags() {
+  return &xray_profiling_flags_dont_use_directly;
+}
+void registerProfilerFlags(FlagParser *P, ProfilerFlags *F);
+
+} // namespace __xray
+
+#endif // XRAY_PROFILER_FLAGS_H
diff --git a/lib/xray/xray_profiling_flags.inc b/lib/xray/xray_profiling_flags.inc
new file mode 100644
index 000000000000..e9230ae64187
--- /dev/null
+++ b/lib/xray/xray_profiling_flags.inc
@@ -0,0 +1,29 @@
+//===-- xray_profiling_flags.inc --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay profiling runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20,
+          "Maximum size of any single per-thread allocator.")
+XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
+          "Maximum size of the global allocator for profile storage.")
+XRAY_FLAG(uptr, stack_allocator_max, 2 << 20,
+          "Maximum size of the traversal stack allocator.")
+XRAY_FLAG(int, grace_period_ms, 1,
+          "Profile collection will wait this much time in milliseconds before "
+          "resetting the global state. This gives a chance to threads to "
+          "notice that the profiler has been finalized and clean up.")
+XRAY_FLAG(bool, no_flush, false,
+          "Set to true if we want the profiling implementation to not write "
+          "out files.")
diff --git a/lib/xray/xray_recursion_guard.h b/lib/xray/xray_recursion_guard.h
new file mode 100644
index 000000000000..6edadea563bc
--- /dev/null
+++ b/lib/xray/xray_recursion_guard.h
@@ -0,0 +1,57 @@
+//===-- xray_recursion_guard.h ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_RECURSION_GUARD_H
+#define XRAY_XRAY_RECURSION_GUARD_H
+
+#include "sanitizer_common/sanitizer_atomic.h"
+
+namespace __xray {
+
+/// The RecursionGuard is useful for guarding against signal handlers which are
+/// also potentially calling XRay-instrumented functions. To use the
+/// RecursionGuard, you'll typically need a thread_local atomic_uint8_t:
+///
+///   thread_local atomic_uint8_t Guard{0};
+///
+///   // In a handler function:
+///   void handleArg0(int32_t F, XRayEntryType T) {
+///     RecursionGuard G(Guard);
+///     if (!G)
+///       return;  // Failed to acquire the guard.
+///     ...
+///   }
+///
+class RecursionGuard {
+  atomic_uint8_t &Running;
+  const bool Valid;
+
+public:
+  explicit inline RecursionGuard(atomic_uint8_t &R)
+      : Running(R), Valid(!atomic_exchange(&R, 1, memory_order_acq_rel)) {}
+
+  inline RecursionGuard(const RecursionGuard &) = delete;
+  inline RecursionGuard(RecursionGuard &&) = delete;
+  inline RecursionGuard &operator=(const RecursionGuard &) = delete;
+  inline RecursionGuard &operator=(RecursionGuard &&) = delete;
+
+  explicit inline operator bool() const { return Valid; }
+
+  inline ~RecursionGuard() noexcept {
+    if (Valid)
+      atomic_store(&Running, 0, memory_order_release);
+  }
+};
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_RECURSION_GUARD_H
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
new file mode 100644
index 000000000000..11dd794fa520
--- /dev/null
+++ b/lib/xray/xray_segmented_array.h
@@ -0,0 +1,375 @@
+//===-- xray_segmented_array.h ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the implementation of a segmented array, with fixed-size segments
+// backing the segments.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_SEGMENTED_ARRAY_H
+#define XRAY_SEGMENTED_ARRAY_H
+
+#include "sanitizer_common/sanitizer_allocator.h"
+#include "xray_allocator.h"
+#include "xray_utils.h"
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+/// The Array type provides an interface similar to std::vector<...> but does
+/// not shrink in size. Once constructed, elements can be appended but cannot be
+/// removed. The implementation is heavily dependent on the contract provided by
+/// the Allocator type, in that all memory will be released when the Allocator
+/// is destroyed. When an Array is destroyed, it will destroy elements in the
+/// backing store but will not free the memory.
+template <class T> class Array {
+  struct SegmentBase {
+    SegmentBase *Prev;
+    SegmentBase *Next;
+  };
+
+  // We want each segment of the array to be cache-line aligned, and elements of
+  // the array be offset from the beginning of the segment.
+  struct Segment : SegmentBase {
+    char Data[1];
+  };
+
+public:
+  // Each segment of the array will be laid out with the following assumptions:
+  //
+  //   - Each segment will be on a cache-line address boundary (kCacheLineSize
+  //     aligned).
+  //
+  //   - The elements will be accessed through an aligned pointer, dependent on
+  //     the alignment of T.
+  //
+  //   - Each element is at least two-pointers worth from the beginning of the
+  //     Segment, aligned properly, and the rest of the elements are accessed
+  //     through appropriate alignment.
+  //
+  // We then compute the size of the segment to follow this logic:
+  //
+  //   - Compute the number of elements that can fit within
+  //     kCacheLineSize-multiple segments, minus the size of two pointers.
+  //
+  //   - Request cacheline-multiple sized elements from the allocator.
+  static constexpr size_t AlignedElementStorageSize =
+      sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type);
+
+  static constexpr size_t SegmentSize =
+      nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize);
+
+  using AllocatorType = Allocator<SegmentSize>;
+
+  static constexpr size_t ElementsPerSegment =
+      (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T));
+
+  static_assert(ElementsPerSegment > 0,
+                "Must have at least 1 element per segment.");
+
+  static SegmentBase SentinelSegment;
+
+private:
+  AllocatorType *Alloc;
+  SegmentBase *Head = &SentinelSegment;
+  SegmentBase *Tail = &SentinelSegment;
+  size_t Size = 0;
+
+  // Here we keep track of segments in the freelist, to allow us to re-use
+  // segments when elements are trimmed off the end.
+  SegmentBase *Freelist = &SentinelSegment;
+
+  Segment *NewSegment() {
+    // We need to handle the case in which enough elements have been trimmed to
+    // allow us to re-use segments we've allocated before. For this we look into
+    // the Freelist, to see whether we need to actually allocate new blocks or
+    // just re-use blocks we've already seen before.
+    if (Freelist != &SentinelSegment) {
+      auto *FreeSegment = Freelist;
+      Freelist = FreeSegment->Next;
+      FreeSegment->Next = &SentinelSegment;
+      Freelist->Prev = &SentinelSegment;
+      return static_cast<Segment *>(FreeSegment);
+    }
+
+    auto SegmentBlock = Alloc->Allocate();
+    if (SegmentBlock.Data == nullptr)
+      return nullptr;
+
+    // Placement-new the Segment element at the beginning of the SegmentBlock.
+    auto S = reinterpret_cast<Segment *>(SegmentBlock.Data);
+    new (S) SegmentBase{&SentinelSegment, &SentinelSegment};
+    return S;
+  }
+
+  Segment *InitHeadAndTail() {
+    DCHECK_EQ(Head, &SentinelSegment);
+    DCHECK_EQ(Tail, &SentinelSegment);
+    auto Segment = NewSegment();
+    if (Segment == nullptr)
+      return nullptr;
+    DCHECK_EQ(Segment->Next, &SentinelSegment);
+    DCHECK_EQ(Segment->Prev, &SentinelSegment);
+    Head = Tail = static_cast<SegmentBase *>(Segment);
+    return Segment;
+  }
+
+  Segment *AppendNewSegment() {
+    auto S = NewSegment();
+    if (S == nullptr)
+      return nullptr;
+    DCHECK_NE(Tail, &SentinelSegment);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    DCHECK_EQ(S->Prev, &SentinelSegment);
+    DCHECK_EQ(S->Next, &SentinelSegment);
+    Tail->Next = S;
+    S->Prev = Tail;
+    Tail = S;
+    return static_cast<Segment *>(Tail);
+  }
+
+  // This Iterator models a BidirectionalIterator.
+  template <class U> class Iterator {
+    SegmentBase *S = &SentinelSegment;
+    size_t Offset = 0;
+    size_t Size = 0;
+
+  public:
+    Iterator(SegmentBase *IS, size_t Off, size_t S)
+        : S(IS), Offset(Off), Size(S) {}
+    Iterator(const Iterator &) noexcept = default;
+    Iterator() noexcept = default;
+    Iterator(Iterator &&) noexcept = default;
+    Iterator &operator=(const Iterator &) = default;
+    Iterator &operator=(Iterator &&) = default;
+    ~Iterator() = default;
+
+    Iterator &operator++() {
+      if (++Offset % ElementsPerSegment || Offset == Size)
+        return *this;
+
+      // At this point, we know that Offset % N == 0, so we must advance the
+      // segment pointer.
+      DCHECK_EQ(Offset % ElementsPerSegment, 0);
+      DCHECK_NE(Offset, Size);
+      DCHECK_NE(S, &SentinelSegment);
+      DCHECK_NE(S->Next, &SentinelSegment);
+      S = S->Next;
+      DCHECK_NE(S, &SentinelSegment);
+      return *this;
+    }
+
+    Iterator &operator--() {
+      DCHECK_NE(S, &SentinelSegment);
+      DCHECK_GT(Offset, 0);
+
+      auto PreviousOffset = Offset--;
+      if (PreviousOffset != Size && PreviousOffset % ElementsPerSegment == 0) {
+        DCHECK_NE(S->Prev, &SentinelSegment);
+        S = S->Prev;
+      }
+
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator Copy(*this);
+      ++(*this);
+      return Copy;
+    }
+
+    Iterator operator--(int) {
+      Iterator Copy(*this);
+      --(*this);
+      return Copy;
+    }
+
+    template <class V, class W>
+    friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
+      return L.S == R.S && L.Offset == R.Offset;
+    }
+
+    template <class V, class W>
+    friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) {
+      return !(L == R);
+    }
+
+    U &operator*() const {
+      DCHECK_NE(S, &SentinelSegment);
+      auto RelOff = Offset % ElementsPerSegment;
+
+      // We need to compute the character-aligned pointer, offset from the
+      // segment's Data location to get the element in the position of Offset.
+      auto Base = static_cast<Segment *>(S)->Data;
+      auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize);
+      return *reinterpret_cast<U *>(AlignedOffset);
+    }
+
+    U *operator->() const { return &(**this); }
+  };
+
+public:
+  explicit Array(AllocatorType &A) : Alloc(&A) {}
+
+  Array(const Array &) = delete;
+  Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
+                              Head(O.Head),
+                              Tail(O.Tail),
+                              Size(O.Size) {
+    O.Head = &SentinelSegment;
+    O.Tail = &SentinelSegment;
+    O.Size = 0;
+  }
+
+  bool empty() const { return Size == 0; }
+
+  AllocatorType &allocator() const {
+    DCHECK_NE(Alloc, nullptr);
+    return *Alloc;
+  }
+
+  size_t size() const { return Size; }
+
+  T *Append(const T &E) {
+    if (UNLIKELY(Head == &SentinelSegment))
+      if (InitHeadAndTail() == nullptr)
+        return nullptr;
+
+    auto Offset = Size % ElementsPerSegment;
+    if (UNLIKELY(Size != 0 && Offset == 0))
+      if (AppendNewSegment() == nullptr)
+        return nullptr;
+
+    auto Base = static_cast<Segment *>(Tail)->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    auto Position = reinterpret_cast<T *>(AlignedOffset);
+    *Position = E;
+    ++Size;
+    return Position;
+  }
+
+  template <class... Args> T *AppendEmplace(Args &&... args) {
+    if (UNLIKELY(Head == &SentinelSegment))
+      if (InitHeadAndTail() == nullptr)
+        return nullptr;
+
+    auto Offset = Size % ElementsPerSegment;
+    auto *LatestSegment = Tail;
+    if (UNLIKELY(Size != 0 && Offset == 0)) {
+      LatestSegment = AppendNewSegment();
+      if (LatestSegment == nullptr)
+        return nullptr;
+    }
+
+    DCHECK_NE(Tail, &SentinelSegment);
+    auto Base = static_cast<Segment *>(LatestSegment)->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    auto Position = reinterpret_cast<T *>(AlignedOffset);
+
+    // In-place construct at Position.
+    new (Position) T{std::forward<Args>(args)...};
+    ++Size;
+    return reinterpret_cast<T *>(Position);
+  }
+
+  T &operator[](size_t Offset) const {
+    DCHECK_LE(Offset, Size);
+    // We need to traverse the array enough times to find the element at Offset.
+    auto S = Head;
+    while (Offset >= ElementsPerSegment) {
+      S = S->Next;
+      Offset -= ElementsPerSegment;
+      DCHECK_NE(S, &SentinelSegment);
+    }
+    auto Base = static_cast<Segment *>(S)->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    auto Position = reinterpret_cast<T *>(AlignedOffset);
+    return *reinterpret_cast<T *>(Position);
+  }
+
+  T &front() const {
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Size, 0u);
+    return *begin();
+  }
+
+  T &back() const {
+    DCHECK_NE(Tail, &SentinelSegment);
+    DCHECK_NE(Size, 0u);
+    auto It = end();
+    --It;
+    return *It;
+  }
+
+  template <class Predicate> T *find_element(Predicate P) const {
+    if (empty())
+      return nullptr;
+
+    auto E = end();
+    for (auto I = begin(); I != E; ++I)
+      if (P(*I))
+        return &(*I);
+
+    return nullptr;
+  }
+
+  /// Remove N Elements from the end. This leaves the blocks behind, and not
+  /// require allocation of new blocks for new elements added after trimming.
+  void trim(size_t Elements) {
+    DCHECK_LE(Elements, Size);
+    DCHECK_GT(Size, 0);
+    auto OldSize = Size;
+    Size -= Elements;
+
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Tail, &SentinelSegment);
+
+    for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) -
+                                nearest_boundary(Size, ElementsPerSegment)) /
+                               ElementsPerSegment;
+         SegmentsToTrim > 0; --SegmentsToTrim) {
+      DCHECK_NE(Head, &SentinelSegment);
+      DCHECK_NE(Tail, &SentinelSegment);
+      // Put the tail into the Freelist.
+      auto *FreeSegment = Tail;
+      Tail = Tail->Prev;
+      if (Tail == &SentinelSegment)
+        Head = Tail;
+      else
+        Tail->Next = &SentinelSegment;
+
+      DCHECK_EQ(Tail->Next, &SentinelSegment);
+      FreeSegment->Next = Freelist;
+      FreeSegment->Prev = &SentinelSegment;
+      if (Freelist != &SentinelSegment)
+        Freelist->Prev = FreeSegment;
+      Freelist = FreeSegment;
+    }
+  }
+
+  // Provide iterators.
+  Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
+  Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
+  Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
+  Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
+};
+
+// We need to have this storage definition out-of-line so that the compiler can
+// ensure that storage for the SentinelSegment is defined and has a single
+// address.
+template <class T>
+typename Array<T>::SegmentBase Array<T>::SentinelSegment{
+    &Array<T>::SentinelSegment, &Array<T>::SentinelSegment};
+
+} // namespace __xray
+
+#endif // XRAY_SEGMENTED_ARRAY_H
diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S
index 350afd9265fd..99ad3966ee3a 100644
--- a/lib/xray/xray_trampoline_x86_64.S
+++ b/lib/xray/xray_trampoline_x86_64.S
@@ -19,47 +19,56 @@
 
 
 .macro SAVE_REGISTERS
-	subq $192, %rsp
-	CFI_DEF_CFA_OFFSET(200)
-	// At this point, the stack pointer should be aligned to an 8-byte boundary,
-	// because any call instructions that come after this will add another 8
-	// bytes and therefore align it to 16-bytes.
-	movq %rbp, 184(%rsp)
-	movupd	%xmm0, 168(%rsp)
-	movupd	%xmm1, 152(%rsp)
-	movupd	%xmm2, 136(%rsp)
-	movupd	%xmm3, 120(%rsp)
-	movupd	%xmm4, 104(%rsp)
-	movupd	%xmm5, 88(%rsp)
-	movupd	%xmm6, 72(%rsp)
-	movupd	%xmm7, 56(%rsp)
-	movq	%rdi, 48(%rsp)
-	movq	%rax, 40(%rsp)
-	movq	%rdx, 32(%rsp)
-	movq	%rsi, 24(%rsp)
-	movq	%rcx, 16(%rsp)
-	movq	%r8, 8(%rsp)
-	movq	%r9, 0(%rsp)
+	subq $240, %rsp
+	CFI_DEF_CFA_OFFSET(248)
+	movq %rbp, 232(%rsp)
+	movupd	%xmm0, 216(%rsp)
+	movupd	%xmm1, 200(%rsp)
+	movupd	%xmm2, 184(%rsp)
+	movupd	%xmm3, 168(%rsp)
+	movupd	%xmm4, 152(%rsp)
+	movupd	%xmm5, 136(%rsp)
+	movupd	%xmm6, 120(%rsp)
+	movupd	%xmm7, 104(%rsp)
+	movq	%rdi, 96(%rsp)
+	movq	%rax, 88(%rsp)
+	movq	%rdx, 80(%rsp)
+	movq	%rsi, 72(%rsp)
+	movq	%rcx, 64(%rsp)
+	movq	%r8, 56(%rsp)
+	movq	%r9, 48(%rsp)
+	movq  %r10, 40(%rsp)
+	movq  %r11, 32(%rsp)
+	movq  %r12, 24(%rsp)
+	movq  %r13, 16(%rsp)
+	movq  %r14, 8(%rsp)
+	movq  %r15, 0(%rsp)
 .endm
 
 .macro RESTORE_REGISTERS
-	movq  184(%rsp), %rbp
-	movupd	168(%rsp), %xmm0
-	movupd	152(%rsp), %xmm1
-	movupd	136(%rsp), %xmm2
-	movupd	120(%rsp), %xmm3
-	movupd	104(%rsp), %xmm4
-	movupd	88(%rsp), %xmm5
-	movupd	72(%rsp) , %xmm6
-	movupd	56(%rsp) , %xmm7
-	movq	48(%rsp), %rdi
-	movq	40(%rsp), %rax
-	movq	32(%rsp), %rdx
-	movq	24(%rsp), %rsi
-	movq	16(%rsp), %rcx
-	movq	8(%rsp), %r8
-	movq	0(%rsp), %r9
-	addq	$192, %rsp
+	movq  232(%rsp), %rbp
+	movupd	216(%rsp), %xmm0
+	movupd	200(%rsp), %xmm1
+	movupd	184(%rsp), %xmm2
+	movupd	168(%rsp), %xmm3
+	movupd	152(%rsp), %xmm4
+	movupd	136(%rsp), %xmm5
+	movupd	120(%rsp) , %xmm6
+	movupd	104(%rsp) , %xmm7
+	movq	96(%rsp), %rdi
+	movq	88(%rsp), %rax
+	movq	80(%rsp), %rdx
+	movq	72(%rsp), %rsi
+	movq	64(%rsp), %rcx
+	movq	56(%rsp), %r8
+	movq	48(%rsp), %r9
+	movq  40(%rsp), %r10
+	movq  32(%rsp), %r11
+	movq  24(%rsp), %r12
+	movq  16(%rsp), %r13
+	movq  8(%rsp), %r14
+	movq  0(%rsp), %r15
+	addq	$240, %rsp
 	CFI_DEF_CFA_OFFSET(8)
 .endm
 
@@ -90,6 +99,7 @@
 	.globl ASM_SYMBOL(__xray_FunctionEntry)
 	.align 16, 0x90
 	ASM_TYPE_FUNCTION(__xray_FunctionEntry)
+# LLVM-MCA-BEGIN __xray_FunctionEntry
 ASM_SYMBOL(__xray_FunctionEntry):
 	CFI_STARTPROC
 	SAVE_REGISTERS
@@ -100,7 +110,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
 	testq	%rax, %rax
 	je	.Ltmp0
 
-	// The patched function prolog puts its xray_instr_map index into %r10d.
+	// The patched function prologue puts its xray_instr_map index into %r10d.
 	movl	%r10d, %edi
 	xor	%esi,%esi
 	ALIGNED_CALL_RAX
@@ -108,6 +118,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
 .Ltmp0:
 	RESTORE_REGISTERS
 	retq
+# LLVM-MCA-END
 	ASM_SIZE(__xray_FunctionEntry)
 	CFI_ENDPROC
 
@@ -116,6 +127,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
 	.globl ASM_SYMBOL(__xray_FunctionExit)
 	.align 16, 0x90
 	ASM_TYPE_FUNCTION(__xray_FunctionExit)
+# LLVM-MCA-BEGIN __xray_FunctionExit
 ASM_SYMBOL(__xray_FunctionExit):
 	CFI_STARTPROC
 	// Save the important registers first. Since we're assuming that this
@@ -146,6 +158,7 @@ ASM_SYMBOL(__xray_FunctionExit):
 	addq	$56, %rsp
 	CFI_DEF_CFA_OFFSET(8)
 	retq
+# LLVM-MCA-END
 	ASM_SIZE(__xray_FunctionExit)
 	CFI_ENDPROC
 
@@ -154,6 +167,7 @@ ASM_SYMBOL(__xray_FunctionExit):
 	.globl ASM_SYMBOL(__xray_FunctionTailExit)
 	.align 16, 0x90
 	ASM_TYPE_FUNCTION(__xray_FunctionTailExit)
+# LLVM-MCA-BEGIN __xray_FunctionTailExit
 ASM_SYMBOL(__xray_FunctionTailExit):
 	CFI_STARTPROC
 	SAVE_REGISTERS
@@ -170,6 +184,7 @@ ASM_SYMBOL(__xray_FunctionTailExit):
 .Ltmp4:
 	RESTORE_REGISTERS
 	retq
+# LLVM-MCA-END
 	ASM_SIZE(__xray_FunctionTailExit)
 	CFI_ENDPROC
 
@@ -178,6 +193,7 @@ ASM_SYMBOL(__xray_FunctionTailExit):
 	.globl ASM_SYMBOL(__xray_ArgLoggerEntry)
 	.align 16, 0x90
 	ASM_TYPE_FUNCTION(__xray_ArgLoggerEntry)
+# LLVM-MCA-BEGIN __xray_ArgLoggerEntry
 ASM_SYMBOL(__xray_ArgLoggerEntry):
 	CFI_STARTPROC
 	SAVE_REGISTERS
@@ -207,6 +223,7 @@ ASM_SYMBOL(__xray_ArgLoggerEntry):
 .Larg1entryFail:
 	RESTORE_REGISTERS
 	retq
+# LLVM-MCA-END
 	ASM_SIZE(__xray_ArgLoggerEntry)
 	CFI_ENDPROC
 
@@ -215,13 +232,13 @@ ASM_SYMBOL(__xray_ArgLoggerEntry):
 	.global ASM_SYMBOL(__xray_CustomEvent)
 	.align 16, 0x90
 	ASM_TYPE_FUNCTION(__xray_CustomEvent)
+# LLVM-MCA-BEGIN __xray_CustomEvent
 ASM_SYMBOL(__xray_CustomEvent):
 	CFI_STARTPROC
 	SAVE_REGISTERS
 
 	// We take two arguments to this trampoline, which should be in rdi	and rsi
-	// already. We also make sure that we stash %rax because we use that register
-	// to call the logging handler.
+	// already.
 	movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax
 	testq %rax,%rax
 	je .LcustomEventCleanup
@@ -231,7 +248,35 @@ ASM_SYMBOL(__xray_CustomEvent):
 .LcustomEventCleanup:
 	RESTORE_REGISTERS
 	retq
+# LLVM-MCA-END
 	ASM_SIZE(__xray_CustomEvent)
 	CFI_ENDPROC
 
+//===----------------------------------------------------------------------===//
+
+	.global ASM_SYMBOL(__xray_TypedEvent)
+	.align 16, 0x90
+	ASM_TYPE_FUNCTION(__xray_TypedEvent)
+# LLVM-MCA-BEGIN __xray_TypedEvent
+ASM_SYMBOL(__xray_TypedEvent):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+
+	// We pass three arguments to this trampoline, which should be in rdi, rsi
+	// and rdx without our intervention.
+	movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax
+	testq %rax,%rax
+	je .LtypedEventCleanup
+
+	ALIGNED_CALL_RAX
+
+.LtypedEventCleanup:
+	RESTORE_REGISTERS
+	retq
+# LLVM-MCA-END
+	ASM_SIZE(__xray_TypedEvent)
+	CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
 NO_EXEC_STACK_DIRECTIVE
diff --git a/lib/xray/xray_utils.cc b/lib/xray/xray_utils.cc
index cf800d3aeaf8..68f4e8c1094c 100644
--- a/lib/xray/xray_utils.cc
+++ b/lib/xray/xray_utils.cc
@@ -15,11 +15,11 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "xray_defs.h"
 #include "xray_flags.h"
-#include <stdlib.h>
 #include <cstdio>
 #include <errno.h>
 #include <fcntl.h>
 #include <iterator>
+#include <stdlib.h>
 #include <sys/types.h>
 #include <tuple>
 #include <unistd.h>
@@ -31,7 +31,7 @@ void printToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT {
   fprintf(stderr, "%s", Buffer);
 }
 
-void retryingWriteAll(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
+void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
   if (Begin == End)
     return;
   auto TotalBytes = std::distance(Begin, End);
@@ -82,7 +82,7 @@ bool readValueFromFile(const char *Filename,
   if (!Success)
     return false;
   close(Fd);
-  char *End = nullptr;
+  const char *End = nullptr;
   long long Tmp = internal_simple_strtoll(Line, &End, 10);
   bool Result = false;
   if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
@@ -94,10 +94,10 @@ bool readValueFromFile(const char *Filename,
 
 int getLogFD() XRAY_NEVER_INSTRUMENT {
   // Open a temporary file once for the log.
-  static char TmpFilename[256] = {};
-  static char TmpWildcardPattern[] = "XXXXXX";
-  auto Argv = GetArgv();
-  const char *Progname = Argv[0] == nullptr ? "(unknown)" : Argv[0];
+  char TmpFilename[256] = {};
+  char TmpWildcardPattern[] = "XXXXXX";
+  auto **Argv = GetArgv();
+  const char *Progname = !Argv ? "(unknown)" : Argv[0];
   const char *LastSlash = internal_strrchr(Progname, '/');
 
   if (LastSlash != nullptr)
@@ -117,7 +117,7 @@ int getLogFD() XRAY_NEVER_INSTRUMENT {
            TmpFilename);
     return -1;
   }
-  if (__sanitizer::Verbosity())
+  if (Verbosity())
     Report("XRay: Log file in '%s'\n", TmpFilename);
 
   return Fd;
diff --git a/lib/xray/xray_utils.h b/lib/xray/xray_utils.h
index 1ecc74a2dce8..eafa16e1a9d5 100644
--- a/lib/xray/xray_utils.h
+++ b/lib/xray/xray_utils.h
@@ -15,6 +15,8 @@
 #ifndef XRAY_UTILS_H
 #define XRAY_UTILS_H
 
+#include <cstddef>
+#include <cstdint>
 #include <sys/types.h>
 #include <utility>
 
@@ -24,7 +26,7 @@ namespace __xray {
 void printToStdErr(const char *Buffer);
 
 // EINTR-safe write routine, provided a file descriptor and a character range.
-void retryingWriteAll(int Fd, char *Begin, char *End);
+void retryingWriteAll(int Fd, const char *Begin, const char *End);
 
 // Reads a long long value from a provided file.
 bool readValueFromFile(const char *Filename, long long *Value);
@@ -36,6 +38,32 @@ std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End);
 // file.
 int getLogFD();
 
+constexpr size_t gcd(size_t a, size_t b) {
+  return (b == 0) ? a : gcd(b, a % b);
+}
+
+constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
+
+constexpr size_t nearest_boundary(size_t number, size_t multiple) {
+  return multiple * ((number / multiple) + (number % multiple ? 1 : 0));
+}
+
+constexpr size_t next_pow2_helper(size_t num, size_t acc) {
+  return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1);
+}
+
+constexpr size_t next_pow2(size_t number) {
+  return next_pow2_helper(number, 1);
+}
+
+template <class T> constexpr T &max(T &A, T &B) { return A > B ? A : B; }
+
+template <class T> constexpr T &min(T &A, T &B) { return A <= B ? A : B; }
+
+constexpr ptrdiff_t diff(uintptr_t A, uintptr_t B) {
+  return max(A, B) - min(A, B);
+}
+
 } // namespace __xray
 
 #endif // XRAY_UTILS_H
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
index e17f00ac3a62..51dc4ce43b1c 100644
--- a/lib/xray/xray_x86_64.cc
+++ b/lib/xray/xray_x86_64.cc
@@ -3,6 +3,15 @@
 #include "xray_defs.h"
 #include "xray_interface_internal.h"
 
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#include <sys/types.h>
+#if SANITIZER_OPENBSD
+#include <sys/time.h>
+#include <machine/cpu.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+
 #include <atomic>
 #include <cstdint>
 #include <errno.h>
@@ -14,6 +23,7 @@
 
 namespace __xray {
 
+#if SANITIZER_LINUX
 static std::pair<ssize_t, bool>
 retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
   auto BytesToRead = std::distance(Begin, End);
@@ -47,7 +57,7 @@ static bool readValueFromFile(const char *Filename,
   close(Fd);
   if (!Success)
     return false;
-  char *End = nullptr;
+  const char *End = nullptr;
   long long Tmp = internal_simple_strtoll(Line, &End, 10);
   bool Result = false;
   if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
@@ -71,6 +81,31 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
   }
   return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
 }
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+    long long TSCFrequency = -1;
+    size_t tscfreqsz = sizeof(TSCFrequency);
+#if SANITIZER_OPENBSD
+    int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ };
+    if (sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+
+#else
+    if (sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
+        NULL, 0) != -1) {
+#endif
+        return static_cast<uint64_t>(TSCFrequency);
+    } else {
+      Report("Unable to determine CPU frequency for TSC accounting.\n");
+    }
+
+    return 0;
+}
+#else
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+    /* Not supported */
+    return 0;
+}
+#endif
 
 static constexpr uint8_t CallOpCode = 0xe8;
 static constexpr uint16_t MovR10Seq = 0xba41;
@@ -184,8 +219,8 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
       reinterpret_cast<int64_t>(__xray_FunctionTailExit) -
       (static_cast<int64_t>(Sled.Address) + 11);
   if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
-    Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
-           __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address));
+    Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
+           __xray_FunctionTailExit, reinterpret_cast<void *>(Sled.Address));
     return false;
   }
   if (Enable) {
@@ -251,6 +286,37 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
   return false;
 }
 
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +20          // 2 byte instruction
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes
+  //   ...
+  //
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
+  // The 20 byte sled stashes three argument registers, calls the trampoline,
+  // unstashes the registers and returns. If the arguments are already in
+  // the correct registers, the stashing and unstashing become equivalently
+  // sized nops.
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+      std::atomic_store_explicit(
+          reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
+          std::memory_order_release);
+  }
+  return false;
+}
+
 // We determine whether the CPU we're running on has the correct features we
 // need. In x86_64 this will be rdtscp support.
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
@@ -259,7 +325,8 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
   // We check whether rdtscp support is enabled. According to the x86_64 manual,
   // level should be set at 0x80000001, and we should have a look at bit 27 in
   // EDX. That's 0x8000000 (or 1u << 27).
-  __get_cpuid(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  __asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX)
+    : "0"(0x80000001));
   if (!(EDX & (1u << 27))) {
     Report("Missing rdtscp support.\n");
     return false;
diff --git a/lib/xray/xray_x86_64.inc b/lib/xray/xray_x86_64.inc
index 4ad3f9810946..b3c475f9110c 100644
--- a/lib/xray/xray_x86_64.inc
+++ b/lib/xray/xray_x86_64.inc
@@ -21,9 +21,10 @@ namespace __xray {
 
 ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
   unsigned LongCPU;
-  uint64_t TSC = __rdtscp(&LongCPU);
+  unsigned long Rax, Rdx;
+  __asm__ __volatile__("rdtscp\n" : "=a"(Rax), "=d"(Rdx), "=c"(LongCPU) ::);
   CPU = LongCPU;
-  return TSC;
+  return (Rdx << 32) + Rax;
 }
 
 uint64_t getTSCFrequency();