1 files changed, 657 insertions, 693 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index a21da9c81c6f..fd9441dfcb53 100644
--- a/contrib/llvm-project/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/contrib/llvm-project/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -16,6 +16,7 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_file.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
@@ -28,29 +29,28 @@
 #include "tsan_symbolize.h"
 #include "ubsan/ubsan_init.h"
 
-#ifdef __SSE3__
-// <emmintrin.h> transitively includes <stdlib.h>,
-// and it's prohibited to include std headers into tsan runtime.
-// So we do this dirty trick.
-#define _MM_MALLOC_H_INCLUDED
-#define __MM_MALLOC_H
-#include <emmintrin.h>
-typedef __m128i m128;
-#endif
-
 volatile int __tsan_resumed = 0;
 
 extern "C" void __tsan_resume() {
   __tsan_resumed = 1;
 }
 
+SANITIZER_WEAK_DEFAULT_IMPL
+void __tsan_test_only_on_fork() {}
+
 namespace __tsan {
 
-#if !SANITIZER_GO && !SANITIZER_MAC
+#if !SANITIZER_GO
+void (*on_initialize)(void);
+int (*on_finalize)(int);
+#endif
+
+#if !SANITIZER_GO && !SANITIZER_APPLE
 __attribute__((tls_model("initial-exec")))
-THREADLOCAL char cur_thread_placeholder[sizeof(ThreadState)] ALIGNED(64);
+THREADLOCAL char cur_thread_placeholder[sizeof(ThreadState)] ALIGNED(
+    SANITIZER_CACHE_LINE_SIZE);
 #endif
-static char ctx_placeholder[sizeof(Context)] ALIGNED(64);
+static char ctx_placeholder[sizeof(Context)] ALIGNED(SANITIZER_CACHE_LINE_SIZE);
 Context *ctx;
 
 // Can be overriden by a front-end.
@@ -58,113 +58,404 @@ Context *ctx;
 bool OnFinalize(bool failed);
 void OnInitialize();
 #else
-#include <dlfcn.h>
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 bool OnFinalize(bool failed) {
-#if !SANITIZER_GO
-  if (auto *ptr = dlsym(RTLD_DEFAULT, "__tsan_on_finalize"))
-    return reinterpret_cast<decltype(&__tsan_on_finalize)>(ptr)(failed);
-#endif
+#  if !SANITIZER_GO
+  if (on_finalize)
+    return on_finalize(failed);
+#  endif
   return failed;
 }
+
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 void OnInitialize() {
-#if !SANITIZER_GO
-  if (auto *ptr = dlsym(RTLD_DEFAULT, "__tsan_on_initialize")) {
-    return reinterpret_cast<decltype(&__tsan_on_initialize)>(ptr)();
-  }
+#  if !SANITIZER_GO
+  if (on_initialize)
+    on_initialize();
+#  endif
+}
 #endif
+
+static TracePart* TracePartAlloc(ThreadState* thr) {
+  TracePart* part = nullptr;
+  {
+    Lock lock(&ctx->slot_mtx);
+    uptr max_parts = Trace::kMinParts + flags()->history_size;
+    Trace* trace = &thr->tctx->trace;
+    if (trace->parts_allocated == max_parts ||
+        ctx->trace_part_finished_excess) {
+      part = ctx->trace_part_recycle.PopFront();
+      DPrintf("#%d: TracePartAlloc: part=%p\n", thr->tid, part);
+      if (part && part->trace) {
+        Trace* trace1 = part->trace;
+        Lock trace_lock(&trace1->mtx);
+        part->trace = nullptr;
+        TracePart* part1 = trace1->parts.PopFront();
+        CHECK_EQ(part, part1);
+        if (trace1->parts_allocated > trace1->parts.Size()) {
+          ctx->trace_part_finished_excess +=
+              trace1->parts_allocated - trace1->parts.Size();
+          trace1->parts_allocated = trace1->parts.Size();
+        }
+      }
+    }
+    if (trace->parts_allocated < max_parts) {
+      trace->parts_allocated++;
+      if (ctx->trace_part_finished_excess)
+        ctx->trace_part_finished_excess--;
+    }
+    if (!part)
+      ctx->trace_part_total_allocated++;
+    else if (ctx->trace_part_recycle_finished)
+      ctx->trace_part_recycle_finished--;
+  }
+  if (!part)
+    part = new (MmapOrDie(sizeof(*part), "TracePart")) TracePart();
+  return part;
+}
+
+static void TracePartFree(TracePart* part) SANITIZER_REQUIRES(ctx->slot_mtx) {
+  DCHECK(part->trace);
+  part->trace = nullptr;
+  ctx->trace_part_recycle.PushFront(part);
+}
+
+void TraceResetForTesting() {
+  Lock lock(&ctx->slot_mtx);
+  while (auto* part = ctx->trace_part_recycle.PopFront()) {
+    if (auto trace = part->trace)
+      CHECK_EQ(trace->parts.PopFront(), part);
+    UnmapOrDie(part, sizeof(*part));
+  }
+  ctx->trace_part_total_allocated = 0;
+  ctx->trace_part_recycle_finished = 0;
+  ctx->trace_part_finished_excess = 0;
 }
+
+static void DoResetImpl(uptr epoch) {
+  ThreadRegistryLock lock0(&ctx->thread_registry);
+  Lock lock1(&ctx->slot_mtx);
+  CHECK_EQ(ctx->global_epoch, epoch);
+  ctx->global_epoch++;
+  CHECK(!ctx->resetting);
+  ctx->resetting = true;
+  for (u32 i = ctx->thread_registry.NumThreadsLocked(); i--;) {
+    ThreadContext* tctx = (ThreadContext*)ctx->thread_registry.GetThreadLocked(
+        static_cast<Tid>(i));
+    // Potentially we could purge all ThreadStatusDead threads from the
+    // registry. Since we reset all shadow, they can't race with anything
+    // anymore. However, their tid's can still be stored in some aux places
+    // (e.g. tid of thread that created something).
+    auto trace = &tctx->trace;
+    Lock lock(&trace->mtx);
+    bool attached = tctx->thr && tctx->thr->slot;
+    auto parts = &trace->parts;
+    bool local = false;
+    while (!parts->Empty()) {
+      auto part = parts->Front();
+      local = local || part == trace->local_head;
+      if (local)
+        CHECK(!ctx->trace_part_recycle.Queued(part));
+      else
+        ctx->trace_part_recycle.Remove(part);
+      if (attached && parts->Size() == 1) {
+        // The thread is running and this is the last/current part.
+        // Set the trace position to the end of the current part
+        // to force the thread to call SwitchTracePart and re-attach
+        // to a new slot and allocate a new trace part.
+        // Note: the thread is concurrently modifying the position as well,
+        // so this is only best-effort. The thread can only modify position
+        // within this part, because switching parts is protected by
+        // slot/trace mutexes that we hold here.
+        atomic_store_relaxed(
+            &tctx->thr->trace_pos,
+            reinterpret_cast<uptr>(&part->events[TracePart::kSize]));
+        break;
+      }
+      parts->Remove(part);
+      TracePartFree(part);
+    }
+    CHECK_LE(parts->Size(), 1);
+    trace->local_head = parts->Front();
+    if (tctx->thr && !tctx->thr->slot) {
+      atomic_store_relaxed(&tctx->thr->trace_pos, 0);
+      tctx->thr->trace_prev_pc = 0;
+    }
+    if (trace->parts_allocated > trace->parts.Size()) {
+      ctx->trace_part_finished_excess +=
+          trace->parts_allocated - trace->parts.Size();
+      trace->parts_allocated = trace->parts.Size();
+    }
+  }
+  while (ctx->slot_queue.PopFront()) {
+  }
+  for (auto& slot : ctx->slots) {
+    slot.SetEpoch(kEpochZero);
+    slot.journal.Reset();
+    slot.thr = nullptr;
+    ctx->slot_queue.PushBack(&slot);
+  }
+
+  DPrintf("Resetting shadow...\n");
+  auto shadow_begin = ShadowBeg();
+  auto shadow_end = ShadowEnd();
+#if SANITIZER_GO
+  CHECK_NE(0, ctx->mapped_shadow_begin);
+  shadow_begin = ctx->mapped_shadow_begin;
+  shadow_end = ctx->mapped_shadow_end;
+  VPrintf(2, "shadow_begin-shadow_end: (0x%zx-0x%zx)\n",
+          shadow_begin, shadow_end);
 #endif
 
-static ALIGNED(64) char thread_registry_placeholder[sizeof(ThreadRegistry)];
-
-static ThreadContextBase *CreateThreadContext(u32 tid) {
-  // Map thread trace when context is created.
-  char name[50];
-  internal_snprintf(name, sizeof(name), "trace %u", tid);
-  MapThreadTrace(GetThreadTrace(tid), TraceSize() * sizeof(Event), name);
-  const uptr hdr = GetThreadTraceHeader(tid);
-  internal_snprintf(name, sizeof(name), "trace header %u", tid);
-  MapThreadTrace(hdr, sizeof(Trace), name);
-  new((void*)hdr) Trace();
-  // We are going to use only a small part of the trace with the default
-  // value of history_size. However, the constructor writes to the whole trace.
-  // Release the unused part.
-  uptr hdr_end = hdr + sizeof(Trace);
-  hdr_end -= sizeof(TraceHeader) * (kTraceParts - TraceParts());
-  hdr_end = RoundUp(hdr_end, GetPageSizeCached());
-  if (hdr_end < hdr + sizeof(Trace)) {
-    ReleaseMemoryPagesToOS(hdr_end, hdr + sizeof(Trace));
-    uptr unused = hdr + sizeof(Trace) - hdr_end;
-    if (hdr_end != (uptr)MmapFixedNoAccess(hdr_end, unused)) {
-      Report("ThreadSanitizer: failed to mprotect(%p, %p)\n",
-          hdr_end, unused);
-      CHECK("unable to mprotect" && 0);
+#if SANITIZER_WINDOWS
+  auto resetFailed =
+      !ZeroMmapFixedRegion(shadow_begin, shadow_end - shadow_begin);
+#else
+  auto resetFailed =
+      !MmapFixedSuperNoReserve(shadow_begin, shadow_end-shadow_begin, "shadow");
+#  if !SANITIZER_GO
+  DontDumpShadow(shadow_begin, shadow_end - shadow_begin);
+#  endif
+#endif
+  if (resetFailed) {
+    Printf("failed to reset shadow memory\n");
+    Die();
+  }
+  DPrintf("Resetting meta shadow...\n");
+  ctx->metamap.ResetClocks();
+  StoreShadow(&ctx->last_spurious_race, Shadow::kEmpty);
+  ctx->resetting = false;
+}
+
+// Clang does not understand locking all slots in the loop:
+// error: expecting mutex 'slot.mtx' to be held at start of each loop
+void DoReset(ThreadState* thr, uptr epoch) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  for (auto& slot : ctx->slots) {
+    slot.mtx.Lock();
+    if (UNLIKELY(epoch == 0))
+      epoch = ctx->global_epoch;
+    if (UNLIKELY(epoch != ctx->global_epoch)) {
+      // Epoch can't change once we've locked the first slot.
+      CHECK_EQ(slot.sid, 0);
+      slot.mtx.Unlock();
+      return;
+    }
+  }
+  DPrintf("#%d: DoReset epoch=%lu\n", thr ? thr->tid : -1, epoch);
+  DoResetImpl(epoch);
+  for (auto& slot : ctx->slots) slot.mtx.Unlock();
+}
+
+void FlushShadowMemory() { DoReset(nullptr, 0); }
+
+static TidSlot* FindSlotAndLock(ThreadState* thr)
+    SANITIZER_ACQUIRE(thr->slot->mtx) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  CHECK(!thr->slot);
+  TidSlot* slot = nullptr;
+  for (;;) {
+    uptr epoch;
+    {
+      Lock lock(&ctx->slot_mtx);
+      epoch = ctx->global_epoch;
+      if (slot) {
+        // This is an exhausted slot from the previous iteration.
+        if (ctx->slot_queue.Queued(slot))
+          ctx->slot_queue.Remove(slot);
+        thr->slot_locked = false;
+        slot->mtx.Unlock();
+      }
+      for (;;) {
+        slot = ctx->slot_queue.PopFront();
+        if (!slot)
+          break;
+        if (slot->epoch() != kEpochLast) {
+          ctx->slot_queue.PushBack(slot);
+          break;
+        }
+      }
+    }
+    if (!slot) {
+      DoReset(thr, epoch);
+      continue;
     }
+    slot->mtx.Lock();
+    CHECK(!thr->slot_locked);
+    thr->slot_locked = true;
+    if (slot->thr) {
+      DPrintf("#%d: preempting sid=%d tid=%d\n", thr->tid, (u32)slot->sid,
+              slot->thr->tid);
+      slot->SetEpoch(slot->thr->fast_state.epoch());
+      slot->thr = nullptr;
+    }
+    if (slot->epoch() != kEpochLast)
+      return slot;
   }
-  void *mem = internal_alloc(MBlockThreadContex, sizeof(ThreadContext));
-  return new(mem) ThreadContext(tid);
 }
 
+void SlotAttachAndLock(ThreadState* thr) {
+  TidSlot* slot = FindSlotAndLock(thr);
+  DPrintf("#%d: SlotAttach: slot=%u\n", thr->tid, static_cast<int>(slot->sid));
+  CHECK(!slot->thr);
+  CHECK(!thr->slot);
+  slot->thr = thr;
+  thr->slot = slot;
+  Epoch epoch = EpochInc(slot->epoch());
+  CHECK(!EpochOverflow(epoch));
+  slot->SetEpoch(epoch);
+  thr->fast_state.SetSid(slot->sid);
+  thr->fast_state.SetEpoch(epoch);
+  if (thr->slot_epoch != ctx->global_epoch) {
+    thr->slot_epoch = ctx->global_epoch;
+    thr->clock.Reset();
 #if !SANITIZER_GO
-static const u32 kThreadQuarantineSize = 16;
-#else
-static const u32 kThreadQuarantineSize = 64;
+    thr->last_sleep_stack_id = kInvalidStackID;
+    thr->last_sleep_clock.Reset();
+#endif
+  }
+  thr->clock.Set(slot->sid, epoch);
+  slot->journal.PushBack({thr->tid, epoch});
+}
+
+static void SlotDetachImpl(ThreadState* thr, bool exiting) {
+  TidSlot* slot = thr->slot;
+  thr->slot = nullptr;
+  if (thr != slot->thr) {
+    slot = nullptr;  // we don't own the slot anymore
+    if (thr->slot_epoch != ctx->global_epoch) {
+      TracePart* part = nullptr;
+      auto* trace = &thr->tctx->trace;
+      {
+        Lock l(&trace->mtx);
+        auto* parts = &trace->parts;
+        // The trace can be completely empty in an unlikely event
+        // the thread is preempted right after it acquired the slot
+        // in ThreadStart and did not trace any events yet.
+        CHECK_LE(parts->Size(), 1);
+        part = parts->PopFront();
+        thr->tctx->trace.local_head = nullptr;
+        atomic_store_relaxed(&thr->trace_pos, 0);
+        thr->trace_prev_pc = 0;
+      }
+      if (part) {
+        Lock l(&ctx->slot_mtx);
+        TracePartFree(part);
+      }
+    }
+    return;
+  }
+  CHECK(exiting || thr->fast_state.epoch() == kEpochLast);
+  slot->SetEpoch(thr->fast_state.epoch());
+  slot->thr = nullptr;
+}
+
+void SlotDetach(ThreadState* thr) {
+  Lock lock(&thr->slot->mtx);
+  SlotDetachImpl(thr, true);
+}
+
+void SlotLock(ThreadState* thr) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  DCHECK(!thr->slot_locked);
+#if SANITIZER_DEBUG
+  // Check these mutexes are not locked.
+  // We can call DoReset from SlotAttachAndLock, which will lock
+  // these mutexes, but it happens only every once in a while.
+  { ThreadRegistryLock lock(&ctx->thread_registry); }
+  { Lock lock(&ctx->slot_mtx); }
 #endif
+  TidSlot* slot = thr->slot;
+  slot->mtx.Lock();
+  thr->slot_locked = true;
+  if (LIKELY(thr == slot->thr && thr->fast_state.epoch() != kEpochLast))
+    return;
+  SlotDetachImpl(thr, false);
+  thr->slot_locked = false;
+  slot->mtx.Unlock();
+  SlotAttachAndLock(thr);
+}
+
+void SlotUnlock(ThreadState* thr) {
+  DCHECK(thr->slot_locked);
+  thr->slot_locked = false;
+  thr->slot->mtx.Unlock();
+}
 
 Context::Context()
     : initialized(),
       report_mtx(MutexTypeReport),
       nreported(),
-      nmissed_expected(),
-      thread_registry(new (thread_registry_placeholder) ThreadRegistry(
-          CreateThreadContext, kMaxTid, kThreadQuarantineSize, kMaxTidReuse)),
+      thread_registry([](Tid tid) -> ThreadContextBase* {
+        return new (Alloc(sizeof(ThreadContext))) ThreadContext(tid);
+      }),
       racy_mtx(MutexTypeRacy),
       racy_stacks(),
-      racy_addresses(),
       fired_suppressions_mtx(MutexTypeFired),
-      clock_alloc(LINKER_INITIALIZED, "clock allocator") {
+      slot_mtx(MutexTypeSlots),
+      resetting() {
   fired_suppressions.reserve(8);
+  for (uptr i = 0; i < ARRAY_SIZE(slots); i++) {
+    TidSlot* slot = &slots[i];
+    slot->sid = static_cast<Sid>(i);
+    slot_queue.PushBack(slot);
+  }
+  global_epoch = 1;
 }
 
+TidSlot::TidSlot() : mtx(MutexTypeSlot) {}
+
 // The objects are allocated in TLS, so one may rely on zero-initialization.
-ThreadState::ThreadState(Context *ctx, u32 tid, int unique_id, u64 epoch,
-                         unsigned reuse_count, uptr stk_addr, uptr stk_size,
-                         uptr tls_addr, uptr tls_size)
-    : fast_state(tid, epoch)
-      // Do not touch these, rely on zero initialization,
-      // they may be accessed before the ctor.
-      // , ignore_reads_and_writes()
-      // , ignore_interceptors()
-      ,
-      clock(tid, reuse_count)
-#if !SANITIZER_GO
-      ,
-      jmp_bufs()
-#endif
-      ,
-      tid(tid),
-      unique_id(unique_id),
-      stk_addr(stk_addr),
-      stk_size(stk_size),
-      tls_addr(tls_addr),
-      tls_size(tls_size)
+ThreadState::ThreadState(Tid tid)
+    // Do not touch these, rely on zero initialization,
+    // they may be accessed before the ctor.
+    // ignore_reads_and_writes()
+    // ignore_interceptors()
+    : tid(tid) {
+  CHECK_EQ(reinterpret_cast<uptr>(this) % SANITIZER_CACHE_LINE_SIZE, 0);
 #if !SANITIZER_GO
-      ,
-      last_sleep_clock(tid)
+  // C/C++ uses fixed size shadow stack.
+  const int kInitStackSize = kShadowStackSize;
+  shadow_stack = static_cast<uptr*>(
+      MmapNoReserveOrDie(kInitStackSize * sizeof(uptr), "shadow stack"));
+  SetShadowRegionHugePageMode(reinterpret_cast<uptr>(shadow_stack),
+                              kInitStackSize * sizeof(uptr));
+#else
+  // Go uses malloc-allocated shadow stack with dynamic size.
+  const int kInitStackSize = 8;
+  shadow_stack = static_cast<uptr*>(Alloc(kInitStackSize * sizeof(uptr)));
 #endif
-{
+  shadow_stack_pos = shadow_stack;
+  shadow_stack_end = shadow_stack + kInitStackSize;
 }
 
 #if !SANITIZER_GO
-static void MemoryProfiler(Context *ctx, fd_t fd, int i) {
-  uptr n_threads;
-  uptr n_running_threads;
-  ctx->thread_registry->GetNumberOfThreads(&n_threads, &n_running_threads);
+void MemoryProfiler(u64 uptime) {
+  if (ctx->memprof_fd == kInvalidFd)
+    return;
   InternalMmapVector<char> buf(4096);
-  WriteMemoryProfile(buf.data(), buf.size(), n_threads, n_running_threads);
-  WriteToFile(fd, buf.data(), internal_strlen(buf.data()));
+  WriteMemoryProfile(buf.data(), buf.size(), uptime);
+  WriteToFile(ctx->memprof_fd, buf.data(), internal_strlen(buf.data()));
+}
+
+static bool InitializeMemoryProfiler() {
+  ctx->memprof_fd = kInvalidFd;
+  const char *fname = flags()->profile_memory;
+  if (!fname || !fname[0])
+    return false;
+  if (internal_strcmp(fname, "stdout") == 0) {
+    ctx->memprof_fd = 1;
+  } else if (internal_strcmp(fname, "stderr") == 0) {
+    ctx->memprof_fd = 2;
+  } else {
+    InternalScopedString filename;
+    filename.AppendF("%s.%d", fname, (int)internal_getpid());
+    ctx->memprof_fd = OpenFile(filename.data(), WrOnly);
+    if (ctx->memprof_fd == kInvalidFd) {
+      Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
+             filename.data());
+      return false;
+    }
+  }
+  MemoryProfiler(0);
+  return true;
 }
 
 static void *BackgroundThread(void *arg) {
@@ -172,64 +463,43 @@ static void *BackgroundThread(void *arg) {
   // We don't use ScopedIgnoreInterceptors, because we want ignores to be
   // enabled even when the thread function exits (e.g. during pthread thread
   // shutdown code).
-  cur_thread_init();
-  cur_thread()->ignore_interceptors++;
+  cur_thread_init()->ignore_interceptors++;
   const u64 kMs2Ns = 1000 * 1000;
+  const u64 start = NanoTime();
 
-  fd_t mprof_fd = kInvalidFd;
-  if (flags()->profile_memory && flags()->profile_memory[0]) {
-    if (internal_strcmp(flags()->profile_memory, "stdout") == 0) {
-      mprof_fd = 1;
-    } else if (internal_strcmp(flags()->profile_memory, "stderr") == 0) {
-      mprof_fd = 2;
-    } else {
-      InternalScopedString filename;
-      filename.append("%s.%d", flags()->profile_memory, (int)internal_getpid());
-      fd_t fd = OpenFile(filename.data(), WrOnly);
-      if (fd == kInvalidFd) {
-        Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
-               filename.data());
-      } else {
-        mprof_fd = fd;
-      }
-    }
-  }
-
-  u64 last_flush = NanoTime();
+  u64 last_flush = start;
   uptr last_rss = 0;
-  for (int i = 0;
-      atomic_load(&ctx->stop_background_thread, memory_order_relaxed) == 0;
-      i++) {
+  while (!atomic_load_relaxed(&ctx->stop_background_thread)) {
     SleepForMillis(100);
     u64 now = NanoTime();
 
     // Flush memory if requested.
     if (flags()->flush_memory_ms > 0) {
       if (last_flush + flags()->flush_memory_ms * kMs2Ns < now) {
-        VPrintf(1, "ThreadSanitizer: periodic memory flush\n");
+        VReport(1, "ThreadSanitizer: periodic memory flush\n");
         FlushShadowMemory();
-        last_flush = NanoTime();
+        now = last_flush = NanoTime();
       }
     }
-    // GetRSS can be expensive on huge programs, so don't do it every 100ms.
     if (flags()->memory_limit_mb > 0) {
       uptr rss = GetRSS();
       uptr limit = uptr(flags()->memory_limit_mb) << 20;
-      VPrintf(1, "ThreadSanitizer: memory flush check"
-                 " RSS=%llu LAST=%llu LIMIT=%llu\n",
+      VReport(1,
+              "ThreadSanitizer: memory flush check"
+              " RSS=%llu LAST=%llu LIMIT=%llu\n",
               (u64)rss >> 20, (u64)last_rss >> 20, (u64)limit >> 20);
       if (2 * rss > limit + last_rss) {
-        VPrintf(1, "ThreadSanitizer: flushing memory due to RSS\n");
+        VReport(1, "ThreadSanitizer: flushing memory due to RSS\n");
         FlushShadowMemory();
         rss = GetRSS();
-        VPrintf(1, "ThreadSanitizer: memory flushed RSS=%llu\n", (u64)rss>>20);
+        now = NanoTime();
+        VReport(1, "ThreadSanitizer: memory flushed RSS=%llu\n",
+                (u64)rss >> 20);
       }
       last_rss = rss;
     }
 
-    // Write memory profile if requested.
-    if (mprof_fd != kInvalidFd)
-      MemoryProfiler(ctx, mprof_fd, i);
+    MemoryProfiler(now - start);
 
     // Flush symbolizer cache if requested.
     if (flags()->flush_symbolizer_ms > 0) {
@@ -260,31 +530,96 @@ static void StopBackgroundThread() {
 #endif
 
 void DontNeedShadowFor(uptr addr, uptr size) {
-  ReleaseMemoryPagesToOS(MemToShadow(addr), MemToShadow(addr + size));
+  ReleaseMemoryPagesToOS(reinterpret_cast<uptr>(MemToShadow(addr)),
+                         reinterpret_cast<uptr>(MemToShadow(addr + size)));
 }
 
 #if !SANITIZER_GO
+// We call UnmapShadow before the actual munmap, at that point we don't yet
+// know if the provided address/size are sane. We can't call UnmapShadow
+// after the actual munmap becuase at that point the memory range can
+// already be reused for something else, so we can't rely on the munmap
+// return value to understand is the values are sane.
+// While calling munmap with insane values (non-canonical address, negative
+// size, etc) is an error, the kernel won't crash. We must also try to not
+// crash as the failure mode is very confusing (paging fault inside of the
+// runtime on some derived shadow address).
+static bool IsValidMmapRange(uptr addr, uptr size) {
+  if (size == 0)
+    return true;
+  if (static_cast<sptr>(size) < 0)
+    return false;
+  if (!IsAppMem(addr) || !IsAppMem(addr + size - 1))
+    return false;
+  // Check that if the start of the region belongs to one of app ranges,
+  // end of the region belongs to the same region.
+  const uptr ranges[][2] = {
+      {LoAppMemBeg(), LoAppMemEnd()},
+      {MidAppMemBeg(), MidAppMemEnd()},
+      {HiAppMemBeg(), HiAppMemEnd()},
+  };
+  for (auto range : ranges) {
+    if (addr >= range[0] && addr < range[1])
+      return addr + size <= range[1];
+  }
+  return false;
+}
+
 void UnmapShadow(ThreadState *thr, uptr addr, uptr size) {
-  if (size == 0) return;
+  if (size == 0 || !IsValidMmapRange(addr, size))
+    return;
   DontNeedShadowFor(addr, size);
   ScopedGlobalProcessor sgp;
-  ctx->metamap.ResetRange(thr->proc(), addr, size);
+  SlotLocker locker(thr, true);
+  ctx->metamap.ResetRange(thr->proc(), addr, size, true);
 }
 #endif
 
 void MapShadow(uptr addr, uptr size) {
+  // Ensure thead registry lock held, so as to synchronize
+  // with DoReset, which also access the mapped_shadow_* ctxt fields.
+  ThreadRegistryLock lock0(&ctx->thread_registry);
+  static bool data_mapped = false;
+
+#if !SANITIZER_GO
   // Global data is not 64K aligned, but there are no adjacent mappings,
   // so we can get away with unaligned mapping.
   // CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
   const uptr kPageSize = GetPageSizeCached();
   uptr shadow_begin = RoundDownTo((uptr)MemToShadow(addr), kPageSize);
   uptr shadow_end = RoundUpTo((uptr)MemToShadow(addr + size), kPageSize);
-  if (!MmapFixedSuperNoReserve(shadow_begin, shadow_end - shadow_begin,
-                               "shadow"))
+  if (!MmapFixedNoReserve(shadow_begin, shadow_end - shadow_begin, "shadow"))
     Die();
+#else
+  uptr shadow_begin = RoundDownTo((uptr)MemToShadow(addr), (64 << 10));
+  uptr shadow_end = RoundUpTo((uptr)MemToShadow(addr + size), (64 << 10));
+  VPrintf(2, "MapShadow for (0x%zx-0x%zx), begin/end: (0x%zx-0x%zx)\n",
+          addr, addr + size, shadow_begin, shadow_end);
+
+  if (!data_mapped) {
+    // First call maps data+bss.
+    if (!MmapFixedSuperNoReserve(shadow_begin, shadow_end - shadow_begin, "shadow"))
+      Die();
+  } else {
+    VPrintf(2, "ctx->mapped_shadow_{begin,end} = (0x%zx-0x%zx)\n",
+            ctx->mapped_shadow_begin, ctx->mapped_shadow_end);
+    // Second and subsequent calls map heap.
+    if (shadow_end <= ctx->mapped_shadow_end)
+      return;
+    if (!ctx->mapped_shadow_begin || ctx->mapped_shadow_begin > shadow_begin)
+       ctx->mapped_shadow_begin = shadow_begin;
+    if (shadow_begin < ctx->mapped_shadow_end)
+      shadow_begin = ctx->mapped_shadow_end;
+    VPrintf(2, "MapShadow begin/end = (0x%zx-0x%zx)\n",
+            shadow_begin, shadow_end);
+    if (!MmapFixedSuperNoReserve(shadow_begin, shadow_end - shadow_begin,
+                                 "shadow"))
+      Die();
+    ctx->mapped_shadow_end = shadow_end;
+  }
+#endif
 
   // Meta shadow is 2:1, so tread carefully.
-  static bool data_mapped = false;
   static uptr mapped_meta_end = 0;
   uptr meta_begin = (uptr)MemToMeta(addr);
   uptr meta_end = (uptr)MemToMeta(addr + size);
@@ -297,12 +632,11 @@ void MapShadow(uptr addr, uptr size) {
                                  "meta shadow"))
       Die();
   } else {
-    // Mapping continous heap.
+    // Mapping continuous heap.
     // Windows wants 64K alignment.
     meta_begin = RoundDownTo(meta_begin, 64 << 10);
     meta_end = RoundUpTo(meta_end, 64 << 10);
-    if (meta_end <= mapped_meta_end)
-      return;
+    CHECK_GT(meta_end, mapped_meta_end);
     if (meta_begin < mapped_meta_end)
       meta_begin = mapped_meta_end;
     if (!MmapFixedSuperNoReserve(meta_begin, meta_end - meta_begin,
@@ -310,56 +644,8 @@ void MapShadow(uptr addr, uptr size) {
       Die();
     mapped_meta_end = meta_end;
   }
-  VPrintf(2, "mapped meta shadow for (%p-%p) at (%p-%p)\n",
-      addr, addr+size, meta_begin, meta_end);
-}
-
-void MapThreadTrace(uptr addr, uptr size, const char *name) {
-  DPrintf("#0: Mapping trace at %p-%p(0x%zx)\n", addr, addr + size, size);
-  CHECK_GE(addr, TraceMemBeg());
-  CHECK_LE(addr + size, TraceMemEnd());
-  CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
-  if (!MmapFixedSuperNoReserve(addr, size, name)) {
-    Printf("FATAL: ThreadSanitizer can not mmap thread trace (%p/%p)\n",
-        addr, size);
-    Die();
-  }
-}
-
-static void CheckShadowMapping() {
-  uptr beg, end;
-  for (int i = 0; GetUserRegion(i, &beg, &end); i++) {
-    // Skip cases for empty regions (heap definition for architectures that
-    // do not use 64-bit allocator).
-    if (beg == end)
-      continue;
-    VPrintf(3, "checking shadow region %p-%p\n", beg, end);
-    uptr prev = 0;
-    for (uptr p0 = beg; p0 <= end; p0 += (end - beg) / 4) {
-      for (int x = -(int)kShadowCell; x <= (int)kShadowCell; x += kShadowCell) {
-        const uptr p = RoundDown(p0 + x, kShadowCell);
-        if (p < beg || p >= end)
-          continue;
-        const uptr s = MemToShadow(p);
-        const uptr m = (uptr)MemToMeta(p);
-        VPrintf(3, "  checking pointer %p: shadow=%p meta=%p\n", p, s, m);
-        CHECK(IsAppMem(p));
-        CHECK(IsShadowMem(s));
-        CHECK_EQ(p, ShadowToMem(s));
-        CHECK(IsMetaMem(m));
-        if (prev) {
-          // Ensure that shadow and meta mappings are linear within a single
-          // user range. Lots of code that processes memory ranges assumes it.
-          const uptr prev_s = MemToShadow(prev);
-          const uptr prev_m = (uptr)MemToMeta(prev);
-          CHECK_EQ(s - prev_s, (p - prev) * kShadowMultiplier);
-          CHECK_EQ((m - prev_m) / kMetaShadowSize,
-                   (p - prev) / kMetaShadowCell);
-        }
-        prev = p;
-      }
-    }
-  }
+  VPrintf(2, "mapped meta shadow for (0x%zx-0x%zx) at (0x%zx-0x%zx)\n", addr,
+          addr + size, meta_begin, meta_end);
 }
 
 #if !SANITIZER_GO
@@ -380,15 +666,19 @@ void CheckUnwind() {
   // since we are going to die soon.
   ScopedIgnoreInterceptors ignore;
 #if !SANITIZER_GO
-  cur_thread()->ignore_sync++;
-  cur_thread()->ignore_reads_and_writes++;
+  ThreadState* thr = cur_thread();
+  thr->nomalloc = false;
+  thr->ignore_sync++;
+  thr->ignore_reads_and_writes++;
+  atomic_store_relaxed(&thr->in_signal_handler, 0);
 #endif
   PrintCurrentStackSlow(StackTrace::GetCurrentPc());
 }
 
+bool is_initialized;
+
 void Initialize(ThreadState *thr) {
   // Thread safe because done before all threads exist.
-  static bool is_initialized = false;
   if (is_initialized)
     return;
   is_initialized = true;
@@ -409,9 +699,6 @@ void Initialize(ThreadState *thr) {
   __tsan::InitializePlatformEarly();
 
 #if !SANITIZER_GO
-  // Re-exec ourselves if we need to set additional env or command line args.
-  MaybeReexec();
-
   InitializeAllocator();
   ReplaceSystemMalloc();
 #endif
@@ -420,7 +707,6 @@ void Initialize(ThreadState *thr) {
   Processor *proc = ProcCreate();
   ProcWire(proc, thr);
   InitializeInterceptors();
-  CheckShadowMapping();
   InitializePlatform();
   InitializeDynamicAnnotations();
 #if !SANITIZER_GO
@@ -436,21 +722,23 @@ void Initialize(ThreadState *thr) {
   Symbolizer::GetOrInit()->AddHooks(EnterSymbolizer, ExitSymbolizer);
 #endif
 
-  VPrintf(1, "***** Running under ThreadSanitizer v2 (pid %d) *****\n",
+  VPrintf(1, "***** Running under ThreadSanitizer v3 (pid %d) *****\n",
           (int)internal_getpid());
 
   // Initialize thread 0.
-  int tid = ThreadCreate(thr, 0, 0, true);
-  CHECK_EQ(tid, 0);
+  Tid tid = ThreadCreate(nullptr, 0, 0, true);
+  CHECK_EQ(tid, kMainTid);
   ThreadStart(thr, tid, GetTid(), ThreadType::Regular);
 #if TSAN_CONTAINS_UBSAN
   __ubsan::InitAsPlugin();
 #endif
-  ctx->initialized = true;
 
 #if !SANITIZER_GO
   Symbolizer::LateInitialize();
+  if (InitializeMemoryProfiler() || flags()->force_background_thread)
+    MaybeSpawnBackgroundThread();
 #endif
+  ctx->initialized = true;
 
   if (flags()->stop_on_start) {
     Printf("ThreadSanitizer is suspended at startup (pid %d)."
@@ -476,20 +764,21 @@ void MaybeSpawnBackgroundThread() {
 #endif
 }
 
-
 int Finalize(ThreadState *thr) {
   bool failed = false;
 
+#if !SANITIZER_GO
   if (common_flags()->print_module_map == 1)
     DumpProcessMap();
+#endif
 
   if (flags()->atexit_sleep_ms > 0 && ThreadCount(thr) > 1)
-    SleepForMillis(flags()->atexit_sleep_ms);
+    internal_usleep(u64(flags()->atexit_sleep_ms) * 1000);
 
-  // Wait for pending reports.
-  ctx->report_mtx.Lock();
-  { ScopedErrorReportLock l; }
-  ctx->report_mtx.Unlock();
+  {
+    // Wait for pending reports.
+    ScopedErrorReportLock lock;
+  }
 
 #if !SANITIZER_GO
   if (Verbosity()) AllocatorPrintStats();
@@ -506,18 +795,8 @@ int Finalize(ThreadState *thr) {
 #endif
   }
 
-  if (ctx->nmissed_expected) {
-    failed = true;
-    Printf("ThreadSanitizer: missed %d expected races\n",
-        ctx->nmissed_expected);
-  }
-
   if (common_flags()->print_suppressions)
     PrintMatchedSuppressions();
-#if !SANITIZER_GO
-  if (flags()->print_benign)
-    PrintMatchedBenignRaces();
-#endif
 
   failed = OnFinalize(failed);
 
@@ -525,10 +804,16 @@ int Finalize(ThreadState *thr) {
 }
 
 #if !SANITIZER_GO
-void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
-  ctx->thread_registry->Lock();
-  ctx->report_mtx.Lock();
+void ForkBefore(ThreadState* thr, uptr pc) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  GlobalProcessorLock();
+  // Detaching from the slot makes OnUserFree skip writing to the shadow.
+  // The slot will be locked so any attempts to use it will deadlock anyway.
+  SlotDetach(thr);
+  for (auto& slot : ctx->slots) slot.mtx.Lock();
+  ctx->thread_registry.Lock();
+  ctx->slot_mtx.Lock();
   ScopedErrorReportLock::Lock();
+  AllocatorLock();
   // Suppress all reports in the pthread_atfork callbacks.
   // Reports will deadlock on the report_mtx.
   // We could ignore sync operations as well,
@@ -537,36 +822,48 @@ void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
   thr->suppress_reports++;
   // On OS X, REAL(fork) can call intercepted functions (OSSpinLockLock), and
   // we'll assert in CheckNoLocks() unless we ignore interceptors.
+  // On OS X libSystem_atfork_prepare/parent/child callbacks are called
+  // after/before our callbacks and they call free.
   thr->ignore_interceptors++;
+  // Disables memory write in OnUserAlloc/Free.
+  thr->ignore_reads_and_writes++;
+
+  __tsan_test_only_on_fork();
 }
 
-void ForkParentAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
+static void ForkAfter(ThreadState* thr) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   thr->suppress_reports--;  // Enabled in ForkBefore.
   thr->ignore_interceptors--;
+  thr->ignore_reads_and_writes--;
+  AllocatorUnlock();
   ScopedErrorReportLock::Unlock();
-  ctx->report_mtx.Unlock();
-  ctx->thread_registry->Unlock();
+  ctx->slot_mtx.Unlock();
+  ctx->thread_registry.Unlock();
+  for (auto& slot : ctx->slots) slot.mtx.Unlock();
+  SlotAttachAndLock(thr);
+  SlotUnlock(thr);
+  GlobalProcessorUnlock();
 }
 
-void ForkChildAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
-  thr->suppress_reports--;  // Enabled in ForkBefore.
-  thr->ignore_interceptors--;
-  ScopedErrorReportLock::Unlock();
-  ctx->report_mtx.Unlock();
-  ctx->thread_registry->Unlock();
+void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr); }
 
-  uptr nthread = 0;
-  ctx->thread_registry->GetNumberOfThreads(0, 0, &nthread /* alive threads */);
-  VPrintf(1, "ThreadSanitizer: forked new process with pid %d,"
-      " parent had %d threads\n", (int)internal_getpid(), (int)nthread);
+void ForkChildAfter(ThreadState* thr, uptr pc, bool start_thread) {
+  ForkAfter(thr);
+  u32 nthread = ctx->thread_registry.OnFork(thr->tid);
+  VPrintf(1,
+          "ThreadSanitizer: forked new process with pid %d,"
+          " parent had %d threads\n",
+          (int)internal_getpid(), (int)nthread);
   if (nthread == 1) {
-    StartBackgroundThread();
+    if (start_thread)
+      StartBackgroundThread();
   } else {
     // We've just forked a multi-threaded process. We cannot reasonably function
     // after that (some mutexes may be locked before fork). So just enable
     // ignores for everything in the hope that we will exec soon.
     ctx->after_multithreaded_fork = true;
     thr->ignore_interceptors++;
+    thr->suppress_reports++;
     ThreadIgnoreBegin(thr, pc);
     ThreadIgnoreSyncBegin(thr, pc);
   }
@@ -578,19 +875,20 @@ NOINLINE
 void GrowShadowStack(ThreadState *thr) {
   const int sz = thr->shadow_stack_end - thr->shadow_stack;
   const int newsz = 2 * sz;
-  uptr *newstack = (uptr*)internal_alloc(MBlockShadowStack,
-      newsz * sizeof(uptr));
+  auto *newstack = (uptr *)Alloc(newsz * sizeof(uptr));
   internal_memcpy(newstack, thr->shadow_stack, sz * sizeof(uptr));
-  internal_free(thr->shadow_stack);
+  Free(thr->shadow_stack);
   thr->shadow_stack = newstack;
   thr->shadow_stack_pos = newstack + sz;
   thr->shadow_stack_end = newstack + newsz;
 }
 #endif
 
-u32 CurrentStackId(ThreadState *thr, uptr pc) {
+StackID CurrentStackId(ThreadState *thr, uptr pc) {
+#if !SANITIZER_GO
   if (!thr->is_inited)  // May happen during bootstrap.
-    return 0;
+    return kInvalidStackID;
+#endif
   if (pc != 0) {
 #if !SANITIZER_GO
     DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -601,486 +899,149 @@ u32 CurrentStackId(ThreadState *thr, uptr pc) {
     thr->shadow_stack_pos[0] = pc;
     thr->shadow_stack_pos++;
   }
-  u32 id = StackDepotPut(
+  StackID id = StackDepotPut(
       StackTrace(thr->shadow_stack, thr->shadow_stack_pos - thr->shadow_stack));
   if (pc != 0)
     thr->shadow_stack_pos--;
   return id;
 }
 
-void TraceSwitch(ThreadState *thr) {
-#if !SANITIZER_GO
-  if (ctx->after_multithreaded_fork)
-    return;
-#endif
-  thr->nomalloc++;
-  Trace *thr_trace = ThreadTrace(thr->tid);
-  Lock l(&thr_trace->mtx);
-  unsigned trace = (thr->fast_state.epoch() / kTracePartSize) % TraceParts();
-  TraceHeader *hdr = &thr_trace->headers[trace];
-  hdr->epoch0 = thr->fast_state.epoch();
-  ObtainCurrentStack(thr, 0, &hdr->stack0);
-  hdr->mset0 = thr->mset;
-  thr->nomalloc--;
-}
-
-Trace *ThreadTrace(int tid) {
-  return (Trace*)GetThreadTraceHeader(tid);
-}
-
-uptr TraceTopPC(ThreadState *thr) {
-  Event *events = (Event*)GetThreadTrace(thr->tid);
-  uptr pc = events[thr->fast_state.GetTracePos()];
-  return pc;
-}
-
-uptr TraceSize() {
-  return (uptr)(1ull << (kTracePartSizeBits + flags()->history_size + 1));
-}
-
-uptr TraceParts() {
-  return TraceSize() / kTracePartSize;
-}
-
-#if !SANITIZER_GO
-extern "C" void __tsan_trace_switch() {
-  TraceSwitch(cur_thread());
-}
-
-extern "C" void __tsan_report_race() {
-  ReportRace(cur_thread());
-}
-#endif
-
-ALWAYS_INLINE
-Shadow LoadShadow(u64 *p) {
-  u64 raw = atomic_load((atomic_uint64_t*)p, memory_order_relaxed);
-  return Shadow(raw);
-}
-
-ALWAYS_INLINE
-void StoreShadow(u64 *sp, u64 s) {
-  atomic_store((atomic_uint64_t*)sp, s, memory_order_relaxed);
-}
-
-ALWAYS_INLINE
-void StoreIfNotYetStored(u64 *sp, u64 *s) {
-  StoreShadow(sp, *s);
-  *s = 0;
-}
-
-ALWAYS_INLINE
-void HandleRace(ThreadState *thr, u64 *shadow_mem,
-                              Shadow cur, Shadow old) {
-  thr->racy_state[0] = cur.raw();
-  thr->racy_state[1] = old.raw();
-  thr->racy_shadow_addr = shadow_mem;
-#if !SANITIZER_GO
-  HACKY_CALL(__tsan_report_race);
-#else
-  ReportRace(thr);
-#endif
-}
-
-static inline bool HappensBefore(Shadow old, ThreadState *thr) {
-  return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
-}
-
-ALWAYS_INLINE
-void MemoryAccessImpl1(ThreadState *thr, uptr addr,
-    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
-    u64 *shadow_mem, Shadow cur) {
-
-  // This potentially can live in an MMX/SSE scratch register.
-  // The required intrinsics are:
-  // __m128i _mm_move_epi64(__m128i*);
-  // _mm_storel_epi64(u64*, __m128i);
-  u64 store_word = cur.raw();
-  bool stored = false;
-
-  // scan all the shadow values and dispatch to 4 categories:
-  // same, replace, candidate and race (see comments below).
-  // we consider only 3 cases regarding access sizes:
-  // equal, intersect and not intersect. initially I considered
-  // larger and smaller as well, it allowed to replace some
-  // 'candidates' with 'same' or 'replace', but I think
-  // it's just not worth it (performance- and complexity-wise).
-
-  Shadow old(0);
-
-  // It release mode we manually unroll the loop,
-  // because empirically gcc generates better code this way.
-  // However, we can't afford unrolling in debug mode, because the function
-  // consumes almost 4K of stack. Gtest gives only 4K of stack to death test
-  // threads, which is not enough for the unrolled loop.
-#if SANITIZER_DEBUG
-  for (int idx = 0; idx < 4; idx++) {
-#include "tsan_update_shadow_word_inl.h"
-  }
-#else
-  int idx = 0;
-#include "tsan_update_shadow_word_inl.h"
-  idx = 1;
-  if (stored) {
-#include "tsan_update_shadow_word_inl.h"
-  } else {
-#include "tsan_update_shadow_word_inl.h"
-  }
-  idx = 2;
-  if (stored) {
-#include "tsan_update_shadow_word_inl.h"
-  } else {
-#include "tsan_update_shadow_word_inl.h"
-  }
-  idx = 3;
-  if (stored) {
-#include "tsan_update_shadow_word_inl.h"
-  } else {
-#include "tsan_update_shadow_word_inl.h"
-  }
-#endif
-
-  // we did not find any races and had already stored
-  // the current access info, so we are done
-  if (LIKELY(stored))
-    return;
-  // choose a random candidate slot and replace it
-  StoreShadow(shadow_mem + (cur.epoch() % kShadowCnt), store_word);
-  return;
- RACE:
-  HandleRace(thr, shadow_mem, cur, old);
-  return;
-}
-
-void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-    int size, bool kAccessIsWrite, bool kIsAtomic) {
-  while (size) {
-    int size1 = 1;
-    int kAccessSizeLog = kSizeLog1;
-    if (size >= 8 && (addr & ~7) == ((addr + 7) & ~7)) {
-      size1 = 8;
-      kAccessSizeLog = kSizeLog8;
-    } else if (size >= 4 && (addr & ~7) == ((addr + 3) & ~7)) {
-      size1 = 4;
-      kAccessSizeLog = kSizeLog4;
-    } else if (size >= 2 && (addr & ~7) == ((addr + 1) & ~7)) {
-      size1 = 2;
-      kAccessSizeLog = kSizeLog2;
-    }
-    MemoryAccess(thr, pc, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic);
-    addr += size1;
-    size -= size1;
-  }
-}
-
-ALWAYS_INLINE
-bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-  Shadow cur(a);
-  for (uptr i = 0; i < kShadowCnt; i++) {
-    Shadow old(LoadShadow(&s[i]));
-    if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
-        old.TidWithIgnore() == cur.TidWithIgnore() &&
-        old.epoch() > sync_epoch &&
-        old.IsAtomic() == cur.IsAtomic() &&
-        old.IsRead() <= cur.IsRead())
-      return true;
+static bool TraceSkipGap(ThreadState* thr) {
+  Trace *trace = &thr->tctx->trace;
+  Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
+  DCHECK_EQ(reinterpret_cast<uptr>(pos + 1) & TracePart::kAlignment, 0);
+  auto *part = trace->parts.Back();
+  DPrintf("#%d: TraceSwitchPart enter trace=%p parts=%p-%p pos=%p\n", thr->tid,
+          trace, trace->parts.Front(), part, pos);
+  if (!part)
+    return false;
+  // We can get here when we still have space in the current trace part.
+  // The fast-path check in TraceAcquire has false positives in the middle of
+  // the part. Check if we are indeed at the end of the current part or not,
+  // and fill any gaps with NopEvent's.
+  Event* end = &part->events[TracePart::kSize];
+  DCHECK_GE(pos, &part->events[0]);
+  DCHECK_LE(pos, end);
+  if (pos + 1 < end) {
+    if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
+        TracePart::kAlignment)
+      *pos++ = NopEvent;
+    *pos++ = NopEvent;
+    DCHECK_LE(pos + 2, end);
+    atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
+    return true;
   }
+  // We are indeed at the end.
+  for (; pos < end; pos++) *pos = NopEvent;
   return false;
 }
 
-#if defined(__SSE3__)
-#define SHUF(v0, v1, i0, i1, i2, i3) _mm_castps_si128(_mm_shuffle_ps( \
-    _mm_castsi128_ps(v0), _mm_castsi128_ps(v1), \
-    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
-ALWAYS_INLINE
-bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-  // This is an optimized version of ContainsSameAccessSlow.
-  // load current access into access[0:63]
-  const m128 access     = _mm_cvtsi64_si128(a);
-  // duplicate high part of access in addr0:
-  // addr0[0:31]        = access[32:63]
-  // addr0[32:63]       = access[32:63]
-  // addr0[64:95]       = access[32:63]
-  // addr0[96:127]      = access[32:63]
-  const m128 addr0      = SHUF(access, access, 1, 1, 1, 1);
-  // load 4 shadow slots
-  const m128 shadow0    = _mm_load_si128((__m128i*)s);
-  const m128 shadow1    = _mm_load_si128((__m128i*)s + 1);
-  // load high parts of 4 shadow slots into addr_vect:
-  // addr_vect[0:31]    = shadow0[32:63]
-  // addr_vect[32:63]   = shadow0[96:127]
-  // addr_vect[64:95]   = shadow1[32:63]
-  // addr_vect[96:127]  = shadow1[96:127]
-  m128 addr_vect        = SHUF(shadow0, shadow1, 1, 3, 1, 3);
-  if (!is_write) {
-    // set IsRead bit in addr_vect
-    const m128 rw_mask1 = _mm_cvtsi64_si128(1<<15);
-    const m128 rw_mask  = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
-    addr_vect           = _mm_or_si128(addr_vect, rw_mask);
-  }
-  // addr0 == addr_vect?
-  const m128 addr_res   = _mm_cmpeq_epi32(addr0, addr_vect);
-  // epoch1[0:63]       = sync_epoch
-  const m128 epoch1     = _mm_cvtsi64_si128(sync_epoch);
-  // epoch[0:31]        = sync_epoch[0:31]
-  // epoch[32:63]       = sync_epoch[0:31]
-  // epoch[64:95]       = sync_epoch[0:31]
-  // epoch[96:127]      = sync_epoch[0:31]
-  const m128 epoch      = SHUF(epoch1, epoch1, 0, 0, 0, 0);
-  // load low parts of shadow cell epochs into epoch_vect:
-  // epoch_vect[0:31]   = shadow0[0:31]
-  // epoch_vect[32:63]  = shadow0[64:95]
-  // epoch_vect[64:95]  = shadow1[0:31]
-  // epoch_vect[96:127] = shadow1[64:95]
-  const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
-  // epoch_vect >= sync_epoch?
-  const m128 epoch_res  = _mm_cmpgt_epi32(epoch_vect, epoch);
-  // addr_res & epoch_res
-  const m128 res        = _mm_and_si128(addr_res, epoch_res);
-  // mask[0] = res[7]
-  // mask[1] = res[15]
-  // ...
-  // mask[15] = res[127]
-  const int mask        = _mm_movemask_epi8(res);
-  return mask != 0;
-}
-#endif
-
-ALWAYS_INLINE
-bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-#if defined(__SSE3__)
-  bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
-  // NOTE: this check can fail if the shadow is concurrently mutated
-  // by other threads. But it still can be useful if you modify
-  // ContainsSameAccessFast and want to ensure that it's not completely broken.
-  // DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
-  return res;
-#else
-  return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
-#endif
-}
-
-ALWAYS_INLINE USED
-void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic) {
-  u64 *shadow_mem = (u64*)MemToShadow(addr);
-  DPrintf2("#%d: MemoryAccess: @%p %p size=%d"
-      " is_write=%d shadow_mem=%p {%zx, %zx, %zx, %zx}\n",
-      (int)thr->fast_state.tid(), (void*)pc, (void*)addr,
-      (int)(1 << kAccessSizeLog), kAccessIsWrite, shadow_mem,
-      (uptr)shadow_mem[0], (uptr)shadow_mem[1],
-      (uptr)shadow_mem[2], (uptr)shadow_mem[3]);
-#if SANITIZER_DEBUG
-  if (!IsAppMem(addr)) {
-    Printf("Access to non app mem %zx\n", addr);
-    DCHECK(IsAppMem(addr));
-  }
-  if (!IsShadowMem((uptr)shadow_mem)) {
-    Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
-    DCHECK(IsShadowMem((uptr)shadow_mem));
-  }
-#endif
-
-  if (!SANITIZER_GO && !kAccessIsWrite && *shadow_mem == kShadowRodata) {
-    // Access to .rodata section, no races here.
-    // Measurements show that it can be 10-20% of all memory accesses.
-    return;
-  }
-
-  FastState fast_state = thr->fast_state;
-  if (UNLIKELY(fast_state.GetIgnoreBit())) {
-    return;
-  }
-
-  Shadow cur(fast_state);
-  cur.SetAddr0AndSizeLog(addr & 7, kAccessSizeLog);
-  cur.SetWrite(kAccessIsWrite);
-  cur.SetAtomic(kIsAtomic);
-
-  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
-      thr->fast_synch_epoch, kAccessIsWrite))) {
-    return;
-  }
-
-  if (kCollectHistory) {
-    fast_state.IncrementEpoch();
-    thr->fast_state = fast_state;
-    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
-    cur.IncrementEpoch();
-  }
-
-  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
-      shadow_mem, cur);
-}
-
-// Called by MemoryAccessRange in tsan_rtl_thread.cpp
-ALWAYS_INLINE USED
-void MemoryAccessImpl(ThreadState *thr, uptr addr,
-    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
-    u64 *shadow_mem, Shadow cur) {
-  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
-      thr->fast_synch_epoch, kAccessIsWrite))) {
-    return;
-  }
-
-  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
-      shadow_mem, cur);
-}
-
-static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                           u64 val) {
-  (void)thr;
-  (void)pc;
-  if (size == 0)
+NOINLINE
+void TraceSwitchPart(ThreadState* thr) {
+  if (TraceSkipGap(thr))
     return;
-  // FIXME: fix me.
-  uptr offset = addr % kShadowCell;
-  if (offset) {
-    offset = kShadowCell - offset;
-    if (size <= offset)
+#if !SANITIZER_GO
+  if (ctx->after_multithreaded_fork) {
+    // We just need to survive till exec.
+    TracePart* part = thr->tctx->trace.parts.Back();
+    if (part) {
+      atomic_store_relaxed(&thr->trace_pos,
+                           reinterpret_cast<uptr>(&part->events[0]));
       return;
-    addr += offset;
-    size -= offset;
-  }
-  DCHECK_EQ(addr % 8, 0);
-  // If a user passes some insane arguments (memset(0)),
-  // let it just crash as usual.
-  if (!IsAppMem(addr) || !IsAppMem(addr + size - 1))
-    return;
-  // Don't want to touch lots of shadow memory.
-  // If a program maps 10MB stack, there is no need reset the whole range.
-  size = (size + (kShadowCell - 1)) & ~(kShadowCell - 1);
-  // UnmapOrDie/MmapFixedNoReserve does not work on Windows.
-  if (SANITIZER_WINDOWS || size < common_flags()->clear_shadow_mmap_threshold) {
-    u64 *p = (u64*)MemToShadow(addr);
-    CHECK(IsShadowMem((uptr)p));
-    CHECK(IsShadowMem((uptr)(p + size * kShadowCnt / kShadowCell - 1)));
-    // FIXME: may overwrite a part outside the region
-    for (uptr i = 0; i < size / kShadowCell * kShadowCnt;) {
-      p[i++] = val;
-      for (uptr j = 1; j < kShadowCnt; j++)
-        p[i++] = 0;
-    }
-  } else {
-    // The region is big, reset only beginning and end.
-    const uptr kPageSize = GetPageSizeCached();
-    u64 *begin = (u64*)MemToShadow(addr);
-    u64 *end = begin + size / kShadowCell * kShadowCnt;
-    u64 *p = begin;
-    // Set at least first kPageSize/2 to page boundary.
-    while ((p < begin + kPageSize / kShadowSize / 2) || ((uptr)p % kPageSize)) {
-      *p++ = val;
-      for (uptr j = 1; j < kShadowCnt; j++)
-        *p++ = 0;
-    }
-    // Reset middle part.
-    u64 *p1 = p;
-    p = RoundDown(end, kPageSize);
-    if (!MmapFixedSuperNoReserve((uptr)p1, (uptr)p - (uptr)p1))
-      Die();
-    // Set the ending.
-    while (p < end) {
-      *p++ = val;
-      for (uptr j = 1; j < kShadowCnt; j++)
-        *p++ = 0;
     }
   }
+#endif
+  TraceSwitchPartImpl(thr);
 }
 
-void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  MemoryRangeSet(thr, pc, addr, size, 0);
-}
-
-void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  // Processing more than 1k (4k of shadow) is expensive,
-  // can cause excessive memory consumption (user does not necessary touch
-  // the whole range) and most likely unnecessary.
-  if (size > 1024)
-    size = 1024;
-  CHECK_EQ(thr->is_freeing, false);
-  thr->is_freeing = true;
-  MemoryAccessRange(thr, pc, addr, size, true);
-  thr->is_freeing = false;
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
+void TraceSwitchPartImpl(ThreadState* thr) {
+  SlotLocker locker(thr, true);
+  Trace* trace = &thr->tctx->trace;
+  TracePart* part = TracePartAlloc(thr);
+  part->trace = trace;
+  thr->trace_prev_pc = 0;
+  TracePart* recycle = nullptr;
+  // Keep roughly half of parts local to the thread
+  // (not queued into the recycle queue).
+  uptr local_parts = (Trace::kMinParts + flags()->history_size + 1) / 2;
+  {
+    Lock lock(&trace->mtx);
+    if (trace->parts.Empty())
+      trace->local_head = part;
+    if (trace->parts.Size() >= local_parts) {
+      recycle = trace->local_head;
+      trace->local_head = trace->parts.Next(recycle);
+    }
+    trace->parts.PushBack(part);
+    atomic_store_relaxed(&thr->trace_pos,
+                         reinterpret_cast<uptr>(&part->events[0]));
   }
-  Shadow s(thr->fast_state);
-  s.ClearIgnoreBit();
-  s.MarkAsFreed();
-  s.SetWrite(true);
-  s.SetAddr0AndSizeLog(0, 3);
-  MemoryRangeSet(thr, pc, addr, size, s.raw());
-}
-
-void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
+  // Make this part self-sufficient by restoring the current stack
+  // and mutex set in the beginning of the trace.
+  TraceTime(thr);
+  {
+    // Pathologically large stacks may not fit into the part.
+    // In these cases we log only fixed number of top frames.
+    const uptr kMaxFrames = 1000;
+    // Check that kMaxFrames won't consume the whole part.
+    static_assert(kMaxFrames < TracePart::kSize / 2, "kMaxFrames is too big");
+    uptr* pos = Max(&thr->shadow_stack[0], thr->shadow_stack_pos - kMaxFrames);
+    for (; pos < thr->shadow_stack_pos; pos++) {
+      if (TryTraceFunc(thr, *pos))
+        continue;
+      CHECK(TraceSkipGap(thr));
+      CHECK(TryTraceFunc(thr, *pos));
+    }
   }
-  Shadow s(thr->fast_state);
-  s.ClearIgnoreBit();
-  s.SetWrite(true);
-  s.SetAddr0AndSizeLog(0, 3);
-  MemoryRangeSet(thr, pc, addr, size, s.raw());
-}
-
-void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
-                                         uptr size) {
-  if (thr->ignore_reads_and_writes == 0)
-    MemoryRangeImitateWrite(thr, pc, addr, size);
-  else
-    MemoryResetRange(thr, pc, addr, size);
-}
-
-ALWAYS_INLINE USED
-void FuncEntry(ThreadState *thr, uptr pc) {
-  DPrintf2("#%d: FuncEntry %p\n", (int)thr->fast_state.tid(), (void*)pc);
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeFuncEnter, pc);
+  for (uptr i = 0; i < thr->mset.Size(); i++) {
+    MutexSet::Desc d = thr->mset.Get(i);
+    for (uptr i = 0; i < d.count; i++)
+      TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
+                     d.addr, d.stack_id);
   }
-
-  // Shadow stack maintenance can be replaced with
-  // stack unwinding during trace switch (which presumably must be faster).
-  DCHECK_GE(thr->shadow_stack_pos, thr->shadow_stack);
-#if !SANITIZER_GO
-  DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
-#else
-  if (thr->shadow_stack_pos == thr->shadow_stack_end)
-    GrowShadowStack(thr);
-#endif
-  thr->shadow_stack_pos[0] = pc;
-  thr->shadow_stack_pos++;
-}
-
-ALWAYS_INLINE USED
-void FuncExit(ThreadState *thr) {
-  DPrintf2("#%d: FuncExit\n", (int)thr->fast_state.tid());
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeFuncExit, 0);
+  // Callers of TraceSwitchPart expect that TraceAcquire will always succeed
+  // after the call. It's possible that TryTraceFunc/TraceMutexLock above
+  // filled the trace part exactly up to the TracePart::kAlignment gap
+  // and the next TraceAcquire won't succeed. Skip the gap to avoid that.
+  EventFunc *ev;
+  if (!TraceAcquire(thr, &ev)) {
+    CHECK(TraceSkipGap(thr));
+    CHECK(TraceAcquire(thr, &ev));
   }
-
-  DCHECK_GT(thr->shadow_stack_pos, thr->shadow_stack);
-#if !SANITIZER_GO
-  DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
-#endif
-  thr->shadow_stack_pos--;
+  {
+    Lock lock(&ctx->slot_mtx);
+    // There is a small chance that the slot may be not queued at this point.
+    // This can happen if the slot has kEpochLast epoch and another thread
+    // in FindSlotAndLock discovered that it's exhausted and removed it from
+    // the slot queue. kEpochLast can happen in 2 cases: (1) if TraceSwitchPart
+    // was called with the slot locked and epoch already at kEpochLast,
+    // or (2) if we've acquired a new slot in SlotLock in the beginning
+    // of the function and the slot was at kEpochLast - 1, so after increment
+    // in SlotAttachAndLock it become kEpochLast.
+    if (ctx->slot_queue.Queued(thr->slot)) {
+      ctx->slot_queue.Remove(thr->slot);
+      ctx->slot_queue.PushBack(thr->slot);
+    }
+    if (recycle)
+      ctx->trace_part_recycle.PushBack(recycle);
+  }
+  DPrintf("#%d: TraceSwitchPart exit parts=%p-%p pos=0x%zx\n", thr->tid,
+          trace->parts.Front(), trace->parts.Back(),
+          atomic_load_relaxed(&thr->trace_pos));
 }
 
-void ThreadIgnoreBegin(ThreadState *thr, uptr pc, bool save_stack) {
+void ThreadIgnoreBegin(ThreadState* thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreBegin\n", thr->tid);
   thr->ignore_reads_and_writes++;
   CHECK_GT(thr->ignore_reads_and_writes, 0);
   thr->fast_state.SetIgnoreBit();
 #if !SANITIZER_GO
-  if (save_stack && !ctx->after_multithreaded_fork)
+  if (pc && !ctx->after_multithreaded_fork)
     thr->mop_ignore_set.Add(CurrentStackId(thr, pc));
 #endif
 }
 
-void ThreadIgnoreEnd(ThreadState *thr, uptr pc) {
+void ThreadIgnoreEnd(ThreadState *thr) {
   DPrintf("#%d: ThreadIgnoreEnd\n", thr->tid);
   CHECK_GT(thr->ignore_reads_and_writes, 0);
   thr->ignore_reads_and_writes--;
@@ -1100,17 +1061,17 @@ uptr __tsan_testonly_shadow_stack_current_size() {
 }
 #endif
 
-void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc, bool save_stack) {
+void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreSyncBegin\n", thr->tid);
   thr->ignore_sync++;
   CHECK_GT(thr->ignore_sync, 0);
 #if !SANITIZER_GO
-  if (save_stack && !ctx->after_multithreaded_fork)
+  if (pc && !ctx->after_multithreaded_fork)
     thr->sync_ignore_set.Add(CurrentStackId(thr, pc));
 #endif
 }
 
-void ThreadIgnoreSyncEnd(ThreadState *thr, uptr pc) {
+void ThreadIgnoreSyncEnd(ThreadState *thr) {
   DPrintf("#%d: ThreadIgnoreSyncEnd\n", thr->tid);
   CHECK_GT(thr->ignore_sync, 0);
   thr->ignore_sync--;
@@ -1129,7 +1090,6 @@ void build_consistency_debug() {}
 #else
 void build_consistency_release() {}
 #endif
-
 }  // namespace __tsan
 
 #if SANITIZER_CHECK_DEADLOCKS
@@ -1137,23 +1097,27 @@ namespace __sanitizer {
 using namespace __tsan;
 MutexMeta mutex_meta[] = {
     {MutexInvalid, "Invalid", {}},
-    {MutexThreadRegistry, "ThreadRegistry", {}},
-    {MutexTypeTrace, "Trace", {MutexLeaf}},
-    {MutexTypeReport, "Report", {MutexTypeSyncVar}},
-    {MutexTypeSyncVar, "SyncVar", {}},
+    {MutexThreadRegistry,
+     "ThreadRegistry",
+     {MutexTypeSlots, MutexTypeTrace, MutexTypeReport}},
+    {MutexTypeReport, "Report", {MutexTypeTrace}},
+    {MutexTypeSyncVar, "SyncVar", {MutexTypeReport, MutexTypeTrace}},
     {MutexTypeAnnotations, "Annotations", {}},
-    {MutexTypeAtExit, "AtExit", {MutexTypeSyncVar}},
+    {MutexTypeAtExit, "AtExit", {}},
     {MutexTypeFired, "Fired", {MutexLeaf}},
     {MutexTypeRacy, "Racy", {MutexLeaf}},
-    {MutexTypeGlobalProc, "GlobalProc", {}},
+    {MutexTypeGlobalProc, "GlobalProc", {MutexTypeSlot, MutexTypeSlots}},
+    {MutexTypeInternalAlloc, "InternalAlloc", {MutexLeaf}},
+    {MutexTypeTrace, "Trace", {}},
+    {MutexTypeSlot,
+     "Slot",
+     {MutexMulti, MutexTypeTrace, MutexTypeSyncVar, MutexThreadRegistry,
+      MutexTypeSlots}},
+    {MutexTypeSlots, "Slots", {MutexTypeTrace, MutexTypeReport}},
     {},
 };
 
 void PrintMutexPC(uptr pc) { StackTrace(&pc, 1).Print(); }
-}  // namespace __sanitizer
-#endif
 
-#if !SANITIZER_GO
-// Must be included in this file to make sure everything is inlined.
-#  include "tsan_interface_inl.h"
+}  // namespace __sanitizer
 #endif