1 files changed, 1063 insertions, 455 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index f55501a05d85..e1e0d50979dc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -20,16 +20,18 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
@@ -41,6 +43,13 @@ using namespace llvm;
 // "coro-frame", which results in leaner debug spew.
 #define DEBUG_TYPE "coro-suspend-crossing"
 
+static cl::opt<bool> EnableReuseStorageInFrame(
+    "reuse-storage-in-coroutine-frame", cl::Hidden,
+    cl::desc(
+        "Enable the optimization which would reuse the storage in the coroutine \
+         frame for allocas whose liferanges are not overlapped, for testing purposes"),
+    llvm::cl::init(false));
+
 enum { SmallVectorThreshold = 32 };
 
 // Provides two way mapping between the blocks and numbers.
@@ -126,10 +135,10 @@ struct SuspendCrossingInfo {
 
     BasicBlock *UseBB = I->getParent();
 
-    // As a special case, treat uses by an llvm.coro.suspend.retcon
-    // as if they were uses in the suspend's single predecessor: the
-    // uses conceptually occur before the suspend.
-    if (isa<CoroSuspendRetconInst>(I)) {
+    // As a special case, treat uses by an llvm.coro.suspend.retcon or an
+    // llvm.coro.suspend.async as if they were uses in the suspend's single
+    // predecessor: the uses conceptually occur before the suspend.
+    if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) {
       UseBB = UseBB->getSinglePredecessor();
       assert(UseBB && "should have split coro.suspend into its own block");
     }
@@ -283,71 +292,96 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
 #undef DEBUG_TYPE // "coro-suspend-crossing"
 #define DEBUG_TYPE "coro-frame"
 
-// We build up the list of spills for every case where a use is separated
-// from the definition by a suspend point.
-
-static const unsigned InvalidFieldIndex = ~0U;
-
 namespace {
-class Spill {
-  Value *Def = nullptr;
-  Instruction *User = nullptr;
-  unsigned FieldNo = InvalidFieldIndex;
-
-public:
-  Spill(Value *Def, llvm::User *U) : Def(Def), User(cast<Instruction>(U)) {}
-
-  Value *def() const { return Def; }
-  Instruction *user() const { return User; }
-  BasicBlock *userBlock() const { return User->getParent(); }
+class FrameTypeBuilder;
+// Mapping from the to-be-spilled value to all the users that need reload.
+using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
+struct AllocaInfo {
+  AllocaInst *Alloca;
+  DenseMap<Instruction *, llvm::Optional<APInt>> Aliases;
+  bool MayWriteBeforeCoroBegin;
+  AllocaInfo(AllocaInst *Alloca,
+             DenseMap<Instruction *, llvm::Optional<APInt>> Aliases,
+             bool MayWriteBeforeCoroBegin)
+      : Alloca(Alloca), Aliases(std::move(Aliases)),
+        MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {}
+};
+struct FrameDataInfo {
+  // All the values (that are not allocas) that needs to be spilled to the
+  // frame.
+  SpillInfo Spills;
+  // Allocas contains all values defined as allocas that need to live in the
+  // frame.
+  SmallVector<AllocaInfo, 8> Allocas;
+
+  SmallVector<Value *, 8> getAllDefs() const {
+    SmallVector<Value *, 8> Defs;
+    for (const auto &P : Spills)
+      Defs.push_back(P.first);
+    for (const auto &A : Allocas)
+      Defs.push_back(A.Alloca);
+    return Defs;
+  }
 
-  // Note that field index is stored in the first SpillEntry for a particular
-  // definition. Subsequent mentions of a defintion do not have fieldNo
-  // assigned. This works out fine as the users of Spills capture the info about
-  // the definition the first time they encounter it. Consider refactoring
-  // SpillInfo into two arrays to normalize the spill representation.
-  unsigned fieldIndex() const {
-    assert(FieldNo != InvalidFieldIndex && "Accessing unassigned field");
-    return FieldNo;
+  uint32_t getFieldIndex(Value *V) const {
+    auto Itr = FieldIndexMap.find(V);
+    assert(Itr != FieldIndexMap.end() &&
+           "Value does not have a frame field index");
+    return Itr->second;
   }
-  void setFieldIndex(unsigned FieldNumber) {
-    assert(FieldNo == InvalidFieldIndex && "Reassigning field number");
-    FieldNo = FieldNumber;
+
+  void setFieldIndex(Value *V, uint32_t Index) {
+    assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) &&
+           "Cannot set the index for the same field twice.");
+    FieldIndexMap[V] = Index;
   }
+
+  // Remap the index of every field in the frame, using the final layout index.
+  void updateLayoutIndex(FrameTypeBuilder &B);
+
+private:
+  // LayoutIndexUpdateStarted is used to avoid updating the index of any field
+  // twice by mistake.
+  bool LayoutIndexUpdateStarted = false;
+  // Map from values to their slot indexes on the frame. They will be first set
+  // with their original insertion field index. After the frame is built, their
+  // indexes will be updated into the final layout index.
+  DenseMap<Value *, uint32_t> FieldIndexMap;
 };
 } // namespace
 
-// Note that there may be more than one record with the same value of Def in
-// the SpillInfo vector.
-using SpillInfo = SmallVector<Spill, 8>;
-
 #ifndef NDEBUG
-static void dump(StringRef Title, SpillInfo const &Spills) {
+static void dumpSpills(StringRef Title, const SpillInfo &Spills) {
   dbgs() << "------------- " << Title << "--------------\n";
-  Value *CurrentValue = nullptr;
-  for (auto const &E : Spills) {
-    if (CurrentValue != E.def()) {
-      CurrentValue = E.def();
-      CurrentValue->dump();
-    }
+  for (const auto &E : Spills) {
+    E.first->dump();
     dbgs() << "   user: ";
-    E.user()->dump();
+    for (auto *I : E.second)
+      I->dump();
+  }
+}
+
+static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) {
+  dbgs() << "------------- Allocas --------------\n";
+  for (const auto &A : Allocas) {
+    A.Alloca->dump();
   }
 }
 #endif
 
 namespace {
+using FieldIDType = size_t;
 // We cannot rely solely on natural alignment of a type when building a
 // coroutine frame and if the alignment specified on the Alloca instruction
 // differs from the natural alignment of the alloca type we will need to insert
 // padding.
 class FrameTypeBuilder {
+private:
   struct Field {
     uint64_t Size;
     uint64_t Offset;
-    Spill *ForSpill;
     Type *Ty;
-    unsigned FieldIndex;
+    FieldIDType LayoutFieldIndex;
     Align Alignment;
     Align TyAlignment;
   };
@@ -363,19 +397,12 @@ class FrameTypeBuilder {
 
 public:
   FrameTypeBuilder(LLVMContext &Context, DataLayout const &DL)
-    : DL(DL), Context(Context) {}
-
-  class FieldId {
-    size_t Value;
-    explicit FieldId(size_t Value) : Value(Value) {}
-
-    friend class FrameTypeBuilder;
-  };
+      : DL(DL), Context(Context) {}
 
   /// Add a field to this structure for the storage of an `alloca`
   /// instruction.
-  FieldId addFieldForAlloca(AllocaInst *AI, Spill *ForSpill = nullptr,
-                            bool IsHeader = false) {
+  LLVM_NODISCARD FieldIDType addFieldForAlloca(AllocaInst *AI,
+                                               bool IsHeader = false) {
     Type *Ty = AI->getAllocatedType();
 
     // Make an array type if this is a static array allocation.
@@ -386,13 +413,42 @@ public:
         report_fatal_error("Coroutines cannot handle non static allocas yet");
     }
 
-    return addField(Ty, AI->getAlign(), ForSpill, IsHeader);
+    return addField(Ty, AI->getAlign(), IsHeader);
   }
 
+  /// We want to put the allocas whose lifetime-ranges are not overlapped
+  /// into one slot of coroutine frame.
+  /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566
+  ///
+  ///     cppcoro::task<void> alternative_paths(bool cond) {
+  ///         if (cond) {
+  ///             big_structure a;
+  ///             process(a);
+  ///             co_await something();
+  ///         } else {
+  ///             big_structure b;
+  ///             process2(b);
+  ///             co_await something();
+  ///         }
+  ///     }
+  ///
+  /// We want to put variable a and variable b in the same slot to
+  /// reduce the size of coroutine frame.
+  ///
+  /// This function use StackLifetime algorithm to partition the AllocaInsts in
+  /// Spills to non-overlapped sets in order to put Alloca in the same
+  /// non-overlapped set into the same slot in the Coroutine Frame. Then add
+  /// field for the allocas in the same non-overlapped set by using the largest
+  /// type as the field type.
+  ///
+  /// Side Effects: Because We sort the allocas, the order of allocas in the
+  /// frame may be different with the order in the source code.
+  void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData,
+                          coro::Shape &Shape);
+
   /// Add a field to this structure.
-  FieldId addField(Type *Ty, MaybeAlign FieldAlignment,
-                   Spill *ForSpill = nullptr,
-                   bool IsHeader = false) {
+  LLVM_NODISCARD FieldIDType addField(Type *Ty, MaybeAlign FieldAlignment,
+                                      bool IsHeader = false) {
     assert(!IsFinished && "adding fields to a finished builder");
     assert(Ty && "must provide a type for a field");
 
@@ -415,9 +471,8 @@ public:
       Offset = OptimizedStructLayoutField::FlexibleOffset;
     }
 
-    Fields.push_back({FieldSize, Offset, ForSpill, Ty, 0,
-                      *FieldAlignment, TyAlignment});
-    return FieldId(Fields.size() - 1);
+    Fields.push_back({FieldSize, Offset, Ty, 0, *FieldAlignment, TyAlignment});
+    return Fields.size() - 1;
   }
 
   /// Finish the layout and set the body on the given type.
@@ -433,13 +488,162 @@ public:
     return StructAlign;
   }
 
-  unsigned getFieldIndex(FieldId Id) const {
+  FieldIDType getLayoutFieldIndex(FieldIDType Id) const {
     assert(IsFinished && "not yet finished!");
-    return Fields[Id.Value].FieldIndex;
+    return Fields[Id].LayoutFieldIndex;
   }
 };
 } // namespace
 
+void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) {
+  auto Updater = [&](Value *I) {
+    setFieldIndex(I, B.getLayoutFieldIndex(getFieldIndex(I)));
+  };
+  LayoutIndexUpdateStarted = true;
+  for (auto &S : Spills)
+    Updater(S.first);
+  for (const auto &A : Allocas)
+    Updater(A.Alloca);
+  LayoutIndexUpdateStarted = false;
+}
+
+void FrameTypeBuilder::addFieldForAllocas(const Function &F,
+                                          FrameDataInfo &FrameData,
+                                          coro::Shape &Shape) {
+  DenseMap<AllocaInst *, unsigned int> AllocaIndex;
+  using AllocaSetType = SmallVector<AllocaInst *, 4>;
+  SmallVector<AllocaSetType, 4> NonOverlapedAllocas;
+
+  // We need to add field for allocas at the end of this function. However, this
+  // function has multiple exits, so we use this helper to avoid redundant code.
+  struct RTTIHelper {
+    std::function<void()> func;
+    RTTIHelper(std::function<void()> &&func) : func(func) {}
+    ~RTTIHelper() { func(); }
+  } Helper([&]() {
+    for (auto AllocaList : NonOverlapedAllocas) {
+      auto *LargestAI = *AllocaList.begin();
+      FieldIDType Id = addFieldForAlloca(LargestAI);
+      for (auto *Alloca : AllocaList)
+        FrameData.setFieldIndex(Alloca, Id);
+    }
+  });
+
+  if (!Shape.ReuseFrameSlot && !EnableReuseStorageInFrame) {
+    for (const auto &A : FrameData.Allocas) {
+      AllocaInst *Alloca = A.Alloca;
+      AllocaIndex[Alloca] = NonOverlapedAllocas.size();
+      NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
+    }
+    return;
+  }
+
+  // Because there are pathes from the lifetime.start to coro.end
+  // for each alloca, the liferanges for every alloca is overlaped
+  // in the blocks who contain coro.end and the successor blocks.
+  // So we choose to skip there blocks when we calculates the liferange
+  // for each alloca. It should be reasonable since there shouldn't be uses
+  // in these blocks and the coroutine frame shouldn't be used outside the
+  // coroutine body.
+  //
+  // Note that the user of coro.suspend may not be SwitchInst. However, this
+  // case seems too complex to handle. And it is harmless to skip these
+  // patterns since it just prevend putting the allocas to live in the same
+  // slot.
+  DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest;
+  for (auto CoroSuspendInst : Shape.CoroSuspends) {
+    for (auto U : CoroSuspendInst->users()) {
+      if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) {
+        auto *SWI = const_cast<SwitchInst *>(ConstSWI);
+        DefaultSuspendDest[SWI] = SWI->getDefaultDest();
+        SWI->setDefaultDest(SWI->getSuccessor(1));
+      }
+    }
+  }
+
+  auto ExtractAllocas = [&]() {
+    AllocaSetType Allocas;
+    Allocas.reserve(FrameData.Allocas.size());
+    for (const auto &A : FrameData.Allocas)
+      Allocas.push_back(A.Alloca);
+    return Allocas;
+  };
+  StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(),
+                                      StackLifetime::LivenessType::May);
+  StackLifetimeAnalyzer.run();
+  auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) {
+    return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps(
+        StackLifetimeAnalyzer.getLiveRange(AI2));
+  };
+  auto GetAllocaSize = [&](const AllocaInfo &A) {
+    Optional<TypeSize> RetSize = A.Alloca->getAllocationSizeInBits(DL);
+    assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n");
+    assert(!RetSize->isScalable() && "Scalable vectors are not yet supported");
+    return RetSize->getFixedSize();
+  };
+  // Put larger allocas in the front. So the larger allocas have higher
+  // priority to merge, which can save more space potentially. Also each
+  // AllocaSet would be ordered. So we can get the largest Alloca in one
+  // AllocaSet easily.
+  sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) {
+    return GetAllocaSize(Iter1) > GetAllocaSize(Iter2);
+  });
+  for (const auto &A : FrameData.Allocas) {
+    AllocaInst *Alloca = A.Alloca;
+    bool Merged = false;
+    // Try to find if the Alloca is not inferenced with any existing
+    // NonOverlappedAllocaSet. If it is true, insert the alloca to that
+    // NonOverlappedAllocaSet.
+    for (auto &AllocaSet : NonOverlapedAllocas) {
+      assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n");
+      bool NoInference = none_of(AllocaSet, [&](auto Iter) {
+        return IsAllocaInferenre(Alloca, Iter);
+      });
+      // If the alignment of A is multiple of the alignment of B, the address
+      // of A should satisfy the requirement for aligning for B.
+      //
+      // There may be other more fine-grained strategies to handle the alignment
+      // infomation during the merging process. But it seems hard to handle
+      // these strategies and benefit little.
+      bool Alignable = [&]() -> bool {
+        auto *LargestAlloca = *AllocaSet.begin();
+        return LargestAlloca->getAlign().value() % Alloca->getAlign().value() ==
+               0;
+      }();
+      bool CouldMerge = NoInference && Alignable;
+      if (!CouldMerge)
+        continue;
+      AllocaIndex[Alloca] = AllocaIndex[*AllocaSet.begin()];
+      AllocaSet.push_back(Alloca);
+      Merged = true;
+      break;
+    }
+    if (!Merged) {
+      AllocaIndex[Alloca] = NonOverlapedAllocas.size();
+      NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
+    }
+  }
+  // Recover the default target destination for each Switch statement
+  // reserved.
+  for (auto SwitchAndDefaultDest : DefaultSuspendDest) {
+    SwitchInst *SWI = SwitchAndDefaultDest.first;
+    BasicBlock *DestBB = SwitchAndDefaultDest.second;
+    SWI->setDefaultDest(DestBB);
+  }
+  // This Debug Info could tell us which allocas are merged into one slot.
+  LLVM_DEBUG(for (auto &AllocaSet
+                  : NonOverlapedAllocas) {
+    if (AllocaSet.size() > 1) {
+      dbgs() << "In Function:" << F.getName() << "\n";
+      dbgs() << "Find Union Set "
+             << "\n";
+      dbgs() << "\tAllocas are \n";
+      for (auto Alloca : AllocaSet)
+        dbgs() << "\t\t" << *Alloca << "\n";
+    }
+  });
+}
+
 void FrameTypeBuilder::finish(StructType *Ty) {
   assert(!IsFinished && "already finished!");
 
@@ -491,13 +695,8 @@ void FrameTypeBuilder::finish(StructType *Ty) {
                                             Offset - LastOffset));
     }
 
-    // Record the layout information into both the Field and the
-    // original Spill, if there is one.
     F.Offset = Offset;
-    F.FieldIndex = FieldTypes.size();
-    if (F.ForSpill) {
-      F.ForSpill->setFieldIndex(F.FieldIndex);
-    }
+    F.LayoutFieldIndex = FieldTypes.size();
 
     FieldTypes.push_back(F.Ty);
     LastOffset = Offset + F.Size;
@@ -509,8 +708,8 @@ void FrameTypeBuilder::finish(StructType *Ty) {
   // Check that the IR layout matches the offsets we expect.
   auto Layout = DL.getStructLayout(Ty);
   for (auto &F : Fields) {
-    assert(Ty->getElementType(F.FieldIndex) == F.Ty);
-    assert(Layout->getElementOffset(F.FieldIndex) == F.Offset);
+    assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty);
+    assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset);
   }
 #endif
 
@@ -526,7 +725,7 @@ void FrameTypeBuilder::finish(StructType *Ty) {
 //     ... spills ...
 //   };
 static StructType *buildFrameType(Function &F, coro::Shape &Shape,
-                                  SpillInfo &Spills) {
+                                  FrameDataInfo &FrameData) {
   LLVMContext &C = F.getContext();
   const DataLayout &DL = F.getParent()->getDataLayout();
   StructType *FrameTy = [&] {
@@ -538,8 +737,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   FrameTypeBuilder B(C, DL);
 
   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
-  Optional<FrameTypeBuilder::FieldId> PromiseFieldId;
-  Optional<FrameTypeBuilder::FieldId> SwitchIndexFieldId;
+  Optional<FieldIDType> SwitchIndexFieldId;
 
   if (Shape.ABI == coro::ABI::Switch) {
     auto *FramePtrTy = FrameTy->getPointerTo();
@@ -549,14 +747,15 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
 
     // Add header fields for the resume and destroy functions.
     // We can rely on these being perfectly packed.
-    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
-    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
+    (void)B.addField(FnPtrTy, None, /*header*/ true);
+    (void)B.addField(FnPtrTy, None, /*header*/ true);
 
-    // Add a header field for the promise if there is one.
-    if (PromiseAlloca) {
-      PromiseFieldId =
-        B.addFieldForAlloca(PromiseAlloca, nullptr, /*header*/ true);
-    }
+    // PromiseAlloca field needs to be explicitly added here because it's
+    // a header field with a fixed offset based on its alignment. Hence it
+    // needs special handling and cannot be added to FrameData.Allocas.
+    if (PromiseAlloca)
+      FrameData.setFieldIndex(
+          PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true));
 
     // Add a field to store the suspend index.  This doesn't need to
     // be in the header.
@@ -568,40 +767,40 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
     assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
   }
 
-  Value *CurrentDef = nullptr;
-
+  // Because multiple allocas may own the same field slot,
+  // we add allocas to field here.
+  B.addFieldForAllocas(F, FrameData, Shape);
+  // Add PromiseAlloca to Allocas list so that
+  // 1. updateLayoutIndex could update its index after
+  // `performOptimizedStructLayout`
+  // 2. it is processed in insertSpills.
+  if (Shape.ABI == coro::ABI::Switch && PromiseAlloca)
+    // We assume that the promise alloca won't be modified before
+    // CoroBegin and no alias will be create before CoroBegin.
+    FrameData.Allocas.emplace_back(
+        PromiseAlloca, DenseMap<Instruction *, llvm::Optional<APInt>>{}, false);
   // Create an entry for every spilled value.
-  for (auto &S : Spills) {
-    // We can have multiple entries in Spills for a single value, but
-    // they should form a contiguous run.  Ignore all but the first.
-    if (CurrentDef == S.def())
-      continue;
-
-    CurrentDef = S.def();
-
-    assert(CurrentDef != PromiseAlloca &&
-           "recorded spill use of promise alloca?");
-
-    if (auto *AI = dyn_cast<AllocaInst>(CurrentDef)) {
-      B.addFieldForAlloca(AI, &S);
-    } else {
-      Type *Ty = CurrentDef->getType();
-      B.addField(Ty, None, &S);
-    }
+  for (auto &S : FrameData.Spills) {
+    Type *FieldType = S.first->getType();
+    // For byval arguments, we need to store the pointed value in the frame,
+    // instead of the pointer itself.
+    if (const Argument *A = dyn_cast<Argument>(S.first))
+      if (A->hasByValAttr())
+        FieldType = FieldType->getPointerElementType();
+    FieldIDType Id = B.addField(FieldType, None);
+    FrameData.setFieldIndex(S.first, Id);
   }
 
   B.finish(FrameTy);
+  FrameData.updateLayoutIndex(B);
   Shape.FrameAlign = B.getStructAlign();
   Shape.FrameSize = B.getStructSize();
 
   switch (Shape.ABI) {
-  // In the switch ABI, remember the field indices for the promise and
-  // switch-index fields.
   case coro::ABI::Switch:
+    // In the switch ABI, remember the switch-index field.
     Shape.SwitchLowering.IndexField =
-      B.getFieldIndex(*SwitchIndexFieldId);
-    Shape.SwitchLowering.PromiseField =
-      (PromiseAlloca ? B.getFieldIndex(*PromiseFieldId) : 0);
+        B.getLayoutFieldIndex(*SwitchIndexFieldId);
 
     // Also round the frame size up to a multiple of its alignment, as is
     // generally expected in C/C++.
@@ -617,65 +816,246 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
          B.getStructAlign() <= Id->getStorageAlignment());
     break;
   }
+  case coro::ABI::Async: {
+    Shape.AsyncLowering.FrameOffset =
+        alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign);
+    // Also make the final context size a multiple of the context alignment to
+    // make allocation easier for allocators.
+    Shape.AsyncLowering.ContextSize =
+        alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize,
+                Shape.AsyncLowering.getContextAlignment());
+    if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) {
+      report_fatal_error(
+          "The alignment requirment of frame variables cannot be higher than "
+          "the alignment of the async function context");
+    }
+    break;
+  }
   }
 
   return FrameTy;
 }
 
-// We use a pointer use visitor to discover if there are any writes into an
-// alloca that dominates CoroBegin. If that is the case, insertSpills will copy
-// the value from the alloca into the coroutine frame spill slot corresponding
-// to that alloca.
+// We use a pointer use visitor to track how an alloca is being used.
+// The goal is to be able to answer the following three questions:
+// 1. Should this alloca be allocated on the frame instead.
+// 2. Could the content of the alloca be modified prior to CoroBegn, which would
+// require copying the data from alloca to the frame after CoroBegin.
+// 3. Is there any alias created for this alloca prior to CoroBegin, but used
+// after CoroBegin. In that case, we will need to recreate the alias after
+// CoroBegin based off the frame. To answer question 1, we track two things:
+//   a. List of all BasicBlocks that use this alloca or any of the aliases of
+//   the alloca. In the end, we check if there exists any two basic blocks that
+//   cross suspension points. If so, this alloca must be put on the frame. b.
+//   Whether the alloca or any alias of the alloca is escaped at some point,
+//   either by storing the address somewhere, or the address is used in a
+//   function call that might capture. If it's ever escaped, this alloca must be
+//   put on the frame conservatively.
+// To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin.
+// Whenever a potential write happens, either through a store instruction, a
+// function call or any of the memory intrinsics, we check whether this
+// instruction is prior to CoroBegin. To answer question 3, we track the offsets
+// of all aliases created for the alloca prior to CoroBegin but used after
+// CoroBegin. llvm::Optional is used to be able to represent the case when the
+// offset is unknown (e.g. when you have a PHINode that takes in different
+// offset values). We cannot handle unknown offsets and will assert. This is the
+// potential issue left out. An ideal solution would likely require a
+// significant redesign.
 namespace {
 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   using Base = PtrUseVisitor<AllocaUseVisitor>;
   AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
-                   const CoroBeginInst &CB)
-      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB) {}
+                   const CoroBeginInst &CB, const SuspendCrossingInfo &Checker)
+      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker) {}
 
-  // We are only interested in uses that dominate coro.begin.
   void visit(Instruction &I) {
-    if (DT.dominates(&I, &CoroBegin))
-      Base::visit(I);
+    UserBBs.insert(I.getParent());
+    Base::visit(I);
+    // If the pointer is escaped prior to CoroBegin, we have to assume it would
+    // be written into before CoroBegin as well.
+    if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) {
+      MayWriteBeforeCoroBegin = true;
+    }
   }
   // We need to provide this overload as PtrUseVisitor uses a pointer based
   // visiting function.
   void visit(Instruction *I) { return visit(*I); }
 
-  void visitLoadInst(LoadInst &) {} // Good. Nothing to do.
+  void visitPHINode(PHINode &I) {
+    enqueueUsers(I);
+    handleAlias(I);
+  }
+
+  void visitSelectInst(SelectInst &I) {
+    enqueueUsers(I);
+    handleAlias(I);
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    // Regardless whether the alias of the alloca is the value operand or the
+    // pointer operand, we need to assume the alloca is been written.
+    handleMayWrite(SI);
+
+    if (SI.getValueOperand() != U->get())
+      return;
+
+    // We are storing the pointer into a memory location, potentially escaping.
+    // As an optimization, we try to detect simple cases where it doesn't
+    // actually escape, for example:
+    //   %ptr = alloca ..
+    //   %addr = alloca ..
+    //   store %ptr, %addr
+    //   %x = load %addr
+    //   ..
+    // If %addr is only used by loading from it, we could simply treat %x as
+    // another alias of %ptr, and not considering %ptr being escaped.
+    auto IsSimpleStoreThenLoad = [&]() {
+      auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand());
+      // If the memory location we are storing to is not an alloca, it
+      // could be an alias of some other memory locations, which is difficult
+      // to analyze.
+      if (!AI)
+        return false;
+      // StoreAliases contains aliases of the memory location stored into.
+      SmallVector<Instruction *, 4> StoreAliases = {AI};
+      while (!StoreAliases.empty()) {
+        Instruction *I = StoreAliases.pop_back_val();
+        for (User *U : I->users()) {
+          // If we are loading from the memory location, we are creating an
+          // alias of the original pointer.
+          if (auto *LI = dyn_cast<LoadInst>(U)) {
+            enqueueUsers(*LI);
+            handleAlias(*LI);
+            continue;
+          }
+          // If we are overriding the memory location, the pointer certainly
+          // won't escape.
+          if (auto *S = dyn_cast<StoreInst>(U))
+            if (S->getPointerOperand() == I)
+              continue;
+          if (auto *II = dyn_cast<IntrinsicInst>(U))
+            if (II->isLifetimeStartOrEnd())
+              continue;
+          // BitCastInst creats aliases of the memory location being stored
+          // into.
+          if (auto *BI = dyn_cast<BitCastInst>(U)) {
+            StoreAliases.push_back(BI);
+            continue;
+          }
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    if (!IsSimpleStoreThenLoad())
+      PI.setEscaped(&SI);
+  }
 
-  // If the use is an operand, the pointer escaped and anything can write into
-  // that memory. If the use is the pointer, we are definitely writing into the
-  // alloca and therefore we need to copy.
-  void visitStoreInst(StoreInst &SI) { PI.setAborted(&SI); }
+  // All mem intrinsics modify the data.
+  void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); }
 
-  // Any other instruction that is not filtered out by PtrUseVisitor, will
-  // result in the copy.
-  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+  void visitBitCastInst(BitCastInst &BC) {
+    Base::visitBitCastInst(BC);
+    handleAlias(BC);
+  }
+
+  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+    Base::visitAddrSpaceCastInst(ASC);
+    handleAlias(ASC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    // The base visitor will adjust Offset accordingly.
+    Base::visitGetElementPtrInst(GEPI);
+    handleAlias(GEPI);
+  }
+
+  void visitCallBase(CallBase &CB) {
+    for (unsigned Op = 0, OpCount = CB.getNumArgOperands(); Op < OpCount; ++Op)
+      if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op))
+        PI.setEscaped(&CB);
+    handleMayWrite(CB);
+  }
+
+  bool getShouldLiveOnFrame() const {
+    if (!ShouldLiveOnFrame)
+      ShouldLiveOnFrame = computeShouldLiveOnFrame();
+    return ShouldLiveOnFrame.getValue();
+  }
+
+  bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; }
+
+  DenseMap<Instruction *, llvm::Optional<APInt>> getAliasesCopy() const {
+    assert(getShouldLiveOnFrame() && "This method should only be called if the "
+                                     "alloca needs to live on the frame.");
+    for (const auto &P : AliasOffetMap)
+      if (!P.second)
+        report_fatal_error("Unable to handle an alias with unknown offset "
+                           "created before CoroBegin.");
+    return AliasOffetMap;
+  }
 
 private:
   const DominatorTree &DT;
   const CoroBeginInst &CoroBegin;
-};
-} // namespace
-static bool mightWriteIntoAllocaPtr(AllocaInst &A, const DominatorTree &DT,
-                                    const CoroBeginInst &CB) {
-  const DataLayout &DL = A.getModule()->getDataLayout();
-  AllocaUseVisitor Visitor(DL, DT, CB);
-  auto PtrI = Visitor.visitPtr(A);
-  if (PtrI.isEscaped() || PtrI.isAborted()) {
-    auto *PointerEscapingInstr = PtrI.getEscapingInst()
-                                     ? PtrI.getEscapingInst()
-                                     : PtrI.getAbortingInst();
-    if (PointerEscapingInstr) {
-      LLVM_DEBUG(
-          dbgs() << "AllocaInst copy was triggered by instruction: "
-                 << *PointerEscapingInstr << "\n");
+  const SuspendCrossingInfo &Checker;
+  // All alias to the original AllocaInst, created before CoroBegin and used
+  // after CoroBegin. Each entry contains the instruction and the offset in the
+  // original Alloca. They need to be recreated after CoroBegin off the frame.
+  DenseMap<Instruction *, llvm::Optional<APInt>> AliasOffetMap{};
+  SmallPtrSet<BasicBlock *, 2> UserBBs{};
+  bool MayWriteBeforeCoroBegin{false};
+
+  mutable llvm::Optional<bool> ShouldLiveOnFrame{};
+
+  bool computeShouldLiveOnFrame() const {
+    if (PI.isEscaped())
+      return true;
+
+    for (auto *BB1 : UserBBs)
+      for (auto *BB2 : UserBBs)
+        if (Checker.hasPathCrossingSuspendPoint(BB1, BB2))
+          return true;
+
+    return false;
+  }
+
+  void handleMayWrite(const Instruction &I) {
+    if (!DT.dominates(&CoroBegin, &I))
+      MayWriteBeforeCoroBegin = true;
+  }
+
+  bool usedAfterCoroBegin(Instruction &I) {
+    for (auto &U : I.uses())
+      if (DT.dominates(&CoroBegin, U))
+        return true;
+    return false;
+  }
+
+  void handleAlias(Instruction &I) {
+    // We track all aliases created prior to CoroBegin but used after.
+    // These aliases may need to be recreated after CoroBegin if the alloca
+    // need to live on the frame.
+    if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I))
+      return;
+
+    if (!IsOffsetKnown) {
+      AliasOffetMap[&I].reset();
+    } else {
+      auto Itr = AliasOffetMap.find(&I);
+      if (Itr == AliasOffetMap.end()) {
+        AliasOffetMap[&I] = Offset;
+      } else if (Itr->second.hasValue() && Itr->second.getValue() != Offset) {
+        // If we have seen two different possible values for this alias, we set
+        // it to empty.
+        AliasOffetMap[&I].reset();
+      }
     }
-    return true;
   }
-  return false;
-}
+};
+} // namespace
 
 // We need to make room to insert a spill after initial PHIs, but before
 // catchswitch instruction. Placing it before violates the requirement that
@@ -720,7 +1100,8 @@ static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
 //    whatever
 //
 //
-static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
+static Instruction *insertSpills(const FrameDataInfo &FrameData,
+                                 coro::Shape &Shape) {
   auto *CB = Shape.CoroBegin;
   LLVMContext &C = CB->getContext();
   IRBuilder<> Builder(CB->getNextNode());
@@ -729,32 +1110,13 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   auto *FramePtr =
       cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr"));
   DominatorTree DT(*CB->getFunction());
-
-  Value *CurrentValue = nullptr;
-  BasicBlock *CurrentBlock = nullptr;
-  Value *CurrentReload = nullptr;
-
-  // Proper field number will be read from field definition.
-  unsigned Index = InvalidFieldIndex;
-
-  // We need to keep track of any allocas that need "spilling"
-  // since they will live in the coroutine frame now, all access to them
-  // need to be changed, not just the access across suspend points
-  // we remember allocas and their indices to be handled once we processed
-  // all the spills.
-  SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
-
-  // Promise alloca (if present) doesn't show in the spills and has a
-  // special field number.
-  if (auto *PromiseAlloca = Shape.getPromiseAlloca()) {
-    assert(Shape.ABI == coro::ABI::Switch);
-    Allocas.emplace_back(PromiseAlloca, Shape.getPromiseField());
-  }
+  SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache;
 
   // Create a GEP with the given index into the coroutine frame for the original
   // value Orig. Appends an extra 0 index for array-allocas, preserving the
   // original type.
-  auto GetFramePointer = [&](uint32_t Index, Value *Orig) -> Value * {
+  auto GetFramePointer = [&](Value *Orig) -> Value * {
+    FieldIDType Index = FrameData.getFieldIndex(Orig);
     SmallVector<Value *, 3> Indices = {
         ConstantInt::get(Type::getInt32Ty(C), 0),
         ConstantInt::get(Type::getInt32Ty(C), Index),
@@ -771,147 +1133,160 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
       }
     }
 
-    return Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices);
-  };
-
-  // Create a load instruction to reload the spilled value from the coroutine
-  // frame. Populates the Value pointer reference provided with the frame GEP.
-  auto CreateReload = [&](Instruction *InsertBefore, Value *&G) {
-    assert(Index != InvalidFieldIndex && "accessing unassigned field number");
-    Builder.SetInsertPoint(InsertBefore);
-
-    G = GetFramePointer(Index, CurrentValue);
-    G->setName(CurrentValue->getName() + Twine(".reload.addr"));
-
-    return isa<AllocaInst>(CurrentValue)
-               ? G
-               : Builder.CreateLoad(FrameTy->getElementType(Index), G,
-                                    CurrentValue->getName() + Twine(".reload"));
+    auto GEP = cast<GetElementPtrInst>(
+        Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices));
+    if (isa<AllocaInst>(Orig)) {
+      // If the type of GEP is not equal to the type of AllocaInst, it implies
+      // that the AllocaInst may be reused in the Frame slot of other
+      // AllocaInst. So We cast GEP to the AllocaInst here to re-use
+      // the Frame storage.
+      //
+      // Note: If we change the strategy dealing with alignment, we need to refine
+      // this casting.
+      if (GEP->getResultElementType() != Orig->getType())
+        return Builder.CreateBitCast(GEP, Orig->getType(),
+                                     Orig->getName() + Twine(".cast"));
+    }
+    return GEP;
   };
 
-  Value *GEP = nullptr, *CurrentGEP = nullptr;
-  for (auto const &E : Spills) {
-    // If we have not seen the value, generate a spill.
-    if (CurrentValue != E.def()) {
-      CurrentValue = E.def();
-      CurrentBlock = nullptr;
-      CurrentReload = nullptr;
-
-      Index = E.fieldIndex();
-
-      if (auto *AI = dyn_cast<AllocaInst>(CurrentValue)) {
-        // Spilled AllocaInst will be replaced with GEP from the coroutine frame
-        // there is no spill required.
-        Allocas.emplace_back(AI, Index);
-        if (!AI->isStaticAlloca())
-          report_fatal_error("Coroutines cannot handle non static allocas yet");
+  for (auto const &E : FrameData.Spills) {
+    Value *Def = E.first;
+    // Create a store instruction storing the value into the
+    // coroutine frame.
+    Instruction *InsertPt = nullptr;
+    bool NeedToCopyArgPtrValue = false;
+    if (auto *Arg = dyn_cast<Argument>(Def)) {
+      // For arguments, we will place the store instruction right after
+      // the coroutine frame pointer instruction, i.e. bitcast of
+      // coro.begin from i8* to %f.frame*.
+      InsertPt = FramePtr->getNextNode();
+
+      // If we're spilling an Argument, make sure we clear 'nocapture'
+      // from the coroutine function.
+      Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture);
+
+      if (Arg->hasByValAttr())
+        NeedToCopyArgPtrValue = true;
+
+    } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) {
+      // Don't spill immediately after a suspend; splitting assumes
+      // that the suspend will be followed by a branch.
+      InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI();
+    } else {
+      auto *I = cast<Instruction>(Def);
+      if (!DT.dominates(CB, I)) {
+        // If it is not dominated by CoroBegin, then spill should be
+        // inserted immediately after CoroFrame is computed.
+        InsertPt = FramePtr->getNextNode();
+      } else if (auto *II = dyn_cast<InvokeInst>(I)) {
+        // If we are spilling the result of the invoke instruction, split
+        // the normal edge and insert the spill in the new block.
+        auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest());
+        InsertPt = NewBB->getTerminator();
+      } else if (isa<PHINode>(I)) {
+        // Skip the PHINodes and EH pads instructions.
+        BasicBlock *DefBlock = I->getParent();
+        if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
+          InsertPt = splitBeforeCatchSwitch(CSI);
+        else
+          InsertPt = &*DefBlock->getFirstInsertionPt();
       } else {
-        // Otherwise, create a store instruction storing the value into the
-        // coroutine frame.
-
-        Instruction *InsertPt = nullptr;
-        if (auto Arg = dyn_cast<Argument>(CurrentValue)) {
-          // For arguments, we will place the store instruction right after
-          // the coroutine frame pointer instruction, i.e. bitcast of
-          // coro.begin from i8* to %f.frame*.
-          InsertPt = FramePtr->getNextNode();
-
-          // If we're spilling an Argument, make sure we clear 'nocapture'
-          // from the coroutine function.
-          Arg->getParent()->removeParamAttr(Arg->getArgNo(),
-                                            Attribute::NoCapture);
-
-        } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) {
-          // If we are spilling the result of the invoke instruction, split the
-          // normal edge and insert the spill in the new block.
-          auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
-          InsertPt = NewBB->getTerminator();
-        } else if (isa<PHINode>(CurrentValue)) {
-          // Skip the PHINodes and EH pads instructions.
-          BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent();
-          if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
-            InsertPt = splitBeforeCatchSwitch(CSI);
-          else
-            InsertPt = &*DefBlock->getFirstInsertionPt();
-        } else if (auto CSI = dyn_cast<AnyCoroSuspendInst>(CurrentValue)) {
-          // Don't spill immediately after a suspend; splitting assumes
-          // that the suspend will be followed by a branch.
-          InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI();
-        } else {
-          auto *I = cast<Instruction>(E.def());
-          assert(!I->isTerminator() && "unexpected terminator");
-          // For all other values, the spill is placed immediately after
-          // the definition.
-          if (DT.dominates(CB, I)) {
-            InsertPt = I->getNextNode();
-          } else {
-            // Unless, it is not dominated by CoroBegin, then it will be
-            // inserted immediately after CoroFrame is computed.
-            InsertPt = FramePtr->getNextNode();
-          }
-        }
-
-        Builder.SetInsertPoint(InsertPt);
-        auto *G = Builder.CreateConstInBoundsGEP2_32(
-            FrameTy, FramePtr, 0, Index,
-            CurrentValue->getName() + Twine(".spill.addr"));
-        Builder.CreateStore(CurrentValue, G);
+        assert(!I->isTerminator() && "unexpected terminator");
+        // For all other values, the spill is placed immediately after
+        // the definition.
+        InsertPt = I->getNextNode();
       }
     }
 
-    // If we have not seen the use block, generate a reload in it.
-    if (CurrentBlock != E.userBlock()) {
-      CurrentBlock = E.userBlock();
-      CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt(), GEP);
+    auto Index = FrameData.getFieldIndex(Def);
+    Builder.SetInsertPoint(InsertPt);
+    auto *G = Builder.CreateConstInBoundsGEP2_32(
+        FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr"));
+    if (NeedToCopyArgPtrValue) {
+      // For byval arguments, we need to store the pointed value in the frame,
+      // instead of the pointer itself.
+      auto *Value =
+          Builder.CreateLoad(Def->getType()->getPointerElementType(), Def);
+      Builder.CreateStore(Value, G);
+    } else {
+      Builder.CreateStore(Def, G);
     }
 
-    // If we have a single edge PHINode, remove it and replace it with a reload
-    // from the coroutine frame. (We already took care of multi edge PHINodes
-    // by rewriting them in the rewritePHIs function).
-    if (auto *PN = dyn_cast<PHINode>(E.user())) {
-      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
-                                                "values in the PHINode");
-      PN->replaceAllUsesWith(CurrentReload);
-      PN->eraseFromParent();
-      continue;
-    }
+    BasicBlock *CurrentBlock = nullptr;
+    Value *CurrentReload = nullptr;
+    for (auto *U : E.second) {
+      // If we have not seen the use block, create a load instruction to reload
+      // the spilled value from the coroutine frame. Populates the Value pointer
+      // reference provided with the frame GEP.
+      if (CurrentBlock != U->getParent()) {
+        CurrentBlock = U->getParent();
+        Builder.SetInsertPoint(&*CurrentBlock->getFirstInsertionPt());
+
+        auto *GEP = GetFramePointer(E.first);
+        GEP->setName(E.first->getName() + Twine(".reload.addr"));
+        if (NeedToCopyArgPtrValue)
+          CurrentReload = GEP;
+        else
+          CurrentReload = Builder.CreateLoad(
+              FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP,
+              E.first->getName() + Twine(".reload"));
+
+        TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Def);
+        for (DbgDeclareInst *DDI : DIs) {
+          bool AllowUnresolved = false;
+          // This dbg.declare is preserved for all coro-split function
+          // fragments. It will be unreachable in the main function, and
+          // processed by coro::salvageDebugInfo() by CoroCloner.
+          DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
+              .insertDeclare(CurrentReload, DDI->getVariable(),
+                             DDI->getExpression(), DDI->getDebugLoc(),
+                             &*Builder.GetInsertPoint());
+          // This dbg.declare is for the main function entry point.  It
+          // will be deleted in all coro-split functions.
+          coro::salvageDebugInfo(DbgPtrAllocaCache, DDI);
+        }
+      }
 
-    // If we have not seen this GEP instruction, migrate any dbg.declare from
-    // the alloca to it.
-    if (CurrentGEP != GEP) {
-      CurrentGEP = GEP;
-      TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(CurrentValue);
-      if (!DIs.empty())
-        DIBuilder(*CurrentBlock->getParent()->getParent(),
-                  /*AllowUnresolved*/ false)
-            .insertDeclare(CurrentGEP, DIs.front()->getVariable(),
-                           DIs.front()->getExpression(),
-                           DIs.front()->getDebugLoc(), DIs.front());
-    }
+      // If we have a single edge PHINode, remove it and replace it with a
+      // reload from the coroutine frame. (We already took care of multi edge
+      // PHINodes by rewriting them in the rewritePHIs function).
+      if (auto *PN = dyn_cast<PHINode>(U)) {
+        assert(PN->getNumIncomingValues() == 1 &&
+               "unexpected number of incoming "
+               "values in the PHINode");
+        PN->replaceAllUsesWith(CurrentReload);
+        PN->eraseFromParent();
+        continue;
+      }
 
-    // Replace all uses of CurrentValue in the current instruction with reload.
-    E.user()->replaceUsesOfWith(CurrentValue, CurrentReload);
+      // Replace all uses of CurrentValue in the current instruction with
+      // reload.
+      U->replaceUsesOfWith(Def, CurrentReload);
+    }
   }
 
   BasicBlock *FramePtrBB = FramePtr->getParent();
 
   auto SpillBlock =
-    FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");      
+      FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
   Shape.AllocaSpillBlock = SpillBlock;
 
   // retcon and retcon.once lowering assumes all uses have been sunk.
-  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce) {
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
+      Shape.ABI == coro::ABI::Async) {
     // If we found any allocas, replace all of their remaining uses with Geps.
     Builder.SetInsertPoint(&SpillBlock->front());
-    for (auto &P : Allocas) {
-      auto *G = GetFramePointer(P.second, P.first);
+    for (const auto &P : FrameData.Allocas) {
+      AllocaInst *Alloca = P.Alloca;
+      auto *G = GetFramePointer(Alloca);
 
       // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
       // here, as we are changing location of the instruction.
-      G->takeName(P.first);
-      P.first->replaceAllUsesWith(G);
-      P.first->eraseFromParent();
+      G->takeName(Alloca);
+      Alloca->replaceAllUsesWith(G);
+      Alloca->eraseFromParent();
     }
     return FramePtr;
   }
@@ -921,49 +1296,63 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   // we also delete the original dbg.declare and replace other uses with undef.
   // Note: We cannot replace the alloca with GEP instructions indiscriminately,
   // as some of the uses may not be dominated by CoroBegin.
-  bool MightNeedToCopy = false;
   Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
   SmallVector<Instruction *, 4> UsersToUpdate;
-  for (auto &P : Allocas) {
-    AllocaInst *const A = P.first;
-
-    for (auto *DI : FindDbgDeclareUses(A))
-      DI->eraseFromParent();
-    replaceDbgUsesWithUndef(A);
-
+  for (const auto &A : FrameData.Allocas) {
+    AllocaInst *Alloca = A.Alloca;
     UsersToUpdate.clear();
-    for (User *U : A->users()) {
+    for (User *U : Alloca->users()) {
       auto *I = cast<Instruction>(U);
       if (DT.dominates(CB, I))
         UsersToUpdate.push_back(I);
-      else
-        MightNeedToCopy = true;
-    }
-    if (!UsersToUpdate.empty()) {
-      auto *G = GetFramePointer(P.second, A);
-      G->takeName(A);
-      for (Instruction *I : UsersToUpdate)
-        I->replaceUsesOfWith(A, G);
     }
-  }
-  // If we discovered such uses not dominated by CoroBegin, see if any of them
-  // preceed coro begin and have instructions that can modify the
-  // value of the alloca and therefore would require a copying the value into
-  // the spill slot in the coroutine frame.
-  if (MightNeedToCopy) {
-    Builder.SetInsertPoint(FramePtr->getNextNode());
-
-    for (auto &P : Allocas) {
-      AllocaInst *const A = P.first;
-      if (mightWriteIntoAllocaPtr(*A, DT, *CB)) {
-        if (A->isArrayAllocation())
-          report_fatal_error(
-              "Coroutines cannot handle copying of array allocas yet");
+    if (UsersToUpdate.empty())
+      continue;
+    auto *G = GetFramePointer(Alloca);
+    G->setName(Alloca->getName() + Twine(".reload.addr"));
+
+    SmallPtrSet<BasicBlock *, 4> SeenDbgBBs;
+    TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Alloca);
+    if (!DIs.empty())
+      DIBuilder(*Alloca->getModule(),
+                /*AllowUnresolved*/ false)
+          .insertDeclare(G, DIs.front()->getVariable(),
+                         DIs.front()->getExpression(),
+                         DIs.front()->getDebugLoc(), DIs.front());
+    for (auto *DI : FindDbgDeclareUses(Alloca))
+      DI->eraseFromParent();
+    replaceDbgUsesWithUndef(Alloca);
 
-        auto *G = GetFramePointer(P.second, A);
-        auto *Value = Builder.CreateLoad(A->getAllocatedType(), A);
-        Builder.CreateStore(Value, G);
-      }
+    for (Instruction *I : UsersToUpdate)
+      I->replaceUsesOfWith(Alloca, G);
+  }
+  Builder.SetInsertPoint(FramePtr->getNextNode());
+  for (const auto &A : FrameData.Allocas) {
+    AllocaInst *Alloca = A.Alloca;
+    if (A.MayWriteBeforeCoroBegin) {
+      // isEscaped really means potentially modified before CoroBegin.
+      if (Alloca->isArrayAllocation())
+        report_fatal_error(
+            "Coroutines cannot handle copying of array allocas yet");
+
+      auto *G = GetFramePointer(Alloca);
+      auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca);
+      Builder.CreateStore(Value, G);
+    }
+    // For each alias to Alloca created before CoroBegin but used after
+    // CoroBegin, we recreate them after CoroBegin by appplying the offset
+    // to the pointer in the frame.
+    for (const auto &Alias : A.Aliases) {
+      auto *FramePtr = GetFramePointer(Alloca);
+      auto *FramePtrRaw =
+          Builder.CreateBitCast(FramePtr, Type::getInt8PtrTy(C));
+      auto *AliasPtr = Builder.CreateGEP(
+          FramePtrRaw,
+          ConstantInt::get(Type::getInt64Ty(C), Alias.second.getValue()));
+      auto *AliasPtrTyped =
+          Builder.CreateBitCast(AliasPtr, Alias.first->getType());
+      Alias.first->replaceUsesWithIf(
+          AliasPtrTyped, [&](Use &U) { return DT.dominates(CB, U); });
     }
   }
   return FramePtr;
@@ -984,15 +1373,14 @@ static void setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {
 // Replaces all uses of OldPred with the NewPred block in all PHINodes in a
 // block.
 static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
-                           BasicBlock *NewPred,
-                           PHINode *LandingPadReplacement) {
+                           BasicBlock *NewPred, PHINode *Until = nullptr) {
   unsigned BBIdx = 0;
   for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
 
     // We manually update the LandingPadReplacement PHINode and it is the last
     // PHI Node. So, if we find it, we are done.
-    if (LandingPadReplacement == PN)
+    if (Until == PN)
       break;
 
     // Reuse the previous value of BBIdx if it lines up.  In cases where we
@@ -1041,6 +1429,101 @@ static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
   return NewBB;
 }
 
+// Moves the values in the PHIs in SuccBB that correspong to PredBB into a new
+// PHI in InsertedBB.
+static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB,
+                                         BasicBlock *InsertedBB,
+                                         BasicBlock *PredBB,
+                                         PHINode *UntilPHI = nullptr) {
+  auto *PN = cast<PHINode>(&SuccBB->front());
+  do {
+    int Index = PN->getBasicBlockIndex(InsertedBB);
+    Value *V = PN->getIncomingValue(Index);
+    PHINode *InputV = PHINode::Create(
+        V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName(),
+        &InsertedBB->front());
+    InputV->addIncoming(V, PredBB);
+    PN->setIncomingValue(Index, InputV);
+    PN = dyn_cast<PHINode>(PN->getNextNode());
+  } while (PN != UntilPHI);
+}
+
+// Rewrites the PHI Nodes in a cleanuppad.
+static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB,
+                                     CleanupPadInst *CleanupPad) {
+  // For every incoming edge to a CleanupPad we will create a new block holding
+  // all incoming values in single-value PHI nodes. We will then create another
+  // block to act as a dispather (as all unwind edges for related EH blocks
+  // must be the same).
+  //
+  // cleanuppad:
+  //    %2 = phi i32[%0, %catchswitch], [%1, %catch.1]
+  //    %3 = cleanuppad within none []
+  //
+  // It will create:
+  //
+  // cleanuppad.corodispatch
+  //    %2 = phi i8[0, %catchswitch], [1, %catch.1]
+  //    %3 = cleanuppad within none []
+  //    switch i8 % 2, label %unreachable
+  //            [i8 0, label %cleanuppad.from.catchswitch
+  //             i8 1, label %cleanuppad.from.catch.1]
+  // cleanuppad.from.catchswitch:
+  //    %4 = phi i32 [%0, %catchswitch]
+  //    br %label cleanuppad
+  // cleanuppad.from.catch.1:
+  //    %6 = phi i32 [%1, %catch.1]
+  //    br %label cleanuppad
+  // cleanuppad:
+  //    %8 = phi i32 [%4, %cleanuppad.from.catchswitch],
+  //                 [%6, %cleanuppad.from.catch.1]
+
+  // Unreachable BB, in case switching on an invalid value in the dispatcher.
+  auto *UnreachBB = BasicBlock::Create(
+      CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent());
+  IRBuilder<> Builder(UnreachBB);
+  Builder.CreateUnreachable();
+
+  // Create a new cleanuppad which will be the dispatcher.
+  auto *NewCleanupPadBB =
+      BasicBlock::Create(CleanupPadBB->getContext(),
+                         CleanupPadBB->getName() + Twine(".corodispatch"),
+                         CleanupPadBB->getParent(), CleanupPadBB);
+  Builder.SetInsertPoint(NewCleanupPadBB);
+  auto *SwitchType = Builder.getInt8Ty();
+  auto *SetDispatchValuePN =
+      Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB));
+  CleanupPad->removeFromParent();
+  CleanupPad->insertAfter(SetDispatchValuePN);
+  auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB,
+                                                pred_size(CleanupPadBB));
+
+  int SwitchIndex = 0;
+  SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB));
+  for (BasicBlock *Pred : Preds) {
+    // Create a new cleanuppad and move the PHI values to there.
+    auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(),
+                                      CleanupPadBB->getName() +
+                                          Twine(".from.") + Pred->getName(),
+                                      CleanupPadBB->getParent(), CleanupPadBB);
+    updatePhiNodes(CleanupPadBB, Pred, CaseBB);
+    CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") +
+                    Pred->getName());
+    Builder.SetInsertPoint(CaseBB);
+    Builder.CreateBr(CleanupPadBB);
+    movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB);
+
+    // Update this Pred to the new unwind point.
+    setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB);
+
+    // Setup the switch in the dispatcher.
+    auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex);
+    SetDispatchValuePN->addIncoming(SwitchConstant, Pred);
+    SwitchOnDispatch->addCase(SwitchConstant, CaseBB);
+    SwitchIndex++;
+  }
+}
+
 static void rewritePHIs(BasicBlock &BB) {
   // For every incoming edge we will create a block holding all
   // incoming values in a single PHI nodes.
@@ -1063,6 +1546,24 @@ static void rewritePHIs(BasicBlock &BB) {
   // TODO: Simplify PHINodes in the basic block to remove duplicate
   // predecessors.
 
+  // Special case for CleanupPad: all EH blocks must have the same unwind edge
+  // so we need to create an additional "dispatcher" block.
+  if (auto *CleanupPad =
+          dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) {
+    SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
+    for (BasicBlock *Pred : Preds) {
+      if (CatchSwitchInst *CS =
+              dyn_cast<CatchSwitchInst>(Pred->getTerminator())) {
+        // CleanupPad with a CatchSwitch predecessor: therefore this is an
+        // unwind destination that needs to be handle specially.
+        assert(CS->getUnwindDest() == &BB);
+        (void)CS;
+        rewritePHIsForCleanupPad(&BB, CleanupPad);
+        return;
+      }
+    }
+  }
+
   LandingPadInst *LandingPad = nullptr;
   PHINode *ReplPHI = nullptr;
   if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
@@ -1076,22 +1577,14 @@ static void rewritePHIs(BasicBlock &BB) {
     // ehAwareSplitEdge cloned it in the transition blocks.
   }
 
-  SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
+  SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
   for (BasicBlock *Pred : Preds) {
     auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
     IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
-    auto *PN = cast<PHINode>(&BB.front());
-    do {
-      int Index = PN->getBasicBlockIndex(IncomingBB);
-      Value *V = PN->getIncomingValue(Index);
-      PHINode *InputV = PHINode::Create(
-          V->getType(), 1, V->getName() + Twine(".") + BB.getName(),
-          &IncomingBB->front());
-      InputV->addIncoming(V, Pred);
-      PN->setIncomingValue(Index, InputV);
-      PN = dyn_cast<PHINode>(PN->getNextNode());
-    } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
-                             // the landing pad.
+
+    // Stop the moving of values at ReplPHI, as this is either null or the PHI
+    // that replaced the landing pad.
+    movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI);
   }
 
   if (LandingPad) {
@@ -1130,39 +1623,32 @@ static bool isCoroutineStructureIntrinsic(Instruction &I) {
 // For every use of the value that is across suspend point, recreate that value
 // after a suspend point.
 static void rewriteMaterializableInstructions(IRBuilder<> &IRB,
-                                              SpillInfo const &Spills) {
-  BasicBlock *CurrentBlock = nullptr;
-  Instruction *CurrentMaterialization = nullptr;
-  Instruction *CurrentDef = nullptr;
-
-  for (auto const &E : Spills) {
-    // If it is a new definition, update CurrentXXX variables.
-    if (CurrentDef != E.def()) {
-      CurrentDef = cast<Instruction>(E.def());
-      CurrentBlock = nullptr;
-      CurrentMaterialization = nullptr;
-    }
-
-    // If we have not seen this block, materialize the value.
-    if (CurrentBlock != E.userBlock()) {
-      CurrentBlock = E.userBlock();
-      CurrentMaterialization = cast<Instruction>(CurrentDef)->clone();
-      CurrentMaterialization->setName(CurrentDef->getName());
-      CurrentMaterialization->insertBefore(
-          &*CurrentBlock->getFirstInsertionPt());
-    }
-
-    if (auto *PN = dyn_cast<PHINode>(E.user())) {
-      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
-                                                "values in the PHINode");
-      PN->replaceAllUsesWith(CurrentMaterialization);
-      PN->eraseFromParent();
-      continue;
+                                              const SpillInfo &Spills) {
+  for (const auto &E : Spills) {
+    Value *Def = E.first;
+    BasicBlock *CurrentBlock = nullptr;
+    Instruction *CurrentMaterialization = nullptr;
+    for (Instruction *U : E.second) {
+      // If we have not seen this block, materialize the value.
+      if (CurrentBlock != U->getParent()) {
+        CurrentBlock = U->getParent();
+        CurrentMaterialization = cast<Instruction>(Def)->clone();
+        CurrentMaterialization->setName(Def->getName());
+        CurrentMaterialization->insertBefore(
+            &*CurrentBlock->getFirstInsertionPt());
+      }
+      if (auto *PN = dyn_cast<PHINode>(U)) {
+        assert(PN->getNumIncomingValues() == 1 &&
+               "unexpected number of incoming "
+               "values in the PHINode");
+        PN->replaceAllUsesWith(CurrentMaterialization);
+        PN->eraseFromParent();
+        continue;
+      }
+      // Replace all uses of Def in the current instruction with the
+      // CurrentMaterialization for the block.
+      U->replaceUsesOfWith(Def, CurrentMaterialization);
     }
-
-    // Replace all uses of CurrentDef in the current instruction with the
-    // CurrentMaterialization for the block.
-    E.user()->replaceUsesOfWith(CurrentDef, CurrentMaterialization);
   }
 }
 
@@ -1501,7 +1987,8 @@ static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
 
 /// retcon and retcon.once conventions assume that all spill uses can be sunk
 /// after the coro.begin intrinsic.
-static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
+static void sinkSpillUsesAfterCoroBegin(Function &F,
+                                        const FrameDataInfo &FrameData,
                                         CoroBeginInst *CoroBegin) {
   DominatorTree Dom(F);
 
@@ -1509,9 +1996,8 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
   SmallVector<Instruction *, 32> Worklist;
 
   // Collect all users that precede coro.begin.
-  for (auto const &Entry : Spills) {
-    auto *SpillDef = Entry.def();
-    for (User *U : SpillDef->users()) {
+  for (auto *Def : FrameData.getAllDefs()) {
+    for (User *U : Def->users()) {
       auto Inst = cast<Instruction>(U);
       if (Inst->getParent() != CoroBegin->getParent() ||
           Dom.dominates(CoroBegin, Inst))
@@ -1522,8 +2008,7 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
   }
   // Recursively collect users before coro.begin.
   while (!Worklist.empty()) {
-    auto *Def = Worklist.back();
-    Worklist.pop_back();
+    auto *Def = Worklist.pop_back_val();
     for (User *U : Def->users()) {
       auto Inst = cast<Instruction>(U);
       if (Dom.dominates(CoroBegin, Inst))
@@ -1535,17 +2020,14 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
 
   // Sort by dominance.
   SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end());
-  std::sort(InsertionList.begin(), InsertionList.end(),
-            [&Dom](Instruction *A, Instruction *B) -> bool {
-              // If a dominates b it should preceed (<) b.
-              return Dom.dominates(A, B);
-            });
+  llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool {
+    // If a dominates b it should preceed (<) b.
+    return Dom.dominates(A, B);
+  });
 
   Instruction *InsertPt = CoroBegin->getNextNode();
   for (Instruction *Inst : InsertionList)
     Inst->moveBefore(InsertPt);
-
-  return;
 }
 
 /// For each local variable that all of its user are only used inside one of
@@ -1567,56 +2049,186 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
   }
 
   for (Instruction &I : instructions(F)) {
-    if (!isa<AllocaInst>(&I))
+    AllocaInst* AI = dyn_cast<AllocaInst>(&I);
+    if (!AI)
       continue;
 
     for (BasicBlock *DomBB : DomSet) {
       bool Valid = true;
-      SmallVector<Instruction *, 1> BCInsts;
+      SmallVector<Instruction *, 1> Lifetimes;
 
-      auto isUsedByLifetimeStart = [&](Instruction *I) {
-        if (isa<BitCastInst>(I) && I->hasOneUse())
-          if (auto *IT = dyn_cast<IntrinsicInst>(I->user_back()))
-            return IT->getIntrinsicID() == Intrinsic::lifetime_start;
+      auto isLifetimeStart = [](Instruction* I) {
+        if (auto* II = dyn_cast<IntrinsicInst>(I))
+          return II->getIntrinsicID() == Intrinsic::lifetime_start;
         return false;
       };
 
-      for (User *U : I.users()) {
+      auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) {
+        if (isLifetimeStart(U)) {
+          Lifetimes.push_back(U);
+          return true;
+        }
+        if (!U->hasOneUse() || U->stripPointerCasts() != AI)
+          return false;
+        if (isLifetimeStart(U->user_back())) {
+          Lifetimes.push_back(U->user_back());
+          return true;
+        }
+        return false;
+      };
+
+      for (User *U : AI->users()) {
         Instruction *UI = cast<Instruction>(U);
         // For all users except lifetime.start markers, if they are all
         // dominated by one of the basic blocks and do not cross
         // suspend points as well, then there is no need to spill the
         // instruction.
         if (!DT.dominates(DomBB, UI->getParent()) ||
-            Checker.isDefinitionAcrossSuspend(DomBB, U)) {
-          // Skip bitcast used by lifetime.start markers.
-          if (isUsedByLifetimeStart(UI)) {
-            BCInsts.push_back(UI);
+            Checker.isDefinitionAcrossSuspend(DomBB, UI)) {
+          // Skip lifetime.start, GEP and bitcast used by lifetime.start
+          // markers.
+          if (collectLifetimeStart(UI, AI))
             continue;
-          }
           Valid = false;
           break;
         }
       }
       // Sink lifetime.start markers to dominate block when they are
       // only used outside the region.
-      if (Valid && BCInsts.size() != 0) {
-        auto *NewBitcast = BCInsts[0]->clone();
-        auto *NewLifetime = cast<Instruction>(BCInsts[0]->user_back())->clone();
-        NewLifetime->replaceUsesOfWith(BCInsts[0], NewBitcast);
-        NewBitcast->insertBefore(DomBB->getTerminator());
+      if (Valid && Lifetimes.size() != 0) {
+        // May be AI itself, when the type of AI is i8*
+        auto *NewBitCast = [&](AllocaInst *AI) -> Value* {
+          if (isa<AllocaInst>(Lifetimes[0]->getOperand(1)))
+            return AI;
+          auto *Int8PtrTy = Type::getInt8PtrTy(F.getContext());
+          return CastInst::Create(Instruction::BitCast, AI, Int8PtrTy, "",
+                                  DomBB->getTerminator());
+        }(AI);
+
+        auto *NewLifetime = Lifetimes[0]->clone();
+        NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), NewBitCast);
         NewLifetime->insertBefore(DomBB->getTerminator());
 
         // All the outsided lifetime.start markers are no longer necessary.
-        for (Instruction *S : BCInsts) {
-          S->user_back()->eraseFromParent();
-        }
+        for (Instruction *S : Lifetimes)
+          S->eraseFromParent();
+
         break;
       }
     }
   }
 }
 
+static void collectFrameAllocas(Function &F, coro::Shape &Shape,
+                                const SuspendCrossingInfo &Checker,
+                                SmallVectorImpl<AllocaInfo> &Allocas) {
+  // Collect lifetime.start info for each alloca.
+  using LifetimeStart = SmallPtrSet<Instruction *, 2>;
+  llvm::DenseMap<AllocaInst *, std::unique_ptr<LifetimeStart>> LifetimeMap;
+  for (Instruction &I : instructions(F)) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II || II->getIntrinsicID() != Intrinsic::lifetime_start)
+      continue;
+
+    if (auto *OpInst = dyn_cast<Instruction>(II->getOperand(1))) {
+      if (auto *AI = dyn_cast<AllocaInst>(OpInst->stripPointerCasts())) {
+
+        if (LifetimeMap.find(AI) == LifetimeMap.end())
+          LifetimeMap[AI] = std::make_unique<LifetimeStart>();
+        LifetimeMap[AI]->insert(isa<AllocaInst>(OpInst) ? II : OpInst);
+      }
+    }
+  }
+
+  for (Instruction &I : instructions(F)) {
+    auto *AI = dyn_cast<AllocaInst>(&I);
+    if (!AI)
+      continue;
+    // The PromiseAlloca will be specially handled since it needs to be in a
+    // fixed position in the frame.
+    if (AI == Shape.SwitchLowering.PromiseAlloca) {
+      continue;
+    }
+    bool ShouldLiveOnFrame = false;
+    auto Iter = LifetimeMap.find(AI);
+    if (Iter != LifetimeMap.end()) {
+      // Check against lifetime.start if the instruction has the info.
+      for (User *U : I.users()) {
+        for (auto *S : *Iter->second)
+          if ((ShouldLiveOnFrame = Checker.isDefinitionAcrossSuspend(*S, U)))
+            break;
+        if (ShouldLiveOnFrame)
+          break;
+      }
+      if (!ShouldLiveOnFrame)
+        continue;
+    }
+    // At this point, either ShouldLiveOnFrame is true or we didn't have
+    // lifetime information. We will need to rely on more precise pointer
+    // tracking.
+    DominatorTree DT(F);
+    AllocaUseVisitor Visitor{F.getParent()->getDataLayout(), DT,
+                             *Shape.CoroBegin, Checker};
+    Visitor.visitPtr(*AI);
+    if (!Visitor.getShouldLiveOnFrame())
+      continue;
+    Allocas.emplace_back(AI, Visitor.getAliasesCopy(),
+                         Visitor.getMayWriteBeforeCoroBegin());
+  }
+}
+
+void coro::salvageDebugInfo(
+    SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,
+    DbgDeclareInst *DDI, bool LoadFromFramePtr) {
+  Function *F = DDI->getFunction();
+  IRBuilder<> Builder(F->getContext());
+  auto InsertPt = F->getEntryBlock().getFirstInsertionPt();
+  while (isa<IntrinsicInst>(InsertPt))
+    ++InsertPt;
+  Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt);
+  DIExpression *Expr = DDI->getExpression();
+  // Follow the pointer arithmetic all the way to the incoming
+  // function argument and convert into a DIExpression.
+  Value *Storage = DDI->getAddress();
+  while (Storage) {
+    if (auto *LdInst = dyn_cast<LoadInst>(Storage)) {
+      Storage = LdInst->getOperand(0);
+    } else if (auto *StInst = dyn_cast<StoreInst>(Storage)) {
+      Storage = StInst->getOperand(0);
+    } else if (auto *GEPInst = dyn_cast<GetElementPtrInst>(Storage)) {
+      Expr = llvm::salvageDebugInfoImpl(*GEPInst, Expr,
+                                        /*WithStackValue=*/false);
+      Storage = GEPInst->getOperand(0);
+    } else if (auto *BCInst = dyn_cast<llvm::BitCastInst>(Storage))
+      Storage = BCInst->getOperand(0);
+    else
+      break;
+  }
+  // Store a pointer to the coroutine frame object in an alloca so it
+  // is available throughout the function when producing unoptimized
+  // code. Extending the lifetime this way is correct because the
+  // variable has been declared by a dbg.declare intrinsic.
+  if (auto Arg = dyn_cast_or_null<llvm::Argument>(Storage)) {
+    auto &Cached = DbgPtrAllocaCache[Storage];
+    if (!Cached) {
+      Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr,
+                                    Arg->getName() + ".debug");
+      Builder.CreateStore(Storage, Cached);
+    }
+    Storage = Cached;
+  }
+  // The FramePtr object adds one extra layer of indirection that
+  // needs to be unwrapped.
+  if (LoadFromFramePtr)
+    Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
+  auto &VMContext = DDI->getFunction()->getContext();
+  DDI->setOperand(
+      0, MetadataAsValue::get(VMContext, ValueAsMetadata::get(Storage)));
+  DDI->setOperand(2, MetadataAsValue::get(VMContext, Expr));
+  if (auto *InsertPt = dyn_cast_or_null<Instruction>(Storage))
+    DDI->moveAfter(InsertPt);
+}
+
 void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   eliminateSwiftError(F, Shape);
 
@@ -1635,9 +2247,26 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   }
 
   // Put CoroEnds into their own blocks.
-  for (CoroEndInst *CE : Shape.CoroEnds)
+  for (AnyCoroEndInst *CE : Shape.CoroEnds) {
     splitAround(CE, "CoroEnd");
 
+    // Emit the musttail call function in a new block before the CoroEnd.
+    // We do this here so that the right suspend crossing info is computed for
+    // the uses of the musttail call function call. (Arguments to the coro.end
+    // instructions would be ignored)
+    if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) {
+      auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction();
+      if (!MustTailCallFn)
+        continue;
+      IRBuilder<> Builder(AsyncEnd);
+      SmallVector<Value *, 8> Args(AsyncEnd->args());
+      auto Arguments = ArrayRef<Value *>(Args).drop_front(3);
+      auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn,
+                                      Arguments, Builder);
+      splitAround(Call, "MustTailCall.Before.CoroEnd");
+    }
+  }
+
   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
   // never has its definition separated from the PHI by the suspend point.
   rewritePHIs(F);
@@ -1646,51 +2275,40 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   SuspendCrossingInfo Checker(F, Shape);
 
   IRBuilder<> Builder(F.getContext());
-  SpillInfo Spills;
+  FrameDataInfo FrameData;
   SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
   SmallVector<Instruction*, 4> DeadInstructions;
 
-  for (int Repeat = 0; Repeat < 4; ++Repeat) {
-    // See if there are materializable instructions across suspend points.
-    for (Instruction &I : instructions(F))
-      if (materializable(I))
-        for (User *U : I.users())
-          if (Checker.isDefinitionAcrossSuspend(I, U))
-            Spills.emplace_back(&I, U);
-
-    if (Spills.empty())
-      break;
+  {
+    SpillInfo Spills;
+    for (int Repeat = 0; Repeat < 4; ++Repeat) {
+      // See if there are materializable instructions across suspend points.
+      for (Instruction &I : instructions(F))
+        if (materializable(I))
+          for (User *U : I.users())
+            if (Checker.isDefinitionAcrossSuspend(I, U))
+              Spills[&I].push_back(cast<Instruction>(U));
+
+      if (Spills.empty())
+        break;
 
-    // Rewrite materializable instructions to be materialized at the use point.
-    LLVM_DEBUG(dump("Materializations", Spills));
-    rewriteMaterializableInstructions(Builder, Spills);
-    Spills.clear();
+      // Rewrite materializable instructions to be materialized at the use
+      // point.
+      LLVM_DEBUG(dumpSpills("Materializations", Spills));
+      rewriteMaterializableInstructions(Builder, Spills);
+      Spills.clear();
+    }
   }
 
   sinkLifetimeStartMarkers(F, Shape, Checker);
-  // Collect lifetime.start info for each alloca.
-  using LifetimeStart = SmallPtrSet<Instruction *, 2>;
-  llvm::DenseMap<Instruction *, std::unique_ptr<LifetimeStart>> LifetimeMap;
-  for (Instruction &I : instructions(F)) {
-    auto *II = dyn_cast<IntrinsicInst>(&I);
-    if (!II || II->getIntrinsicID() != Intrinsic::lifetime_start)
-      continue;
-
-    if (auto *OpInst = dyn_cast<BitCastInst>(I.getOperand(1)))
-      if (auto *AI = dyn_cast<AllocaInst>(OpInst->getOperand(0))) {
-
-        if (LifetimeMap.find(AI) == LifetimeMap.end())
-          LifetimeMap[AI] = std::make_unique<LifetimeStart>();
-
-        LifetimeMap[AI]->insert(OpInst);
-      }
-  }
+  collectFrameAllocas(F, Shape, Checker, FrameData.Allocas);
+  LLVM_DEBUG(dumpAllocas(FrameData.Allocas));
 
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
     for (User *U : A.users())
       if (Checker.isDefinitionAcrossSuspend(A, U))
-        Spills.emplace_back(&A, U);
+        FrameData.Spills[&A].push_back(cast<Instruction>(U));
 
   for (Instruction &I : instructions(F)) {
     // Values returned from coroutine structure intrinsics should not be part
@@ -1721,43 +2339,33 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
 
       for (User *U : Alloc->users()) {
         if (Checker.isDefinitionAcrossSuspend(*Alloc, U))
-          Spills.emplace_back(Alloc, U);
+          FrameData.Spills[Alloc].push_back(cast<Instruction>(U));
       }
       continue;
     }
 
     // Ignore alloca.get; we process this as part of coro.alloca.alloc.
-    if (isa<CoroAllocaGetInst>(I)) {
+    if (isa<CoroAllocaGetInst>(I))
       continue;
-    }
 
-    auto Iter = LifetimeMap.find(&I);
-    for (User *U : I.users()) {
-      bool NeedSpill = false;
-
-      // Check against lifetime.start if the instruction has the info.
-      if (Iter != LifetimeMap.end())
-        for (auto *S : *Iter->second) {
-          if ((NeedSpill = Checker.isDefinitionAcrossSuspend(*S, U)))
-            break;
-        }
-      else
-        NeedSpill = Checker.isDefinitionAcrossSuspend(I, U);
+    if (isa<AllocaInst>(I))
+      continue;
 
-      if (NeedSpill) {
+    for (User *U : I.users())
+      if (Checker.isDefinitionAcrossSuspend(I, U)) {
         // We cannot spill a token.
         if (I.getType()->isTokenTy())
           report_fatal_error(
               "token definition is separated from the use by a suspend point");
-        Spills.emplace_back(&I, U);
+        FrameData.Spills[&I].push_back(cast<Instruction>(U));
       }
-    }
   }
-  LLVM_DEBUG(dump("Spills", Spills));
-  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce)
-    sinkSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
-  Shape.FrameTy = buildFrameType(F, Shape, Spills);
-  Shape.FramePtr = insertSpills(Spills, Shape);
+  LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills));
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
+      Shape.ABI == coro::ABI::Async)
+    sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin);
+  Shape.FrameTy = buildFrameType(F, Shape, FrameData);
+  Shape.FramePtr = insertSpills(FrameData, Shape);
   lowerLocalAllocas(LocalAllocas, DeadInstructions);
 
   for (auto I : DeadInstructions)