1 files changed, 334 insertions, 165 deletions
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6bbe5979316d..d456e3d9b94d 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Insert wait instructions for memory reads and writes.
+/// Insert wait instructions for memory reads and writes.
 ///
 /// Memory reads and writes are issued asynchronously, so we need to insert
 /// S_WAITCNT instructions when we want to access any of their results or
@@ -40,6 +40,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -50,9 +51,21 @@
 #include <utility>
 #include <vector>
 
+using namespace llvm;
+
 #define DEBUG_TYPE "si-insert-waitcnts"
 
-using namespace llvm;
+DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
+              "Force emit s_waitcnt expcnt(0) instrs");
+DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
+              "Force emit s_waitcnt lgkmcnt(0) instrs");
+DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
+              "Force emit s_waitcnt vmcnt(0) instrs");
+
+static cl::opt<unsigned> ForceEmitZeroFlag(
+  "amdgpu-waitcnt-forcezero",
+  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
+  cl::init(0), cl::Hidden);
 
 namespace {
 
@@ -115,15 +128,15 @@ enum RegisterMapping {
        (w) = (enum WaitEventType)((w) + 1))
 
 // This is a per-basic-block object that maintains current score brackets
-// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// of each wait counter, and a per-register scoreboard for each wait counter.
 // We also maintain the latest score for every event type that can change the
 // waitcnt in order to know if there are multiple types of events within
 // the brackets. When multiple types of event happen in the bracket,
-// wait-count may get decreased out of order, therefore we need to put in
+// wait count may get decreased out of order, therefore we need to put in
 // "s_waitcnt 0" before use.
 class BlockWaitcntBrackets {
 public:
-  BlockWaitcntBrackets() {
+  BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
@@ -301,6 +314,7 @@ public:
   void dump() { print(dbgs()); }
 
 private:
+  const GCNSubtarget *ST = nullptr;
   bool WaitAtBeginning = false;
   bool RevisitLoop = false;
   bool MixedExpTypes = false;
@@ -332,14 +346,12 @@ public:
 
   void incIterCnt() { IterCnt++; }
   void resetIterCnt() { IterCnt = 0; }
-  int32_t getIterCnt() { return IterCnt; }
+  unsigned getIterCnt() { return IterCnt; }
 
   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
 
-  void print() {
-    DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
-  }
+  void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
 
 private:
   // s_waitcnt added at the end of loop footer to stablize wait scores
@@ -352,7 +364,7 @@ private:
 
 class SIInsertWaitcnts : public MachineFunctionPass {
 private:
-  const SISubtarget *ST = nullptr;
+  const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
@@ -361,22 +373,31 @@ private:
   AMDGPUAS AMDGPUASI;
 
   DenseSet<MachineBasicBlock *> BlockVisitedSet;
-  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
       BlockWaitcntBracketsMap;
 
-  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
 
   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
 
   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
 
+  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
+  // because of amdgpu-waitcnt-forcezero flag
+  bool ForceEmitZeroWaitcnts;
+  bool ForceEmitWaitcnt[NUM_INST_CNTS];
+
 public:
   static char ID;
 
-  SIInsertWaitcnts() : MachineFunctionPass(ID) {}
+  SIInsertWaitcnts() : MachineFunctionPass(ID) {
+    (void)ForceExpCounter;
+    (void)ForceLgkmCounter;
+    (void)ForceVMCounter;
+  }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -397,15 +418,53 @@ public:
         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
   }
 
+  bool isForceEmitWaitcnt() const {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1))
+      if (ForceEmitWaitcnt[T])
+        return true;
+    return false;
+  }
+
+  void setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+    if (DebugCounter::isCounterSet(ForceExpCounter) &&
+        DebugCounter::shouldExecute(ForceExpCounter)) {
+      ForceEmitWaitcnt[EXP_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[EXP_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+         DebugCounter::shouldExecute(ForceLgkmCounter)) {
+      ForceEmitWaitcnt[LGKM_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[LGKM_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceVMCounter) &&
+        DebugCounter::shouldExecute(ForceVMCounter)) {
+      ForceEmitWaitcnt[VM_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[VM_CNT] = false;
+    }
+#endif // NDEBUG
+  }
+
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
-                                           BlockWaitcntBrackets *ScoreBrackets);
-  void updateEventWaitCntAfter(MachineInstr &Inst,
+  void generateWaitcntInstBefore(MachineInstr &MI,
+                                  BlockWaitcntBrackets *ScoreBrackets);
+  void updateEventWaitcntAfter(MachineInstr &Inst,
                                BlockWaitcntBrackets *ScoreBrackets);
   void mergeInputScoreBrackets(MachineBasicBlock &Block);
-  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
+  unsigned countNumBottomBlocks(const MachineLoop *Loop);
   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+  bool isWaitcntStronger(unsigned LHS, unsigned RHS);
+  unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
 };
 
 } // end anonymous namespace
@@ -459,7 +518,7 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
                                        const MachineRegisterInfo *MRI,
                                        unsigned OpNo, int32_t Val) {
   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
-  DEBUG({
+  LLVM_DEBUG({
     const MachineOperand &Opnd = MI->getOperand(OpNo);
     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
   });
@@ -681,14 +740,17 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
   const int32_t LB = getScoreLB(T);
   const int32_t UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if (T == VM_CNT && hasPendingFlat()) {
-      // If there is a pending FLAT operation, and this is a VM waitcnt,
-      // then we need to force a waitcnt 0 for VM.
+    if ((T == VM_CNT || T == LGKM_CNT) &&
+        hasPendingFlat() &&
+        !ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
     } else if (counterOutOfOrder(T)) {
       // Counter can get decremented out-of-order when there
-      // are multiple types event in the brack. Also emit an s_wait counter
+      // are multiple types event in the bracket. Also emit an s_wait counter
       // with a conservative value of 0 for the counter.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
@@ -789,7 +851,30 @@ static bool readsVCCZ(const MachineInstr &MI) {
          !MI.getOperand(1).isUndef();
 }
 
-///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+/// Given wait count encodings checks if LHS is stronger than RHS.
+bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
+  if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
+    return false;
+  if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
+    return false;
+  if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
+    return false;
+  return true;
+}
+
+/// Given wait count encodings create a new encoding which is stronger
+/// or equal to both.
+unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
+  unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
+                            AMDGPU::decodeVmcnt(IV, RHS));
+  unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
+                              AMDGPU::decodeLgkmcnt(IV, RHS));
+  unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
+                             AMDGPU::decodeExpcnt(IV, RHS));
+  return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
+}
+
+///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
 ///  We rely on this in-order completion
@@ -799,23 +884,29 @@ static bool readsVCCZ(const MachineInstr &MI) {
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+void SIInsertWaitcnts::generateWaitcntInstBefore(
     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
   // To emit, or not to emit - that's the question!
   // Start with an assumption that there is no need to emit.
-  unsigned int EmitSwaitcnt = 0;
-  // s_waitcnt instruction to return; default is NULL.
-  MachineInstr *SWaitInst = nullptr;
+  unsigned int EmitWaitcnt = 0;
+
   // No need to wait before phi. If a phi-move exists, then the wait should
   // has been inserted before the move. If a phi-move does not exist, then
   // wait should be inserted before the real use. The same is true for
   // sc-merge. It is not a coincident that all these cases correspond to the
   // instructions that are skipped in the assembling loop.
   bool NeedLineMapping = false; // TODO: Check on this.
-  if (MI.isDebugValue() &&
+
+  // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
+  bool ForceEmitZeroWaitcnt = false;
+
+  setForceEmitWaitcnt();
+  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
+
+  if (MI.isDebugInstr() &&
       // TODO: any other opcode?
       !NeedLineMapping) {
-    return SWaitInst;
+    return;
   }
 
   // See if an s_waitcnt is forced at block entry, or is needed at
@@ -826,7 +917,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     ScoreBrackets->clearWaitAtBeginning();
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
-      EmitSwaitcnt |= CNT_MASK(T);
+      EmitWaitcnt |= CNT_MASK(T);
       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     }
   }
@@ -836,21 +927,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
-    EmitSwaitcnt |=
+    EmitWaitcnt |=
         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
   }
 
   // All waits must be resolved at call return.
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
-  if (MI.getOpcode() == AMDGPU::RETURN ||
-      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-        EmitSwaitcnt |= CNT_MASK(T);
+        EmitWaitcnt |= CNT_MASK(T);
       }
     }
   }
@@ -861,7 +951,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
             AMDGPU::SendMsg::ID_GS_DONE)) {
     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-      EmitSwaitcnt |= CNT_MASK(VM_CNT);
+      EmitWaitcnt |= CNT_MASK(VM_CNT);
     }
   }
 #if 0 // TODO: the following blocks of logic when we have fence.
@@ -879,11 +969,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_LDS:
           if (group_is_multi_wave ||
             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
                                ScoreBrackets->getScoreUB(LGKM_CNT));
             // LDS may have to wait for VM_CNT after buffer load to LDS
             if (target_info->HasBufferLoadToLDS()) {
-              EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+              EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
                                  ScoreBrackets->getScoreUB(VM_CNT));
             }
           }
@@ -891,9 +981,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
 
         case SCMEM_GDS:
           if (group_is_multi_wave || fence_is_global) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
               ScoreBrackets->getScoreUB(LGKM_CNT));
           }
           break;
@@ -903,9 +993,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_RING:
         case SCMEM_SCATTER:
           if (group_is_multi_wave || fence_is_global) {
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+            EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
               ScoreBrackets->getScoreUB(VM_CNT));
           }
           break;
@@ -926,13 +1016,13 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
     }
 
@@ -947,7 +1037,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       if (ScoreBrackets->getScoreUB(EXP_CNT) >
         ScoreBrackets->getScoreLB(EXP_CNT)) {
         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-        EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+        EmitWaitcnt |= CNT_MASK(EXP_CNT);
       }
     }
 #endif
@@ -965,7 +1055,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         continue;
       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
       // VM_CNT is only relevant to vgpr or LDS.
-      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+      EmitWaitcnt |= ScoreBrackets->updateByWait(
           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
     }
 
@@ -977,10 +1067,10 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Op.getReg())) {
           // VM_CNT is only relevant to vgpr or LDS.
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
         }
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     }
@@ -999,9 +1089,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         if (AS != AMDGPUASI.LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
       }
     }
@@ -1012,38 +1102,35 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Def.getReg())) {
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EmitWaitcnt |= ScoreBrackets->updateByWait(
               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
         }
-        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EmitWaitcnt |= ScoreBrackets->updateByWait(
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     } // End of for loop that looks at all dest operands.
   }
 
-  // TODO: Tie force zero to a compiler triage option.
-  bool ForceZero = false;
-
   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
   // occurs before the instruction. Doing it here prevents any additional
   // S_WAITCNTs from being emitted if the instruction was marked as
   // requiring a WAITCNT beforehand.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier()) {
-    EmitSwaitcnt |=
+    EmitWaitcnt |=
         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+    EmitWaitcnt |= ScoreBrackets->updateByWait(
         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+    EmitWaitcnt |= ScoreBrackets->updateByWait(
         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
-  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+  if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
             ScoreBrackets->getScoreUB(LGKM_CNT) &&
         ScoreBrackets->hasPendingSMEM()) {
@@ -1052,17 +1139,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       // block, so if we only wait on LGKM here, we might end up with
       // another s_waitcnt inserted right after this if there are non-LGKM
       // instructions still outstanding.
-      ForceZero = true;
-      EmitSwaitcnt = true;
+      // FIXME: this is too conservative / the comment is wrong.
+      // We don't wait on everything at the end of the block and we combine
+      // waitcnts so we should never have back-to-back waitcnts.
+      ForceEmitZeroWaitcnt = true;
+      EmitWaitcnt = true;
     }
   }
 
   // Does this operand processing indicate s_wait counter update?
-  if (EmitSwaitcnt) {
+  if (EmitWaitcnt || IsForceEmitWaitcnt) {
     int CntVal[NUM_INST_CNTS];
 
     bool UseDefaultWaitcntStrategy = true;
-    if (ForceZero) {
+    if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
       // Force all waitcnts to 0.
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
@@ -1077,7 +1167,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     if (UseDefaultWaitcntStrategy) {
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
-        if (EmitSwaitcnt & CNT_MASK(T)) {
+        if (EmitWaitcnt & CNT_MASK(T)) {
           int Delta =
               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
@@ -1087,7 +1177,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
               ScoreBrackets->setScoreLB(
                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
             }
-            EmitSwaitcnt &= ~CNT_MASK(T);
+            EmitWaitcnt &= ~CNT_MASK(T);
           }
           CntVal[T] = Delta;
         } else {
@@ -1099,10 +1189,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     }
 
     // If we are not waiting on any counter we can skip the wait altogether.
-    if (EmitSwaitcnt != 0) {
+    if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-      if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+      if (!OldWaitcnt ||
+          (AMDGPU::decodeVmcnt(IV, Imm) !=
                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
           (AMDGPU::decodeExpcnt(IV, Imm) !=
            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@@ -1114,39 +1205,80 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
           BlockWaitcntBrackets *ScoreBracket =
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
-            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            assert(!BlockVisitedSet.count(TBB));
             BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>();
+                llvm::make_unique<BlockWaitcntBrackets>(ST);
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
           }
           ScoreBracket->setRevisitLoop(true);
-          DEBUG(dbgs() << "set-revisit: block"
-                       << ContainingLoop->getHeader()->getNumber() << '\n';);
+          LLVM_DEBUG(dbgs()
+                         << "set-revisit2: Block"
+                         << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
 
       // Update an existing waitcount, or make a new one.
-      MachineFunction &MF = *MI.getParent()->getParent();
-      if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
-        SWaitInst = OldWaitcnt;
-      } else {
-        SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
-                                          MI.getDebugLoc());
-        CompilerGeneratedWaitcntSet.insert(SWaitInst);
-      }
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV,
+                      ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
+                      ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
+                      ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
+      // We don't remove waitcnts that existed prior to the waitcnt
+      // pass. Check if the waitcnt to-be-inserted can be avoided
+      // or if the prev waitcnt can be updated.
+      bool insertSWaitInst = true;
+      for (MachineBasicBlock::iterator I = MI.getIterator(),
+                                       B = MI.getParent()->begin();
+           insertSWaitInst && I != B; --I) {
+        if (I == MI.getIterator())
+          continue;
 
-      const MachineOperand &Op =
-          MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
-              IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
-      SWaitInst->addOperand(MF, Op);
+        switch (I->getOpcode()) {
+        case AMDGPU::S_WAITCNT:
+          if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
+            insertSWaitInst = false;
+          else if (!OldWaitcnt) {
+            OldWaitcnt = &*I;
+            Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
+          }
+          break;
+        // TODO: skip over instructions which never require wait.
+        }
+        break;
+      }
+      if (insertSWaitInst) {
+        if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
+          if (ForceEmitZeroWaitcnts)
+            LLVM_DEBUG(
+                dbgs()
+                << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
+          if (IsForceEmitWaitcnt)
+            LLVM_DEBUG(dbgs()
+                       << "Force emit a s_waitcnt due to debug counter\n");
+
+          OldWaitcnt->getOperand(0).setImm(Enc);
+          if (!OldWaitcnt->getParent())
+            MI.getParent()->insert(MI, OldWaitcnt);
+
+          LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                            << "Old Instr: " << MI << '\n'
+                            << "New Instr: " << *OldWaitcnt << '\n');
+        } else {
+            auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+                               MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                             .addImm(Enc);
+            TrackedWaitcntSet.insert(SWaitInst);
+
+            LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                              << "Old Instr: " << MI << '\n'
+                              << "New Instr: " << *SWaitInst << '\n');
+        }
+      }
 
       if (CntVal[EXP_CNT] == 0) {
         ScoreBrackets->setMixedExpTypes(false);
       }
     }
   }
-
-  return SWaitInst;
 }
 
 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
@@ -1180,7 +1312,7 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
   return false;
 }
 
-void SIInsertWaitcnts::updateEventWaitCntAfter(
+void SIInsertWaitcnts::updateEventWaitcntAfter(
     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
@@ -1214,7 +1346,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
-    if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+    if (ST->vmemWriteNeedsExpWaitcnt() &&
         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
     }
@@ -1247,27 +1379,37 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
   }
 }
 
+// Merge the score brackets of the Block's predecessors;
+// this merged score bracket is used when adding waitcnts to the Block
 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   int32_t MaxPending[NUM_INST_CNTS] = {0};
   int32_t MaxFlat[NUM_INST_CNTS] = {0};
   bool MixedExpTypes = false;
 
-  // Clear the score bracket state.
-  ScoreBrackets->clear();
-
-  // Compute the number of pending elements on block entry.
+  // For single basic block loops, we need to retain the Block's
+  // score bracket to have accurate Pred info. So, make a copy of Block's
+  // score bracket, clear() it (which retains several important bits of info),
+  // populate, and then replace en masse. For non-single basic block loops,
+  // just clear Block's current score bracket and repopulate in-place.
+  bool IsSelfPred;
+  std::unique_ptr<BlockWaitcntBrackets> S;
+
+  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
+    != Block.pred_end();
+  if (IsSelfPred) {
+    S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+    ScoreBrackets = S.get();
+  }
 
-  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
-  // need to handle single BBs with backedges to themselves. This means that
-  // they will need to retain and not clear their initial state.
+  ScoreBrackets->clear();
 
   // See if there are any uninitialized predecessors. If so, emit an
   // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *pred : Block.predecessors()) {
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1306,7 +1448,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
         BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1354,7 +1496,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
 
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1468,7 +1610,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   // sequencing predecessors, because changes to EXEC require waitcnts due to
   // the delayed nature of these operations.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1496,17 +1638,36 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
       }
     }
   }
+
+  // if a single block loop, update the score brackets. Not needed for other
+  // blocks, as we did this in-place
+  if (IsSelfPred) {
+    BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+  }
 }
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
+bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
+                                    const MachineBasicBlock *Block) {
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Count the number of "bottom" basic blocks of a loop.
+unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
+  unsigned Count = 0;
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB->isSuccessor(Loop->getHeader())) {
+      Count++;
+    }
+  }
+  return Count;
 }
 
 // Generate s_waitcnt instructions where needed.
@@ -1517,8 +1678,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
 
-  DEBUG({
-    dbgs() << "Block" << Block.getNumber();
+  LLVM_DEBUG({
+    dbgs() << "*** Block" << Block.getNumber() << " ***";
     ScoreBrackets->dump();
   });
 
@@ -1528,16 +1689,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     MachineInstr &Inst = *Iter;
     // Remove any previously existing waitcnts.
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
-      // TODO: Register the old waitcnt and optimize the following waitcnts.
-      // Leaving the previously existing waitcnts is conservatively correct.
-      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
-          CompilerGeneratedWaitcntSet.end())
+      // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
+      // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
+      // as needed.
+      if (!TrackedWaitcntSet.count(&Inst))
         ++Iter;
       else {
-        ScoreBrackets->setWaitcnt(&Inst);
         ++Iter;
         Inst.removeFromParent();
       }
+      ScoreBrackets->setWaitcnt(&Inst);
       continue;
     }
 
@@ -1550,29 +1711,20 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
-        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+        (!VCCZBugHandledSet.count(&Inst))) {
       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
               ScoreBrackets->getScoreUB(LGKM_CNT) &&
           ScoreBrackets->hasPendingSMEM()) {
-        if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+        if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }
     }
 
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
-    MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
-
-    if (SWaitInst) {
-      Block.insert(Inst, SWaitInst);
-      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
-        DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                     << "Old Instr: " << Inst << '\n'
-                     << "New Instr: " << *SWaitInst << '\n';);
-      }
-    }
+    generateWaitcntInstBefore(Inst, ScoreBrackets);
 
-    updateEventWaitCntAfter(Inst, ScoreBrackets);
+    updateEventWaitcntAfter(Inst, ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
     // If this instruction generates a S_SETVSKIP because it is an
@@ -1587,10 +1739,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     ScoreBrackets->clearWaitcnt();
 
-    if (SWaitInst) {
-      DEBUG({ SWaitInst->print(dbgs() << '\n'); });
-    }
-    DEBUG({
+    LLVM_DEBUG({
       Inst.print(dbgs());
       ScoreBrackets->dump();
     });
@@ -1627,21 +1776,22 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Check if we need to force convergence at loop footer.
   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
-  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
     WaitcntData->print();
-    DEBUG(dbgs() << '\n';);
+    LLVM_DEBUG(dbgs() << '\n';);
 
     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement and doesn't always guarantee convergence for a loop. Each
-    // loop should take at most 2 iterations for it to converge naturally.
-    // When this max is reached and result doesn't converge, we force
-    // convergence by inserting a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > 2) {
+    // placement, but doesn't guarantee convergence for a loop. Each
+    // loop should take at most (n+1) iterations for it to converge naturally,
+    // where n is the number of bottom blocks. If this threshold is reached and
+    // the result hasn't converged, then we force convergence by inserting
+    // a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
       // To ensure convergence, need to make wait events at loop footer be no
       // more than those from the previous iteration.
-      // As a simplification, Instead of tracking individual scores and
-      // generate the precise wait count, just wait on 0.
+      // As a simplification, instead of tracking individual scores and
+      // generating the precise wait count, just wait on 0.
       bool HasPending = false;
       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1649,16 +1799,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
           HasPending = true;
+          break;
         }
       }
 
       if (HasPending) {
         if (!SWaitInst) {
-          SWaitInst = Block.getParent()->CreateMachineInstr(
-              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
-          CompilerGeneratedWaitcntSet.insert(SWaitInst);
-          const MachineOperand &Op = MachineOperand::CreateImm(0);
-          SWaitInst->addOperand(MF, Op);
+          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                              .addImm(0);
+          TrackedWaitcntSet.insert(SWaitInst);
 #if 0 // TODO: Format the debug output
           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
           OutputTransformAdd(SWaitInst, context);
@@ -1670,7 +1820,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       }
 
       if (SWaitInst) {
-        DEBUG({
+        LLVM_DEBUG({
           SWaitInst->print(dbgs());
           dbgs() << "\nAdjusted score board:";
           ScoreBrackets->dump();
@@ -1678,7 +1828,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
         // Add this waitcnt to the block. It is either newly created or
         // created in previous iterations and added back since block traversal
-        // always remove waitcnt.
+        // always removes waitcnts.
         insertWaitcntBeforeCF(Block, SWaitInst);
         WaitcntData->setWaitcnt(SWaitInst);
       }
@@ -1687,7 +1837,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
-  ST = &MF.getSubtarget<SISubtarget>();
+  ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
@@ -1696,6 +1846,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   AMDGPUASI = ST->getAMDGPUAS();
 
+  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1))
+    ForceEmitWaitcnt[T] = false;
+
   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
@@ -1712,6 +1867,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   RegisterEncoding.SGPRL =
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
+  TrackedWaitcntSet.clear();
+  BlockVisitedSet.clear();
+  VCCZBugHandledSet.clear();
+  LoopWaitcntDataMap.clear();
+  BlockWaitcntProcessedSet.clear();
+
   // Walk over the blocks in reverse post-dominator order, inserting
   // s_waitcnt where needed.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
@@ -1726,7 +1887,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
+      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     }
     ScoreBrackets->setPostOrder(MBB.getNumber());
@@ -1737,22 +1898,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     // If we are walking into the block from before the loop, then guarantee
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
-        (BlockWaitcntProcessedSet.find(&MBB) ==
-         BlockWaitcntProcessedSet.end())) {
-      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-      DEBUG(dbgs() << "set-revisit: block"
-                   << ContainingLoop->getHeader()->getNumber() << '\n';);
+    if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
+      unsigned Count = countNumBottomBlocks(ContainingLoop);
+
+      // If the loop has multiple back-edges, and so more than one "bottom"
+      // basic block, we have to guarantee a re-walk over every blocks.
+      if ((std::count(BlockWaitcntProcessedSet.begin(),
+                      BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
+        BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
+                          << ContainingLoop->getHeader()->getNumber() << '\n';);
+      }
     }
 
     // Walk over the instructions.
     insertWaitcntInBlock(MF, MBB);
 
-    // Flag that waitcnts have been processed at least once.
-    BlockWaitcntProcessedSet.insert(&MBB);
+    // Record that waitcnts have been processed at least once for this block.
+    BlockWaitcntProcessedSet.push_back(&MBB);
 
-    // See if we want to revisit the loop.
-    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+    // See if we want to revisit the loop. If a loop has multiple back-edges,
+    // we shouldn't revisit the same "bottom" basic block.
+    if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
+        std::count(BlockWaitcntProcessedSet.begin(),
+                   BlockWaitcntProcessedSet.end(), &MBB) == 1) {
       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
       if (EntrySB && EntrySB->getRevisitLoop()) {
@@ -1772,7 +1941,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
         }
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
         WaitcntData->incIterCnt();
-        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
         continue;
       } else {
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
@@ -1837,7 +2006,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   if (!MFI->isEntryFunction()) {
     // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to to the wait after the
+    // depend on. We can't track them and it's better to the wait after the
     // costly call sequence.
 
     // TODO: Could insert earlier and schedule more liberally with operations