1 files changed, 72 insertions, 38 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index aae2a54c198b..3e9fdcb1618e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -48,6 +48,8 @@ private:
   const GCNSubtarget *ST;
   bool IsPixelShader;
 
+  Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+                        Value *const Identity) const;
   Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
                    Value *const Identity) const;
   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
@@ -279,6 +281,45 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
   return B.CreateSelect(Cond, LHS, RHS);
 }
 
+// Use the builder to create a reduction of V across the wavefront, with all
+// lanes active, returning the same result in all lanes.
+Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
+                                             AtomicRMWInst::BinOp Op, Value *V,
+                                             Value *const Identity) const {
+  Type *const Ty = V->getType();
+  Module *M = B.GetInsertBlock()->getModule();
+  Function *UpdateDPP =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+
+  // Reduce within each row of 16 lanes.
+  for (unsigned Idx = 0; Idx < 4; Idx++) {
+    V = buildNonAtomicBinOp(
+        B, Op, V,
+        B.CreateCall(UpdateDPP,
+                     {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
+                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+  }
+
+  // Reduce within each pair of rows (i.e. 32 lanes).
+  assert(ST->hasPermLaneX16());
+  V = buildNonAtomicBinOp(
+      B, Op, V,
+      B.CreateIntrinsic(
+          Intrinsic::amdgcn_permlanex16, {},
+          {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
+
+  if (ST->isWave32())
+    return V;
+
+  // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
+  // combine them with a scalar operation.
+  Function *ReadLane =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+  Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
+  Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
+  return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
+}
+
 // Use the builder to create an inclusive scan of V across the wavefront, with
 // all lanes active.
 Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
@@ -287,10 +328,6 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
-  Function *PermLaneX16 =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
-  Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
 
   for (unsigned Idx = 0; Idx < 4; Idx++) {
     V = buildNonAtomicBinOp(
@@ -317,9 +354,10 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
 
     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
     // 48..63).
-    Value *const PermX =
-        B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
-                                   B.getFalse(), B.getFalse()});
+    assert(ST->hasPermLaneX16());
+    Value *const PermX = B.CreateIntrinsic(
+        Intrinsic::amdgcn_permlanex16, {},
+        {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
     V = buildNonAtomicBinOp(
         B, Op, V,
         B.CreateCall(UpdateDPP,
@@ -327,7 +365,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
                       B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
     if (!ST->isWave32()) {
       // Combine lane 31 into lanes 32..63.
-      Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+      Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+                                              {V, B.getInt32(31)});
       V = buildNonAtomicBinOp(
           B, Op, V,
           B.CreateCall(UpdateDPP,
@@ -346,10 +385,6 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
-  Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
-  Function *WriteLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
 
   if (ST->hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
@@ -357,6 +392,11 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
                       B.getInt32(0xf), B.getFalse()});
   } else {
+    Function *ReadLane =
+        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+    Function *WriteLane =
+        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
     // On GFX10 all DPP operations are confined to a single row. To get cross-
     // row operations we have to use permlane or readlane.
     Value *Old = V;
@@ -480,6 +520,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   Value *ExclScan = nullptr;
   Value *NewV = nullptr;
 
+  const bool NeedResult = !I.use_empty();
+
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
   if (ValDivergent) {
@@ -489,35 +531,27 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
 
     const AtomicRMWInst::BinOp ScanOp =
         Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
-    NewV = buildScan(B, ScanOp, NewV, Identity);
-    ExclScan = buildShiftRight(B, NewV, Identity);
-
-    // Read the value from the last lane, which has accumlated the values of
-    // each active lane in the wavefront. This will be our new value which we
-    // will provide to the atomic operation.
-    Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
-      CallInst *const ReadLaneLo = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
-      CallInst *const ReadLaneHi = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
-      Value *const PartialInsert = B.CreateInsertElement(
-          UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
-      NewV = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
+    if (!NeedResult && ST->hasPermLaneX16()) {
+      // On GFX10 the permlanex16 instruction helps us build a reduction without
+      // too many readlanes and writelanes, which are generally bad for
+      // performance.
+      NewV = buildReduction(B, ScanOp, NewV, Identity);
+    } else {
+      NewV = buildScan(B, ScanOp, NewV, Identity);
+      if (NeedResult)
+        ExclScan = buildShiftRight(B, NewV, Identity);
+
+      // Read the value from the last lane, which has accumlated the values of
+      // each active lane in the wavefront. This will be our new value which we
+      // will provide to the atomic operation.
+      Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+      assert(TyBitWidth == 32);
       NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
                                {NewV, LastLaneIdx});
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
     }
 
     // Finally mark the readlanes in the WWM section.
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
   } else {
     switch (Op) {
     default:
@@ -583,7 +617,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // original instruction.
   B.SetInsertPoint(&I);
 
-  const bool NeedResult = !I.use_empty();
   if (NeedResult) {
     // Create a PHI node to get our new atomic result into the exit block.
     PHINode *const PHI = B.CreatePHI(Ty, 2);
@@ -621,7 +654,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
     // from the first lane, to get our lane's index into the atomic result.
     Value *LaneOffset = nullptr;
     if (ValDivergent) {
-      LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+      LaneOffset =
+          B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
     } else {
       switch (Op) {
       default: