aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp110
1 files changed, 72 insertions, 38 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index aae2a54c198b..3e9fdcb1618e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -48,6 +48,8 @@ private:
const GCNSubtarget *ST;
bool IsPixelShader;
+ Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const;
Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
Value *const Identity) const;
Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
@@ -279,6 +281,45 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
return B.CreateSelect(Cond, LHS, RHS);
}
+// Use the builder to create a reduction of V across the wavefront, with all
+// lanes active, returning the same result in all lanes.
+Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+
+ // Reduce within each row of 16 lanes.
+ for (unsigned Idx = 0; Idx < 4; Idx++) {
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+ }
+
+ // Reduce within each pair of rows (i.e. 32 lanes).
+ assert(ST->hasPermLaneX16());
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
+
+ if (ST->isWave32())
+ return V;
+
+ // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
+ // combine them with a scalar operation.
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
+ Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
+ return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
+}
+
// Use the builder to create an inclusive scan of V across the wavefront, with
// all lanes active.
Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
@@ -287,10 +328,6 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
- Function *PermLaneX16 =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
for (unsigned Idx = 0; Idx < 4; Idx++) {
V = buildNonAtomicBinOp(
@@ -317,9 +354,10 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
- Value *const PermX =
- B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
- B.getFalse(), B.getFalse()});
+ assert(ST->hasPermLaneX16());
+ Value *const PermX = B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(
B, Op, V,
B.CreateCall(UpdateDPP,
@@ -327,7 +365,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
- Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+ Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {V, B.getInt32(31)});
V = buildNonAtomicBinOp(
B, Op, V,
B.CreateCall(UpdateDPP,
@@ -346,10 +385,6 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
if (ST->hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
@@ -357,6 +392,11 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
Value *Old = V;
@@ -480,6 +520,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Value *ExclScan = nullptr;
Value *NewV = nullptr;
+ const bool NeedResult = !I.use_empty();
+
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
@@ -489,35 +531,27 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
const AtomicRMWInst::BinOp ScanOp =
Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
- NewV = buildScan(B, ScanOp, NewV, Identity);
- ExclScan = buildShiftRight(B, NewV, Identity);
-
- // Read the value from the last lane, which has accumlated the values of
- // each active lane in the wavefront. This will be our new value which we
- // will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
- CallInst *const ReadLaneLo = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
- CallInst *const ReadLaneHi = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
- Value *const PartialInsert = B.CreateInsertElement(
- UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
- NewV = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
+ if (!NeedResult && ST->hasPermLaneX16()) {
+ // On GFX10 the permlanex16 instruction helps us build a reduction without
+ // too many readlanes and writelanes, which are generally bad for
+ // performance.
+ NewV = buildReduction(B, ScanOp, NewV, Identity);
+ } else {
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ if (NeedResult)
+ ExclScan = buildShiftRight(B, NewV, Identity);
+
+ // Read the value from the last lane, which has accumlated the values of
+ // each active lane in the wavefront. This will be our new value which we
+ // will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ assert(TyBitWidth == 32);
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{NewV, LastLaneIdx});
- } else {
- llvm_unreachable("Unhandled atomic bit width");
}
// Finally mark the readlanes in the WWM section.
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
} else {
switch (Op) {
default:
@@ -583,7 +617,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// original instruction.
B.SetInsertPoint(&I);
- const bool NeedResult = !I.use_empty();
if (NeedResult) {
// Create a PHI node to get our new atomic result into the exit block.
PHINode *const PHI = B.CreatePHI(Ty, 2);
@@ -621,7 +654,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// from the first lane, to get our lane's index into the atomic result.
Value *LaneOffset = nullptr;
if (ValDivergent) {
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ LaneOffset =
+ B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
} else {
switch (Op) {
default: