diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 110 |
1 files changed, 72 insertions, 38 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index aae2a54c198b..3e9fdcb1618e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -48,6 +48,8 @@ private: const GCNSubtarget *ST; bool IsPixelShader; + Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; @@ -279,6 +281,45 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, return B.CreateSelect(Cond, LHS, RHS); } +// Use the builder to create a reduction of V across the wavefront, with all +// lanes active, returning the same result in all lanes. +Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, + AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + + // Reduce within each row of 16 lanes. + for (unsigned Idx = 0; Idx < 4; Idx++) { + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + } + + // Reduce within each pair of rows (i.e. 32 lanes). + assert(ST->hasPermLaneX16()); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); + + if (ST->isWave32()) + return V; + + // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and + // combine them with a scalar operation. + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); + Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); + return buildNonAtomicBinOp(B, Op, Lane0, Lane32); +} + // Use the builder to create an inclusive scan of V across the wavefront, with // all lanes active. Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, @@ -287,10 +328,6 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - Function *PermLaneX16 = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -317,9 +354,10 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes // 48..63). - Value *const PermX = - B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1), - B.getFalse(), B.getFalse()}); + assert(ST->hasPermLaneX16()); + Value *const PermX = B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp( B, Op, V, B.CreateCall(UpdateDPP, @@ -327,7 +365,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. - Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)}); + Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {V, B.getInt32(31)}); V = buildNonAtomicBinOp( B, Op, V, B.CreateCall(UpdateDPP, @@ -346,10 +385,6 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. @@ -357,6 +392,11 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Function *WriteLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. Value *Old = V; @@ -480,6 +520,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Value *ExclScan = nullptr; Value *NewV = nullptr; + const bool NeedResult = !I.use_empty(); + // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { @@ -489,35 +531,27 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - NewV = buildScan(B, ScanOp, NewV, Identity); - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumlated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); - CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); - CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); - NewV = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction without + // too many readlanes and writelanes, which are generally bad for + // performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + + // Read the value from the last lane, which has accumlated the values of + // each active lane in the wavefront. This will be our new value which we + // will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx}); - } else { - llvm_unreachable("Unhandled atomic bit width"); } // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -583,7 +617,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // original instruction. B.SetInsertPoint(&I); - const bool NeedResult = !I.use_empty(); if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. PHINode *const PHI = B.CreatePHI(Ty, 2); @@ -621,7 +654,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); } else { switch (Op) { default: |