diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp | 47 |
1 files changed, 43 insertions, 4 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 09dbd2150db6..a9f1e9bd0996 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -74,10 +74,10 @@ public: private: struct MemAccessInfo { - const Value *V; - const Value *Base; - int64_t Offset; - MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} + const Value *V = nullptr; + const Value *Base = nullptr; + int64_t Offset = 0; + MemAccessInfo() = default; bool isLargeStride(MemAccessInfo &Reference) const; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) Printable print() const { @@ -116,6 +116,7 @@ private: bool isGlobalAddr(const Value *V) const; bool isLocalAddr(const Value *V) const; + bool isGlobalLoadUsedInBB(const Instruction &) const; }; static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( @@ -196,6 +197,24 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { return false; } +// Returns true if the global load `I` is used in its own basic block. +bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { + const auto *Ld = dyn_cast<LoadInst>(&I); + if (!Ld) + return false; + if (!isGlobalAddr(Ld->getPointerOperand())) + return false; + + for (const User *Usr : Ld->users()) { + if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) { + if (UsrInst->getParent() == I.getParent()) + return true; + } + } + + return false; +} + AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; @@ -203,9 +222,14 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { for (auto &B : F) { LastAccess = MemAccessInfo(); + unsigned UsedGlobalLoadsInBB = 0; for (auto &I : B) { if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); + // TODO: Check if the global load and its user are close to each other + // instead (Or do this analysis in GCNSchedStrategy?). + if (isGlobalLoadUsedInBB(I)) + UsedGlobalLoadsInBB += Size; if (isIndirectAccess(&I)) FI.IAMInstCost += Size; if (isLargeStride(&I)) @@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { ++FI.InstCost; } } + + if (!FI.HasDenseGlobalMemAcc) { + unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); + if (GlobalMemAccPercentage > 50) { + LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " + << B.getName() << " has " << GlobalMemAccPercentage + << "% global memory access\n"); + FI.HasDenseGlobalMemAcc = true; + } + } } return &FI; @@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) { } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + // Reverting optimal scheduling in favour of occupancy with basic block(s) + // having dense global memory access can potentially hurt performance. + if (FI.HasDenseGlobalMemAcc) + return true; + return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; } |