aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-07-29 20:15:26 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-07-29 20:15:26 +0000
commit344a3780b2e33f6ca763666c380202b18aab72a3 (patch)
treef0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
parentb60736ec1405bb0a8dd40989f67ef4c93da068ab (diff)
downloadsrc-344a3780b2e33f6ca763666c380202b18aab72a3.tar.gz
src-344a3780b2e33f6ca763666c380202b18aab72a3.zip
the upstream release/13.x branch was created.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp514
1 files changed, 514 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
new file mode 100644
index 000000000000..ef46e53b7460
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -0,0 +1,514 @@
+//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes how many registers and other resources are used by
+/// functions.
+///
+/// The results of this analysis are used to fill the register usage, flat
+/// usage, etc. into hardware registers.
+///
+/// The analysis takes callees into account. E.g. if a function A that needs 10
+/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
+/// will return 20.
+/// It is assumed that an indirect call can go into any function except
+/// hardware-entrypoints. Therefore the register usage of functions with
+/// indirect calls is estimated as the maximum of all non-entrypoint functions
+/// in the module.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUResourceUsageAnalysis.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+#define DEBUG_TYPE "amdgpu-resource-usage"
+
+char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
+char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
+
+// We need to tell the runtime some amount ahead of time if we don't know the
+// true stack size. Assume a smaller number if this is only due to dynamic /
+// non-entry block allocas.
+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
+ "amdgpu-assume-external-call-stack-size",
+ cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
+ cl::init(16384));
+
+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
+ "amdgpu-assume-dynamic-stack-object-size",
+ cl::desc("Assumed extra stack use if there are any "
+ "variable sized objects (in bytes)"),
+ cl::Hidden, cl::init(4096));
+
+INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
+ "Function register usage analysis", true, true)
+
+static const Function *getCalleeFunction(const MachineOperand &Op) {
+ if (Op.isImm()) {
+ assert(Op.getImm() == 0);
+ return nullptr;
+ }
+
+ return cast<Function>(Op.getGlobal());
+}
+
+static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
+ const SIInstrInfo &TII, unsigned Reg) {
+ for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
+ if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
+ return true;
+ }
+
+ return false;
+}
+
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
+ const GCNSubtarget &ST) const {
+ return NumExplicitSGPR +
+ IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
+ ST.getTargetID().isXnackOnOrAny());
+}
+
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
+ const GCNSubtarget &ST) const {
+ if (ST.hasGFX90AInsts() && NumAGPR)
+ return alignTo(NumVGPR, 4) + NumAGPR;
+ return std::max(NumVGPR, NumAGPR);
+}
+
+bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ bool HasIndirectCall = false;
+
+ for (CallGraphNode *I : SCC) {
+ Function *F = I->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ MachineModuleInfo &MMI =
+ getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+
+ auto CI = CallGraphResourceInfo.insert(
+ std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+ SIFunctionResourceInfo &Info = CI.first->second;
+ assert(CI.second && "should only be called once per function");
+ Info = analyzeResourceUsage(MF, TM);
+ HasIndirectCall |= Info.HasIndirectCall;
+ }
+
+ if (HasIndirectCall)
+ propagateIndirectCallRegisterUsage();
+
+ return false;
+}
+
+AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
+AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
+ const MachineFunction &MF, const TargetMachine &TM) const {
+ SIFunctionResourceInfo Info;
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
+ MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
+ MRI.isLiveIn(MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
+
+ // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+ // instructions aren't used to access the scratch buffer. Inline assembly may
+ // need it though.
+ //
+ // If we only have implicit uses of flat_scr on flat instructions, it is not
+ // really needed.
+ if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
+ (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
+ Info.UsesFlatScratch = false;
+ }
+
+ Info.PrivateSegmentSize = FrameInfo.getStackSize();
+
+ // Assume a big number if there are any unknown sized objects.
+ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+ if (Info.HasDynamicallySizedStack)
+ Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
+
+ if (MFI->isStackRealigned())
+ Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
+
+ Info.UsesVCC =
+ MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+ // If there are no calls, MachineRegisterInfo can tell us the used register
+ // count easily.
+ // A tail call isn't considered a call for MachineFrameInfo's purposes.
+ if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestVGPRReg = Reg;
+ break;
+ }
+ }
+
+ if (ST.hasMAIInsts()) {
+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestAGPRReg = Reg;
+ break;
+ }
+ }
+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestAGPRReg) + 1;
+ }
+
+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestSGPRReg = Reg;
+ break;
+ }
+ }
+
+ // We found the maximum register index. They start at 0, so add one to get
+ // the number of registers.
+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestVGPRReg) + 1;
+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+ return Info;
+ }
+
+ int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
+ int32_t MaxSGPR = -1;
+ uint64_t CalleeFrameSize = 0;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: Check regmasks? Do they occur anywhere except calls?
+ for (const MachineOperand &MO : MI.operands()) {
+ unsigned Width = 0;
+ bool IsSGPR = false;
+ bool IsAGPR = false;
+
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ case AMDGPU::M0_LO16:
+ case AMDGPU::M0_HI16:
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::MODE:
+ continue;
+
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ llvm_unreachable("src_pops_exiting_wave_id should not be used");
+
+ case AMDGPU::NoRegister:
+ assert(MI.isDebugInstr() &&
+ "Instruction uses invalid noreg register");
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
+ case AMDGPU::VCC_LO_LO16:
+ case AMDGPU::VCC_LO_HI16:
+ case AMDGPU::VCC_HI_LO16:
+ case AMDGPU::VCC_HI_HI16:
+ Info.UsesVCC = true;
+ continue;
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ continue;
+
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ llvm_unreachable("xnack_mask registers should not be used");
+
+ case AMDGPU::LDS_DIRECT:
+ llvm_unreachable("lds_direct register should not be used");
+
+ case AMDGPU::TBA:
+ case AMDGPU::TBA_LO:
+ case AMDGPU::TBA_HI:
+ case AMDGPU::TMA:
+ case AMDGPU::TMA_LO:
+ case AMDGPU::TMA_HI:
+ llvm_unreachable("trap handler registers should not be used");
+
+ case AMDGPU::SRC_VCCZ:
+ llvm_unreachable("src_vccz register should not be used");
+
+ case AMDGPU::SRC_EXECZ:
+ llvm_unreachable("src_execz register should not be used");
+
+ case AMDGPU::SRC_SCC:
+ llvm_unreachable("src_scc register should not be used");
+
+ default:
+ break;
+ }
+
+ if (AMDGPU::SReg_32RegClass.contains(Reg) ||
+ AMDGPU::SReg_LO16RegClass.contains(Reg) ||
+ AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 1;
+ } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+ AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
+ AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 1;
+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 1;
+ } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 2;
+ } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 3;
+ } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 3;
+ } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 3;
+ } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 4;
+ } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 5;
+ } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 5;
+ } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 5;
+ } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 6;
+ } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 6;
+ } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 6;
+ } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 7;
+ } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 7;
+ } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 7;
+ } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 8;
+ } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 8;
+ } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 8;
+ } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 16;
+ } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 16;
+ } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 16;
+ } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 32;
+ } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
+ } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 32;
+ } else {
+ llvm_unreachable("Unknown register class");
+ }
+ unsigned HWReg = TRI.getHWRegIndex(Reg);
+ int MaxUsed = HWReg + Width - 1;
+ if (IsSGPR) {
+ MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
+ } else {
+ MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+ }
+ }
+
+ if (MI.isCall()) {
+ // Pseudo used just to encode the underlying global. Is there a better
+ // way to track this?
+
+ const MachineOperand *CalleeOp =
+ TII->getNamedOperand(MI, AMDGPU::OpName::callee);
+
+ const Function *Callee = getCalleeFunction(*CalleeOp);
+ DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
+ CallGraphResourceInfo.end();
+
+ // Avoid crashing on undefined behavior with an illegal call to a
+ // kernel. If a callsite's calling convention doesn't match the
+ // function's, it's undefined behavior. If the callsite calling
+ // convention does match, that would have errored earlier.
+ if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+ report_fatal_error("invalid call to entry function");
+
+ bool IsIndirect = !Callee || Callee->isDeclaration();
+ if (!IsIndirect)
+ I = CallGraphResourceInfo.find(Callee);
+
+ if (IsIndirect || I == CallGraphResourceInfo.end()) {
+ CalleeFrameSize =
+ std::max(CalleeFrameSize,
+ static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+
+ // Register usage of indirect calls gets handled later
+ Info.UsesVCC = true;
+ Info.UsesFlatScratch = ST.hasFlatAddressSpace();
+ Info.HasDynamicallySizedStack = true;
+ Info.HasIndirectCall = true;
+ } else {
+ // We force CodeGen to run in SCC order, so the callee's register
+ // usage etc. should be the cumulative usage of all callees.
+ MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
+ MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
+ CalleeFrameSize =
+ std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
+ Info.UsesVCC |= I->second.UsesVCC;
+ Info.UsesFlatScratch |= I->second.UsesFlatScratch;
+ Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
+ Info.HasRecursion |= I->second.HasRecursion;
+ Info.HasIndirectCall |= I->second.HasIndirectCall;
+ }
+
+ // FIXME: Call site could have norecurse on it
+ if (!Callee || !Callee->doesNotRecurse())
+ Info.HasRecursion = true;
+ }
+ }
+ }
+
+ Info.NumExplicitSGPR = MaxSGPR + 1;
+ Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
+ Info.PrivateSegmentSize += CalleeFrameSize;
+
+ return Info;
+}
+
+void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
+ // Collect the maximum number of registers from non-hardware-entrypoints.
+ // All these functions are potential targets for indirect calls.
+ int32_t NonKernelMaxSGPRs = 0;
+ int32_t NonKernelMaxVGPRs = 0;
+ int32_t NonKernelMaxAGPRs = 0;
+
+ for (const auto &I : CallGraphResourceInfo) {
+ if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
+ auto &Info = I.getSecond();
+ NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
+ NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
+ NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
+ }
+ }
+
+ // Add register usage for functions with indirect calls.
+ // For calls to unknown functions, we assume the maximum register usage of
+ // all non-hardware-entrypoints in the current module.
+ for (auto &I : CallGraphResourceInfo) {
+ auto &Info = I.getSecond();
+ if (Info.HasIndirectCall) {
+ Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
+ Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
+ Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
+ }
+ }
+}