src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-07-29 20:15:26 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2021-07-29 20:15:26 +0000
commit	344a3780b2e33f6ca763666c380202b18aab72a3 (patch)
tree	f0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
parent	b60736ec1405bb0a8dd40989f67ef4c93da068ab (diff)
download	src-344a3780b2e33f6ca763666c380202b18aab72a3.tar.gz src-344a3780b2e33f6ca763666c380202b18aab72a3.zip

Vendor import of llvm-project main 88e66fa60ae5, the last commit beforevendor/llvm-project/llvmorg-13-init-16847-g88e66fa60ae5 vendor/llvm-project/llvmorg-12.0.1-rc2-0-ge7dac564cd0e vendor/llvm-project/llvmorg-12.0.1-0-gfed41342a82f

the upstream release/13.x branch was created.

Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp')

-rw-r--r--

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

514

1 files changed, 514 insertions, 0 deletions

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
new file mode 100644
index 000000000000..ef46e53b7460
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

@@ -0,0 +1,514 @@

+//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+//

+/// \file

+/// \brief Analyzes how many registers and other resources are used by

+/// functions.

+///

+/// The results of this analysis are used to fill the register usage, flat

+/// usage, etc. into hardware registers.

+///

+/// The analysis takes callees into account. E.g. if a function A that needs 10

+/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A

+/// will return 20.

+/// It is assumed that an indirect call can go into any function except

+/// hardware-entrypoints. Therefore the register usage of functions with

+/// indirect calls is estimated as the maximum of all non-entrypoint functions

+/// in the module.

+///

+//===----------------------------------------------------------------------===//

+#include "AMDGPUResourceUsageAnalysis.h"

+#include "AMDGPU.h"

+#include "GCNSubtarget.h"

+#include "SIMachineFunctionInfo.h"

+#include "llvm/Analysis/CallGraph.h"

+#include "llvm/CodeGen/TargetPassConfig.h"

+#include "llvm/Target/TargetMachine.h"

+using namespace llvm;

+using namespace llvm::AMDGPU;

+#define DEBUG_TYPE "amdgpu-resource-usage"

+char llvm::AMDGPUResourceUsageAnalysis::ID = 0;

+char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;

+// We need to tell the runtime some amount ahead of time if we don't know the

+// true stack size. Assume a smaller number if this is only due to dynamic /

+// non-entry block allocas.

+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(

+ "amdgpu-assume-external-call-stack-size",

+ cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,

+ cl::init(16384));

+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(

+ "amdgpu-assume-dynamic-stack-object-size",

+ cl::desc("Assumed extra stack use if there are any "

+ "variable sized objects (in bytes)"),

+ cl::Hidden, cl::init(4096));

+INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,

+ "Function register usage analysis", true, true)

+static const Function *getCalleeFunction(const MachineOperand &Op) {

+ if (Op.isImm()) {

+ assert(Op.getImm() == 0);

+ return nullptr;

+ }

+ return cast<Function>(Op.getGlobal());

+static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,

+ const SIInstrInfo &TII, unsigned Reg) {

+ for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {

+ if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))

+ return true;

+ }

+ return false;

+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(

+ const GCNSubtarget &ST) const {

+ return NumExplicitSGPR +

+ IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,

+ ST.getTargetID().isXnackOnOrAny());

+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(

+ const GCNSubtarget &ST) const {

+ if (ST.hasGFX90AInsts() && NumAGPR)

+ return alignTo(NumVGPR, 4) + NumAGPR;

+ return std::max(NumVGPR, NumAGPR);

+bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {

+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();

+ if (!TPC)

+ return false;

+ const TargetMachine &TM = TPC->getTM<TargetMachine>();

+ bool HasIndirectCall = false;

+ for (CallGraphNode *I : SCC) {

+ Function *F = I->getFunction();

+ if (!F || F->isDeclaration())

+ continue;

+ MachineModuleInfo &MMI =

+ getAnalysis<MachineModuleInfoWrapperPass>().getMMI();

+ MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);

+ auto CI = CallGraphResourceInfo.insert(

+ std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));

+ SIFunctionResourceInfo &Info = CI.first->second;

+ assert(CI.second && "should only be called once per function");

+ Info = analyzeResourceUsage(MF, TM);

+ HasIndirectCall |= Info.HasIndirectCall;

+ }

+ if (HasIndirectCall)

+ propagateIndirectCallRegisterUsage();

+ return false;

+AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo

+AMDGPUResourceUsageAnalysis::analyzeResourceUsage(

+ const MachineFunction &MF, const TargetMachine &TM) const {

+ SIFunctionResourceInfo Info;

+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();

+ const MachineRegisterInfo &MRI = MF.getRegInfo();

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ const SIRegisterInfo &TRI = TII->getRegisterInfo();

+ Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||

+ MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||

+ MRI.isLiveIn(MFI->getPreloadedReg(

+ AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));

+ // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat

+ // instructions aren't used to access the scratch buffer. Inline assembly may

+ // need it though.

+ //

+ // If we only have implicit uses of flat_scr on flat instructions, it is not

+ // really needed.

+ if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&

+ (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&

+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&

+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {

+ Info.UsesFlatScratch = false;

+ }

+ Info.PrivateSegmentSize = FrameInfo.getStackSize();

+ // Assume a big number if there are any unknown sized objects.

+ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();

+ if (Info.HasDynamicallySizedStack)

+ Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;

+ if (MFI->isStackRealigned())

+ Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();

+ Info.UsesVCC =

+ MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);

+ // If there are no calls, MachineRegisterInfo can tell us the used register

+ // count easily.

+ // A tail call isn't considered a call for MachineFrameInfo's purposes.

+ if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {

+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;

+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {

+ if (MRI.isPhysRegUsed(Reg)) {

+ HighestVGPRReg = Reg;

+ break;

+ }

+ if (ST.hasMAIInsts()) {

+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;

+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {

+ if (MRI.isPhysRegUsed(Reg)) {

+ HighestAGPRReg = Reg;

+ break;

+ }

+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister

+ ? 0

+ : TRI.getHWRegIndex(HighestAGPRReg) + 1;

+ }

+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;

+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {

+ if (MRI.isPhysRegUsed(Reg)) {

+ HighestSGPRReg = Reg;

+ break;

+ }

+ // We found the maximum register index. They start at 0, so add one to get

+ // the number of registers.

+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister

+ ? 0

+ : TRI.getHWRegIndex(HighestVGPRReg) + 1;

+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister

+ ? 0

+ : TRI.getHWRegIndex(HighestSGPRReg) + 1;

+ return Info;

+ }

+ int32_t MaxVGPR = -1;

+ int32_t MaxAGPR = -1;

+ int32_t MaxSGPR = -1;

+ uint64_t CalleeFrameSize = 0;

+ for (const MachineBasicBlock &MBB : MF) {

+ for (const MachineInstr &MI : MBB) {

+ // TODO: Check regmasks? Do they occur anywhere except calls?

+ for (const MachineOperand &MO : MI.operands()) {

+ unsigned Width = 0;

+ bool IsSGPR = false;

+ bool IsAGPR = false;

+ if (!MO.isReg())

+ continue;

+ Register Reg = MO.getReg();

+ switch (Reg) {

+ case AMDGPU::EXEC:

+ case AMDGPU::EXEC_LO:

+ case AMDGPU::EXEC_HI:

+ case AMDGPU::SCC:

+ case AMDGPU::M0:

+ case AMDGPU::M0_LO16:

+ case AMDGPU::M0_HI16:

+ case AMDGPU::SRC_SHARED_BASE:

+ case AMDGPU::SRC_SHARED_LIMIT:

+ case AMDGPU::SRC_PRIVATE_BASE:

+ case AMDGPU::SRC_PRIVATE_LIMIT:

+ case AMDGPU::SGPR_NULL:

+ case AMDGPU::MODE:

+ continue;

+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:

+ llvm_unreachable("src_pops_exiting_wave_id should not be used");

+ case AMDGPU::NoRegister:

+ assert(MI.isDebugInstr() &&

+ "Instruction uses invalid noreg register");

+ continue;

+ case AMDGPU::VCC:

+ case AMDGPU::VCC_LO:

+ case AMDGPU::VCC_HI:

+ case AMDGPU::VCC_LO_LO16:

+ case AMDGPU::VCC_LO_HI16:

+ case AMDGPU::VCC_HI_LO16:

+ case AMDGPU::VCC_HI_HI16:

+ Info.UsesVCC = true;

+ continue;

+ case AMDGPU::FLAT_SCR:

+ case AMDGPU::FLAT_SCR_LO:

+ case AMDGPU::FLAT_SCR_HI:

+ continue;

+ case AMDGPU::XNACK_MASK:

+ case AMDGPU::XNACK_MASK_LO:

+ case AMDGPU::XNACK_MASK_HI:

+ llvm_unreachable("xnack_mask registers should not be used");

+ case AMDGPU::LDS_DIRECT:

+ llvm_unreachable("lds_direct register should not be used");

+ case AMDGPU::TBA:

+ case AMDGPU::TBA_LO:

+ case AMDGPU::TBA_HI:

+ case AMDGPU::TMA:

+ case AMDGPU::TMA_LO:

+ case AMDGPU::TMA_HI:

+ llvm_unreachable("trap handler registers should not be used");

+ case AMDGPU::SRC_VCCZ:

+ llvm_unreachable("src_vccz register should not be used");

+ case AMDGPU::SRC_EXECZ:

+ llvm_unreachable("src_execz register should not be used");

+ case AMDGPU::SRC_SCC:

+ llvm_unreachable("src_scc register should not be used");

+ default:

+ break;

+ }

+ if (AMDGPU::SReg_32RegClass.contains(Reg) ||

+ AMDGPU::SReg_LO16RegClass.contains(Reg) ||

+ AMDGPU::SGPR_HI16RegClass.contains(Reg)) {

+ assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&

+ "trap handler registers should not be used");

+ IsSGPR = true;

+ Width = 1;

+ } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||

+ AMDGPU::VGPR_LO16RegClass.contains(Reg) ||

+ AMDGPU::VGPR_HI16RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 1;

+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||

+ AMDGPU::AGPR_LO16RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 1;

+ } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {

+ assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&

+ "trap handler registers should not be used");

+ IsSGPR = true;

+ Width = 2;

+ } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 2;

+ } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 2;

+ } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 3;

+ } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {

+ IsSGPR = true;

+ Width = 3;

+ } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 3;

+ } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {

+ assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&

+ "trap handler registers should not be used");

+ IsSGPR = true;

+ Width = 4;

+ } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 4;

+ } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 4;

+ } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 5;

+ } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {

+ IsSGPR = true;

+ Width = 5;

+ } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 5;

+ } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 6;

+ } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {

+ IsSGPR = true;

+ Width = 6;

+ } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 6;

+ } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 7;

+ } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {

+ IsSGPR = true;

+ Width = 7;

+ } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 7;

+ } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {

+ assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&

+ "trap handler registers should not be used");

+ IsSGPR = true;

+ Width = 8;

+ } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 8;

+ } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 8;

+ } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {

+ assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&

+ "trap handler registers should not be used");

+ IsSGPR = true;

+ Width = 16;

+ } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 16;

+ } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 16;

+ } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {

+ IsSGPR = true;

+ Width = 32;

+ } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {

+ IsSGPR = false;

+ Width = 32;

+ } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {

+ IsSGPR = false;

+ IsAGPR = true;

+ Width = 32;

+ } else {

+ llvm_unreachable("Unknown register class");

+ }

+ unsigned HWReg = TRI.getHWRegIndex(Reg);

+ int MaxUsed = HWReg + Width - 1;

+ if (IsSGPR) {

+ MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;

+ } else if (IsAGPR) {

+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;

+ } else {

+ MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;

+ }

+ if (MI.isCall()) {

+ // Pseudo used just to encode the underlying global. Is there a better

+ // way to track this?

+ const MachineOperand *CalleeOp =

+ TII->getNamedOperand(MI, AMDGPU::OpName::callee);

+ const Function *Callee = getCalleeFunction(*CalleeOp);

+ DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =

+ CallGraphResourceInfo.end();

+ // Avoid crashing on undefined behavior with an illegal call to a

+ // kernel. If a callsite's calling convention doesn't match the

+ // function's, it's undefined behavior. If the callsite calling

+ // convention does match, that would have errored earlier.

+ if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))

+ report_fatal_error("invalid call to entry function");

+ bool IsIndirect = !Callee || Callee->isDeclaration();

+ if (!IsIndirect)

+ I = CallGraphResourceInfo.find(Callee);

+ if (IsIndirect || I == CallGraphResourceInfo.end()) {

+ CalleeFrameSize =

+ std::max(CalleeFrameSize,

+ static_cast<uint64_t>(AssumedStackSizeForExternalCall));

+ // Register usage of indirect calls gets handled later

+ Info.UsesVCC = true;

+ Info.UsesFlatScratch = ST.hasFlatAddressSpace();

+ Info.HasDynamicallySizedStack = true;

+ Info.HasIndirectCall = true;

+ } else {

+ // We force CodeGen to run in SCC order, so the callee's register

+ // usage etc. should be the cumulative usage of all callees.

+ MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);

+ MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);

+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);

+ CalleeFrameSize =

+ std::max(I->second.PrivateSegmentSize, CalleeFrameSize);

+ Info.UsesVCC |= I->second.UsesVCC;

+ Info.UsesFlatScratch |= I->second.UsesFlatScratch;

+ Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;

+ Info.HasRecursion |= I->second.HasRecursion;

+ Info.HasIndirectCall |= I->second.HasIndirectCall;

+ }

+ // FIXME: Call site could have norecurse on it

+ if (!Callee || !Callee->doesNotRecurse())

+ Info.HasRecursion = true;

+ }

+ Info.NumExplicitSGPR = MaxSGPR + 1;

+ Info.NumVGPR = MaxVGPR + 1;

+ Info.NumAGPR = MaxAGPR + 1;

+ Info.PrivateSegmentSize += CalleeFrameSize;

+ return Info;

+void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {

+ // Collect the maximum number of registers from non-hardware-entrypoints.

+ // All these functions are potential targets for indirect calls.

+ int32_t NonKernelMaxSGPRs = 0;

+ int32_t NonKernelMaxVGPRs = 0;

+ int32_t NonKernelMaxAGPRs = 0;

+ for (const auto &I : CallGraphResourceInfo) {

+ if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {

+ auto &Info = I.getSecond();

+ NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);

+ NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);

+ NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);

+ }

+ // Add register usage for functions with indirect calls.

+ // For calls to unknown functions, we assume the maximum register usage of

+ // all non-hardware-entrypoints in the current module.

+ for (auto &I : CallGraphResourceInfo) {

+ auto &Info = I.getSecond();

+ if (Info.HasIndirectCall) {

+ Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);

+ Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);

+ Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);

+ }