diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 172 |
1 files changed, 120 insertions, 52 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 213788ae0f67..f1a7d7463676 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -13,18 +13,21 @@ #include "AMDGPUSubtarget.h" #include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" +#include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" #include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/IR/MDBuilder.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/MC/MCSubtargetInfo.h" #include <algorithm> using namespace llvm; @@ -50,6 +53,15 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt<bool> EnableFlatScratch( + "amdgpu-enable-flat-scratch", + cl::desc("Use flat scratch instructions"), + cl::init(false)); + +static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", + cl::desc("Enable the use of AA during codegen."), + cl::init(true)); + GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & @@ -57,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { SmallString<256> FullFS("+promote-alloca,"); FullFS += FS; - ParseSubtargetFeatures(GPU, FullFS); + ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); HasMulU24 = getGeneration() >= EVERGREEN; HasMulI24 = hasCaymanISA(); @@ -77,11 +89,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled - // Assuming ECC is enabled is the conservative default. - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); - if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; + // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default + if (isAmdHsaOS()) + FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS @@ -97,17 +109,38 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += FS; - ParseSubtargetFeatures(GPU, FullFS); + ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); + + // Implement the "generic" processors, which acts as the default when no + // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to + // the first amdgcn target that supports flat addressing. Other OSes defaults + // to the first amdgcn target. + if (Gen == AMDGPUSubtarget::INVALID) { + Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS + : AMDGPUSubtarget::SOUTHERN_ISLANDS; + } // We don't support FP64 for EG/NI atm. assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); - // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es - // on VI and newer hardware to avoid assertion failures due to missing ADDR64 - // variants of MUBUF instructions. - if (!hasAddr64() && !FS.contains("flat-for-global")) { + // Targets must either support 64-bit offsets for MUBUF instructions, and/or + // support flat operations, otherwise they cannot access a 64-bit global + // address space + assert(hasAddr64() || hasFlat()); + // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets + // that do not support ADDR64 variants of MUBUF instructions. Such targets + // cannot use a 64 bit offset with a MUBUF instruction to access the global + // address space + if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { + ToggleFeature(AMDGPU::FeatureFlatForGlobal); FlatForGlobal = true; } + // Unless +-flat-for-global is specified, use MUBUF instructions for global + // address space access if flat operations are not available. + if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { + ToggleFeature(AMDGPU::FeatureFlatForGlobal); + FlatForGlobal = false; + } // Set defaults if needed. if (MaxPrivateElementSize == 0) @@ -131,20 +164,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - // Disable XNACK on targets where it is not enabled by default unless it is - // explicitly requested. - if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { - ToggleFeature(AMDGPU::FeatureXNACK); - EnableXNACK = false; - } + TargetID.setTargetIDFromFeaturesString(FS); - // ECC is on by default, but turn it off if the hardware doesn't support it - // anyway. This matters for the gfx9 targets with d16 loads, but don't support - // ECC. - if (DoesNotSupportSRAMECC && EnableSRAMECC) { - ToggleFeature(AMDGPU::FeatureSRAMECC); - EnableSRAMECC = false; - } + LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " + << TargetID.getXnackSetting() << '\n'); + LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " + << TargetID.getSramEccSetting() << '\n'); return *this; } @@ -170,10 +195,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM) : - AMDGPUGenSubtargetInfo(TT, GPU, FS), + AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), TargetTriple(TT), - Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), + TargetID(*this), + Gen(INVALID), InstrItins(getInstrItineraryForCPU(GPU)), LDSBankCount(0), MaxPrivateElementSize(0), @@ -184,13 +210,12 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), - CodeObjectV3(false), UnalignedScratchAccess(false), - UnalignedBufferAccess(false), + UnalignedAccessMode(false), HasApertureRegs(false), + SupportsXNACK(false), EnableXNACK(false), - DoesNotSupportXNACK(false), EnableCuMode(false), TrapHandler(false), @@ -239,8 +264,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasMAIInsts(false), HasPkFmacF16Inst(false), HasAtomicFaddInsts(false), + SupportsSRAMECC(false), EnableSRAMECC(false), - DoesNotSupportSRAMECC(false), HasNoSdstCMPX(false), HasVscnt(false), HasGetWaveIdInst(false), @@ -257,6 +282,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasUnpackedD16VMem(false), LDSMisalignedBug(false), HasMFMAInlineLiteralBug(false), + UnalignedBufferAccess(false), + UnalignedDSAccess(false), ScalarizeGlobal(false), @@ -269,6 +296,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasNSAtoVMEMBug(false), HasOffset3fBug(false), HasFlatSegmentOffsetBug(false), + HasImageStoreD16Bug(false), + HasImageGather4D16Bug(false), FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), @@ -283,20 +312,24 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); } +bool GCNSubtarget::enableFlatScratch() const { + return EnableFlatScratch && hasFlatScratchInsts(); +} + unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { if (getGeneration() < GFX10) return 1; switch (Opcode) { - case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_e64: case AMDGPU::V_LSHLREV_B64_gfx10: - case AMDGPU::V_LSHL_B64: - case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHL_B64_e64: + case AMDGPU::V_LSHRREV_B64_e64: case AMDGPU::V_LSHRREV_B64_gfx10: - case AMDGPU::V_LSHR_B64: - case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_LSHR_B64_e64: + case AMDGPU::V_ASHRREV_I64_e64: case AMDGPU::V_ASHRREV_I64_gfx10: - case AMDGPU::V_ASHR_I64: + case AMDGPU::V_ASHR_I64_e64: return 1; } @@ -436,6 +469,25 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( return Requested; } +static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { + auto Node = Kernel.getMetadata("reqd_work_group_size"); + if (Node && Node->getNumOperands() == 3) + return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); + return std::numeric_limits<unsigned>::max(); +} + +bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { + return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); +} + +unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, + unsigned Dimension) const { + unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); + if (ReqdSize != std::numeric_limits<unsigned>::max()) + return ReqdSize - 1; + return getFlatWorkGroupSizes(Kernel).second - 1; +} + bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { Function *Kernel = I->getParent()->getParent(); unsigned MinSize = 0; @@ -472,11 +524,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { default: break; } + if (Dim <= 3) { - if (auto Node = Kernel->getMetadata("reqd_work_group_size")) - if (Node->getNumOperands() == 3) - MinSize = MaxSize = mdconst::extract<ConstantInt>( - Node->getOperand(Dim))->getZExtValue(); + unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); + if (ReqdSize != std::numeric_limits<unsigned>::max()) + MinSize = MaxSize = ReqdSize; } } } @@ -498,6 +550,12 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { return true; } +unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { + if (isMesaKernel(F)) + return 16; + return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); +} + uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, Align &MaxAlign) const { assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || @@ -508,12 +566,15 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, MaxAlign = Align(1); for (const Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); + MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; + if (!Alignment) + Alignment = DL.getABITypeAlign(ArgTy); - const Align Alignment = DL.getABITypeAlign(ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; - MaxAlign = std::max(MaxAlign, Alignment); + MaxAlign = max(MaxAlign, Alignment); } return ExplicitArgBytes; @@ -536,9 +597,14 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, return alignTo(TotalSize, 4); } +AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { + return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 + : AMDGPUDwarfFlavour::Wave64; +} + R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : - R600GenSubtargetInfo(TT, GPU, FS), + R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), AMDGPUSubtarget(TT), InstrInfo(*this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), @@ -571,13 +637,15 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, } bool GCNSubtarget::hasMadF16() const { - return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; + return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; } bool GCNSubtarget::useVGPRIndexMode() const { return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); } +bool GCNSubtarget::useAA() const { return UseAA; } + unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) return getMaxWavesPerEU(); @@ -787,7 +855,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { for (unsigned I = 0; I < Succs.size(); ++I) { for (const SDep &SI : Succs[I]->Succs) { const SUnit *SU = SI.getSUnit(); - if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) + if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) Succs.push_back(SU); } } @@ -795,7 +863,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { SmallPtrSet<const SUnit*, 32> Visited; while (!Preds.empty()) { const SUnit *SU = Preds.pop_back_val(); - if (llvm::find(Succs, SU) != Succs.end()) + if (llvm::is_contained(Succs, SU)) return false; Visited.insert(SU); for (const SDep &SI : SU->Preds) @@ -859,8 +927,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { for (SUnit &SU : DAG->SUnits) { MachineInstr &MAI = *SU.getInstr(); if (!TII->isMAI(MAI) || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) continue; unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; |