aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp172
1 files changed, 120 insertions, 52 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 213788ae0f67..f1a7d7463676 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,18 +13,21 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
#include "AMDGPUCallLowering.h"
#include "AMDGPUInstructionSelector.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/IR/MDBuilder.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include <algorithm>
using namespace llvm;
@@ -50,6 +53,15 @@ static cl::opt<bool> EnableVGPRIndexMode(
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
+static cl::opt<bool> EnableFlatScratch(
+ "amdgpu-enable-flat-scratch",
+ cl::desc("Use flat scratch instructions"),
+ cl::init(false));
+
+static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
+ cl::desc("Enable the use of AA during codegen."),
+ cl::init(true));
+
GCNSubtarget::~GCNSubtarget() = default;
R600Subtarget &
@@ -57,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
SmallString<256> FullFS("+promote-alloca,");
FullFS += FS;
- ParseSubtargetFeatures(GPU, FullFS);
+ ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
HasMulU24 = getGeneration() >= EVERGREEN;
HasMulI24 = hasCaymanISA();
@@ -77,11 +89,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// Similarly we want enable-prt-strict-null to be on by default and not to
// unset everything else if it is disabled
- // Assuming ECC is enabled is the conservative default.
- SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
+ SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
- if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
- FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+ // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
+ if (isAmdHsaOS())
+ FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
@@ -97,17 +109,38 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += FS;
- ParseSubtargetFeatures(GPU, FullFS);
+ ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
+
+ // Implement the "generic" processors, which acts as the default when no
+ // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
+ // the first amdgcn target that supports flat addressing. Other OSes defaults
+ // to the first amdgcn target.
+ if (Gen == AMDGPUSubtarget::INVALID) {
+ Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
+ : AMDGPUSubtarget::SOUTHERN_ISLANDS;
+ }
// We don't support FP64 for EG/NI atm.
assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
- // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
- // on VI and newer hardware to avoid assertion failures due to missing ADDR64
- // variants of MUBUF instructions.
- if (!hasAddr64() && !FS.contains("flat-for-global")) {
+ // Targets must either support 64-bit offsets for MUBUF instructions, and/or
+ // support flat operations, otherwise they cannot access a 64-bit global
+ // address space
+ assert(hasAddr64() || hasFlat());
+ // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
+ // that do not support ADDR64 variants of MUBUF instructions. Such targets
+ // cannot use a 64 bit offset with a MUBUF instruction to access the global
+ // address space
+ if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
+ ToggleFeature(AMDGPU::FeatureFlatForGlobal);
FlatForGlobal = true;
}
+ // Unless +-flat-for-global is specified, use MUBUF instructions for global
+ // address space access if flat operations are not available.
+ if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
+ ToggleFeature(AMDGPU::FeatureFlatForGlobal);
+ FlatForGlobal = false;
+ }
// Set defaults if needed.
if (MaxPrivateElementSize == 0)
@@ -131,20 +164,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
- // Disable XNACK on targets where it is not enabled by default unless it is
- // explicitly requested.
- if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
- ToggleFeature(AMDGPU::FeatureXNACK);
- EnableXNACK = false;
- }
+ TargetID.setTargetIDFromFeaturesString(FS);
- // ECC is on by default, but turn it off if the hardware doesn't support it
- // anyway. This matters for the gfx9 targets with d16 loads, but don't support
- // ECC.
- if (DoesNotSupportSRAMECC && EnableSRAMECC) {
- ToggleFeature(AMDGPU::FeatureSRAMECC);
- EnableSRAMECC = false;
- }
+ LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
+ << TargetID.getXnackSetting() << '\n');
+ LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
+ << TargetID.getSramEccSetting() << '\n');
return *this;
}
@@ -170,10 +195,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM) :
- AMDGPUGenSubtargetInfo(TT, GPU, FS),
+ AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
AMDGPUSubtarget(TT),
TargetTriple(TT),
- Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
+ TargetID(*this),
+ Gen(INVALID),
InstrItins(getInstrItineraryForCPU(GPU)),
LDSBankCount(0),
MaxPrivateElementSize(0),
@@ -184,13 +210,12 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
- CodeObjectV3(false),
UnalignedScratchAccess(false),
- UnalignedBufferAccess(false),
+ UnalignedAccessMode(false),
HasApertureRegs(false),
+ SupportsXNACK(false),
EnableXNACK(false),
- DoesNotSupportXNACK(false),
EnableCuMode(false),
TrapHandler(false),
@@ -239,8 +264,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasMAIInsts(false),
HasPkFmacF16Inst(false),
HasAtomicFaddInsts(false),
+ SupportsSRAMECC(false),
EnableSRAMECC(false),
- DoesNotSupportSRAMECC(false),
HasNoSdstCMPX(false),
HasVscnt(false),
HasGetWaveIdInst(false),
@@ -257,6 +282,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),
HasMFMAInlineLiteralBug(false),
+ UnalignedBufferAccess(false),
+ UnalignedDSAccess(false),
ScalarizeGlobal(false),
@@ -269,6 +296,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasNSAtoVMEMBug(false),
HasOffset3fBug(false),
HasFlatSegmentOffsetBug(false),
+ HasImageStoreD16Bug(false),
+ HasImageGather4D16Bug(false),
FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
@@ -283,20 +312,24 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
+bool GCNSubtarget::enableFlatScratch() const {
+ return EnableFlatScratch && hasFlatScratchInsts();
+}
+
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
if (getGeneration() < GFX10)
return 1;
switch (Opcode) {
- case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
- case AMDGPU::V_LSHL_B64:
- case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHL_B64_e64:
+ case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
- case AMDGPU::V_LSHR_B64:
- case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_LSHR_B64_e64:
+ case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
- case AMDGPU::V_ASHR_I64:
+ case AMDGPU::V_ASHR_I64_e64:
return 1;
}
@@ -436,6 +469,25 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
return Requested;
}
+static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+ auto Node = Kernel.getMetadata("reqd_work_group_size");
+ if (Node && Node->getNumOperands() == 3)
+ return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
+ return std::numeric_limits<unsigned>::max();
+}
+
+bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
+ return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
+}
+
+unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
+ unsigned Dimension) const {
+ unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+ if (ReqdSize != std::numeric_limits<unsigned>::max())
+ return ReqdSize - 1;
+ return getFlatWorkGroupSizes(Kernel).second - 1;
+}
+
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
Function *Kernel = I->getParent()->getParent();
unsigned MinSize = 0;
@@ -472,11 +524,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
default:
break;
}
+
if (Dim <= 3) {
- if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
- if (Node->getNumOperands() == 3)
- MinSize = MaxSize = mdconst::extract<ConstantInt>(
- Node->getOperand(Dim))->getZExtValue();
+ unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+ if (ReqdSize != std::numeric_limits<unsigned>::max())
+ MinSize = MaxSize = ReqdSize;
}
}
}
@@ -498,6 +550,12 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
return true;
}
+unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
+ if (isMesaKernel(F))
+ return 16;
+ return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+}
+
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
Align &MaxAlign) const {
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
@@ -508,12 +566,15 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
MaxAlign = Align(1);
for (const Argument &Arg : F.args()) {
- Type *ArgTy = Arg.getType();
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+ MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
+ if (!Alignment)
+ Alignment = DL.getABITypeAlign(ArgTy);
- const Align Alignment = DL.getABITypeAlign(ArgTy);
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
- MaxAlign = std::max(MaxAlign, Alignment);
+ MaxAlign = max(MaxAlign, Alignment);
}
return ExplicitArgBytes;
@@ -536,9 +597,14 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
return alignTo(TotalSize, 4);
}
+AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
+ return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
+ : AMDGPUDwarfFlavour::Wave64;
+}
+
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM) :
- R600GenSubtargetInfo(TT, GPU, FS),
+ R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
AMDGPUSubtarget(TT),
InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
@@ -571,13 +637,15 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
}
bool GCNSubtarget::hasMadF16() const {
- return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
+ return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
}
bool GCNSubtarget::useVGPRIndexMode() const {
return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
}
+bool GCNSubtarget::useAA() const { return UseAA; }
+
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return getMaxWavesPerEU();
@@ -787,7 +855,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
for (unsigned I = 0; I < Succs.size(); ++I) {
for (const SDep &SI : Succs[I]->Succs) {
const SUnit *SU = SI.getSUnit();
- if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
+ if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
Succs.push_back(SU);
}
}
@@ -795,7 +863,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
SmallPtrSet<const SUnit*, 32> Visited;
while (!Preds.empty()) {
const SUnit *SU = Preds.pop_back_val();
- if (llvm::find(Succs, SU) != Succs.end())
+ if (llvm::is_contained(Succs, SU))
return false;
Visited.insert(SU);
for (const SDep &SI : SU->Preds)
@@ -859,8 +927,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
for (SUnit &SU : DAG->SUnits) {
MachineInstr &MAI = *SU.getInstr();
if (!TII->isMAI(MAI) ||
- MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
- MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
continue;
unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;