aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp181
1 files changed, 70 insertions, 111 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 445e91092499..213788ae0f67 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -59,13 +59,6 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
- // FIXME: I don't think think Evergreen has any useful support for
- // denormals, but should be checked. Should we issue a warning somewhere
- // if someone tries to enable these?
- if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- FP32Denormals = false;
- }
-
HasMulU24 = getGeneration() >= EVERGREEN;
HasMulI24 = hasCaymanISA();
@@ -76,9 +69,6 @@ GCNSubtarget &
GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
// Determine default and user-specified characteristics
- // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
- // enabled, but some instructions do not respect them and they run at the
- // double precision rate, so don't enable by default.
//
// We want to be able to turn these off, but making this a subtarget feature
// for SI has the unhelpful behavior that it unsets everything else if you
@@ -88,20 +78,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// unset everything else if it is disabled
// Assuming ECC is enabled is the conservative default.
- SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
+ SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
- // FIXME: I don't think think Evergreen has any useful support for
- // denormals, but should be checked. Should we issue a warning somewhere
- // if someone tries to enable these?
- if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- FullFS += "+fp64-fp16-denormals,";
- } else {
- FullFS += "-fp32-denormals,";
- }
-
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
// Disable mutually exclusive bits.
@@ -145,12 +126,14 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
}
// Don't crash on invalid devices.
- if (WavefrontSize == 0)
- WavefrontSize = 64;
+ if (WavefrontSizeLog2 == 0)
+ WavefrontSizeLog2 = 5;
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
- if (DoesNotSupportXNACK && EnableXNACK) {
+ // Disable XNACK on targets where it is not enabled by default unless it is
+ // explicitly requested.
+ if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
ToggleFeature(AMDGPU::FeatureXNACK);
EnableXNACK = false;
}
@@ -170,8 +153,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
TargetTriple(TT),
Has16BitInsts(false),
HasMadMixInsts(false),
- FP32Denormals(false),
- FPExceptions(false),
+ HasMadMacF32Insts(false),
+ HasDsSrc2Insts(false),
HasSDWA(false),
HasVOP3PInsts(false),
HasMulI24(true),
@@ -182,7 +165,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
HasTrigReducedRange(false),
MaxWavesPerEU(10),
LocalMemorySize(0),
- WavefrontSize(0)
+ WavefrontSizeLog2(0)
{ }
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
@@ -196,9 +179,9 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
MaxPrivateElementSize(0),
FastFMAF32(false),
+ FastDenormalF32(false),
HalfRate64Ops(false),
- FP64FP16Denormals(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
CodeObjectV3(false),
@@ -224,6 +207,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
GFX8Insts(false),
GFX9Insts(false),
GFX10Insts(false),
+ GFX10_3Insts(false),
GFX7GFX8GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
@@ -241,7 +225,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasDPP(false),
HasDPP8(false),
HasR128A16(false),
+ HasGFX10A16(false),
+ HasG16(false),
HasNSAEncoding(false),
+ GFX10_BEncoding(false),
HasDLInsts(false),
HasDot1Insts(false),
HasDot2Insts(false),
@@ -256,6 +243,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DoesNotSupportSRAMECC(false),
HasNoSdstCMPX(false),
HasVscnt(false),
+ HasGetWaveIdInst(false),
+ HasSMemTimeInst(false),
HasRegisterBanking(false),
HasVOP3Literal(false),
HasNoDataDepHazard(false),
@@ -287,6 +276,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+ InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
InstSelector.reset(new AMDGPUInstructionSelector(
@@ -325,18 +315,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
+// FIXME: Should return min,max range.
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
- unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
- unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
- if (!WorkGroupsPerCu)
+ const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
+ const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
+ if (!MaxWorkGroupsPerCu)
return 0;
- unsigned MaxWaves = getMaxWavesPerEU();
- unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
- unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
- NumWaves = std::min(NumWaves, MaxWaves);
- NumWaves = std::max(NumWaves, 1u);
- return NumWaves;
+
+ const unsigned WaveSize = getWavefrontSize();
+
+ // FIXME: Do we need to account for alignment requirement of LDS rounding the
+ // size up?
+ // Compute restriction based on LDS usage
+ unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
+
+ // This can be queried with more LDS than is possible, so just assume the
+ // worst.
+ if (NumGroups == 0)
+ return 1;
+
+ NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
+
+ // Round to the number of waves.
+ const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
+ unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
+
+ // Clamp to the maximum possible number of waves.
+ MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+
+ // FIXME: Needs to be a multiple of the group size?
+ //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+
+ assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
+ "computed invalid occupancy");
+ return MaxWaves;
}
unsigned
@@ -396,13 +409,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
// number of waves per execution unit to values implied by requested
// minimum/maximum flat work group sizes.
unsigned MinImpliedByFlatWorkGroupSize =
- getMaxWavesPerEU(FlatWorkGroupSizes.second);
- bool RequestedFlatWorkGroupSize = false;
-
- if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
- Default.first = MinImpliedByFlatWorkGroupSize;
- RequestedFlatWorkGroupSize = true;
- }
+ getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
+ Default.first = MinImpliedByFlatWorkGroupSize;
+ bool RequestedFlatWorkGroupSize =
+ F.hasFnAttribute("amdgpu-flat-work-group-size");
// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
@@ -414,9 +424,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
// Make sure requested values do not violate subtarget's specifications.
if (Requested.first < getMinWavesPerEU() ||
- Requested.first > getMaxWavesPerEU())
- return Default;
- if (Requested.second > getMaxWavesPerEU())
+ Requested.second > getMaxWavesPerEU())
return Default;
// Make sure requested values are compatible with values implied by requested
@@ -497,12 +505,12 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
const DataLayout &DL = F.getParent()->getDataLayout();
uint64_t ExplicitArgBytes = 0;
- MaxAlign = Align::None();
+ MaxAlign = Align(1);
for (const Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
- const Align Alignment(DL.getABITypeAlignment(ArgTy));
+ const Align Alignment = DL.getABITypeAlign(ArgTy);
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
MaxAlign = std::max(MaxAlign, Alignment);
@@ -622,13 +630,12 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
-unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
- unsigned LDSSize,
+unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs,
unsigned NumVGPRs) const {
unsigned Occupancy =
std::min(getMaxWavesPerEU(),
- getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
+ getOccupancyWithLocalMemSize(LDSSize, F));
if (NumSGPRs)
Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
if (NumVGPRs)
@@ -716,20 +723,20 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return MaxNumVGPRs;
}
-void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
- SDep &Dep) const {
+void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
+ int UseOpIdx, SDep &Dep) const {
if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
- !Src->isInstr() || !Dst->isInstr())
+ !Def->isInstr() || !Use->isInstr())
return;
- MachineInstr *SrcI = Src->getInstr();
- MachineInstr *DstI = Dst->getInstr();
+ MachineInstr *DefI = Def->getInstr();
+ MachineInstr *UseI = Use->getInstr();
- if (SrcI->isBundle()) {
+ if (DefI->isBundle()) {
const SIRegisterInfo *TRI = getRegisterInfo();
auto Reg = Dep.getReg();
- MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
- MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
+ MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
+ MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
unsigned Lat = 0;
for (++I; I != E && I->isBundledWithPred(); ++I) {
if (I->modifiesRegister(Reg, TRI))
@@ -738,12 +745,12 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
--Lat;
}
Dep.setLatency(Lat);
- } else if (DstI->isBundle()) {
+ } else if (UseI->isBundle()) {
const SIRegisterInfo *TRI = getRegisterInfo();
auto Reg = Dep.getReg();
- MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
- MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
- unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
+ MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
+ MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
+ unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
if (I->readsRegister(Reg, TRI))
break;
@@ -754,53 +761,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
}
namespace {
-struct MemOpClusterMutation : ScheduleDAGMutation {
- const SIInstrInfo *TII;
-
- MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
-
- void apply(ScheduleDAGInstrs *DAG) override {
- SUnit *SUa = nullptr;
- // Search for two consequent memory operations and link them
- // to prevent scheduler from moving them apart.
- // In DAG pre-process SUnits are in the original order of
- // the instructions before scheduling.
- for (SUnit &SU : DAG->SUnits) {
- MachineInstr &MI2 = *SU.getInstr();
- if (!MI2.mayLoad() && !MI2.mayStore()) {
- SUa = nullptr;
- continue;
- }
- if (!SUa) {
- SUa = &SU;
- continue;
- }
-
- MachineInstr &MI1 = *SUa->getInstr();
- if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
- (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
- (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
- (TII->isDS(MI1) && TII->isDS(MI2))) {
- SU.addPredBarrier(SUa);
-
- for (const SDep &SI : SU.Preds) {
- if (SI.getSUnit() != SUa)
- SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
- }
-
- if (&SU != &DAG->ExitSU) {
- for (const SDep &SI : SUa->Succs) {
- if (SI.getSUnit() != &SU)
- SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
- }
- }
- }
-
- SUa = &SU;
- }
- }
-};
-
struct FillMFMAShadowMutation : ScheduleDAGMutation {
const SIInstrInfo *TII;
@@ -927,7 +887,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
}