diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 181 |
1 files changed, 70 insertions, 111 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 445e91092499..213788ae0f67 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -59,13 +59,6 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP32Denormals = false; - } - HasMulU24 = getGeneration() >= EVERGREEN; HasMulI24 = hasCaymanISA(); @@ -76,9 +69,6 @@ GCNSubtarget & GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { // Determine default and user-specified characteristics - // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be - // enabled, but some instructions do not respect them and they run at the - // double precision rate, so don't enable by default. // // We want to be able to turn these off, but making this a subtarget feature // for SI has the unhelpful behavior that it unsets everything else if you @@ -88,20 +78,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // unset everything else if it is disabled // Assuming ECC is enabled is the conservative default. - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - FullFS += "+fp64-fp16-denormals,"; - } else { - FullFS += "-fp32-denormals,"; - } - FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS // Disable mutually exclusive bits. @@ -145,12 +126,14 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, } // Don't crash on invalid devices. - if (WavefrontSize == 0) - WavefrontSize = 64; + if (WavefrontSizeLog2 == 0) + WavefrontSizeLog2 = 5; HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - if (DoesNotSupportXNACK && EnableXNACK) { + // Disable XNACK on targets where it is not enabled by default unless it is + // explicitly requested. + if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { ToggleFeature(AMDGPU::FeatureXNACK); EnableXNACK = false; } @@ -170,8 +153,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT), Has16BitInsts(false), HasMadMixInsts(false), - FP32Denormals(false), - FPExceptions(false), + HasMadMacF32Insts(false), + HasDsSrc2Insts(false), HasSDWA(false), HasVOP3PInsts(false), HasMulI24(true), @@ -182,7 +165,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : HasTrigReducedRange(false), MaxWavesPerEU(10), LocalMemorySize(0), - WavefrontSize(0) + WavefrontSizeLog2(0) { } GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, @@ -196,9 +179,9 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, MaxPrivateElementSize(0), FastFMAF32(false), + FastDenormalF32(false), HalfRate64Ops(false), - FP64FP16Denormals(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), CodeObjectV3(false), @@ -224,6 +207,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, GFX8Insts(false), GFX9Insts(false), GFX10Insts(false), + GFX10_3Insts(false), GFX7GFX8GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), @@ -241,7 +225,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasDPP(false), HasDPP8(false), HasR128A16(false), + HasGFX10A16(false), + HasG16(false), HasNSAEncoding(false), + GFX10_BEncoding(false), HasDLInsts(false), HasDot1Insts(false), HasDot2Insts(false), @@ -256,6 +243,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DoesNotSupportSRAMECC(false), HasNoSdstCMPX(false), HasVscnt(false), + HasGetWaveIdInst(false), + HasSMemTimeInst(false), HasRegisterBanking(false), HasVOP3Literal(false), HasNoDataDepHazard(false), @@ -287,6 +276,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); + InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); InstSelector.reset(new AMDGPUInstructionSelector( @@ -325,18 +315,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } +// FIXME: Should return min,max range. unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { - unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; - unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); - if (!WorkGroupsPerCu) + const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; + const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); + if (!MaxWorkGroupsPerCu) return 0; - unsigned MaxWaves = getMaxWavesPerEU(); - unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; - unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); - NumWaves = std::min(NumWaves, MaxWaves); - NumWaves = std::max(NumWaves, 1u); - return NumWaves; + + const unsigned WaveSize = getWavefrontSize(); + + // FIXME: Do we need to account for alignment requirement of LDS rounding the + // size up? + // Compute restriction based on LDS usage + unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); + + // This can be queried with more LDS than is possible, so just assume the + // worst. + if (NumGroups == 0) + return 1; + + NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); + + // Round to the number of waves. + const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; + unsigned MaxWaves = NumGroups * MaxGroupNumWaves; + + // Clamp to the maximum possible number of waves. + MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); + + // FIXME: Needs to be a multiple of the group size? + //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); + + assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && + "computed invalid occupancy"); + return MaxWaves; } unsigned @@ -396,13 +409,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( // number of waves per execution unit to values implied by requested // minimum/maximum flat work group sizes. unsigned MinImpliedByFlatWorkGroupSize = - getMaxWavesPerEU(FlatWorkGroupSizes.second); - bool RequestedFlatWorkGroupSize = false; - - if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { - Default.first = MinImpliedByFlatWorkGroupSize; - RequestedFlatWorkGroupSize = true; - } + getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); + Default.first = MinImpliedByFlatWorkGroupSize; + bool RequestedFlatWorkGroupSize = + F.hasFnAttribute("amdgpu-flat-work-group-size"); // Requested minimum/maximum number of waves per execution unit. std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( @@ -414,9 +424,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( // Make sure requested values do not violate subtarget's specifications. if (Requested.first < getMinWavesPerEU() || - Requested.first > getMaxWavesPerEU()) - return Default; - if (Requested.second > getMaxWavesPerEU()) + Requested.second > getMaxWavesPerEU()) return Default; // Make sure requested values are compatible with values implied by requested @@ -497,12 +505,12 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, const DataLayout &DL = F.getParent()->getDataLayout(); uint64_t ExplicitArgBytes = 0; - MaxAlign = Align::None(); + MaxAlign = Align(1); for (const Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); - const Align Alignment(DL.getABITypeAlignment(ArgTy)); + const Align Alignment = DL.getABITypeAlign(ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; MaxAlign = std::max(MaxAlign, Alignment); @@ -622,13 +630,12 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { return 2; // VCC. } -unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, - unsigned LDSSize, +unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, unsigned NumSGPRs, unsigned NumVGPRs) const { unsigned Occupancy = std::min(getMaxWavesPerEU(), - getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); + getOccupancyWithLocalMemSize(LDSSize, F)); if (NumSGPRs) Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); if (NumVGPRs) @@ -716,20 +723,20 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return MaxNumVGPRs; } -void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, - SDep &Dep) const { +void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, + int UseOpIdx, SDep &Dep) const { if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || - !Src->isInstr() || !Dst->isInstr()) + !Def->isInstr() || !Use->isInstr()) return; - MachineInstr *SrcI = Src->getInstr(); - MachineInstr *DstI = Dst->getInstr(); + MachineInstr *DefI = Def->getInstr(); + MachineInstr *UseI = Use->getInstr(); - if (SrcI->isBundle()) { + if (DefI->isBundle()) { const SIRegisterInfo *TRI = getRegisterInfo(); auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(SrcI->getIterator()); - MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end()); + MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); + MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); unsigned Lat = 0; for (++I; I != E && I->isBundledWithPred(); ++I) { if (I->modifiesRegister(Reg, TRI)) @@ -738,12 +745,12 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, --Lat; } Dep.setLatency(Lat); - } else if (DstI->isBundle()) { + } else if (UseI->isBundle()) { const SIRegisterInfo *TRI = getRegisterInfo(); auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(DstI->getIterator()); - MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end()); - unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI); + MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); + MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); + unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { if (I->readsRegister(Reg, TRI)) break; @@ -754,53 +761,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, } namespace { -struct MemOpClusterMutation : ScheduleDAGMutation { - const SIInstrInfo *TII; - - MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} - - void apply(ScheduleDAGInstrs *DAG) override { - SUnit *SUa = nullptr; - // Search for two consequent memory operations and link them - // to prevent scheduler from moving them apart. - // In DAG pre-process SUnits are in the original order of - // the instructions before scheduling. - for (SUnit &SU : DAG->SUnits) { - MachineInstr &MI2 = *SU.getInstr(); - if (!MI2.mayLoad() && !MI2.mayStore()) { - SUa = nullptr; - continue; - } - if (!SUa) { - SUa = &SU; - continue; - } - - MachineInstr &MI1 = *SUa->getInstr(); - if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || - (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || - (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || - (TII->isDS(MI1) && TII->isDS(MI2))) { - SU.addPredBarrier(SUa); - - for (const SDep &SI : SU.Preds) { - if (SI.getSUnit() != SUa) - SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); - } - - if (&SU != &DAG->ExitSU) { - for (const SDep &SI : SUa->Succs) { - if (SI.getSUnit() != &SU) - SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); - } - } - } - - SUa = &SU; - } - } -}; - struct FillMFMAShadowMutation : ScheduleDAGMutation { const SIInstrInfo *TII; @@ -927,7 +887,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { void GCNSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo)); Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); } |