diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
51 files changed, 1235 insertions, 1012 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index aab76d27ef11..d28f38e42430 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -173,14 +173,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { const AMDGPUFunctionArgInfo & AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const { auto I = ArgInfoMap.find(&F); - if (I == ArgInfoMap.end()) { - if (AMDGPUTargetMachine::EnableFixedFunctionABI) - return FixedABIFunctionInfo; - - // Without the fixed ABI, we assume no function has special inputs. - assert(F.isDeclaration()); - return ExternFunctionInfo; - } - + if (I == ArgInfoMap.end()) + return FixedABIFunctionInfo; return I->second; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index f0aadab3302f..b4ebc7d7d75f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -112,6 +112,17 @@ static bool isDSAddress(const Constant *C) { return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; } +/// Returns true if the function requires the implicit argument be passed +/// regardless of the function contents. +static bool funcRequiresImplicitArgPtr(const Function &F) { + // Sanitizers require the hostcall buffer passed in the implicit arguments. + return F.hasFnAttribute(Attribute::SanitizeAddress) || + F.hasFnAttribute(Attribute::SanitizeThread) || + F.hasFnAttribute(Attribute::SanitizeMemory) || + F.hasFnAttribute(Attribute::SanitizeHWAddress) || + F.hasFnAttribute(Attribute::SanitizeMemTag); +} + namespace { class AMDGPUInformationCache : public InformationCache { public: @@ -296,7 +307,7 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { bool AllCallSitesKnown = true; if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) - indicatePessimisticFixpoint(); + return indicatePessimisticFixpoint(); return Change; } @@ -339,7 +350,17 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); + + // If the function requires the implicit arg pointer due to sanitizers, + // assume it's needed even if explicitly marked as not requiring it. + const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F); + if (NeedsImplicit) + removeAssumedBits(IMPLICIT_ARG_PTR); + for (auto Attr : ImplicitAttrs) { + if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR) + continue; + if (F->hasFnAttribute(Attr.second)) addKnownBits(Attr.first); } @@ -500,6 +521,9 @@ struct AAAMDFlatWorkGroupSize std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); intersectKnown( ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); + + if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); } ChangeStatus updateImpl(Attributor &A) override { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 43928d7c2a09..2f1e7823f65c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -652,8 +652,8 @@ bool AMDGPUCallLowering::lowerFormalArguments( ++PSInputNum; if (SkipArg) { - for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) - B.buildUndef(VRegs[Idx][I]); + for (Register R : VRegs[Idx]) + B.buildUndef(R); ++Idx; continue; @@ -715,10 +715,9 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!MBB.empty()) B.setInstr(*MBB.begin()); - if (!IsEntryFunc) { + if (!IsEntryFunc && !IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. - if (AMDGPUTargetMachine::EnableFixedFunctionABI) - TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); + TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } IncomingValueAssigner Assigner(AssignFn); @@ -731,11 +730,6 @@ bool AMDGPUCallLowering::lowerFormalArguments( uint64_t StackOffset = Assigner.StackOffset; - if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { - // Special inputs come after user arguments. - TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); - } - // Start adding system SGPRs. if (IsEntryFunc) { TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); @@ -829,9 +823,12 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, if (IncomingArg) { LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); - } else { - assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); + } else { + // We may have proven the input wasn't needed, although the ABI is + // requiring it. We just need to allocate the register appropriately. + MIRBuilder.buildUndef(InputReg); } if (OutgoingArg->isRegister()) { @@ -1233,8 +1230,7 @@ bool AMDGPUCallLowering::lowerTailCall( // after the ordinary user argument registers. SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; - if (AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1300,12 +1296,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const SITargetLowering &TLI = *getTLI<SITargetLowering>(); const DataLayout &DL = F.getParent()->getDataLayout(); - if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { - LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); - return false; - } - SmallVector<ArgInfo, 8> OutArgs; for (auto &OrigArg : Info.OrigArgs) splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv); @@ -1359,8 +1349,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // after the ordinary user argument registers. SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; - if (AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index c7c5ff7bcbe7..2415fdfecaae 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -64,6 +64,30 @@ def int_minmax_to_med3 : GICombineRule< [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; +def fp_minmax_to_med3 : GICombineRule< + (defs root:$min_or_max, med3_matchdata:$matchinfo), + (match (wip_match_opcode G_FMAXNUM, + G_FMINNUM, + G_FMAXNUM_IEEE, + G_FMINNUM_IEEE):$min_or_max, + [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; + +def fp_minmax_to_clamp : GICombineRule< + (defs root:$min_or_max, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_FMAXNUM, + G_FMINNUM, + G_FMAXNUM_IEEE, + G_FMINNUM_IEEE):$min_or_max, + [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>; + +def fmed3_intrinsic_to_clamp : GICombineRule< + (defs root:$fmed3, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_INTRINSIC):$fmed3, + [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -102,7 +126,9 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> { + "AMDGPUGenRegBankCombinerHelper", + [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp index 301e6f6d6f42..e79ff9b597c9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -378,5 +378,4 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, } MI.eraseFromParent(); - return; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 12cef2774aaf..7fd94a977be7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -172,6 +172,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>; def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>; def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>; def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>; +def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>; +def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>; def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>; def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b9c59f4c615a..699c6c479455 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -280,11 +280,12 @@ void MetadataStreamerV2::emitKernelAttrs(const Function &Func) { } } -void MetadataStreamerV2::emitKernelArgs(const Function &Func) { +void MetadataStreamerV2::emitKernelArgs(const Function &Func, + const GCNSubtarget &ST) { for (auto &Arg : Func.args()) emitKernelArg(Arg); - emitHiddenKernelArgs(Func); + emitHiddenKernelArgs(Func, ST); } void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { @@ -381,10 +382,9 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, } } -void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { - int HiddenArgNumBytes = - getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); - +void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func, + const GCNSubtarget &ST) { + unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func); if (!HiddenArgNumBytes) return; @@ -465,11 +465,12 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF, HSAMetadata.mKernels.push_back(Kernel::Metadata()); auto &Kernel = HSAMetadata.mKernels.back(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); Kernel.mName = std::string(Func.getName()); Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str(); emitKernelLanguage(Func); emitKernelAttrs(Func); - emitKernelArgs(Func); + emitKernelArgs(Func, ST); HSAMetadata.mKernels.back().mCodeProps = CodeProps; HSAMetadata.mKernels.back().mDebugProps = DebugProps; } @@ -673,13 +674,14 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func, } void MetadataStreamerV3::emitKernelArgs(const Function &Func, + const GCNSubtarget &ST, msgpack::MapDocNode Kern) { unsigned Offset = 0; auto Args = HSAMetadataDoc->getArrayNode(); for (auto &Arg : Func.args()) emitKernelArg(Arg, Offset, Args); - emitHiddenKernelArgs(Func, Offset, Args); + emitHiddenKernelArgs(Func, ST, Offset, Args); Kern[".args"] = Args; } @@ -791,11 +793,10 @@ void MetadataStreamerV3::emitKernelArg( } void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, + const GCNSubtarget &ST, unsigned &Offset, msgpack::ArrayDocNode Args) { - int HiddenArgNumBytes = - getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); - + unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func); if (!HiddenArgNumBytes) return; @@ -912,6 +913,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); auto Kern = getHSAKernelProps(MF, ProgramInfo); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || Func.getCallingConv() == CallingConv::SPIR_KERNEL); @@ -925,7 +927,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF, (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true); emitKernelLanguage(Func, Kern); emitKernelAttrs(Func, Kern); - emitKernelArgs(Func, Kern); + emitKernelArgs(Func, ST, Kern); } Kernels.push_back(Kern); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index af5dae1cd8c0..54ed0afbba6d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -30,6 +30,7 @@ class MDNode; class Module; struct SIProgramInfo; class Type; +class GCNSubtarget; namespace AMDGPU { @@ -86,7 +87,8 @@ protected: void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern); + void emitKernelArgs(const Function &Func, const GCNSubtarget &ST, + msgpack::MapDocNode Kern); void emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args); @@ -98,8 +100,8 @@ protected: StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); - void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, - msgpack::ArrayDocNode Args); + void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST, + unsigned &Offset, msgpack::ArrayDocNode Args); msgpack::DocNode &getRootMetadata(StringRef Key) { return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key]; @@ -173,7 +175,7 @@ private: void emitKernelAttrs(const Function &Func); - void emitKernelArgs(const Function &Func); + void emitKernelArgs(const Function &Func, const GCNSubtarget &ST); void emitKernelArg(const Argument &Arg); @@ -183,7 +185,7 @@ private: StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); - void emitHiddenKernelArgs(const Function &Func); + void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST); const Metadata &getHSAMetadata() const { return HSAMetadata; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 88b4ec53a2a0..db84b8766924 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -892,6 +892,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } break; } + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: { + if (isa<UndefValue>(II.getArgOperand(0))) + return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + + if (isa<ConstantPointerNull>(II.getArgOperand(0))) + return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); + break; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1f898f2ba8b3..5046daaed977 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -533,7 +533,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) .minScalar(0, S16) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) .widenScalarToNextMultipleOf(0, 32) .maxScalar(0, S32) .scalarize(0); @@ -541,7 +541,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) .legalFor({S32, S16, V2S16}) // Clamp modifier .minScalarOrElt(0, S16) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .widenScalarToNextPow2(0, 32) .lower(); @@ -712,7 +712,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } if (ST.hasVOP3PInsts()) - FPOpActions.clampMaxNumElements(0, S16, 2); + FPOpActions.clampMaxNumElementsStrict(0, S16, 2); FPOpActions .scalarize(0) @@ -728,7 +728,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_FNEG, G_FABS}) .legalFor(FPTypesPK16) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .clampScalar(0, S16, S64); @@ -965,7 +965,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.has16BitInsts()) { getActionDefinitionsBuilder(G_BSWAP) .legalFor({S16, S32, V2S16}) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) // FIXME: Fixing non-power-of-2 before clamp is workaround for // narrowScalar limitation. .widenScalarToNextPow2(0) @@ -1052,10 +1052,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Split vector extloads. unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); - unsigned AlignBits = Query.MMODescrs[0].AlignInBits; - - if (MemSize < DstTy.getSizeInBits()) - MemSize = std::max(MemSize, AlignBits); if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) return true; @@ -1077,12 +1073,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return true; } - if (AlignBits < MemSize) { - const SITargetLowering *TLI = ST.getTargetLowering(); - return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, - Align(AlignBits / 8)); - } - return false; }; @@ -1176,20 +1166,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (DstSize > MemSize) return std::make_pair(0, LLT::scalar(MemSize)); - if (!isPowerOf2_32(DstSize)) { - // We're probably decomposing an odd sized store. Try to split - // to the widest type. TODO: Account for alignment. As-is it - // should be OK, since the new parts will be further legalized. - unsigned FloorSize = PowerOf2Floor(DstSize); - return std::make_pair(0, LLT::scalar(FloorSize)); - } - - if (DstSize > 32 && (DstSize % 32 != 0)) { - // FIXME: Need a way to specify non-extload of larger size if - // suitably aligned. - return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); - } - unsigned MaxSize = maxSizeForAddrSpace(ST, PtrTy.getAddressSpace(), Op == G_LOAD); @@ -1257,14 +1233,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, ElementCount::getFixed(FloorSize / EltSize), EltTy)); } - // Need to split because of alignment. - unsigned Align = Query.MMODescrs[0].AlignInBits; - if (EltSize > Align && - (EltSize / Align < DstTy.getNumElements())) { - return std::make_pair( - 0, LLT::fixed_vector(EltSize / Align, EltTy)); - } - // May need relegalization for the scalars. return std::make_pair(0, EltTy); }) @@ -1457,6 +1425,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Doesn't handle extract of illegal sizes. getActionDefinitionsBuilder(Op) .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) + .lowerIf([=](const LegalityQuery &Query) { + // Sub-vector(or single element) insert and extract. + // TODO: verify immediate offset here since lower only works with + // whole elements. + const LLT BigTy = Query.Types[BigTyIdx]; + return BigTy.isVector(); + }) // FIXME: Multiples of 16 should not be legal. .legalIf([=](const LegalityQuery &Query) { const LLT BigTy = Query.Types[BigTyIdx]; @@ -1615,7 +1590,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Prefer to reduce vector widths for 16-bit vectors before lowering, to // get more vector shift opportunities, since we'll get those when // expanded. - .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); + .clampMaxNumElementsStrict(0, S16, 2); } else if (ST.has16BitInsts()) { SextInReg.lowerFor({{S32}, {S64}, {S16}}); } else { @@ -1637,14 +1612,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_FSHR) .legalFor({{S32, S32}}) .lowerFor({{V2S16, V2S16}}) - .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .lower(); if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder(G_FSHL) .lowerFor({{V2S16, V2S16}}) - .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .lower(); } else { @@ -2567,10 +2542,8 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, } else { // For cases where the widened type isn't a nice register value, unmerge // from a widened register (e.g. <3 x s16> -> <4 x s16>) - B.setInsertPt(B.getMBB(), ++B.getInsertPt()); - WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); - B.setInsertPt(B.getMBB(), MI.getIterator()); - B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); + WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); + B.buildDeleteTrailingVectorElements(ValReg, WideLoad); } } @@ -3843,6 +3816,10 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, llvm_unreachable("invalid data type"); } + if (StoreVT == LLT::fixed_vector(3, S16)) { + Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) + .getReg(0); + } return Reg; } @@ -4237,8 +4214,17 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || (I >= Intr->CoordStart && !IsA16)) { // Handle any gradient or coordinate operands that should not be packed - AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); - PackedAddrs.push_back(AddrReg); + if ((I < Intr->GradientStart) && IsA16 && + (B.getMRI()->getType(AddrReg) == S16)) { + // Special handling of bias when A16 is on. Bias is of type half but + // occupies full 32-bit. + PackedAddrs.push_back( + B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) + .getReg(0)); + } else { + AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); + PackedAddrs.push_back(AddrReg); + } } else { // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, // derivatives dx/dh and dx/dv are packed with undef. @@ -4676,9 +4662,23 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // Deal with the one annoying legal case. const LLT V3S16 = LLT::fixed_vector(3, 16); if (Ty == V3S16) { - padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); - auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); - B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); + if (IsTFE) { + if (ResultRegs.size() == 1) { + NewResultReg = ResultRegs[0]; + } else if (ResultRegs.size() == 2) { + LLT V4S16 = LLT::fixed_vector(4, 16); + NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); + } else { + return false; + } + } + + if (MRI->getType(DstReg).getNumElements() < + MRI->getType(NewResultReg).getNumElements()) { + B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); + } else { + B.buildPadVectorWithUndefElements(DstReg, NewResultReg); + } return true; } @@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, } Ops.push_back(RayExtent); - auto packLanes = [&Ops, &S32, &B] (Register Src) { - auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); + auto packLanes = [&Ops, &S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); Ops.push_back(Unmerge.getReg(0)); Ops.push_back(Unmerge.getReg(1)); Ops.push_back(Unmerge.getReg(2)); @@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, packLanes(RayOrigin); if (IsA16) { - auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); - auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); Register R1 = MRI.createGenericVirtualRegister(S32); Register R2 = MRI.createGenericVirtualRegister(S32); Register R3 = MRI.createGenericVirtualRegister(S32); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 12d6d35a6917..6e2b5dc471bc 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -24,13 +24,6 @@ // A possible future refinement is to specialise the structure per-kernel, so // that fields can be elided based on more expensive analysis. // -// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct -// type which would cause allocating huge memory for struct instance within -// every kernel. Hence, before running this pass, it is advisable to run the -// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within -// non-kernel functions by pointers and thereby minimizes the unnecessary per -// kernel allocation of LDS memory. -// //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals( namespace { +SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { + SmallPtrSet<GlobalValue *, 32> UsedList; + + SmallVector<GlobalValue *, 32> TmpVec; + collectUsedGlobalVariables(M, TmpVec, true); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + TmpVec.clear(); + collectUsedGlobalVariables(M, TmpVec, false); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + return UsedList; +} + class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedList(Module &M, StringRef Name, @@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass { removeFromUsedLists(Module &M, const std::vector<GlobalVariable *> &LocalVars) { SmallPtrSet<Constant *, 32> LocalVarsSet; - for (size_t I = 0; I < LocalVars.size(); I++) { - if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) { + for (GlobalVariable *LocalVar : LocalVars) + if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) LocalVarsSet.insert(C); - } - } removeFromUsedList(M, "llvm.used", LocalVarsSet); removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); } @@ -158,9 +163,9 @@ public: } bool runOnModule(Module &M) override { - UsedList = AMDGPU::getUsedList(M); - - bool Changed = processUsedLDS(M); + UsedList = getUsedList(M); + bool Changed = superAlignLDSGlobals(M); + Changed |= processUsedLDS(M); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -177,6 +182,50 @@ public: } private: + // Increase the alignment of LDS globals if necessary to maximise the chance + // that we can use aligned LDS instructions to access them. + static bool superAlignLDSGlobals(Module &M) { + const DataLayout &DL = M.getDataLayout(); + bool Changed = false; + if (!SuperAlignLDSGlobals) { + return Changed; + } + + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + // Only changing alignment of LDS variables + continue; + } + if (!GV.hasInitializer()) { + // cuda/hip extern __shared__ variable, leave alignment alone + continue; + } + + Align Alignment = AMDGPU::getAlign(DL, &GV); + TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = std::max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = std::max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = std::max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = std::max(Alignment, Align(2)); + } + + if (Alignment != AMDGPU::getAlign(DL, &GV)) { + Changed = true; + GV.setAlignment(Alignment); + } + } + return Changed; + } + bool processUsedLDS(Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -190,31 +239,6 @@ private: return false; } - // Increase the alignment of LDS globals if necessary to maximise the chance - // that we can use aligned LDS instructions to access them. - if (SuperAlignLDSGlobals) { - for (auto *GV : FoundLocalVars) { - Align Alignment = AMDGPU::getAlign(DL, GV); - TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); - - if (GVSize > 8) { - // We might want to use a b96 or b128 load/store - Alignment = std::max(Alignment, Align(16)); - } else if (GVSize > 4) { - // We might want to use a b64 load/store - Alignment = std::max(Alignment, Align(8)); - } else if (GVSize > 2) { - // We might want to use a b32 load/store - Alignment = std::max(Alignment, Align(4)); - } else if (GVSize > 1) { - // We might want to use a b16 load/store - Alignment = std::max(Alignment, Align(2)); - } - - GV->setAlignment(Alignment); - } - } - SmallVector<OptimizedStructLayoutField, 8> LayoutFields; LayoutFields.reserve(FoundLocalVars.size()); for (GlobalVariable *GV : FoundLocalVars) { @@ -343,20 +367,14 @@ private: refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); } - // Mark kernels with asm that reads the address of the allocated structure - // This is not necessary for lowering. This lets other passes, specifically - // PromoteAlloca, accurately calculate how much LDS will be used by the - // kernel after lowering. + // This ensures the variable is allocated when called functions access it. + // It also lets other passes, specifically PromoteAlloca, accurately + // calculate how much LDS will be used by the kernel after lowering. if (!F) { IRBuilder<> Builder(Ctx); - SmallPtrSet<Function *, 32> Kernels; for (Function &Func : M.functions()) { - if (Func.isDeclaration()) - continue; - - if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) { + if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { markUsedByKernel(Builder, &Func, SGV); - Kernels.insert(&Func); } } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 5d4b007f11e6..4e2f98d2a5db 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -2786,12 +2786,8 @@ AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned Selec // Fixme: Move linearization creation to the original spot createLinearizedRegion(Region, SelectOut); - for (auto CI = Region->getChildren()->begin(), - CE = Region->getChildren()->end(); - CI != CE; ++CI) { - InnerSelectOut = - initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII); - } + for (auto *CI : *Region->getChildren()) + InnerSelectOut = initializeSelectRegisters(CI, InnerSelectOut, MRI, TII); MRT->setBBSelectRegIn(InnerSelectOut); return InnerSelectOut; } else { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 2aa02299ecdc..8ad344816ad2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -119,31 +119,27 @@ private: bool isConstantAddr(const Value *V) const; }; -static const Value *getMemoryInstrPtr(const Instruction *Inst) { - if (auto LI = dyn_cast<LoadInst>(Inst)) { - return LI->getPointerOperand(); - } - if (auto SI = dyn_cast<StoreInst>(Inst)) { - return SI->getPointerOperand(); - } - if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { - return AI->getPointerOperand(); - } - if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { - return AI->getPointerOperand(); - } - if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { - return MI->getRawDest(); - } - - return nullptr; +static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( + const Instruction *Inst) { + if (auto LI = dyn_cast<LoadInst>(Inst)) + return {LI->getPointerOperand(), LI->getType()}; + if (auto SI = dyn_cast<StoreInst>(Inst)) + return {SI->getPointerOperand(), SI->getValueOperand()->getType()}; + if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) + return {AI->getPointerOperand(), AI->getCompareOperand()->getType()}; + if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) + return {AI->getPointerOperand(), AI->getValOperand()->getType()}; + if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) + return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())}; + + return {nullptr, nullptr}; } bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); SmallSet<const Value *, 32> WorkSet; SmallSet<const Value *, 32> Visited; - if (const Value *MO = getMemoryInstrPtr(Inst)) { + if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { if (isGlobalAddr(MO)) WorkSet.insert(MO); } @@ -209,10 +205,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { for (auto &B : F) { LastAccess = MemAccessInfo(); for (auto &I : B) { - if (const Value *Ptr = getMemoryInstrPtr(&I)) { - unsigned Size = divideCeil( - Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), - 32); + if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { + unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); if (isIndirectAccess(&I)) FI.IAMInstCost += Size; if (isLargeStride(&I)) @@ -326,7 +320,7 @@ bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { AMDGPUPerfHint::MemAccessInfo AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { MemAccessInfo MAI; - const Value *MO = getMemoryInstrPtr(Inst); + const Value *MO = getMemoryInstrPtrAndType(Inst).first; LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); // Do not treat local-addr memory access as large stride. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 3ec5dd7e0eff..f9a9fe403ff6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -939,7 +939,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(MaybeAlign(I.getAlignment())); + GV->setAlignment(I.getAlign()); Value *TCntY, *TCntZ; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 12b5830ef930..3ce67a733c10 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -16,6 +16,7 @@ #include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -23,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-regbank-combiner" @@ -36,13 +38,15 @@ protected: MachineRegisterInfo &MRI; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; + const SIInstrInfo &TII; CombinerHelper &Helper; public: AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), RBI(*MF.getSubtarget().getRegBankInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; + TRI(*MF.getSubtarget().getRegisterInfo()), + TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); Register getAsVgpr(Register Reg); @@ -63,7 +67,19 @@ public: Register &Val, CstTy &K0, CstTy &K1); bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg); + bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + void applyClamp(MachineInstr &MI, Register &Reg); + +private: + AMDGPU::SIModeRegisterDefaults getMode(); + bool getIEEE(); + bool getDX10Clamp(); + bool isFminnumIeee(const MachineInstr &MI); + bool isFCst(MachineInstr *MI); + bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1); }; bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { @@ -98,6 +114,13 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { case AMDGPU::G_UMAX: case AMDGPU::G_UMIN: return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM: + return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3}; + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FMINNUM_IEEE: + return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE, + AMDGPU::G_AMDGPU_FMED3}; } } @@ -148,6 +171,146 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( return true; } +// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) +// ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K +// ieee = false : min/max(NaN, K) = K +// clamp(NaN) = dx10_clamp ? 0.0 : NaN +// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input. +// Other operand commutes (see matchMed) give same result since min and max are +// commutative. + +// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1 +// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0. +// Val = SNaN only for ieee = true +// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1 +// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1 +// max(min(SNaN, K1), K0) = max(K1, K0) = K1 +// Val = NaN,ieee = false or Val = QNaN,ieee = true +// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0 +// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true) +// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0 +bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32)) + return false; + + auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); + + Register Val; + Optional<FPValueAndVReg> K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. + if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + if (K0->Value > K1->Value) + return false; + + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to + // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner + // nodes(max/min) have same behavior when one input is NaN and other isn't. + // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN, + // also post-legalizer inputs to min/max are fcanonicalized (never SNaN). + if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) { + // Don't fold single use constant that can't be inlined. + if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) && + (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) { + MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg}; + return true; + } + } + + return false; +} + +bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI, + Register &Reg) { + // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16). + auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); + Register Val; + Optional<FPValueAndVReg> K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). + if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0)) + return false; + + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates + // to 0.0 requires dx10_clamp = true. + if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) && + isKnownNeverSNaN(Val, MRI)) || + isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) { + Reg = Val; + return true; + } + + return false; +} + +// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true. +// Val = SNaN only for ieee = true. It is important which operand is NaN. +// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0 +// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0 +// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN +// Val = NaN,ieee = false or Val = QNaN,ieee = true +// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0 +// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0 +// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0 +bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI, + Register &Reg) { + if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3) + return false; + + // In llvm-ir, clamp is often represented as an intrinsic call to + // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. + MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); + MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (isFCst(Src1) && !isFCst(Src2)) + std::swap(Src1, Src2); + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (!isClampZeroToOne(Src1, Src2)) + return false; + + Register Val = Src0->getOperand(0).getReg(); + + auto isOp3Zero = [&]() { + MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) + return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); + return false; + }; + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold + // when Val could be QNaN. If Val can also be SNaN third input should be 0.0. + if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) || + (getIEEE() && getDX10Clamp() && + (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) { + Reg = Val; + return true; + } + + return false; +} + +void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg}, + MI.getFlags()); + MI.eraseFromParent(); +} + void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) { B.setInstrAndDebugLoc(MI); @@ -158,6 +321,33 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, MI.eraseFromParent(); } +AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { + return MF.getInfo<SIMachineFunctionInfo>()->getMode(); +} + +bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; } + +bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; } + +bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) { + return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE; +} + +bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::G_FCONSTANT; +} + +bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0, + MachineInstr *K1) { + if (isFCst(K0) && isFCst(K1)) { + const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm(); + const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm(); + return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) || + (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0)); + } + return false; +} + class AMDGPURegBankCombinerHelperState { protected: CombinerHelper &Helper; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5988403c0a29..c60012bcfe2e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -707,9 +707,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( iterator_range<MachineBasicBlock::iterator> Range, SmallSet<Register, 4> &SGPROperandRegs, MachineRegisterInfo &MRI) const { - SmallVector<Register, 4> ResultRegs; - SmallVector<Register, 4> InitResultRegs; - SmallVector<Register, 4> PhiRegs; // Track use registers which have already been expanded with a readfirstlane // sequence. This may have multiple uses if moving a sequence. @@ -774,15 +771,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addReg(NewExec) .addMBB(LoopBB); - for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { - B.buildInstr(TargetOpcode::G_PHI) - .addDef(std::get<2>(Result)) - .addReg(std::get<0>(Result)) // Initial value / implicit_def - .addMBB(&MBB) - .addReg(std::get<1>(Result)) // Mid-loop value. - .addMBB(LoopBB); - } - const DebugLoc &DL = B.getDL(); MachineInstr &FirstInst = *Range.begin(); @@ -1174,18 +1162,25 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, // 96-bit loads are only available for vector loads. We need to split this // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). if (MMO->getAlign() < Align(16)) { + MachineFunction *MF = MI.getParent()->getParent(); + ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); + MachineIRBuilder B(MI, ApplyBank); + LegalizerHelper Helper(*MF, ApplyBank, B); LLT Part64, Part32; std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); - auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); - auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); - - auto Undef = B.buildUndef(LoadTy); - auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); - B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != + LegalizerHelper::Legalized) + return false; + return true; } else { LLT WiderTy = widen96To128(LoadTy); auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); - B.buildExtract(MI.getOperand(0), WideLoad, 0); + if (WiderTy.isScalar()) + B.buildTrunc(MI.getOperand(0), WideLoad); + else { + B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), + WideLoad); + } } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp index d55bf3917e9c..2475b44b42a3 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -87,6 +87,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -110,6 +111,18 @@ using namespace llvm; namespace { +namespace AMDGPU { +/// Collect all the instructions where user \p U belongs to. \p U could be +/// instruction itself or it could be a constant expression which is used within +/// an instruction. If \p CollectKernelInsts is true, collect instructions only +/// from kernels, otherwise collect instructions only from non-kernel functions. +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts); + +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV); + +} // namespace AMDGPU + class ReplaceLDSUseImpl { Module &M; LLVMContext &Ctx; @@ -127,7 +140,8 @@ class ReplaceLDSUseImpl { // Collect LDS which requires their uses to be replaced by pointer. std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() { // Collect LDS which requires module lowering. - std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M); + std::vector<GlobalVariable *> LDSGlobals = + llvm::AMDGPU::findVariablesToLower(M); // Remove LDS which don't qualify for replacement. llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) { @@ -172,7 +186,7 @@ class ReplaceLDSUseImpl { AMDGPUAS::LOCAL_ADDRESS); LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer)); + LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer)); // Mark that an associated LDS pointer is created for LDS. LDSToPointer[GV] = LDSPointer; @@ -245,10 +259,9 @@ class ReplaceLDSUseImpl { auto FunctionToInsts = AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/); - for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end(); - FI != FE; ++FI) { - Function *F = FI->first; - auto &Insts = FI->second; + for (const auto &FunctionToInst : FunctionToInsts) { + Function *F = FunctionToInst.first; + auto &Insts = FunctionToInst.second; for (auto *I : Insts) { // If `U` is a constant expression, then we need to break the // associated instruction into a set of separate instructions by @@ -341,10 +354,9 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { // Traverse through each kernel K, check and if required, initialize the // LDS pointer to point to LDS within K. - for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE; - ++KI) { - Function *K = KI->first; - SmallPtrSet<Function *, 8> Callees = KI->second; + for (const auto &KernelToCallee : KernelToCallees) { + Function *K = KernelToCallee.first; + SmallPtrSet<Function *, 8> Callees = KernelToCallee.second; // Compute reachable and LDS used callees for kernel K. set_intersect(Callees, LDSAccessors); @@ -378,6 +390,184 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { return true; } +namespace AMDGPU { + +// An helper class for collecting all reachable callees for each kernel defined +// within the module. +class CollectReachableCallees { + Module &M; + CallGraph CG; + SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; + + // Collect all address taken functions within the module. + void collectAddressTakenFunctions() { + auto *ECNode = CG.getExternalCallingNode(); + + for (const auto &GI : *ECNode) { + auto *CGN = GI.second; + auto *F = CGN->getFunction(); + if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F)) + continue; + AddressTakenFunctions.insert(CGN); + } + } + + // For given kernel, collect all its reachable non-kernel functions. + SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { + SmallPtrSet<Function *, 8> ReachableCallees; + + // Call graph node which represents this kernel. + auto *KCGN = CG[K]; + + // Go through all call graph nodes reachable from the node representing this + // kernel, visit all their call sites, if the call site is direct, add + // corresponding callee to reachable callee set, if it is indirect, resolve + // the indirect call site to potential reachable callees, add them to + // reachable callee set, and repeat the process for the newly added + // potential callee nodes. + // + // FIXME: Need to handle bit-casted function pointers. + // + SmallVector<CallGraphNode *, 8> CGNStack(depth_first(KCGN)); + SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; + while (!CGNStack.empty()) { + auto *CGN = CGNStack.pop_back_val(); + + if (!VisitedCGNodes.insert(CGN).second) + continue; + + // Ignore call graph node which does not have associated function or + // associated function is not a definition. + if (!CGN->getFunction() || CGN->getFunction()->isDeclaration()) + continue; + + for (const auto &GI : *CGN) { + auto *RCB = cast<CallBase>(GI.first.getValue()); + auto *RCGN = GI.second; + + if (auto *DCallee = RCGN->getFunction()) { + ReachableCallees.insert(DCallee); + } else if (RCB->isIndirectCall()) { + auto *RCBFTy = RCB->getFunctionType(); + for (auto *ACGN : AddressTakenFunctions) { + auto *ACallee = ACGN->getFunction(); + if (ACallee->getFunctionType() == RCBFTy) { + ReachableCallees.insert(ACallee); + CGNStack.append(df_begin(ACGN), df_end(ACGN)); + } + } + } + } + } + + return ReachableCallees; + } + +public: + explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { + // Collect address taken functions. + collectAddressTakenFunctions(); + } + + void collectReachableCallees( + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + // Collect reachable callee set for each kernel defined in the module. + for (Function &F : M.functions()) { + if (!llvm::AMDGPU::isKernelCC(&F)) + continue; + Function *K = &F; + KernelToCallees[K] = collectReachableCallees(K); + } + } +}; + +/// Collect reachable callees for each kernel defined in the module \p M and +/// return collected callees at \p KernelToCallees. +void collectReachableCallees( + Module &M, + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + CollectReachableCallees CRC{M}; + CRC.collectReachableCallees(KernelToCallees); +} + +/// For the given LDS global \p GV, visit all its users and collect all +/// non-kernel functions within which \p GV is used and return collected list of +/// such non-kernel functions. +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { + SmallPtrSet<Function *, 8> LDSAccessors; + SmallVector<User *, 8> UserStack(GV->users()); + SmallPtrSet<User *, 8> VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with LDS. Ignore LDS. + if (isa<GlobalValue>(U)) + return SmallPtrSet<Function *, 8>(); + + // Recursively explore constant users. + if (isa<Constant>(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + Function *F = cast<Instruction>(U)->getFunction(); + if (!llvm::AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } + + return LDSAccessors; +} + +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts) { + DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; + SmallVector<User *, 8> UserStack; + SmallPtrSet<User *, 8> VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa<GlobalValue>(UU)) + continue; + + if (isa<Constant>(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + auto *I = cast<Instruction>(UU); + Function *F = I->getFunction(); + if (CollectKernelInsts) { + if (!llvm::AMDGPU::isKernelCC(F)) { + continue; + } + } else { + if (llvm::AMDGPU::isKernelCC(F)) { + continue; + } + } + + FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>())); + FunctionToInsts[F].insert(I); + } + + return FunctionToInsts; +} + +} // namespace AMDGPU + // Entry-point function which interface ReplaceLDSUseImpl with outside of the // class. bool ReplaceLDSUseImpl::replaceLDSUse() { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0655b4342ba1..cd05797fdbdb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -413,21 +413,21 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { case AMDGPU::V_MAX_I16_e32: case AMDGPU::V_MIN_I16_e64: case AMDGPU::V_MIN_I16_e32: + case AMDGPU::V_MAD_F16_e64: + case AMDGPU::V_MAD_U16_e64: + case AMDGPU::V_MAD_I16_e64: + case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_DIV_FIXUP_F16_e64: // On gfx10, all 16-bit instructions preserve the high bits. return getGeneration() <= AMDGPUSubtarget::GFX9; - case AMDGPU::V_MAD_F16_e64: case AMDGPU::V_MADAK_F16: case AMDGPU::V_MADMK_F16: case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_MAC_F16_e32: case AMDGPU::V_FMAMK_F16: case AMDGPU::V_FMAAK_F16: - case AMDGPU::V_MAD_U16_e64: - case AMDGPU::V_MAD_I16_e64: - case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMAC_F16_e64: case AMDGPU::V_FMAC_F16_e32: - case AMDGPU::V_DIV_FIXUP_F16_e64: // In gfx9, the preferred handling of the unused high 16-bits changed. Most // instructions maintain the legacy behavior of 0ing. Some instructions // changed to preserving the high bits. @@ -648,9 +648,18 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { + assert(AMDGPU::isKernel(F.getCallingConv())); + + // We don't allocate the segment if we know the implicit arguments weren't + // used, even if the ABI implies we need them. + if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) + return 0; + if (isMesaKernel(F)) return 16; - return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); + + // Assume all implicit inputs are used by default + return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); } uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index de11676279f2..a2c61f9da8da 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -231,13 +231,6 @@ static cl::opt<bool, true> LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); -static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt( - "amdgpu-fixed-function-abi", - cl::desc("Enable all implicit function arguments"), - cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), - cl::init(false), - cl::Hidden); - // Enable lib calls simplifications static cl::opt<bool> EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -505,7 +498,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; -bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 0ff2db2a52d9..226646a96953 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -37,7 +37,6 @@ protected: public: static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; - static bool EnableFixedFunctionABI; static bool EnableLowerModuleLDS; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index ecdbdf613a53..09c5eb192e1f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -519,57 +519,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { - EVT OrigTy = TLI->getValueType(DL, Ty); - if (!OrigTy.isSimple()) { - // FIXME: We're having to query the throughput cost so that the basic - // implementation tries to generate legalize and scalarization costs. Maybe - // we could hoist the scalarization code here? - if (CostKind != TTI::TCK_CodeSize) - return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, - Opd1Info, Opd2Info, Opd1PropInfo, - Opd2PropInfo, Args, CxtI); - // Scalarization - - // Check if any of the operands are vector operands. - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - - std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - - bool IsFloat = Ty->isFPOrFPVectorTy(); - // Assume that floating point arithmetic operations cost twice as much as - // integer operations. - unsigned OpCost = (IsFloat ? 2 : 1); - - if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { - // The operation is legal. Assume it costs 1. - // TODO: Once we have extract/insert subvector cost we need to use them. - return LT.first * OpCost; - } - - if (!TLI->isOperationExpand(ISD, LT.second)) { - // If the operation is custom lowered, then assume that the code is twice - // as expensive. - return LT.first * 2 * OpCost; - } - - // Else, assume that we need to scalarize this op. - // TODO: If one of the types get legalized by splitting, handle this - // similarly to what getCastInstrCost() does. - if (auto *VTy = dyn_cast<VectorType>(Ty)) { - unsigned Num = cast<FixedVectorType>(VTy)->getNumElements(); - InstructionCost Cost = getArithmeticInstrCost( - Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo, Args, CxtI); - // Return the cost of multiple scalar invocation plus the cost of - // inserting and extracting the values. - SmallVector<Type *> Tys(Args.size(), Ty); - return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; - } - - // We don't know anything about this scalar instruction. - return OpCost; - } // Legalize the type. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -742,40 +691,6 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); Type *RetTy = ICA.getReturnType(); - EVT OrigTy = TLI->getValueType(DL, RetTy); - if (!OrigTy.isSimple()) { - if (CostKind != TTI::TCK_CodeSize) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); - - // TODO: Combine these two logic paths. - if (ICA.isTypeBasedOnly()) - return getTypeBasedIntrinsicInstrCost(ICA, CostKind); - - unsigned RetVF = - (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements() - : 1); - const IntrinsicInst *I = ICA.getInst(); - const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); - FastMathFlags FMF = ICA.getFlags(); - // Assume that we need to scalarize this intrinsic. - - // Compute the scalarization overhead based on Args for a vector - // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while - // CostModel will pass a vector RetTy and VF is 1. - InstructionCost ScalarizationCost = InstructionCost::getInvalid(); - if (RetVF > 1) { - ScalarizationCost = 0; - if (!RetTy->isVoidTy()) - ScalarizationCost += - getScalarizationOverhead(cast<VectorType>(RetTy), true, false); - ScalarizationCost += - getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); - } - - IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, - ScalarizationCost); - return getIntrinsicInstrCost(Attrs, CostKind); - } // Legalize the type. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 712f6dece911..1736c078eb83 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -173,10 +173,8 @@ protected: } static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { - for (MachineLoop::iterator iter = LoopInfo.begin(), - iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { - (*iter)->print(dbgs()); - } + for (const MachineLoop *L : LoopInfo) + L->print(dbgs()); } // UTILITY FUNCTIONS @@ -691,9 +689,7 @@ bool AMDGPUCFGStructurizer::prepare() { SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks; // Add an ExitBlk to loop that don't have one - for (MachineLoopInfo::iterator It = MLI->begin(), - E = MLI->end(); It != E; ++It) { - MachineLoop *LoopRep = (*It); + for (MachineLoop *LoopRep : *MLI) { MBBVector ExitingMBBs; LoopRep->getExitingBlocks(ExitingMBBs); @@ -827,14 +823,13 @@ bool AMDGPUCFGStructurizer::run() { wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); // Detach retired Block, release memory. - for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); - It != E; ++It) { - if ((*It).second && (*It).second->IsRetired) { - assert(((*It).first)->getNumber() != -1); - LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";); - (*It).first->eraseFromParent(); //Remove from the parent Function. + for (auto &It : BlockInfoMap) { + if (It.second && It.second->IsRetired) { + assert((It.first)->getNumber() != -1); + LLVM_DEBUG(dbgs() << "Erase BB" << (It.first)->getNumber() << "\n";); + It.first->eraseFromParent(); // Remove from the parent Function. } - delete (*It).second; + delete It.second; } BlockInfoMap.clear(); LLInfoMap.clear(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 4acd77a9d5d2..2bb59086f391 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -246,8 +246,12 @@ public: return isRegKind() && !hasModifiers(); } + bool isRegOrInline(unsigned RCID, MVT type) const { + return isRegClass(RCID) || isInlinableImm(type); + } + bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const { - return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type); + return isRegOrInline(RCID, type) || isLiteralImm(type); } bool isRegOrImmWithInt16InputMods() const { @@ -372,7 +376,7 @@ public: bool isInlineValue() const; bool isRegOrInlineNoMods(unsigned RCID, MVT type) const { - return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers(); + return isRegOrInline(RCID, type) && !hasModifiers(); } bool isSCSrcB16() const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td index d3644db7cf8b..a535c8cc0918 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// -def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; -def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">; +def MUBUFAddr64 : ComplexPattern<iPTR, 4, "SelectMUBUFAddr64">; +def MUBUFOffset : ComplexPattern<iPTR, 3, "SelectMUBUFOffset">; -def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; -def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; +def MUBUFScratchOffen : ComplexPattern<iPTR, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; +def MUBUFScratchOffset : ComplexPattern<iPTR, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; def BUFAddrKind { int Offset = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td index bb0aa648ff90..c7ec5308e6d0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// -def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>; -def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>; -def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>; +def FlatOffset : ComplexPattern<iPTR, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>; +def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>; +def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>; -def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>; -def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>; +def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>; +def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>; //===----------------------------------------------------------------------===// // FLAT classes diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index f3f664f7972a..912bcc792e4d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -120,8 +120,7 @@ unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, // We will now look at each of the currently executing instructions // to find out if this wait instruction still needs to wait. - for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) { - const InstRef &PrevIR = *I; + for (const InstRef &PrevIR : IssuedInst) { const Instruction &PrevInst = *PrevIR.getInstruction(); const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 29c37c706138..8a48a67b829c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -440,9 +440,8 @@ private: CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } + for (MachineInstr *MI : Clause.second) + BB->splice(InsertPos, BB, MI); CfCount += 2 * Clause.second.size(); } @@ -452,9 +451,8 @@ private: CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } + for (MachineInstr *MI : Clause.second) + BB->splice(InsertPos, BB, MI); CfCount += Clause.second.size(); } @@ -635,10 +633,10 @@ public: CfCount++; } MI->eraseFromParent(); - for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) - EmitFetchClause(I, DL, FetchClauses[i], CfCount); - for (unsigned i = 0, e = AluClauses.size(); i < e; i++) - EmitALUClause(I, DL, AluClauses[i], CfCount); + for (ClauseFile &CF : FetchClauses) + EmitFetchClause(I, DL, CF, CfCount); + for (ClauseFile &CF : AluClauses) + EmitALUClause(I, DL, CF, CfCount); break; } default: @@ -649,8 +647,7 @@ public: break; } } - for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { - MachineInstr *Alu = ToPopAfter[i]; + for (MachineInstr *Alu : ToPopAfter) { BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), TII->get(R600::CF_ALU_POP_AFTER)) .addImm(Alu->getOperand(0).getImm()) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index a7ebf72315cb..aec8b1ae4837 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -268,17 +268,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { {R600::OpName::src1_W, R600::OpName::src1_sel_W}, }; - for (unsigned j = 0; j < 8; j++) { - MachineOperand &MO = - MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); + for (const auto &Op : OpTable) { + MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[0])); Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { MachineOperand &Sel = - MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } - } return Result; } @@ -289,15 +287,14 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { {R600::OpName::src2, R600::OpName::src2_sel}, }; - for (unsigned j = 0; j < 3; j++) { - int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]); + for (const auto &Op : OpTable) { + int SrcIdx = getOperandIdx(MI.getOpcode(), Op[0]); if (SrcIdx < 0) break; MachineOperand &MO = MI.getOperand(SrcIdx); Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { - MachineOperand &Sel = - MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } @@ -521,12 +518,11 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, ValidSwizzle.clear(); unsigned ConstCount; BankSwizzle TransBS = ALU_VEC_012_SCL_210; - for (unsigned i = 0, e = IG.size(); i < e; ++i) { - IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount)); - unsigned Op = getOperandIdx(IG[i]->getOpcode(), - R600::OpName::bank_swizzle); - ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) - IG[i]->getOperand(Op).getImm()); + for (MachineInstr *MI : IG) { + IGSrcs.push_back(ExtractSrcs(*MI, PV, ConstCount)); + unsigned Op = getOperandIdx(MI->getOpcode(), R600::OpName::bank_swizzle); + ValidSwizzle.push_back( + (R600InstrInfo::BankSwizzle)MI->getOperand(Op).getImm()); } std::vector<std::pair<int, unsigned>> TransOps; if (!isLastAluTrans) @@ -542,8 +538,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, ALU_VEC_120_SCL_212, ALU_VEC_102_SCL_221 }; - for (unsigned i = 0; i < 4; i++) { - TransBS = TransSwz[i]; + for (R600InstrInfo::BankSwizzle TransBS : TransSwz) { if (!isConstCompatible(TransBS, TransOps, ConstCount)) continue; bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, @@ -562,9 +557,9 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) const { assert (Consts.size() <= 12 && "Too many operands in instructions group"); unsigned Pair1 = 0, Pair2 = 0; - for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - unsigned ReadConstHalf = Consts[i] & 2; - unsigned ReadConstIndex = Consts[i] & (~3); + for (unsigned Const : Consts) { + unsigned ReadConstHalf = Const & 2; + unsigned ReadConstIndex = Const & (~3); unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; if (!Pair1) { Pair1 = ReadHalfConst; @@ -587,12 +582,11 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) const { std::vector<unsigned> Consts; SmallSet<int64_t, 4> Literals; - for (unsigned i = 0, n = MIs.size(); i < n; i++) { - MachineInstr &MI = *MIs[i]; - if (!isALUInstr(MI.getOpcode())) + for (MachineInstr *MI : MIs) { + if (!isALUInstr(MI->getOpcode())) continue; - for (const auto &Src : getSrcs(MI)) { + for (const auto &Src : getSrcs(*MI)) { if (Src.first->getReg() == R600::ALU_LITERAL_X) Literals.insert(Src.second); if (Literals.size() > 4) @@ -1330,11 +1324,11 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel)) .setReg(MO.getReg()); - for (unsigned i = 0; i < 14; i++) { + for (unsigned Operand : Operands) { MachineOperand &MO = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); + getOperandIdx(MI->getOpcode(), getSlotedOps(Operand, Slot))); assert (MO.isImm()); - setImmOperand(*MIB, Operands[i], MO.getImm()); + setImmOperand(*MIB, Operand, MO.getImm()); } MIB->getOperand(20).setImm(0); return MIB; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp index 6aee2f591b56..d26879ed8d60 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -328,9 +328,9 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { void R600SchedStrategy::LoadAlu() { std::vector<SUnit *> &QSrc = Pending[IDAlu]; - for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { - AluKind AK = getAluKind(QSrc[i]); - AvailableAlus[AK].push_back(QSrc[i]); + for (SUnit *SU : QSrc) { + AluKind AK = getAluKind(SU); + AvailableAlus[AK].push_back(SU); } QSrc.clear(); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index ac6a3581e255..aa156190b7ae 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -307,8 +307,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass { // Build new MDNode. SmallVector<Metadata *, 6> KernelMDArgs; KernelMDArgs.push_back(ConstantAsMetadata::get(NewF)); - for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) - KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i])); + for (const MDVector &MDV : NewArgMDs.ArgVector) + KernelMDArgs.push_back(MDNode::get(*Context, MDV)); MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs); return std::make_tuple(NewF, NewMDNode); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 72cf48c04e7f..795bc898a7bf 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -150,19 +150,18 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap) const { unsigned CurrentUndexIdx = 0; - for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(), - E = ToMerge->RegToChan.end(); It != E; ++It) { + for (auto &It : ToMerge->RegToChan) { DenseMap<Register, unsigned>::const_iterator PosInUntouched = - Untouched->RegToChan.find((*It).first); + Untouched->RegToChan.find(It.first); if (PosInUntouched != Untouched->RegToChan.end()) { - Remap.push_back(std::pair<unsigned, unsigned> - ((*It).second, (*PosInUntouched).second)); + Remap.push_back( + std::pair<unsigned, unsigned>(It.second, (*PosInUntouched).second)); continue; } if (CurrentUndexIdx >= Untouched->UndefReg.size()) return false; - Remap.push_back(std::pair<unsigned, unsigned> - ((*It).second, Untouched->UndefReg[CurrentUndexIdx++])); + Remap.push_back(std::pair<unsigned, unsigned>( + It.second, Untouched->UndefReg[CurrentUndexIdx++])); } return true; @@ -172,9 +171,9 @@ static unsigned getReassignedChan( const std::vector<std::pair<unsigned, unsigned>> &RemapChan, unsigned Chan) { - for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { - if (RemapChan[j].first == Chan) - return RemapChan[j].second; + for (const auto &J : RemapChan) { + if (J.first == Chan) + return J.second; } llvm_unreachable("Chan wasn't reassigned"); } @@ -190,11 +189,10 @@ MachineInstr *R600VectorRegMerger::RebuildVector( Register SrcVec = BaseRSI->Instr->getOperand(0).getReg(); DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; std::vector<Register> UpdatedUndef = BaseRSI->UndefReg; - for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(), - E = RSI->RegToChan.end(); It != E; ++It) { + for (const auto &It : RSI->RegToChan) { Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); - unsigned SubReg = (*It).first; - unsigned Swizzle = (*It).second; + unsigned SubReg = It.first; + unsigned Swizzle = It.second; unsigned Chan = getReassignedChan(RemapChan, Swizzle); MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG), @@ -234,14 +232,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector( } void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { - for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), - E = PreviousRegSeqByReg.end(); It != E; ++It) { - std::vector<MachineInstr *> &MIs = (*It).second; + for (auto &It : PreviousRegSeqByReg) { + std::vector<MachineInstr *> &MIs = It.second; MIs.erase(llvm::find(MIs, MI), MIs.end()); } - for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), - E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { - std::vector<MachineInstr *> &MIs = (*It).second; + for (auto &It : PreviousRegSeqByUndefCount) { + std::vector<MachineInstr *> &MIs = It.second; MIs.erase(llvm::find(MIs, MI), MIs.end()); } } @@ -255,9 +251,9 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, Offset = 3; for (unsigned i = 0; i < 4; i++) { unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1; - for (unsigned j = 0, e = RemapChan.size(); j < e; j++) { - if (RemapChan[j].first == Swizzle) { - MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1); + for (const auto &J : RemapChan) { + if (J.first == Swizzle) { + MI.getOperand(i + Offset).setImm(J.second - 1); break; } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index beb0aad86e89..fbe2a1cd9fba 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -127,8 +127,8 @@ private: R600::OpName::src1, R600::OpName::src2 }; - for (unsigned i = 0; i < 3; i++) { - int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); + for (unsigned Op : Ops) { + int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Op); if (OperandIdx < 0) continue; Register Src = MI.getOperand(OperandIdx).getReg(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp index 99a1a8e9871a..c329bae50f92 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -54,10 +54,8 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE); reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR); - for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(), - E = R600::R600_AddrRegClass.end(); I != E; ++I) { - reserveRegisterTuples(Reserved, *I); - } + for (MCPhysReg R : R600::R600_AddrRegClass) + reserveRegisterTuples(Reserved, R); TII->reserveIndirectRegisters(Reserved, MF, *this); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 200e00ee5521..1f93284fc7ee 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1620,7 +1620,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users, // in which case we can erase them all later in runOnMachineFunction. if (MRI->use_nodbg_empty(MI.getOperand(0).getReg())) - MI.eraseFromParentAndMarkDBGValuesForRemoval(); + MI.eraseFromParent(); return true; } @@ -1821,7 +1821,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { auto &SrcOp = InstToErase->getOperand(1); auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register(); - InstToErase->eraseFromParentAndMarkDBGValuesForRemoval(); + InstToErase->eraseFromParent(); InstToErase = nullptr; if (!SrcReg || SrcReg.isPhysical()) break; @@ -1831,7 +1831,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { } if (InstToErase && InstToErase->isRegSequence() && MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) - InstToErase->eraseFromParentAndMarkDBGValuesForRemoval(); + InstToErase->eraseFromParent(); } } return true; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 4706c74be721..d4fe74ecb96e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1167,11 +1167,13 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (SpillVGPRToAGPR) { // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); bool SeenDbgInstr = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + int FrameIndex; if (MI.isDebugInstr()) SeenDbgInstr = true; @@ -1191,10 +1193,18 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( SpillFIs.set(FI); continue; } - } + } else if (TII->isStoreToStackSlot(MI, FrameIndex) || + TII->isLoadFromStackSlot(MI, FrameIndex)) + NonVGPRSpillFIs.set(FrameIndex); } } + // Stack slot coloring may assign different objets to the same stack slot. + // If not, then the VGPR to AGPR spill slot is dead. + for (unsigned FI : SpillFIs.set_bits()) + if (!NonVGPRSpillFIs.test(FI)) + FuncInfo->setVGPRToAGPRSpillDead(FI); + for (MachineBasicBlock &MBB : MF) { for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) MBB.addLiveIn(Reg); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 35b72f5d201b..9f138136e6e9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" @@ -2062,33 +2063,30 @@ void SITargetLowering::allocateSpecialInputSGPRs( SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); - // We need to allocate these in place regardless of their use. - const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI; - // TODO: Unify handling with private memory pointers. - if (IsFixed || Info.hasDispatchPtr()) + if (Info.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - if (IsFixed || Info.hasQueuePtr()) + if (Info.hasQueuePtr()) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a // constant offset from the kernarg segment. - if (IsFixed || Info.hasImplicitArgPtr()) + if (Info.hasImplicitArgPtr()) allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); - if (IsFixed || Info.hasDispatchID()) + if (Info.hasDispatchID()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. - if (IsFixed || Info.hasWorkGroupIDX()) + if (Info.hasWorkGroupIDX()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); - if (IsFixed || Info.hasWorkGroupIDY()) + if (Info.hasWorkGroupIDY()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); - if (IsFixed || Info.hasWorkGroupIDZ()) + if (Info.hasWorkGroupIDZ()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); } @@ -2419,10 +2417,9 @@ SDValue SITargetLowering::LowerFormalArguments( if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); - } else { + } else if (!IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. - if (AMDGPUTargetMachine::EnableFixedFunctionABI) - allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); + allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } if (IsKernel) { @@ -2549,17 +2546,13 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { - // Special inputs come after user arguments. - allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); - } - // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); + if (!IsGraphics) + allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } auto &ArgUsageInfo = @@ -3123,8 +3116,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (AMDGPUTargetMachine::EnableFixedFunctionABI && - CallConv != CallingConv::AMDGPU_Gfx) { + if (CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3263,12 +3255,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } - if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - CallConv != CallingConv::AMDGPU_Gfx) { - // Copy special input registers after user input arguments. - passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); - } - if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -6282,10 +6268,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } - // Push back extra arguments. - for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) - VAddrs.push_back(Op.getOperand(ArgOffset + I)); - // Check for 16 bit addresses or derivatives and pack if true. MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); @@ -6298,6 +6280,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + // Push back extra arguments. + for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { + if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { + // Special handling of bias when A16 is on. Bias is of type half but + // occupies full 32-bit. + SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); + VAddrs.push_back(bias); + } else + VAddrs.push_back(Op.getOperand(ArgOffset + I)); + } + if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { // 16 bit gradients are supported, but are tied to the A16 control // so both gradients and addresses must be 16 bit @@ -7502,8 +7495,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, assert(NodePtr.getValueType() == MVT::i32 || NodePtr.getValueType() == MVT::i64); - assert(RayDir.getValueType() == MVT::v4f16 || - RayDir.getValueType() == MVT::v4f32); + assert(RayDir.getValueType() == MVT::v3f16 || + RayDir.getValueType() == MVT::v3f32); if (!Subtarget->hasGFX10_AEncoding()) { emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); @@ -9837,11 +9830,13 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, if (Opcode == AMDGPU::G_FCANONICALIZE) return true; - if (Opcode == AMDGPU::G_FCONSTANT) { - auto F = MI->getOperand(1).getFPImm()->getValueAPF(); - if (F.isNaN() && F.isSignaling()) + Optional<FPValueAndVReg> FCR; + // Constant splat (can be padded with undef) or scalar constant. + if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) { + if (FCR->Value.isSignaling()) return false; - return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF); + return !FCR->Value.isDenormal() || + denormalsEnabledForType(MRI.getType(FCR->VReg), MF); } if (MaxDepth == 0) @@ -11514,7 +11509,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // Prefer VGPRs over AGPRs in mAI instructions where possible. // This saves a chain-copy of registers and better ballance register // use between vgpr and agpr as agpr tuples tend to be big. - if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + if (MI.getDesc().OpInfo) { unsigned Opc = MI.getOpcode(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), @@ -12477,6 +12472,6 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, if (Size <= 256) return Cost; - Cost.first = (Size + 255) / 256; + Cost.first += (Size + 255) / 256; return Cost; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c9d9dd1fb82c..6fbe5d45ce0a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -30,6 +30,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Sequence.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -51,26 +52,6 @@ static cl::opt<bool> ForceEmitZeroFlag( cl::init(false), cl::Hidden); namespace { - -template <typename EnumT> -class enum_iterator - : public iterator_facade_base<enum_iterator<EnumT>, - std::forward_iterator_tag, const EnumT> { - EnumT Value; -public: - enum_iterator() = default; - enum_iterator(EnumT Value) : Value(Value) {} - - enum_iterator &operator++() { - Value = static_cast<EnumT>(Value + 1); - return *this; - } - - bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; } - - EnumT operator*() const { return Value; } -}; - // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emitted. @@ -78,27 +59,32 @@ public: #define CNT_MASK(t) (1u << (t)) enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; +} // namespace -iterator_range<enum_iterator<InstCounterType>> inst_counter_types() { - return make_range(enum_iterator<InstCounterType>(VM_CNT), - enum_iterator<InstCounterType>(NUM_INST_CNTS)); -} +namespace llvm { +template <> struct enum_iteration_traits<InstCounterType> { + static constexpr bool is_iterable = true; +}; +} // namespace llvm + +namespace { +auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); } using RegInterval = std::pair<int, int>; -struct { +struct HardwareLimits { unsigned VmcntMax; unsigned ExpcntMax; unsigned LgkmcntMax; unsigned VscntMax; -} HardwareLimits; +}; -struct { +struct RegisterEncoding { unsigned VGPR0; unsigned VGPRL; unsigned SGPR0; unsigned SGPRL; -} RegisterEncoding; +}; enum WaitEventType { VMEM_ACCESS, // vector-memory read & write @@ -194,18 +180,20 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {} + WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits, + RegisterEncoding Encoding) + : ST(SubTarget), Limits(Limits), Encoding(Encoding) {} - static unsigned getWaitCountMax(InstCounterType T) { + unsigned getWaitCountMax(InstCounterType T) const { switch (T) { case VM_CNT: - return HardwareLimits.VmcntMax; + return Limits.VmcntMax; case LGKM_CNT: - return HardwareLimits.LgkmcntMax; + return Limits.LgkmcntMax; case EXP_CNT: - return HardwareLimits.ExpcntMax; + return Limits.ExpcntMax; case VS_CNT: - return HardwareLimits.VscntMax; + return Limits.VscntMax; default: break; } @@ -338,6 +326,8 @@ private: unsigned OpNo, unsigned Val); const GCNSubtarget *ST = nullptr; + HardwareLimits Limits = {}; + RegisterEncoding Encoding = {}; unsigned ScoreLBs[NUM_INST_CNTS] = {0}; unsigned ScoreUBs[NUM_INST_CNTS] = {0}; unsigned PendingEvents = 0; @@ -471,14 +461,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)); if (TRI->isVectorRegister(*MRI, Op.getReg())) { - assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); - Result.first = Reg - RegisterEncoding.VGPR0; + assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); + Result.first = Reg - Encoding.VGPR0; if (TRI->isAGPR(*MRI, Op.getReg())) Result.first += AGPR_OFFSET; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { - assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); - Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; + assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); + Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; assert(Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); } @@ -1589,20 +1579,22 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); - HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); - HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); - HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0; + HardwareLimits Limits = {}; + Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV); + Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); + Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); + Limits.VscntMax = ST->hasVscnt() ? 63 : 0; unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); - RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); - RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1; - RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); - RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1; + RegisterEncoding Encoding = {}; + Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); + Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; + Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); + Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; TrackedWaitcntSet.clear(); BlockInfos.clear(); @@ -1652,9 +1644,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) - Brackets = std::make_unique<WaitcntBrackets>(ST); + Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); else - *Brackets = WaitcntBrackets(ST); + *Brackets = WaitcntBrackets(ST, Limits, Encoding); } Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); @@ -1686,45 +1678,47 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } while (Repeat); - SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; - - bool HaveScalarStores = false; + if (ST->hasScalarStores()) { + SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; + bool HaveScalarStores = false; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (!HaveScalarStores && TII->isScalarStore(MI)) - HaveScalarStores = true; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!HaveScalarStores && TII->isScalarStore(MI)) + HaveScalarStores = true; - if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) - EndPgmBlocks.push_back(&MBB); + if (MI.getOpcode() == AMDGPU::S_ENDPGM || + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) + EndPgmBlocks.push_back(&MBB); + } } - } - if (HaveScalarStores) { - // If scalar writes are used, the cache must be flushed or else the next - // wave to reuse the same scratch memory can be clobbered. - // - // Insert s_dcache_wb at wave termination points if there were any scalar - // stores, and only if the cache hasn't already been flushed. This could be - // improved by looking across blocks for flushes in postdominating blocks - // from the stores but an explicitly requested flush is probably very rare. - for (MachineBasicBlock *MBB : EndPgmBlocks) { - bool SeenDCacheWB = false; - - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; - ++I) { - if (I->getOpcode() == AMDGPU::S_DCACHE_WB) - SeenDCacheWB = true; - else if (TII->isScalarStore(*I)) - SeenDCacheWB = false; - - // FIXME: It would be better to insert this before a waitcnt if any. - if ((I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && - !SeenDCacheWB) { - Modified = true; - BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + if (HaveScalarStores) { + // If scalar writes are used, the cache must be flushed or else the next + // wave to reuse the same scratch memory can be clobbered. + // + // Insert s_dcache_wb at wave termination points if there were any scalar + // stores, and only if the cache hasn't already been flushed. This could + // be improved by looking across blocks for flushes in postdominating + // blocks from the stores but an explicitly requested flush is probably + // very rare. + for (MachineBasicBlock *MBB : EndPgmBlocks) { + bool SeenDCacheWB = false; + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) { + if (I->getOpcode() == AMDGPU::S_DCACHE_WB) + SeenDCacheWB = true; + else if (TII->isScalarStore(*I)) + SeenDCacheWB = false; + + // FIXME: It would be better to insert this before a waitcnt if any. + if ((I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && + !SeenDCacheWB) { + Modified = true; + BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + } } } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 92f5322b8ad2..1755b93538ce 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -899,8 +899,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isAGPRClass(RC)) { - Opcode = (RI.hasVGPRs(SrcRC)) ? - AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; + if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) + Opcode = AMDGPU::V_ACCVGPR_MOV_B32; + else if (RI.hasVGPRs(SrcRC)) + Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; + else + Opcode = AMDGPU::INSTRUCTION_LIST_END; } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && @@ -1417,6 +1421,33 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) { } } +static unsigned getAVSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_AV32_SAVE; + case 8: + return AMDGPU::SI_SPILL_AV64_SAVE; + case 12: + return AMDGPU::SI_SPILL_AV96_SAVE; + case 16: + return AMDGPU::SI_SPILL_AV128_SAVE; + case 20: + return AMDGPU::SI_SPILL_AV160_SAVE; + case 24: + return AMDGPU::SI_SPILL_AV192_SAVE; + case 28: + return AMDGPU::SI_SPILL_AV224_SAVE; + case 32: + return AMDGPU::SI_SPILL_AV256_SAVE; + case 64: + return AMDGPU::SI_SPILL_AV512_SAVE; + case 128: + return AMDGPU::SI_SPILL_AV1024_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, @@ -1463,21 +1494,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize) + : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); - if (RI.isVectorSuperClass(RC)) { - // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an - // equivalent VGPR register beforehand. Regalloc might want to introduce - // AV spills only to be relevant until rewriter at which they become - // either spills of VGPRs or AGPRs. - Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); - BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg) - .addReg(SrcReg, RegState::Kill); - SrcReg = TmpVReg; - } - BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr @@ -1567,6 +1588,33 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { } } +static unsigned getAVSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_AV32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_AV64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_AV96_RESTORE; + case 16: + return AMDGPU::SI_SPILL_AV128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_AV160_RESTORE; + case 24: + return AMDGPU::SI_SPILL_AV192_RESTORE; + case 28: + return AMDGPU::SI_SPILL_AV224_RESTORE; + case 32: + return AMDGPU::SI_SPILL_AV256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_AV512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_AV1024_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, @@ -1609,26 +1657,15 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) - : getVGPRSpillRestoreOpcode(SpillSize); - - bool IsVectorSuperClass = RI.isVectorSuperClass(RC); - Register TmpReg = DestReg; - if (IsVectorSuperClass) { - // For AV classes, insert the spill restore to a VGPR followed by a copy - // into an equivalent AV register. - MachineRegisterInfo &MRI = MF->getRegInfo(); - DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); - } + unsigned Opcode = RI.isVectorSuperClass(RC) + ? getAVSpillRestoreOpcode(SpillSize) + : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); - - if (IsVectorSuperClass) - BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg) - .addReg(DestReg, RegState::Kill); + .addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, @@ -2358,8 +2395,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); auto *ShAmt = MCConstantExpr::create(32, MCCtx); OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); - - return; } unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { @@ -3106,23 +3141,26 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, } static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, - int64_t &Imm) { + int64_t &Imm, MachineInstr **DefMI = nullptr) { if (Reg.isPhysical()) return false; auto *Def = MRI.getUniqueVRegDef(Reg); if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { Imm = Def->getOperand(1).getImm(); + if (DefMI) + *DefMI = Def; return true; } return false; } -static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) { +static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, + MachineInstr **DefMI = nullptr) { if (!MO->isReg()) return false; const MachineFunction *MF = MO->getParent()->getParent()->getParent(); const MachineRegisterInfo &MRI = MF->getRegInfo(); - return getFoldableImm(MO->getReg(), MRI, Imm); + return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); } static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, @@ -3195,8 +3233,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { + MachineInstr *DefMI; + const auto killDef = [&DefMI, &MBB, this]() -> void { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + // The only user is the instruction which will be killed. + if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg())) + return; + // We cannot just remove the DefMI here, calling pass will crash. + DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); + for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) + DefMI->RemoveOperand(I); + }; + int64_t Imm; - if (getFoldableImm(Src2, Imm)) { + if (getFoldableImm(Src2, Imm, &DefMI)) { unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); @@ -3209,13 +3259,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); + killDef(); return MIB; } } unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); - if (getFoldableImm(Src1, Imm)) { + if (getFoldableImm(Src1, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) @@ -3225,10 +3276,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); + killDef(); return MIB; } } - if (getFoldableImm(Src0, Imm)) { + if (getFoldableImm(Src0, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1 && isOperandLegal( MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), @@ -3241,12 +3293,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); + killDef(); return MIB; } } } - unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 : IsF64 ? AMDGPU::V_FMA_F64_e64 : AMDGPU::V_FMA_F32_e64) : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); @@ -3605,12 +3658,6 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const { const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); // Can't shrink instruction with three operands. - // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add - // a special case for it. It can only be shrunk if the third operand - // is vcc, and src0_modifiers and src1_modifiers are not set. - // We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrinking - // post-regalloc. if (Src2) { switch (MI.getOpcode()) { default: return false; @@ -4563,8 +4610,9 @@ static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, unsigned RCID, bool IsAllocatable) { if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && - (TID.mayLoad() || TID.mayStore() || - (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { + (((TID.mayLoad() || TID.mayStore()) && + !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || + (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { switch (RCID) { case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; @@ -5001,8 +5049,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, --ConstantBusLimit; } - for (unsigned i = 0; i < 3; ++i) { - int Idx = VOP3Idx[i]; + for (int Idx : VOP3Idx) { if (Idx == -1) break; MachineOperand &MO = MI.getOperand(Idx); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 47ee83eb9351..dda92d3d25ff 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1350,11 +1350,11 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>; // Complex patterns //===----------------------------------------------------------------------===// -def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; -def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; -def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">; +def DS1Addr1Offset : ComplexPattern<iPTR, 2, "SelectDS1Addr1Offset">; +def DS64Bit4ByteAligned : ComplexPattern<iPTR, 3, "SelectDS64Bit4ByteAligned">; +def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned">; -def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">; +def MOVRELOffset : ComplexPattern<iPTR, 2, "SelectMOVRELOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td index d55d8da8699a..636337ede000 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -761,6 +761,17 @@ defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; +defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; +defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; +defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; +defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; +defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; +defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; +defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; +defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; +defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; +defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; + def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), @@ -2106,6 +2117,19 @@ def : GCNPat < } // end isWave32 def : GCNPat < + (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))), + (V_NOT_B32_e32 $src0) +>; + +def : GCNPat < + (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))), + (REG_SEQUENCE VReg_64, + (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, + (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 + ) +>; + +def : GCNPat < (f16 (sint_to_fp i1:$src)), (V_CVT_F16_F32_e32 ( V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), @@ -2188,18 +2212,18 @@ def : GCNPat < >; def : GCNPat < - (i1 (trunc i32:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) + (i1 (DivergentUnaryFrag<trunc> i32:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; def : GCNPat < - (i1 (trunc i16:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) + (i1 (DivergentUnaryFrag<trunc> i16:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; def : GCNPat < - (i1 (trunc i64:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), + (i1 (DivergentUnaryFrag<trunc> i64:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; @@ -2405,21 +2429,37 @@ def : GCNPat < // COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < - (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) >; def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))), + (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), + (v2i16 (V_LSHLREV_B32_e64 (i16 16), SReg_32:$src1)) +>; + + +def : GCNPat < + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < - (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), + (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) +>; + +def : GCNPat < + (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < + (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) +>; + +def : GCNPat < (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))), (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) >; @@ -2435,42 +2475,74 @@ def : GCNPat < >; def : GCNPat < - (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; def : GCNPat < - (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))), + (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), + (v2i16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) +>; + + +def : GCNPat < + (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; +def : GCNPat < + (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), + (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) +>; + let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), + (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) >; +def : GCNPat < + (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1)) +>; + def : GCNPat < - (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) >; -// TODO: Should source modifiers be matched to v_pack_b32_f16? def : GCNPat < - (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (v2i16 (DivergentBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0)))) +>; + +def : GCNPat < + (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; def : GCNPat < + (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) +>; + + +def : GCNPat < (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) @@ -2866,6 +2938,18 @@ def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { let hasSideEffects = 0; } +def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + +def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index c4007f56f350..3ce368ef4db9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -62,11 +62,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // calls. const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't - // have any calls. - const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI && - CC != CallingConv::AMDGPU_Gfx && - (!isEntryFunction() || HasCalls); const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; @@ -80,7 +75,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } if (!isEntryFunction()) { - if (UseFixedABI) + if (CC != CallingConv::AMDGPU_Gfx) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; // TODO: Pick a high register, and shift down, similar to a kernel. @@ -110,20 +105,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) else if (ST.isMesaGfxShader(F)) ImplicitBufferPtr = true; - if (UseFixedABI) { - DispatchPtr = true; - QueuePtr = true; - ImplicitArgPtr = true; - WorkGroupIDX = true; - WorkGroupIDY = true; - WorkGroupIDZ = true; - WorkItemIDX = true; - WorkItemIDY = true; - WorkItemIDZ = true; - - // FIXME: We don't need this? - DispatchID = true; - } else if (!AMDGPU::isGraphics(CC)) { + if (!AMDGPU::isGraphics(CC)) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) WorkGroupIDX = true; @@ -462,7 +444,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { MFI.setStackID(i, TargetStackID::Default); for (auto &R : VGPRToAGPRSpills) { - if (R.second.FullyAllocated) + if (R.second.IsDead) MFI.RemoveStackObject(R.first); } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index c305bc20e40d..8accbf611c5f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -465,6 +465,7 @@ public: struct VGPRSpillToAGPR { SmallVector<MCPhysReg, 32> Lanes; bool FullyAllocated = false; + bool IsDead = false; }; // Map WWM VGPR to a stack slot that is used to save/restore it in the @@ -546,6 +547,12 @@ public: : I->second.Lanes[Lane]; } + void setVGPRToAGPRSpillDead(int FrameIndex) { + auto I = VGPRToAGPRSpills.find(FrameIndex); + if (I != VGPRToAGPRSpills.end()) + I->second.IsDead = true; + } + bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 5590d84cc3ab..81db66a98ddf 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -869,29 +869,27 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() { } void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { - unsigned DAGSize = DAG->SUnits.size(); std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; // Every combination of colors given by the top down // and bottom up Reserved node dependency - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; + for (const SUnit &SU : DAG->SUnits) { std::pair<unsigned, unsigned> SUColors; // High latency instructions: already given. - if (CurrentColoring[SU->NodeNum]) + if (CurrentColoring[SU.NodeNum]) continue; - SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; - SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; + SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum]; + SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum]; std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = ColorCombinations.find(SUColors); if (Pos != ColorCombinations.end()) { - CurrentColoring[SU->NodeNum] = Pos->second; + CurrentColoring[SU.NodeNum] = Pos->second; } else { - CurrentColoring[SU->NodeNum] = NextNonReservedID; + CurrentColoring[SU.NodeNum] = NextNonReservedID; ColorCombinations[SUColors] = NextNonReservedID++; } } @@ -1232,15 +1230,13 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria } // Free root and leafs of all blocks to enable scheduling inside them. - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; + for (SIScheduleBlock *Block : CurrentBlocks) Block->finalizeUnits(); - } - LLVM_DEBUG(dbgs() << "Blocks created:\n\n"; - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); - }); + LLVM_DEBUG({ + dbgs() << "Blocks created:\n\n"; + for (SIScheduleBlock *Block : CurrentBlocks) + Block->printDebug(true); + }); } // Two functions taken from Codegen/MachineScheduler.cpp @@ -1379,9 +1375,9 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { } } - LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); + LLVM_DEBUG({ + for (SIScheduleBlock *Block : CurrentBlocks) + Block->printDebug(true); }); } @@ -1437,8 +1433,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, // found for several parents, we increment the usage of the one with the // highest topological index. LiveOutRegsNumUsages.resize(Blocks.size()); - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; + for (SIScheduleBlock *Block : Blocks) { for (unsigned Reg : Block->getInRegs()) { bool Found = false; int topoInd = -1; @@ -1502,8 +1497,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, // Fill LiveRegsConsumers for regs that were already // defined before scheduling. - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; + for (SIScheduleBlock *Block : Blocks) { for (unsigned Reg : Block->getInRegs()) { bool Found = false; for (SIScheduleBlock* Pred: Block->getPreds()) { @@ -1700,10 +1694,7 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { decreaseLiveRegs(Block, Block->getInRegs()); addLiveRegs(Block->getOutRegs()); releaseBlockSuccs(Block); - for (std::map<unsigned, unsigned>::iterator RegI = - LiveOutRegsNumUsages[Block->getID()].begin(), - E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { - std::pair<unsigned, unsigned> RegP = *RegI; + for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) { // We produce this register, thus it must not be previously alive. assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0); @@ -1759,8 +1750,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, ScheduledBlocks = Scheduler.getBlocks(); - for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { - SIScheduleBlock *Block = ScheduledBlocks[b]; + for (SIScheduleBlock *Block : ScheduledBlocks) { std::vector<SUnit*> SUs = Block->getScheduledUnits(); for (SUnit* SU : SUs) @@ -2000,9 +1990,8 @@ void SIScheduleDAGMI::schedule() assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); TopRPTracker.setPos(CurrentTop); - for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), - E = ScheduledSUnits.end(); I != E; ++I) { - SUnit *SU = &SUnits[*I]; + for (unsigned I : ScheduledSUnits) { + SUnit *SU = &SUnits[I]; scheduleMI(SU, true); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a1d9a23a5084..21aed4ececb5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -210,6 +210,7 @@ struct SGPRSpillBuilder { auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); if (!TmpVGPRLive) I.addReg(TmpVGPR, RegState::ImplicitDefine); + I->getOperand(2).setIsDead(true); // Mark SCC as dead. TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); } } @@ -242,9 +243,10 @@ struct SGPRSpillBuilder { TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, /*IsKill*/ false); auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); - if (!TmpVGPRLive) { + if (!TmpVGPRLive) I.addReg(TmpVGPR, RegState::ImplicitKill); - } + I->getOperand(2).setIsDead(true); // Mark SCC as dead. + // Restore active lanes if (TmpVGPRLive) TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); @@ -267,9 +269,11 @@ struct SGPRSpillBuilder { TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, /*IsKill*/ false); // Spill inactive lanes - BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + Not0->getOperand(2).setIsDead(); // Mark SCC as dead. TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); - BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + Not1->getOperand(2).setIsDead(); // Mark SCC as dead. } } @@ -908,6 +912,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V1024_RESTORE: case AMDGPU::SI_SPILL_A1024_SAVE: case AMDGPU::SI_SPILL_A1024_RESTORE: + case AMDGPU::SI_SPILL_AV1024_SAVE: + case AMDGPU::SI_SPILL_AV1024_RESTORE: return 32; case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: @@ -915,6 +921,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V512_RESTORE: case AMDGPU::SI_SPILL_A512_SAVE: case AMDGPU::SI_SPILL_A512_RESTORE: + case AMDGPU::SI_SPILL_AV512_SAVE: + case AMDGPU::SI_SPILL_AV512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -922,6 +930,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_A256_SAVE: case AMDGPU::SI_SPILL_A256_RESTORE: + case AMDGPU::SI_SPILL_AV256_SAVE: + case AMDGPU::SI_SPILL_AV256_RESTORE: return 8; case AMDGPU::SI_SPILL_S224_SAVE: case AMDGPU::SI_SPILL_S224_RESTORE: @@ -929,6 +939,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V224_RESTORE: case AMDGPU::SI_SPILL_A224_SAVE: case AMDGPU::SI_SPILL_A224_RESTORE: + case AMDGPU::SI_SPILL_AV224_SAVE: + case AMDGPU::SI_SPILL_AV224_RESTORE: return 7; case AMDGPU::SI_SPILL_S192_SAVE: case AMDGPU::SI_SPILL_S192_RESTORE: @@ -936,6 +948,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V192_RESTORE: case AMDGPU::SI_SPILL_A192_SAVE: case AMDGPU::SI_SPILL_A192_RESTORE: + case AMDGPU::SI_SPILL_AV192_SAVE: + case AMDGPU::SI_SPILL_AV192_RESTORE: return 6; case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S160_RESTORE: @@ -943,6 +957,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_A160_SAVE: case AMDGPU::SI_SPILL_A160_RESTORE: + case AMDGPU::SI_SPILL_AV160_SAVE: + case AMDGPU::SI_SPILL_AV160_RESTORE: return 5; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: @@ -950,6 +966,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_A128_SAVE: case AMDGPU::SI_SPILL_A128_RESTORE: + case AMDGPU::SI_SPILL_AV128_SAVE: + case AMDGPU::SI_SPILL_AV128_RESTORE: return 4; case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S96_RESTORE: @@ -957,6 +975,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_A96_SAVE: case AMDGPU::SI_SPILL_A96_RESTORE: + case AMDGPU::SI_SPILL_AV96_SAVE: + case AMDGPU::SI_SPILL_AV96_RESTORE: return 3; case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: @@ -964,6 +984,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_A64_SAVE: case AMDGPU::SI_SPILL_A64_RESTORE: + case AMDGPU::SI_SPILL_AV64_SAVE: + case AMDGPU::SI_SPILL_AV64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: @@ -971,6 +993,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_A32_SAVE: case AMDGPU::SI_SPILL_A32_RESTORE: + case AMDGPU::SI_SPILL_AV32_SAVE: + case AMDGPU::SI_SPILL_AV32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -1240,9 +1264,10 @@ void SIRegisterInfo::buildSpillLoadStore( if (ScratchOffsetReg == AMDGPU::NoRegister) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); } else { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) + auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) .addReg(ScratchOffsetReg) .addImm(Offset); + Add->getOperand(3).setIsDead(); // Mark SCC as dead. } Offset = 0; @@ -1810,7 +1835,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_A128_SAVE: case AMDGPU::SI_SPILL_A96_SAVE: case AMDGPU::SI_SPILL_A64_SAVE: - case AMDGPU::SI_SPILL_A32_SAVE: { + case AMDGPU::SI_SPILL_A32_SAVE: + case AMDGPU::SI_SPILL_AV1024_SAVE: + case AMDGPU::SI_SPILL_AV512_SAVE: + case AMDGPU::SI_SPILL_AV256_SAVE: + case AMDGPU::SI_SPILL_AV224_SAVE: + case AMDGPU::SI_SPILL_AV192_SAVE: + case AMDGPU::SI_SPILL_AV160_SAVE: + case AMDGPU::SI_SPILL_AV128_SAVE: + case AMDGPU::SI_SPILL_AV96_SAVE: + case AMDGPU::SI_SPILL_AV64_SAVE: + case AMDGPU::SI_SPILL_AV32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -1846,7 +1881,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_A224_RESTORE: case AMDGPU::SI_SPILL_A256_RESTORE: case AMDGPU::SI_SPILL_A512_RESTORE: - case AMDGPU::SI_SPILL_A1024_RESTORE: { + case AMDGPU::SI_SPILL_A1024_RESTORE: + case AMDGPU::SI_SPILL_AV32_RESTORE: + case AMDGPU::SI_SPILL_AV64_RESTORE: + case AMDGPU::SI_SPILL_AV96_RESTORE: + case AMDGPU::SI_SPILL_AV128_RESTORE: + case AMDGPU::SI_SPILL_AV160_RESTORE: + case AMDGPU::SI_SPILL_AV192_RESTORE: + case AMDGPU::SI_SPILL_AV224_RESTORE: + case AMDGPU::SI_SPILL_AV256_RESTORE: + case AMDGPU::SI_SPILL_AV512_RESTORE: + case AMDGPU::SI_SPILL_AV1024_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 3a372d4519fb..c8f1daf26de9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -731,11 +731,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } - // getVOPe32 could be -1 here if we started with an instruction that had - // a 32-bit encoding and then commuted it to an instruction that did not. - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) - continue; - int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { @@ -776,10 +771,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - // Check the carry-in operand for v_addc_u32_e64. - const MachineOperand *Src2 = TII->getNamedOperand(MI, - AMDGPU::OpName::src2); - if (SDst) { bool Next = false; @@ -791,6 +782,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // All of the instructions with carry outs also have an SGPR input in // src2. + const MachineOperand *Src2 = TII->getNamedOperand(MI, + AMDGPU::OpName::src2); if (Src2 && Src2->getReg() != VCCReg) { if (Src2->getReg().isVirtual()) MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 46012e5d7d97..77ee3c0ff0e4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -495,11 +495,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // instruction as needing e.g. WQM before visiting it and realizing it needs // WQM disabled. ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); - for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = **BI; - BlockInfo &BBI = Blocks[&MBB]; + for (MachineBasicBlock *MBB : RPOT) { + BlockInfo &BBI = Blocks[MBB]; - for (MachineInstr &MI : MBB) { + for (MachineInstr &MI : *MBB) { InstrInfo &III = Instructions[&MI]; unsigned Opcode = MI.getOpcode(); char Flags = 0; @@ -561,7 +560,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { BBI.InNeeds |= StateExact; - Worklist.push_back(&MBB); + Worklist.push_back(MBB); } GlobalFlags |= StateExact; III.Disabled = StateWQM | StateStrict; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td index 8502ed61b366..184c871db775 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -181,15 +181,8 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse " $sdst", [(set i64:$sdst, (node))]> { let hasSideEffects = 1; - // FIXME: This should be definitively mayStore = 0. TableGen - // brokenly tries to infer these based on the intrinsic properties - // corresponding to the IR attributes. The target intrinsics are - // considered as writing to memory for IR dependency purposes, but - // those can be modeled with hasSideEffects here. These also end up - // inferring differently for llvm.readcyclecounter and the amdgcn - // intrinsics. - let mayStore = ?; - let mayLoad = 1; + let mayStore = 0; + let mayLoad = 0; let has_sbase = 0; let has_offset = 0; } @@ -765,11 +758,11 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL }]; } -def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; -def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; -def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; -def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; -def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; +def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">; multiclass SMRD_Pattern <string Instr, ValueType vt> { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td index 61ecc13620a1..1713586dcf5b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -157,6 +157,42 @@ class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : let has_sdst = 0; } +class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return !N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + +class UniformBinFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return !N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + +class DivergentBinFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + let isMoveImm = 1 in { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { @@ -172,11 +208,11 @@ let isMoveImm = 1 in { let Defs = [SCC] in { def S_NOT_B32 : SOP1_32 <"s_not_b32", - [(set i32:$sdst, (not i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))] >; def S_NOT_B64 : SOP1_64 <"s_not_b64", - [(set i64:$sdst, (not i64:$src0))] + [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; @@ -221,22 +257,22 @@ let isReMaterializable = 1 in { def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64", - [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))] >; def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32", - [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))] >; def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32", - [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))] >; def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64", - [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))] >; def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32", - [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))] >; def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">; def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8", @@ -426,41 +462,6 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < "$sdst, $src0, $src1", pattern >; -class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < - (ops node:$src0), - (Op $src0), - [{ return !N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} - -class UniformBinFrag<SDPatternOperator Op> : PatFrag < - (ops node:$src0, node:$src1), - (Op $src0, $src1), - [{ return !N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} - -class DivergentBinFrag<SDPatternOperator Op> : PatFrag < - (ops node:$src0, node:$src1), - (Op $src0, $src1), - [{ return N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { @@ -485,19 +486,18 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32", [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; } // End Uses = [SCC] - let isCommutable = 1 in { def S_MIN_I32 : SOP2_32 <"s_min_i32", - [(set i32:$sdst, (smin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))] >; def S_MIN_U32 : SOP2_32 <"s_min_u32", - [(set i32:$sdst, (umin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))] >; def S_MAX_I32 : SOP2_32 <"s_max_i32", - [(set i32:$sdst, (smax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))] >; def S_MAX_U32 : SOP2_32 <"s_max_u32", - [(set i32:$sdst, (umax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))] >; } // End isCommutable = 1 } // End Defs = [SCC] @@ -870,7 +870,7 @@ def S_GETREG_B32 : SOPK_Pseudo < } } // End mayLoad = 1 -let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in { +let Defs = [MODE], Uses = [MODE] in { // FIXME: Need to truncate immediate to 16-bits. class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo < @@ -914,7 +914,7 @@ def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo { let hasSideEffects = 0; } -} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] +} // End Defs = [MODE], Uses = [MODE] class SOPK_WAITCNT<string opName, list<dag> pat=[]> : SOPK_Pseudo< @@ -1264,7 +1264,7 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let mayStore = 1; } -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +let hasSideEffects = 1 in def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16", [(int_amdgcn_s_waitcnt timm:$simm16)]>; def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", @@ -1278,8 +1278,6 @@ def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 0; - let mayStore = 0; } def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">; @@ -1305,14 +1303,10 @@ def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> { def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_incperflevel timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 0; - let mayStore = 0; } def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_decperflevel timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 0; - let mayStore = 0; } def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> { let simm16 = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp index 2e4d83fbbc39..a83ff6667956 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -15,7 +15,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" #include "llvm/IR/ReplaceConstant.h" @@ -25,175 +24,6 @@ namespace llvm { namespace AMDGPU { -// An helper class for collecting all reachable callees for each kernel defined -// within the module. -class CollectReachableCallees { - Module &M; - CallGraph CG; - SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; - - // Collect all address taken functions within the module. - void collectAddressTakenFunctions() { - auto *ECNode = CG.getExternalCallingNode(); - - for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { - auto *CGN = GI->second; - auto *F = CGN->getFunction(); - if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) - continue; - AddressTakenFunctions.insert(CGN); - } - } - - // For given kernel, collect all its reachable non-kernel functions. - SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { - SmallPtrSet<Function *, 8> ReachableCallees; - - // Call graph node which represents this kernel. - auto *KCGN = CG[K]; - - // Go through all call graph nodes reachable from the node representing this - // kernel, visit all their call sites, if the call site is direct, add - // corresponding callee to reachable callee set, if it is indirect, resolve - // the indirect call site to potential reachable callees, add them to - // reachable callee set, and repeat the process for the newly added - // potential callee nodes. - // - // FIXME: Need to handle bit-casted function pointers. - // - SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN)); - SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; - while (!CGNStack.empty()) { - auto *CGN = CGNStack.pop_back_val(); - - if (!VisitedCGNodes.insert(CGN).second) - continue; - - // Ignore call graph node which does not have associated function or - // associated function is not a definition. - if (!CGN->getFunction() || CGN->getFunction()->isDeclaration()) - continue; - - for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { - auto *RCB = cast<CallBase>(GI->first.getValue()); - auto *RCGN = GI->second; - - if (auto *DCallee = RCGN->getFunction()) { - ReachableCallees.insert(DCallee); - } else if (RCB->isIndirectCall()) { - auto *RCBFTy = RCB->getFunctionType(); - for (auto *ACGN : AddressTakenFunctions) { - auto *ACallee = ACGN->getFunction(); - if (ACallee->getFunctionType() == RCBFTy) { - ReachableCallees.insert(ACallee); - CGNStack.append(df_begin(ACGN), df_end(ACGN)); - } - } - } - } - } - - return ReachableCallees; - } - -public: - explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { - // Collect address taken functions. - collectAddressTakenFunctions(); - } - - void collectReachableCallees( - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { - // Collect reachable callee set for each kernel defined in the module. - for (Function &F : M.functions()) { - if (!AMDGPU::isKernelCC(&F)) - continue; - Function *K = &F; - KernelToCallees[K] = collectReachableCallees(K); - } - } -}; - -void collectReachableCallees( - Module &M, - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { - CollectReachableCallees CRC{M}; - CRC.collectReachableCallees(KernelToCallees); -} - -SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { - SmallPtrSet<Function *, 8> LDSAccessors; - SmallVector<User *, 8> UserStack(GV->users()); - SmallPtrSet<User *, 8> VisitedUsers; - - while (!UserStack.empty()) { - auto *U = UserStack.pop_back_val(); - - // `U` is already visited? continue to next one. - if (!VisitedUsers.insert(U).second) - continue; - - // `U` is a global variable which is initialized with LDS. Ignore LDS. - if (isa<GlobalValue>(U)) - return SmallPtrSet<Function *, 8>(); - - // Recursively explore constant users. - if (isa<Constant>(U)) { - append_range(UserStack, U->users()); - continue; - } - - // `U` should be an instruction, if it belongs to a non-kernel function F, - // then collect F. - Function *F = cast<Instruction>(U)->getFunction(); - if (!AMDGPU::isKernelCC(F)) - LDSAccessors.insert(F); - } - - return LDSAccessors; -} - -DenseMap<Function *, SmallPtrSet<Instruction *, 8>> -getFunctionToInstsMap(User *U, bool CollectKernelInsts) { - DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; - SmallVector<User *, 8> UserStack; - SmallPtrSet<User *, 8> VisitedUsers; - - UserStack.push_back(U); - - while (!UserStack.empty()) { - auto *UU = UserStack.pop_back_val(); - - if (!VisitedUsers.insert(UU).second) - continue; - - if (isa<GlobalValue>(UU)) - continue; - - if (isa<Constant>(UU)) { - append_range(UserStack, UU->users()); - continue; - } - - auto *I = cast<Instruction>(UU); - Function *F = I->getFunction(); - if (CollectKernelInsts) { - if (!AMDGPU::isKernelCC(F)) { - continue; - } - } else { - if (AMDGPU::isKernelCC(F)) { - continue; - } - } - - FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>())); - FunctionToInsts[F].insert(I); - } - - return FunctionToInsts; -} - bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } @@ -232,26 +62,8 @@ void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) { } } -bool hasUserInstruction(const GlobalValue *GV) { - SmallPtrSet<const User *, 8> Visited; - SmallVector<const User *, 16> Stack(GV->users()); - - while (!Stack.empty()) { - const User *U = Stack.pop_back_val(); - - if (!Visited.insert(U).second) - continue; - - if (isa<Instruction>(U)) - return true; - - append_range(Stack, U->users()); - } - - return false; -} - -bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { +static bool shouldLowerLDSToStruct(const GlobalVariable &GV, + const Function *F) { // We are not interested in kernel LDS lowering for module LDS itself. if (F && GV.getName() == "llvm.amdgcn.module.lds") return false; @@ -259,7 +71,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { bool Ret = false; SmallPtrSet<const User *, 8> Visited; SmallVector<const User *, 16> Stack(GV.users()); - SmallPtrSet<const GlobalValue *, 8> GlobalUsers; assert(!F || isKernelCC(F)); @@ -267,15 +78,10 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { const User *V = Stack.pop_back_val(); Visited.insert(V); - if (auto *G = dyn_cast<GlobalValue>(V)) { - StringRef GName = G->getName(); - if (F && GName != "llvm.used" && GName != "llvm.compiler.used") { - // For kernel LDS lowering, if G is not a compiler.used list, then we - // cannot lower the lds GV since we cannot replace the use of GV within - // G. - return false; - } - GlobalUsers.insert(G); + if (isa<GlobalValue>(V)) { + // This use of the LDS variable is the initializer of a global variable. + // This is ill formed. The address of an LDS variable is kernel dependent + // and unknown until runtime. It can't be written to a global variable. continue; } @@ -297,15 +103,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { append_range(Stack, V->users()); } - if (!F && !Ret) { - // For module LDS lowering, we have not yet decided if we should lower GV or - // not. Explore all global users of GV, and check if atleast one of these - // global users appear as an use within an instruction (possibly nested use - // via constant expression), if so, then conservately lower LDS. - for (auto *G : GlobalUsers) - Ret |= hasUserInstruction(G); - } - return Ret; } @@ -324,7 +121,7 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M, continue; } if (!isa<UndefValue>(GV.getInitializer())) { - // Initializers are unimplemented for local address space. + // Initializers are unimplemented for LDS address space. // Leave such variables in place for consistent error reporting. continue; } @@ -342,20 +139,6 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M, return LocalVars; } -SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { - SmallPtrSet<GlobalValue *, 32> UsedList; - - SmallVector<GlobalValue *, 32> TmpVec; - collectUsedGlobalVariables(M, TmpVec, true); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - TmpVec.clear(); - collectUsedGlobalVariables(M, TmpVec, false); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - return UsedList; -} - } // end namespace AMDGPU } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h index d1c9229bc336..83ef68cc3f60 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -22,44 +22,13 @@ class ConstantExpr; namespace AMDGPU { -/// Collect reachable callees for each kernel defined in the module \p M and -/// return collected callees at \p KernelToCallees. -void collectReachableCallees( - Module &M, - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees); - -/// For the given LDS global \p GV, visit all its users and collect all -/// non-kernel functions within which \p GV is used and return collected list of -/// such non-kernel functions. -SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV); - -/// Collect all the instructions where user \p U belongs to. \p U could be -/// instruction itself or it could be a constant expression which is used within -/// an instruction. If \p CollectKernelInsts is true, collect instructions only -/// from kernels, otherwise collect instructions only from non-kernel functions. -DenseMap<Function *, SmallPtrSet<Instruction *, 8>> -getFunctionToInstsMap(User *U, bool CollectKernelInsts); - bool isKernelCC(const Function *Func); Align getAlign(DataLayout const &DL, const GlobalVariable *GV); -/// \returns true if a given global variable \p GV (or its global users) appear -/// as an use within some instruction (either from kernel or from non-kernel). -bool hasUserInstruction(const GlobalValue *GV); - -/// \returns true if an LDS global requires lowering to a module LDS structure -/// if \p F is not given. If \p F is given it must be a kernel and function -/// \returns true if an LDS global is directly used from that kernel and it -/// is safe to replace its uses with a kernel LDS structure member. -bool shouldLowerLDSToStruct(const GlobalVariable &GV, - const Function *F = nullptr); - std::vector<GlobalVariable *> findVariablesToLower(Module &M, const Function *F = nullptr); -SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M); - /// Replace all uses of constant \p C with instructions in \p F. void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); } // end namespace AMDGPU diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td index a3eccf13cd71..a8368892c565 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -794,6 +794,18 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> { list<dag> ret = [!con(Outs, (set Ins))]; } +class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> { list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []); } |