aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp106
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp116
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp192
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp210
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp85
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp48
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp73
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp160
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp153
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td116
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp59
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td106
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp231
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td12
51 files changed, 1235 insertions, 1012 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index aab76d27ef11..d28f38e42430 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -173,14 +173,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
const AMDGPUFunctionArgInfo &
AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
auto I = ArgInfoMap.find(&F);
- if (I == ArgInfoMap.end()) {
- if (AMDGPUTargetMachine::EnableFixedFunctionABI)
- return FixedABIFunctionInfo;
-
- // Without the fixed ABI, we assume no function has special inputs.
- assert(F.isDeclaration());
- return ExternFunctionInfo;
- }
-
+ if (I == ArgInfoMap.end())
+ return FixedABIFunctionInfo;
return I->second;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index f0aadab3302f..b4ebc7d7d75f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -112,6 +112,17 @@ static bool isDSAddress(const Constant *C) {
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
}
+/// Returns true if the function requires the implicit argument be passed
+/// regardless of the function contents.
+static bool funcRequiresImplicitArgPtr(const Function &F) {
+ // Sanitizers require the hostcall buffer passed in the implicit arguments.
+ return F.hasFnAttribute(Attribute::SanitizeAddress) ||
+ F.hasFnAttribute(Attribute::SanitizeThread) ||
+ F.hasFnAttribute(Attribute::SanitizeMemory) ||
+ F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+ F.hasFnAttribute(Attribute::SanitizeMemTag);
+}
+
namespace {
class AMDGPUInformationCache : public InformationCache {
public:
@@ -296,7 +307,7 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
bool AllCallSitesKnown = true;
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
- indicatePessimisticFixpoint();
+ return indicatePessimisticFixpoint();
return Change;
}
@@ -339,7 +350,17 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
+
+ // If the function requires the implicit arg pointer due to sanitizers,
+ // assume it's needed even if explicitly marked as not requiring it.
+ const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F);
+ if (NeedsImplicit)
+ removeAssumedBits(IMPLICIT_ARG_PTR);
+
for (auto Attr : ImplicitAttrs) {
+ if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR)
+ continue;
+
if (F->hasFnAttribute(Attr.second))
addKnownBits(Attr.first);
}
@@ -500,6 +521,9 @@ struct AAAMDFlatWorkGroupSize
std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
intersectKnown(
ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+
+ if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+ indicatePessimisticFixpoint();
}
ChangeStatus updateImpl(Attributor &A) override {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 43928d7c2a09..2f1e7823f65c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -652,8 +652,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
++PSInputNum;
if (SkipArg) {
- for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
- B.buildUndef(VRegs[Idx][I]);
+ for (Register R : VRegs[Idx])
+ B.buildUndef(R);
++Idx;
continue;
@@ -715,10 +715,9 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!MBB.empty())
B.setInstr(*MBB.begin());
- if (!IsEntryFunc) {
+ if (!IsEntryFunc && !IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
- if (AMDGPUTargetMachine::EnableFixedFunctionABI)
- TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+ TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
IncomingValueAssigner Assigner(AssignFn);
@@ -731,11 +730,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
uint64_t StackOffset = Assigner.StackOffset;
- if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
- // Special inputs come after user arguments.
- TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
- }
-
// Start adding system SGPRs.
if (IsEntryFunc) {
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
@@ -829,9 +823,12 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
if (IncomingArg) {
LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
- } else {
- assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
+ } else {
+ // We may have proven the input wasn't needed, although the ABI is
+ // requiring it. We just need to allocate the register appropriately.
+ MIRBuilder.buildUndef(InputReg);
}
if (OutgoingArg->isRegister()) {
@@ -1233,8 +1230,7 @@ bool AMDGPUCallLowering::lowerTailCall(
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
- Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
@@ -1300,12 +1296,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
- if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- Info.CallConv != CallingConv::AMDGPU_Gfx) {
- LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
- return false;
- }
-
SmallVector<ArgInfo, 8> OutArgs;
for (auto &OrigArg : Info.OrigArgs)
splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
@@ -1359,8 +1349,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
- Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c7c5ff7bcbe7..2415fdfecaae 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -64,6 +64,30 @@ def int_minmax_to_med3 : GICombineRule<
[{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+def fp_minmax_to_med3 : GICombineRule<
+ (defs root:$min_or_max, med3_matchdata:$matchinfo),
+ (match (wip_match_opcode G_FMAXNUM,
+ G_FMINNUM,
+ G_FMAXNUM_IEEE,
+ G_FMINNUM_IEEE):$min_or_max,
+ [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+
+def fp_minmax_to_clamp : GICombineRule<
+ (defs root:$min_or_max, register_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_FMAXNUM,
+ G_FMINNUM,
+ G_FMAXNUM_IEEE,
+ G_FMINNUM_IEEE):$min_or_max,
+ [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>;
+
+def fmed3_intrinsic_to_clamp : GICombineRule<
+ (defs root:$fmed3, register_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_INTRINSIC):$fmed3,
+ [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
+ (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>;
+
def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
def remove_fcanonicalize : GICombineRule<
@@ -102,7 +126,9 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
}
def AMDGPURegBankCombinerHelper : GICombinerHelper<
- "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
+ "AMDGPUGenRegBankCombinerHelper",
+ [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
+ fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
let StateClass = "AMDGPURegBankCombinerHelperState";
let AdditionalArguments = [];
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 301e6f6d6f42..e79ff9b597c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -378,5 +378,4 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
}
MI.eraseFromParent();
- return;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 12cef2774aaf..7fd94a977be7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -172,6 +172,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
+def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>;
+def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b9c59f4c615a..699c6c479455 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -280,11 +280,12 @@ void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
}
}
-void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitKernelArgs(const Function &Func,
+ const GCNSubtarget &ST) {
for (auto &Arg : Func.args())
emitKernelArg(Arg);
- emitHiddenKernelArgs(Func);
+ emitHiddenKernelArgs(Func, ST);
}
void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
@@ -381,10 +382,9 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
}
}
-void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
- int HiddenArgNumBytes =
- getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
-
+void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
+ const GCNSubtarget &ST) {
+ unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
if (!HiddenArgNumBytes)
return;
@@ -465,11 +465,12 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
HSAMetadata.mKernels.push_back(Kernel::Metadata());
auto &Kernel = HSAMetadata.mKernels.back();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Kernel.mName = std::string(Func.getName());
Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
emitKernelLanguage(Func);
emitKernelAttrs(Func);
- emitKernelArgs(Func);
+ emitKernelArgs(Func, ST);
HSAMetadata.mKernels.back().mCodeProps = CodeProps;
HSAMetadata.mKernels.back().mDebugProps = DebugProps;
}
@@ -673,13 +674,14 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
}
void MetadataStreamerV3::emitKernelArgs(const Function &Func,
+ const GCNSubtarget &ST,
msgpack::MapDocNode Kern) {
unsigned Offset = 0;
auto Args = HSAMetadataDoc->getArrayNode();
for (auto &Arg : Func.args())
emitKernelArg(Arg, Offset, Args);
- emitHiddenKernelArgs(Func, Offset, Args);
+ emitHiddenKernelArgs(Func, ST, Offset, Args);
Kern[".args"] = Args;
}
@@ -791,11 +793,10 @@ void MetadataStreamerV3::emitKernelArg(
}
void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
+ const GCNSubtarget &ST,
unsigned &Offset,
msgpack::ArrayDocNode Args) {
- int HiddenArgNumBytes =
- getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
-
+ unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
if (!HiddenArgNumBytes)
return;
@@ -912,6 +913,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
auto Kern = getHSAKernelProps(MF, ProgramInfo);
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
Func.getCallingConv() == CallingConv::SPIR_KERNEL);
@@ -925,7 +927,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
(Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
emitKernelLanguage(Func, Kern);
emitKernelAttrs(Func, Kern);
- emitKernelArgs(Func, Kern);
+ emitKernelArgs(Func, ST, Kern);
}
Kernels.push_back(Kern);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index af5dae1cd8c0..54ed0afbba6d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -30,6 +30,7 @@ class MDNode;
class Module;
struct SIProgramInfo;
class Type;
+class GCNSubtarget;
namespace AMDGPU {
@@ -86,7 +87,8 @@ protected:
void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
- void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern);
+ void emitKernelArgs(const Function &Func, const GCNSubtarget &ST,
+ msgpack::MapDocNode Kern);
void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args);
@@ -98,8 +100,8 @@ protected:
StringRef BaseTypeName = "", StringRef AccQual = "",
StringRef TypeQual = "");
- void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
- msgpack::ArrayDocNode Args);
+ void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST,
+ unsigned &Offset, msgpack::ArrayDocNode Args);
msgpack::DocNode &getRootMetadata(StringRef Key) {
return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
@@ -173,7 +175,7 @@ private:
void emitKernelAttrs(const Function &Func);
- void emitKernelArgs(const Function &Func);
+ void emitKernelArgs(const Function &Func, const GCNSubtarget &ST);
void emitKernelArg(const Argument &Arg);
@@ -183,7 +185,7 @@ private:
StringRef BaseTypeName = "", StringRef AccQual = "",
StringRef TypeQual = "");
- void emitHiddenKernelArgs(const Function &Func);
+ void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST);
const Metadata &getHSAMetadata() const {
return HSAMetadata;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 88b4ec53a2a0..db84b8766924 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -892,6 +892,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
break;
}
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private: {
+ if (isa<UndefValue>(II.getArgOperand(0)))
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+
+ if (isa<ConstantPointerNull>(II.getArgOperand(0)))
+ return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
+ break;
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1f898f2ba8b3..5046daaed977 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -533,7 +533,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16, V2S16})
.minScalar(0, S16)
- .clampMaxNumElements(0, S16, 2)
+ .clampMaxNumElementsStrict(0, S16, 2)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32)
.scalarize(0);
@@ -541,7 +541,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
.legalFor({S32, S16, V2S16}) // Clamp modifier
.minScalarOrElt(0, S16)
- .clampMaxNumElements(0, S16, 2)
+ .clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.lower();
@@ -712,7 +712,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
if (ST.hasVOP3PInsts())
- FPOpActions.clampMaxNumElements(0, S16, 2);
+ FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
FPOpActions
.scalarize(0)
@@ -728,7 +728,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder({G_FNEG, G_FABS})
.legalFor(FPTypesPK16)
- .clampMaxNumElements(0, S16, 2)
+ .clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.clampScalar(0, S16, S64);
@@ -965,7 +965,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_BSWAP)
.legalFor({S16, S32, V2S16})
- .clampMaxNumElements(0, S16, 2)
+ .clampMaxNumElementsStrict(0, S16, 2)
// FIXME: Fixing non-power-of-2 before clamp is workaround for
// narrowScalar limitation.
.widenScalarToNextPow2(0)
@@ -1052,10 +1052,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Split vector extloads.
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
- unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
-
- if (MemSize < DstTy.getSizeInBits())
- MemSize = std::max(MemSize, AlignBits);
if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
return true;
@@ -1077,12 +1073,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return true;
}
- if (AlignBits < MemSize) {
- const SITargetLowering *TLI = ST.getTargetLowering();
- return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
- Align(AlignBits / 8));
- }
-
return false;
};
@@ -1176,20 +1166,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (DstSize > MemSize)
return std::make_pair(0, LLT::scalar(MemSize));
- if (!isPowerOf2_32(DstSize)) {
- // We're probably decomposing an odd sized store. Try to split
- // to the widest type. TODO: Account for alignment. As-is it
- // should be OK, since the new parts will be further legalized.
- unsigned FloorSize = PowerOf2Floor(DstSize);
- return std::make_pair(0, LLT::scalar(FloorSize));
- }
-
- if (DstSize > 32 && (DstSize % 32 != 0)) {
- // FIXME: Need a way to specify non-extload of larger size if
- // suitably aligned.
- return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
- }
-
unsigned MaxSize = maxSizeForAddrSpace(ST,
PtrTy.getAddressSpace(),
Op == G_LOAD);
@@ -1257,14 +1233,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
ElementCount::getFixed(FloorSize / EltSize), EltTy));
}
- // Need to split because of alignment.
- unsigned Align = Query.MMODescrs[0].AlignInBits;
- if (EltSize > Align &&
- (EltSize / Align < DstTy.getNumElements())) {
- return std::make_pair(
- 0, LLT::fixed_vector(EltSize / Align, EltTy));
- }
-
// May need relegalization for the scalars.
return std::make_pair(0, EltTy);
})
@@ -1457,6 +1425,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Doesn't handle extract of illegal sizes.
getActionDefinitionsBuilder(Op)
.lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+ .lowerIf([=](const LegalityQuery &Query) {
+ // Sub-vector(or single element) insert and extract.
+ // TODO: verify immediate offset here since lower only works with
+ // whole elements.
+ const LLT BigTy = Query.Types[BigTyIdx];
+ return BigTy.isVector();
+ })
// FIXME: Multiples of 16 should not be legal.
.legalIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
@@ -1615,7 +1590,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Prefer to reduce vector widths for 16-bit vectors before lowering, to
// get more vector shift opportunities, since we'll get those when
// expanded.
- .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
+ .clampMaxNumElementsStrict(0, S16, 2);
} else if (ST.has16BitInsts()) {
SextInReg.lowerFor({{S32}, {S64}, {S16}});
} else {
@@ -1637,14 +1612,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_FSHR)
.legalFor({{S32, S32}})
.lowerFor({{V2S16, V2S16}})
- .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+ .clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.lower();
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder(G_FSHL)
.lowerFor({{V2S16, V2S16}})
- .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+ .clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.lower();
} else {
@@ -2567,10 +2542,8 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
} else {
// For cases where the widened type isn't a nice register value, unmerge
// from a widened register (e.g. <3 x s16> -> <4 x s16>)
- B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
- B.setInsertPt(B.getMBB(), MI.getIterator());
- B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
+ WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+ B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
}
}
@@ -3843,6 +3816,10 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
llvm_unreachable("invalid data type");
}
+ if (StoreVT == LLT::fixed_vector(3, S16)) {
+ Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
+ .getReg(0);
+ }
return Reg;
}
@@ -4237,8 +4214,17 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
(I >= Intr->CoordStart && !IsA16)) {
// Handle any gradient or coordinate operands that should not be packed
- AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
- PackedAddrs.push_back(AddrReg);
+ if ((I < Intr->GradientStart) && IsA16 &&
+ (B.getMRI()->getType(AddrReg) == S16)) {
+ // Special handling of bias when A16 is on. Bias is of type half but
+ // occupies full 32-bit.
+ PackedAddrs.push_back(
+ B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+ .getReg(0));
+ } else {
+ AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
+ PackedAddrs.push_back(AddrReg);
+ }
} else {
// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
// derivatives dx/dh and dx/dv are packed with undef.
@@ -4676,9 +4662,23 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// Deal with the one annoying legal case.
const LLT V3S16 = LLT::fixed_vector(3, 16);
if (Ty == V3S16) {
- padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
- auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
- B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
+ if (IsTFE) {
+ if (ResultRegs.size() == 1) {
+ NewResultReg = ResultRegs[0];
+ } else if (ResultRegs.size() == 2) {
+ LLT V4S16 = LLT::fixed_vector(4, 16);
+ NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
+ } else {
+ return false;
+ }
+ }
+
+ if (MRI->getType(DstReg).getNumElements() <
+ MRI->getType(NewResultReg).getNumElements()) {
+ B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
+ } else {
+ B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
+ }
return true;
}
@@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
}
Ops.push_back(RayExtent);
- auto packLanes = [&Ops, &S32, &B] (Register Src) {
- auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
+ auto packLanes = [&Ops, &S32, &B](Register Src) {
+ auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));
Ops.push_back(Unmerge.getReg(2));
@@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
+ auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
Register R1 = MRI.createGenericVirtualRegister(S32);
Register R2 = MRI.createGenericVirtualRegister(S32);
Register R3 = MRI.createGenericVirtualRegister(S32);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 12d6d35a6917..6e2b5dc471bc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -24,13 +24,6 @@
// A possible future refinement is to specialise the structure per-kernel, so
// that fields can be elided based on more expensive analysis.
//
-// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
-// type which would cause allocating huge memory for struct instance within
-// every kernel. Hence, before running this pass, it is advisable to run the
-// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
-// non-kernel functions by pointers and thereby minimizes the unnecessary per
-// kernel allocation of LDS memory.
-//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals(
namespace {
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+ SmallPtrSet<GlobalValue *, 32> UsedList;
+
+ SmallVector<GlobalValue *, 32> TmpVec;
+ collectUsedGlobalVariables(M, TmpVec, true);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ TmpVec.clear();
+ collectUsedGlobalVariables(M, TmpVec, false);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ return UsedList;
+}
+
class AMDGPULowerModuleLDS : public ModulePass {
static void removeFromUsedList(Module &M, StringRef Name,
@@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
removeFromUsedLists(Module &M,
const std::vector<GlobalVariable *> &LocalVars) {
SmallPtrSet<Constant *, 32> LocalVarsSet;
- for (size_t I = 0; I < LocalVars.size(); I++) {
- if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+ for (GlobalVariable *LocalVar : LocalVars)
+ if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
LocalVarsSet.insert(C);
- }
- }
removeFromUsedList(M, "llvm.used", LocalVarsSet);
removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
}
@@ -158,9 +163,9 @@ public:
}
bool runOnModule(Module &M) override {
- UsedList = AMDGPU::getUsedList(M);
-
- bool Changed = processUsedLDS(M);
+ UsedList = getUsedList(M);
+ bool Changed = superAlignLDSGlobals(M);
+ Changed |= processUsedLDS(M);
for (Function &F : M.functions()) {
if (F.isDeclaration())
@@ -177,6 +182,50 @@ public:
}
private:
+ // Increase the alignment of LDS globals if necessary to maximise the chance
+ // that we can use aligned LDS instructions to access them.
+ static bool superAlignLDSGlobals(Module &M) {
+ const DataLayout &DL = M.getDataLayout();
+ bool Changed = false;
+ if (!SuperAlignLDSGlobals) {
+ return Changed;
+ }
+
+ for (auto &GV : M.globals()) {
+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+ // Only changing alignment of LDS variables
+ continue;
+ }
+ if (!GV.hasInitializer()) {
+ // cuda/hip extern __shared__ variable, leave alignment alone
+ continue;
+ }
+
+ Align Alignment = AMDGPU::getAlign(DL, &GV);
+ TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
+
+ if (GVSize > 8) {
+ // We might want to use a b96 or b128 load/store
+ Alignment = std::max(Alignment, Align(16));
+ } else if (GVSize > 4) {
+ // We might want to use a b64 load/store
+ Alignment = std::max(Alignment, Align(8));
+ } else if (GVSize > 2) {
+ // We might want to use a b32 load/store
+ Alignment = std::max(Alignment, Align(4));
+ } else if (GVSize > 1) {
+ // We might want to use a b16 load/store
+ Alignment = std::max(Alignment, Align(2));
+ }
+
+ if (Alignment != AMDGPU::getAlign(DL, &GV)) {
+ Changed = true;
+ GV.setAlignment(Alignment);
+ }
+ }
+ return Changed;
+ }
+
bool processUsedLDS(Module &M, Function *F = nullptr) {
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
@@ -190,31 +239,6 @@ private:
return false;
}
- // Increase the alignment of LDS globals if necessary to maximise the chance
- // that we can use aligned LDS instructions to access them.
- if (SuperAlignLDSGlobals) {
- for (auto *GV : FoundLocalVars) {
- Align Alignment = AMDGPU::getAlign(DL, GV);
- TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
-
- if (GVSize > 8) {
- // We might want to use a b96 or b128 load/store
- Alignment = std::max(Alignment, Align(16));
- } else if (GVSize > 4) {
- // We might want to use a b64 load/store
- Alignment = std::max(Alignment, Align(8));
- } else if (GVSize > 2) {
- // We might want to use a b32 load/store
- Alignment = std::max(Alignment, Align(4));
- } else if (GVSize > 1) {
- // We might want to use a b16 load/store
- Alignment = std::max(Alignment, Align(2));
- }
-
- GV->setAlignment(Alignment);
- }
- }
-
SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
LayoutFields.reserve(FoundLocalVars.size());
for (GlobalVariable *GV : FoundLocalVars) {
@@ -343,20 +367,14 @@ private:
refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
}
- // Mark kernels with asm that reads the address of the allocated structure
- // This is not necessary for lowering. This lets other passes, specifically
- // PromoteAlloca, accurately calculate how much LDS will be used by the
- // kernel after lowering.
+ // This ensures the variable is allocated when called functions access it.
+ // It also lets other passes, specifically PromoteAlloca, accurately
+ // calculate how much LDS will be used by the kernel after lowering.
if (!F) {
IRBuilder<> Builder(Ctx);
- SmallPtrSet<Function *, 32> Kernels;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration())
- continue;
-
- if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) {
+ if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
markUsedByKernel(Builder, &Func, SGV);
- Kernels.insert(&Func);
}
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 5d4b007f11e6..4e2f98d2a5db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -2786,12 +2786,8 @@ AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned Selec
// Fixme: Move linearization creation to the original spot
createLinearizedRegion(Region, SelectOut);
- for (auto CI = Region->getChildren()->begin(),
- CE = Region->getChildren()->end();
- CI != CE; ++CI) {
- InnerSelectOut =
- initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
- }
+ for (auto *CI : *Region->getChildren())
+ InnerSelectOut = initializeSelectRegisters(CI, InnerSelectOut, MRI, TII);
MRT->setBBSelectRegIn(InnerSelectOut);
return InnerSelectOut;
} else {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 2aa02299ecdc..8ad344816ad2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -119,31 +119,27 @@ private:
bool isConstantAddr(const Value *V) const;
};
-static const Value *getMemoryInstrPtr(const Instruction *Inst) {
- if (auto LI = dyn_cast<LoadInst>(Inst)) {
- return LI->getPointerOperand();
- }
- if (auto SI = dyn_cast<StoreInst>(Inst)) {
- return SI->getPointerOperand();
- }
- if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
- return AI->getPointerOperand();
- }
- if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
- return AI->getPointerOperand();
- }
- if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
- return MI->getRawDest();
- }
-
- return nullptr;
+static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
+ const Instruction *Inst) {
+ if (auto LI = dyn_cast<LoadInst>(Inst))
+ return {LI->getPointerOperand(), LI->getType()};
+ if (auto SI = dyn_cast<StoreInst>(Inst))
+ return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
+ if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
+ return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
+ if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
+ return {AI->getPointerOperand(), AI->getValOperand()->getType()};
+ if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
+ return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};
+
+ return {nullptr, nullptr};
}
bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
SmallSet<const Value *, 32> WorkSet;
SmallSet<const Value *, 32> Visited;
- if (const Value *MO = getMemoryInstrPtr(Inst)) {
+ if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
if (isGlobalAddr(MO))
WorkSet.insert(MO);
}
@@ -209,10 +205,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
for (auto &B : F) {
LastAccess = MemAccessInfo();
for (auto &I : B) {
- if (const Value *Ptr = getMemoryInstrPtr(&I)) {
- unsigned Size = divideCeil(
- Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
- 32);
+ if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
+ unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
if (isIndirectAccess(&I))
FI.IAMInstCost += Size;
if (isLargeStride(&I))
@@ -326,7 +320,7 @@ bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
AMDGPUPerfHint::MemAccessInfo
AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
MemAccessInfo MAI;
- const Value *MO = getMemoryInstrPtr(Inst);
+ const Value *MO = getMemoryInstrPtrAndType(Inst).first;
LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
// Do not treat local-addr memory access as large stride.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 3ec5dd7e0eff..f9a9fe403ff6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -939,7 +939,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
GlobalVariable::NotThreadLocal,
AMDGPUAS::LOCAL_ADDRESS);
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- GV->setAlignment(MaybeAlign(I.getAlignment()));
+ GV->setAlignment(I.getAlign());
Value *TCntY, *TCntZ;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 12b5830ef930..3ce67a733c10 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -16,6 +16,7 @@
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -23,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-regbank-combiner"
@@ -36,13 +38,15 @@ protected:
MachineRegisterInfo &MRI;
const RegisterBankInfo &RBI;
const TargetRegisterInfo &TRI;
+ const SIInstrInfo &TII;
CombinerHelper &Helper;
public:
AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
: B(B), MF(B.getMF()), MRI(*B.getMRI()),
RBI(*MF.getSubtarget().getRegBankInfo()),
- TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){};
+ TRI(*MF.getSubtarget().getRegisterInfo()),
+ TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){};
bool isVgprRegBank(Register Reg);
Register getAsVgpr(Register Reg);
@@ -63,7 +67,19 @@ public:
Register &Val, CstTy &K0, CstTy &K1);
bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+ bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+ bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg);
+ bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg);
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+ void applyClamp(MachineInstr &MI, Register &Reg);
+
+private:
+ AMDGPU::SIModeRegisterDefaults getMode();
+ bool getIEEE();
+ bool getDX10Clamp();
+ bool isFminnumIeee(const MachineInstr &MI);
+ bool isFCst(MachineInstr *MI);
+ bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1);
};
bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
@@ -98,6 +114,13 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
case AMDGPU::G_UMAX:
case AMDGPU::G_UMIN:
return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINNUM:
+ return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3};
+ case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINNUM_IEEE:
+ return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE,
+ AMDGPU::G_AMDGPU_FMED3};
}
}
@@ -148,6 +171,146 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
return true;
}
+// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
+// ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
+// ieee = false : min/max(NaN, K) = K
+// clamp(NaN) = dx10_clamp ? 0.0 : NaN
+// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
+// Other operand commutes (see matchMed) give same result since min and max are
+// commutative.
+
+// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
+// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
+// Val = SNaN only for ieee = true
+// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
+// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
+// max(min(SNaN, K1), K0) = max(K1, K0) = K1
+// Val = NaN,ieee = false or Val = QNaN,ieee = true
+// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
+// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
+// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
+bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
+ MachineInstr &MI, Med3MatchInfo &MatchInfo) {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(Dst);
+ if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32))
+ return false;
+
+ auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
+
+ Register Val;
+ Optional<FPValueAndVReg> K0, K1;
+ // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
+ if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+ return false;
+
+ if (K0->Value > K1->Value)
+ return false;
+
+ // For IEEE=false perform combine only when it's safe to assume that there are
+ // no NaN inputs. Most often MI is marked with nnan fast math flag.
+ // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
+ // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
+ // nodes(max/min) have same behavior when one input is NaN and other isn't.
+ // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
+ // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
+ if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) {
+ // Don't fold single use constant that can't be inlined.
+ if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) &&
+ (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) {
+ MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI,
+ Register &Reg) {
+ // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
+ auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
+ Register Val;
+ Optional<FPValueAndVReg> K0, K1;
+ // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
+ if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+ return false;
+
+ if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0))
+ return false;
+
+ // For IEEE=false perform combine only when it's safe to assume that there are
+ // no NaN inputs. Most often MI is marked with nnan fast math flag.
+ // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
+ // to 0.0 requires dx10_clamp = true.
+ if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
+ isKnownNeverSNaN(Val, MRI)) ||
+ isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) {
+ Reg = Val;
+ return true;
+ }
+
+ return false;
+}
+
+// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
+// Val = SNaN only for ieee = true. It is important which operand is NaN.
+// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
+// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
+// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
+// Val = NaN,ieee = false or Val = QNaN,ieee = true
+// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
+// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
+// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
+bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI,
+ Register &Reg) {
+ if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3)
+ return false;
+
+ // In llvm-ir, clamp is often represented as an intrinsic call to
+ // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
+ MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+ MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
+ MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+
+ if (isFCst(Src0) && !isFCst(Src1))
+ std::swap(Src0, Src1);
+ if (isFCst(Src1) && !isFCst(Src2))
+ std::swap(Src1, Src2);
+ if (isFCst(Src0) && !isFCst(Src1))
+ std::swap(Src0, Src1);
+ if (!isClampZeroToOne(Src1, Src2))
+ return false;
+
+ Register Val = Src0->getOperand(0).getReg();
+
+ auto isOp3Zero = [&]() {
+ MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+ if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
+ return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
+ return false;
+ };
+ // For IEEE=false perform combine only when it's safe to assume that there are
+ // no NaN inputs. Most often MI is marked with nnan fast math flag.
+ // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
+ // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
+ if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) ||
+ (getIEEE() && getDX10Clamp() &&
+ (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) {
+ Reg = Val;
+ return true;
+ }
+
+ return false;
+}
+
+void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) {
+ B.setInstrAndDebugLoc(MI);
+ B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
+ MI.getFlags());
+ MI.eraseFromParent();
+}
+
void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
Med3MatchInfo &MatchInfo) {
B.setInstrAndDebugLoc(MI);
@@ -158,6 +321,33 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
MI.eraseFromParent();
}
+AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() {
+ return MF.getInfo<SIMachineFunctionInfo>()->getMode();
+}
+
+bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; }
+
+bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; }
+
+bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) {
+ return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
+}
+
+bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) {
+ return MI->getOpcode() == AMDGPU::G_FCONSTANT;
+}
+
+bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0,
+ MachineInstr *K1) {
+ if (isFCst(K0) && isFCst(K1)) {
+ const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm();
+ const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm();
+ return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) ||
+ (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0));
+ }
+ return false;
+}
+
class AMDGPURegBankCombinerHelperState {
protected:
CombinerHelper &Helper;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5988403c0a29..c60012bcfe2e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -707,9 +707,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
iterator_range<MachineBasicBlock::iterator> Range,
SmallSet<Register, 4> &SGPROperandRegs,
MachineRegisterInfo &MRI) const {
- SmallVector<Register, 4> ResultRegs;
- SmallVector<Register, 4> InitResultRegs;
- SmallVector<Register, 4> PhiRegs;
// Track use registers which have already been expanded with a readfirstlane
// sequence. This may have multiple uses if moving a sequence.
@@ -774,15 +771,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
.addReg(NewExec)
.addMBB(LoopBB);
- for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
- B.buildInstr(TargetOpcode::G_PHI)
- .addDef(std::get<2>(Result))
- .addReg(std::get<0>(Result)) // Initial value / implicit_def
- .addMBB(&MBB)
- .addReg(std::get<1>(Result)) // Mid-loop value.
- .addMBB(LoopBB);
- }
-
const DebugLoc &DL = B.getDL();
MachineInstr &FirstInst = *Range.begin();
@@ -1174,18 +1162,25 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
// 96-bit loads are only available for vector loads. We need to split this
// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
if (MMO->getAlign() < Align(16)) {
+ MachineFunction *MF = MI.getParent()->getParent();
+ ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
+ MachineIRBuilder B(MI, ApplyBank);
+ LegalizerHelper Helper(*MF, ApplyBank, B);
LLT Part64, Part32;
std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
- auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
- auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
-
- auto Undef = B.buildUndef(LoadTy);
- auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
- B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
+ LegalizerHelper::Legalized)
+ return false;
+ return true;
} else {
LLT WiderTy = widen96To128(LoadTy);
auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
- B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ if (WiderTy.isScalar())
+ B.buildTrunc(MI.getOperand(0), WideLoad);
+ else {
+ B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
+ WideLoad);
+ }
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index d55bf3917e9c..2475b44b42a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -87,6 +87,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
+#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
@@ -110,6 +111,18 @@ using namespace llvm;
namespace {
+namespace AMDGPU {
+/// Collect all the instructions where user \p U belongs to. \p U could be
+/// instruction itself or it could be a constant expression which is used within
+/// an instruction. If \p CollectKernelInsts is true, collect instructions only
+/// from kernels, otherwise collect instructions only from non-kernel functions.
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts);
+
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
+
+} // namespace AMDGPU
+
class ReplaceLDSUseImpl {
Module &M;
LLVMContext &Ctx;
@@ -127,7 +140,8 @@ class ReplaceLDSUseImpl {
// Collect LDS which requires their uses to be replaced by pointer.
std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
// Collect LDS which requires module lowering.
- std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
+ std::vector<GlobalVariable *> LDSGlobals =
+ llvm::AMDGPU::findVariablesToLower(M);
// Remove LDS which don't qualify for replacement.
llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
@@ -172,7 +186,7 @@ class ReplaceLDSUseImpl {
AMDGPUAS::LOCAL_ADDRESS);
LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer));
+ LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer));
// Mark that an associated LDS pointer is created for LDS.
LDSToPointer[GV] = LDSPointer;
@@ -245,10 +259,9 @@ class ReplaceLDSUseImpl {
auto FunctionToInsts =
AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
- for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end();
- FI != FE; ++FI) {
- Function *F = FI->first;
- auto &Insts = FI->second;
+ for (const auto &FunctionToInst : FunctionToInsts) {
+ Function *F = FunctionToInst.first;
+ auto &Insts = FunctionToInst.second;
for (auto *I : Insts) {
// If `U` is a constant expression, then we need to break the
// associated instruction into a set of separate instructions by
@@ -341,10 +354,9 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
// Traverse through each kernel K, check and if required, initialize the
// LDS pointer to point to LDS within K.
- for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE;
- ++KI) {
- Function *K = KI->first;
- SmallPtrSet<Function *, 8> Callees = KI->second;
+ for (const auto &KernelToCallee : KernelToCallees) {
+ Function *K = KernelToCallee.first;
+ SmallPtrSet<Function *, 8> Callees = KernelToCallee.second;
// Compute reachable and LDS used callees for kernel K.
set_intersect(Callees, LDSAccessors);
@@ -378,6 +390,184 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
return true;
}
+namespace AMDGPU {
+
+// An helper class for collecting all reachable callees for each kernel defined
+// within the module.
+class CollectReachableCallees {
+ Module &M;
+ CallGraph CG;
+ SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
+
+ // Collect all address taken functions within the module.
+ void collectAddressTakenFunctions() {
+ auto *ECNode = CG.getExternalCallingNode();
+
+ for (const auto &GI : *ECNode) {
+ auto *CGN = GI.second;
+ auto *F = CGN->getFunction();
+ if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F))
+ continue;
+ AddressTakenFunctions.insert(CGN);
+ }
+ }
+
+ // For given kernel, collect all its reachable non-kernel functions.
+ SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
+ SmallPtrSet<Function *, 8> ReachableCallees;
+
+ // Call graph node which represents this kernel.
+ auto *KCGN = CG[K];
+
+ // Go through all call graph nodes reachable from the node representing this
+ // kernel, visit all their call sites, if the call site is direct, add
+ // corresponding callee to reachable callee set, if it is indirect, resolve
+ // the indirect call site to potential reachable callees, add them to
+ // reachable callee set, and repeat the process for the newly added
+ // potential callee nodes.
+ //
+ // FIXME: Need to handle bit-casted function pointers.
+ //
+ SmallVector<CallGraphNode *, 8> CGNStack(depth_first(KCGN));
+ SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
+ while (!CGNStack.empty()) {
+ auto *CGN = CGNStack.pop_back_val();
+
+ if (!VisitedCGNodes.insert(CGN).second)
+ continue;
+
+ // Ignore call graph node which does not have associated function or
+ // associated function is not a definition.
+ if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
+ continue;
+
+ for (const auto &GI : *CGN) {
+ auto *RCB = cast<CallBase>(GI.first.getValue());
+ auto *RCGN = GI.second;
+
+ if (auto *DCallee = RCGN->getFunction()) {
+ ReachableCallees.insert(DCallee);
+ } else if (RCB->isIndirectCall()) {
+ auto *RCBFTy = RCB->getFunctionType();
+ for (auto *ACGN : AddressTakenFunctions) {
+ auto *ACallee = ACGN->getFunction();
+ if (ACallee->getFunctionType() == RCBFTy) {
+ ReachableCallees.insert(ACallee);
+ CGNStack.append(df_begin(ACGN), df_end(ACGN));
+ }
+ }
+ }
+ }
+ }
+
+ return ReachableCallees;
+ }
+
+public:
+ explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
+ // Collect address taken functions.
+ collectAddressTakenFunctions();
+ }
+
+ void collectReachableCallees(
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ // Collect reachable callee set for each kernel defined in the module.
+ for (Function &F : M.functions()) {
+ if (!llvm::AMDGPU::isKernelCC(&F))
+ continue;
+ Function *K = &F;
+ KernelToCallees[K] = collectReachableCallees(K);
+ }
+ }
+};
+
+/// Collect reachable callees for each kernel defined in the module \p M and
+/// return collected callees at \p KernelToCallees.
+void collectReachableCallees(
+ Module &M,
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ CollectReachableCallees CRC{M};
+ CRC.collectReachableCallees(KernelToCallees);
+}
+
+/// For the given LDS global \p GV, visit all its users and collect all
+/// non-kernel functions within which \p GV is used and return collected list of
+/// such non-kernel functions.
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
+ SmallPtrSet<Function *, 8> LDSAccessors;
+ SmallVector<User *, 8> UserStack(GV->users());
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ while (!UserStack.empty()) {
+ auto *U = UserStack.pop_back_val();
+
+ // `U` is already visited? continue to next one.
+ if (!VisitedUsers.insert(U).second)
+ continue;
+
+ // `U` is a global variable which is initialized with LDS. Ignore LDS.
+ if (isa<GlobalValue>(U))
+ return SmallPtrSet<Function *, 8>();
+
+ // Recursively explore constant users.
+ if (isa<Constant>(U)) {
+ append_range(UserStack, U->users());
+ continue;
+ }
+
+ // `U` should be an instruction, if it belongs to a non-kernel function F,
+ // then collect F.
+ Function *F = cast<Instruction>(U)->getFunction();
+ if (!llvm::AMDGPU::isKernelCC(F))
+ LDSAccessors.insert(F);
+ }
+
+ return LDSAccessors;
+}
+
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
+ DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
+ SmallVector<User *, 8> UserStack;
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ UserStack.push_back(U);
+
+ while (!UserStack.empty()) {
+ auto *UU = UserStack.pop_back_val();
+
+ if (!VisitedUsers.insert(UU).second)
+ continue;
+
+ if (isa<GlobalValue>(UU))
+ continue;
+
+ if (isa<Constant>(UU)) {
+ append_range(UserStack, UU->users());
+ continue;
+ }
+
+ auto *I = cast<Instruction>(UU);
+ Function *F = I->getFunction();
+ if (CollectKernelInsts) {
+ if (!llvm::AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ } else {
+ if (llvm::AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ }
+
+ FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
+ FunctionToInsts[F].insert(I);
+ }
+
+ return FunctionToInsts;
+}
+
+} // namespace AMDGPU
+
// Entry-point function which interface ReplaceLDSUseImpl with outside of the
// class.
bool ReplaceLDSUseImpl::replaceLDSUse() {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0655b4342ba1..cd05797fdbdb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -413,21 +413,21 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
case AMDGPU::V_MAX_I16_e32:
case AMDGPU::V_MIN_I16_e64:
case AMDGPU::V_MIN_I16_e32:
+ case AMDGPU::V_MAD_F16_e64:
+ case AMDGPU::V_MAD_U16_e64:
+ case AMDGPU::V_MAD_I16_e64:
+ case AMDGPU::V_FMA_F16_e64:
+ case AMDGPU::V_DIV_FIXUP_F16_e64:
// On gfx10, all 16-bit instructions preserve the high bits.
return getGeneration() <= AMDGPUSubtarget::GFX9;
- case AMDGPU::V_MAD_F16_e64:
case AMDGPU::V_MADAK_F16:
case AMDGPU::V_MADMK_F16:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_FMAMK_F16:
case AMDGPU::V_FMAAK_F16:
- case AMDGPU::V_MAD_U16_e64:
- case AMDGPU::V_MAD_I16_e64:
- case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_e32:
- case AMDGPU::V_DIV_FIXUP_F16_e64:
// In gfx9, the preferred handling of the unused high 16-bits changed. Most
// instructions maintain the legacy behavior of 0ing. Some instructions
// changed to preserving the high bits.
@@ -648,9 +648,18 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
+ assert(AMDGPU::isKernel(F.getCallingConv()));
+
+ // We don't allocate the segment if we know the implicit arguments weren't
+ // used, even if the ABI implies we need them.
+ if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
+ return 0;
+
if (isMesaKernel(F))
return 16;
- return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+
+ // Assume all implicit inputs are used by default
+ return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
}
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index de11676279f2..a2c61f9da8da 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -231,13 +231,6 @@ static cl::opt<bool, true> LateCFGStructurize(
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
cl::Hidden);
-static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
- "amdgpu-fixed-function-abi",
- cl::desc("Enable all implicit function arguments"),
- cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
- cl::init(false),
- cl::Hidden);
-
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
@@ -505,7 +498,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
-bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 0ff2db2a52d9..226646a96953 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -37,7 +37,6 @@ protected:
public:
static bool EnableLateStructurizeCFG;
static bool EnableFunctionCalls;
- static bool EnableFixedFunctionABI;
static bool EnableLowerModuleLDS;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ecdbdf613a53..09c5eb192e1f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -519,57 +519,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
- EVT OrigTy = TLI->getValueType(DL, Ty);
- if (!OrigTy.isSimple()) {
- // FIXME: We're having to query the throughput cost so that the basic
- // implementation tries to generate legalize and scalarization costs. Maybe
- // we could hoist the scalarization code here?
- if (CostKind != TTI::TCK_CodeSize)
- return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
- Opd1Info, Opd2Info, Opd1PropInfo,
- Opd2PropInfo, Args, CxtI);
- // Scalarization
-
- // Check if any of the operands are vector operands.
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
- assert(ISD && "Invalid opcode");
-
- std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-
- bool IsFloat = Ty->isFPOrFPVectorTy();
- // Assume that floating point arithmetic operations cost twice as much as
- // integer operations.
- unsigned OpCost = (IsFloat ? 2 : 1);
-
- if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
- // The operation is legal. Assume it costs 1.
- // TODO: Once we have extract/insert subvector cost we need to use them.
- return LT.first * OpCost;
- }
-
- if (!TLI->isOperationExpand(ISD, LT.second)) {
- // If the operation is custom lowered, then assume that the code is twice
- // as expensive.
- return LT.first * 2 * OpCost;
- }
-
- // Else, assume that we need to scalarize this op.
- // TODO: If one of the types get legalized by splitting, handle this
- // similarly to what getCastInstrCost() does.
- if (auto *VTy = dyn_cast<VectorType>(Ty)) {
- unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
- InstructionCost Cost = getArithmeticInstrCost(
- Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo, Args, CxtI);
- // Return the cost of multiple scalar invocation plus the cost of
- // inserting and extracting the values.
- SmallVector<Type *> Tys(Args.size(), Ty);
- return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
- }
-
- // We don't know anything about this scalar instruction.
- return OpCost;
- }
// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -742,40 +691,6 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
Type *RetTy = ICA.getReturnType();
- EVT OrigTy = TLI->getValueType(DL, RetTy);
- if (!OrigTy.isSimple()) {
- if (CostKind != TTI::TCK_CodeSize)
- return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-
- // TODO: Combine these two logic paths.
- if (ICA.isTypeBasedOnly())
- return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
-
- unsigned RetVF =
- (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
- : 1);
- const IntrinsicInst *I = ICA.getInst();
- const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
- FastMathFlags FMF = ICA.getFlags();
- // Assume that we need to scalarize this intrinsic.
-
- // Compute the scalarization overhead based on Args for a vector
- // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
- // CostModel will pass a vector RetTy and VF is 1.
- InstructionCost ScalarizationCost = InstructionCost::getInvalid();
- if (RetVF > 1) {
- ScalarizationCost = 0;
- if (!RetTy->isVoidTy())
- ScalarizationCost +=
- getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
- ScalarizationCost +=
- getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
- }
-
- IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
- ScalarizationCost);
- return getIntrinsicInstrCost(Attrs, CostKind);
- }
// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 712f6dece911..1736c078eb83 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -173,10 +173,8 @@ protected:
}
static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
- for (MachineLoop::iterator iter = LoopInfo.begin(),
- iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
- (*iter)->print(dbgs());
- }
+ for (const MachineLoop *L : LoopInfo)
+ L->print(dbgs());
}
// UTILITY FUNCTIONS
@@ -691,9 +689,7 @@ bool AMDGPUCFGStructurizer::prepare() {
SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
// Add an ExitBlk to loop that don't have one
- for (MachineLoopInfo::iterator It = MLI->begin(),
- E = MLI->end(); It != E; ++It) {
- MachineLoop *LoopRep = (*It);
+ for (MachineLoop *LoopRep : *MLI) {
MBBVector ExitingMBBs;
LoopRep->getExitingBlocks(ExitingMBBs);
@@ -827,14 +823,13 @@ bool AMDGPUCFGStructurizer::run() {
wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
// Detach retired Block, release memory.
- for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
- It != E; ++It) {
- if ((*It).second && (*It).second->IsRetired) {
- assert(((*It).first)->getNumber() != -1);
- LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
- (*It).first->eraseFromParent(); //Remove from the parent Function.
+ for (auto &It : BlockInfoMap) {
+ if (It.second && It.second->IsRetired) {
+ assert((It.first)->getNumber() != -1);
+ LLVM_DEBUG(dbgs() << "Erase BB" << (It.first)->getNumber() << "\n";);
+ It.first->eraseFromParent(); // Remove from the parent Function.
}
- delete (*It).second;
+ delete It.second;
}
BlockInfoMap.clear();
LLInfoMap.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4acd77a9d5d2..2bb59086f391 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -246,8 +246,12 @@ public:
return isRegKind() && !hasModifiers();
}
+ bool isRegOrInline(unsigned RCID, MVT type) const {
+ return isRegClass(RCID) || isInlinableImm(type);
+ }
+
bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const {
- return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type);
+ return isRegOrInline(RCID, type) || isLiteralImm(type);
}
bool isRegOrImmWithInt16InputMods() const {
@@ -372,7 +376,7 @@ public:
bool isInlineValue() const;
bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
- return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
+ return isRegOrInline(RCID, type) && !hasModifiers();
}
bool isSCSrcB16() const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
index d3644db7cf8b..a535c8cc0918 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -6,11 +6,11 @@
//
//===----------------------------------------------------------------------===//
-def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
-def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
+def MUBUFAddr64 : ComplexPattern<iPTR, 4, "SelectMUBUFAddr64">;
+def MUBUFOffset : ComplexPattern<iPTR, 3, "SelectMUBUFOffset">;
-def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
-def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
+def MUBUFScratchOffen : ComplexPattern<iPTR, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
+def MUBUFScratchOffset : ComplexPattern<iPTR, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
def BUFAddrKind {
int Offset = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
index bb0aa648ff90..c7ec5308e6d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -6,12 +6,12 @@
//
//===----------------------------------------------------------------------===//
-def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
-def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
-def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
+def FlatOffset : ComplexPattern<iPTR, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
+def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
-def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
-def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
+def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
//===----------------------------------------------------------------------===//
// FLAT classes
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index f3f664f7972a..912bcc792e4d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -120,8 +120,7 @@ unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
// We will now look at each of the currently executing instructions
// to find out if this wait instruction still needs to wait.
- for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
- const InstRef &PrevIR = *I;
+ for (const InstRef &PrevIR : IssuedInst) {
const Instruction &PrevInst = *PrevIR.getInstruction();
const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 29c37c706138..8a48a67b829c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -440,9 +440,8 @@ private:
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
- for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
- BB->splice(InsertPos, BB, Clause.second[i]);
- }
+ for (MachineInstr *MI : Clause.second)
+ BB->splice(InsertPos, BB, MI);
CfCount += 2 * Clause.second.size();
}
@@ -452,9 +451,8 @@ private:
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
- for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
- BB->splice(InsertPos, BB, Clause.second[i]);
- }
+ for (MachineInstr *MI : Clause.second)
+ BB->splice(InsertPos, BB, MI);
CfCount += Clause.second.size();
}
@@ -635,10 +633,10 @@ public:
CfCount++;
}
MI->eraseFromParent();
- for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
- EmitFetchClause(I, DL, FetchClauses[i], CfCount);
- for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
- EmitALUClause(I, DL, AluClauses[i], CfCount);
+ for (ClauseFile &CF : FetchClauses)
+ EmitFetchClause(I, DL, CF, CfCount);
+ for (ClauseFile &CF : AluClauses)
+ EmitALUClause(I, DL, CF, CfCount);
break;
}
default:
@@ -649,8 +647,7 @@ public:
break;
}
}
- for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
- MachineInstr *Alu = ToPopAfter[i];
+ for (MachineInstr *Alu : ToPopAfter) {
BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
TII->get(R600::CF_ALU_POP_AFTER))
.addImm(Alu->getOperand(0).getImm())
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index a7ebf72315cb..aec8b1ae4837 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -268,17 +268,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
{R600::OpName::src1_W, R600::OpName::src1_sel_W},
};
- for (unsigned j = 0; j < 8; j++) {
- MachineOperand &MO =
- MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
+ for (const auto &Op : OpTable) {
+ MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[0]));
Register Reg = MO.getReg();
if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
- MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+ MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1]));
Result.push_back(std::make_pair(&MO, Sel.getImm()));
continue;
}
-
}
return Result;
}
@@ -289,15 +287,14 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
{R600::OpName::src2, R600::OpName::src2_sel},
};
- for (unsigned j = 0; j < 3; j++) {
- int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]);
+ for (const auto &Op : OpTable) {
+ int SrcIdx = getOperandIdx(MI.getOpcode(), Op[0]);
if (SrcIdx < 0)
break;
MachineOperand &MO = MI.getOperand(SrcIdx);
Register Reg = MO.getReg();
if (Reg == R600::ALU_CONST) {
- MachineOperand &Sel =
- MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+ MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1]));
Result.push_back(std::make_pair(&MO, Sel.getImm()));
continue;
}
@@ -521,12 +518,11 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
ValidSwizzle.clear();
unsigned ConstCount;
BankSwizzle TransBS = ALU_VEC_012_SCL_210;
- for (unsigned i = 0, e = IG.size(); i < e; ++i) {
- IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
- unsigned Op = getOperandIdx(IG[i]->getOpcode(),
- R600::OpName::bank_swizzle);
- ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
- IG[i]->getOperand(Op).getImm());
+ for (MachineInstr *MI : IG) {
+ IGSrcs.push_back(ExtractSrcs(*MI, PV, ConstCount));
+ unsigned Op = getOperandIdx(MI->getOpcode(), R600::OpName::bank_swizzle);
+ ValidSwizzle.push_back(
+ (R600InstrInfo::BankSwizzle)MI->getOperand(Op).getImm());
}
std::vector<std::pair<int, unsigned>> TransOps;
if (!isLastAluTrans)
@@ -542,8 +538,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
ALU_VEC_120_SCL_212,
ALU_VEC_102_SCL_221
};
- for (unsigned i = 0; i < 4; i++) {
- TransBS = TransSwz[i];
+ for (R600InstrInfo::BankSwizzle TransBS : TransSwz) {
if (!isConstCompatible(TransBS, TransOps, ConstCount))
continue;
bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
@@ -562,9 +557,9 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
const {
assert (Consts.size() <= 12 && "Too many operands in instructions group");
unsigned Pair1 = 0, Pair2 = 0;
- for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
- unsigned ReadConstHalf = Consts[i] & 2;
- unsigned ReadConstIndex = Consts[i] & (~3);
+ for (unsigned Const : Consts) {
+ unsigned ReadConstHalf = Const & 2;
+ unsigned ReadConstIndex = Const & (~3);
unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
if (!Pair1) {
Pair1 = ReadHalfConst;
@@ -587,12 +582,11 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
const {
std::vector<unsigned> Consts;
SmallSet<int64_t, 4> Literals;
- for (unsigned i = 0, n = MIs.size(); i < n; i++) {
- MachineInstr &MI = *MIs[i];
- if (!isALUInstr(MI.getOpcode()))
+ for (MachineInstr *MI : MIs) {
+ if (!isALUInstr(MI->getOpcode()))
continue;
- for (const auto &Src : getSrcs(MI)) {
+ for (const auto &Src : getSrcs(*MI)) {
if (Src.first->getReg() == R600::ALU_LITERAL_X)
Literals.insert(Src.second);
if (Literals.size() > 4)
@@ -1330,11 +1324,11 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel))
.setReg(MO.getReg());
- for (unsigned i = 0; i < 14; i++) {
+ for (unsigned Operand : Operands) {
MachineOperand &MO = MI->getOperand(
- getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+ getOperandIdx(MI->getOpcode(), getSlotedOps(Operand, Slot)));
assert (MO.isImm());
- setImmOperand(*MIB, Operands[i], MO.getImm());
+ setImmOperand(*MIB, Operand, MO.getImm());
}
MIB->getOperand(20).setImm(0);
return MIB;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 6aee2f591b56..d26879ed8d60 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -328,9 +328,9 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
void R600SchedStrategy::LoadAlu() {
std::vector<SUnit *> &QSrc = Pending[IDAlu];
- for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
- AluKind AK = getAluKind(QSrc[i]);
- AvailableAlus[AK].push_back(QSrc[i]);
+ for (SUnit *SU : QSrc) {
+ AluKind AK = getAluKind(SU);
+ AvailableAlus[AK].push_back(SU);
}
QSrc.clear();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index ac6a3581e255..aa156190b7ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -307,8 +307,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass {
// Build new MDNode.
SmallVector<Metadata *, 6> KernelMDArgs;
KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
- for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
- KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+ for (const MDVector &MDV : NewArgMDs.ArgVector)
+ KernelMDArgs.push_back(MDNode::get(*Context, MDV));
MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
return std::make_tuple(NewF, NewMDNode);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 72cf48c04e7f..795bc898a7bf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -150,19 +150,18 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
const {
unsigned CurrentUndexIdx = 0;
- for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(),
- E = ToMerge->RegToChan.end(); It != E; ++It) {
+ for (auto &It : ToMerge->RegToChan) {
DenseMap<Register, unsigned>::const_iterator PosInUntouched =
- Untouched->RegToChan.find((*It).first);
+ Untouched->RegToChan.find(It.first);
if (PosInUntouched != Untouched->RegToChan.end()) {
- Remap.push_back(std::pair<unsigned, unsigned>
- ((*It).second, (*PosInUntouched).second));
+ Remap.push_back(
+ std::pair<unsigned, unsigned>(It.second, (*PosInUntouched).second));
continue;
}
if (CurrentUndexIdx >= Untouched->UndefReg.size())
return false;
- Remap.push_back(std::pair<unsigned, unsigned>
- ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+ Remap.push_back(std::pair<unsigned, unsigned>(
+ It.second, Untouched->UndefReg[CurrentUndexIdx++]));
}
return true;
@@ -172,9 +171,9 @@ static
unsigned getReassignedChan(
const std::vector<std::pair<unsigned, unsigned>> &RemapChan,
unsigned Chan) {
- for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
- if (RemapChan[j].first == Chan)
- return RemapChan[j].second;
+ for (const auto &J : RemapChan) {
+ if (J.first == Chan)
+ return J.second;
}
llvm_unreachable("Chan wasn't reassigned");
}
@@ -190,11 +189,10 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
std::vector<Register> UpdatedUndef = BaseRSI->UndefReg;
- for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(),
- E = RSI->RegToChan.end(); It != E; ++It) {
+ for (const auto &It : RSI->RegToChan) {
Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
- unsigned SubReg = (*It).first;
- unsigned Swizzle = (*It).second;
+ unsigned SubReg = It.first;
+ unsigned Swizzle = It.second;
unsigned Chan = getReassignedChan(RemapChan, Swizzle);
MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG),
@@ -234,14 +232,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
}
void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
- for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
- E = PreviousRegSeqByReg.end(); It != E; ++It) {
- std::vector<MachineInstr *> &MIs = (*It).second;
+ for (auto &It : PreviousRegSeqByReg) {
+ std::vector<MachineInstr *> &MIs = It.second;
MIs.erase(llvm::find(MIs, MI), MIs.end());
}
- for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
- E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
- std::vector<MachineInstr *> &MIs = (*It).second;
+ for (auto &It : PreviousRegSeqByUndefCount) {
+ std::vector<MachineInstr *> &MIs = It.second;
MIs.erase(llvm::find(MIs, MI), MIs.end());
}
}
@@ -255,9 +251,9 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
Offset = 3;
for (unsigned i = 0; i < 4; i++) {
unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
- for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
- if (RemapChan[j].first == Swizzle) {
- MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+ for (const auto &J : RemapChan) {
+ if (J.first == Swizzle) {
+ MI.getOperand(i + Offset).setImm(J.second - 1);
break;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index beb0aad86e89..fbe2a1cd9fba 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -127,8 +127,8 @@ private:
R600::OpName::src1,
R600::OpName::src2
};
- for (unsigned i = 0; i < 3; i++) {
- int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
+ for (unsigned Op : Ops) {
+ int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Op);
if (OperandIdx < 0)
continue;
Register Src = MI.getOperand(OperandIdx).getReg();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 99a1a8e9871a..c329bae50f92 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -54,10 +54,8 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE);
reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR);
- for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(),
- E = R600::R600_AddrRegClass.end(); I != E; ++I) {
- reserveRegisterTuples(Reserved, *I);
- }
+ for (MCPhysReg R : R600::R600_AddrRegClass)
+ reserveRegisterTuples(Reserved, R);
TII->reserveIndirectRegisters(Reserved, MF, *this);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 200e00ee5521..1f93284fc7ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1620,7 +1620,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
// Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
// in which case we can erase them all later in runOnMachineFunction.
if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
- MI.eraseFromParentAndMarkDBGValuesForRemoval();
+ MI.eraseFromParent();
return true;
}
@@ -1821,7 +1821,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
auto &SrcOp = InstToErase->getOperand(1);
auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
- InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+ InstToErase->eraseFromParent();
InstToErase = nullptr;
if (!SrcReg || SrcReg.isPhysical())
break;
@@ -1831,7 +1831,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
}
if (InstToErase && InstToErase->isRegSequence() &&
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
- InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+ InstToErase->eraseFromParent();
}
}
return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4706c74be721..d4fe74ecb96e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1167,11 +1167,13 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (SpillVGPRToAGPR) {
// To track the spill frame indices handled in this pass.
BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+ BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
bool SeenDbgInstr = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+ int FrameIndex;
if (MI.isDebugInstr())
SeenDbgInstr = true;
@@ -1191,10 +1193,18 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
SpillFIs.set(FI);
continue;
}
- }
+ } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
+ TII->isLoadFromStackSlot(MI, FrameIndex))
+ NonVGPRSpillFIs.set(FrameIndex);
}
}
+ // Stack slot coloring may assign different objets to the same stack slot.
+ // If not, then the VGPR to AGPR spill slot is dead.
+ for (unsigned FI : SpillFIs.set_bits())
+ if (!NonVGPRSpillFIs.test(FI))
+ FuncInfo->setVGPRToAGPRSpillDead(FI);
+
for (MachineBasicBlock &MBB : MF) {
for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
MBB.addLiveIn(Reg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 35b72f5d201b..9f138136e6e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -2062,33 +2063,30 @@ void SITargetLowering::allocateSpecialInputSGPRs(
SIMachineFunctionInfo &Info) const {
auto &ArgInfo = Info.getArgInfo();
- // We need to allocate these in place regardless of their use.
- const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI;
-
// TODO: Unify handling with private memory pointers.
- if (IsFixed || Info.hasDispatchPtr())
+ if (Info.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
- if (IsFixed || Info.hasQueuePtr())
+ if (Info.hasQueuePtr())
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
// constant offset from the kernarg segment.
- if (IsFixed || Info.hasImplicitArgPtr())
+ if (Info.hasImplicitArgPtr())
allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
- if (IsFixed || Info.hasDispatchID())
+ if (Info.hasDispatchID())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
// flat_scratch_init is not applicable for non-kernel functions.
- if (IsFixed || Info.hasWorkGroupIDX())
+ if (Info.hasWorkGroupIDX())
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
- if (IsFixed || Info.hasWorkGroupIDY())
+ if (Info.hasWorkGroupIDY())
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
- if (IsFixed || Info.hasWorkGroupIDZ())
+ if (Info.hasWorkGroupIDZ())
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
}
@@ -2419,10 +2417,9 @@ SDValue SITargetLowering::LowerFormalArguments(
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
- } else {
+ } else if (!IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
- if (AMDGPUTargetMachine::EnableFixedFunctionABI)
- allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+ allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
if (IsKernel) {
@@ -2549,17 +2546,13 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
- if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
- // Special inputs come after user arguments.
- allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
- }
-
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
- allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+ if (!IsGraphics)
+ allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
auto &ArgUsageInfo =
@@ -3123,8 +3116,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
- CallConv != CallingConv::AMDGPU_Gfx) {
+ if (CallConv != CallingConv::AMDGPU_Gfx) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -3263,12 +3255,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
- if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- CallConv != CallingConv::AMDGPU_Gfx) {
- // Copy special input registers after user input arguments.
- passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
- }
-
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
@@ -6282,10 +6268,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
}
- // Push back extra arguments.
- for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++)
- VAddrs.push_back(Op.getOperand(ArgOffset + I));
-
// Check for 16 bit addresses or derivatives and pack if true.
MVT VAddrVT =
Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
@@ -6298,6 +6280,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
+ // Push back extra arguments.
+ for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
+ if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
+ // Special handling of bias when A16 is on. Bias is of type half but
+ // occupies full 32-bit.
+ SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
+ VAddrs.push_back(bias);
+ } else
+ VAddrs.push_back(Op.getOperand(ArgOffset + I));
+ }
+
if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
// 16 bit gradients are supported, but are tied to the A16 control
// so both gradients and addresses must be 16 bit
@@ -7502,8 +7495,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
assert(NodePtr.getValueType() == MVT::i32 ||
NodePtr.getValueType() == MVT::i64);
- assert(RayDir.getValueType() == MVT::v4f16 ||
- RayDir.getValueType() == MVT::v4f32);
+ assert(RayDir.getValueType() == MVT::v3f16 ||
+ RayDir.getValueType() == MVT::v3f32);
if (!Subtarget->hasGFX10_AEncoding()) {
emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
@@ -9837,11 +9830,13 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
if (Opcode == AMDGPU::G_FCANONICALIZE)
return true;
- if (Opcode == AMDGPU::G_FCONSTANT) {
- auto F = MI->getOperand(1).getFPImm()->getValueAPF();
- if (F.isNaN() && F.isSignaling())
+ Optional<FPValueAndVReg> FCR;
+ // Constant splat (can be padded with undef) or scalar constant.
+ if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
+ if (FCR->Value.isSignaling())
return false;
- return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);
+ return !FCR->Value.isDenormal() ||
+ denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
}
if (MaxDepth == 0)
@@ -11514,7 +11509,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
// Prefer VGPRs over AGPRs in mAI instructions where possible.
// This saves a chain-copy of registers and better ballance register
// use between vgpr and agpr as agpr tuples tend to be big.
- if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) {
+ if (MI.getDesc().OpInfo) {
unsigned Opc = MI.getOpcode();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
@@ -12477,6 +12472,6 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
if (Size <= 256)
return Cost;
- Cost.first = (Size + 255) / 256;
+ Cost.first += (Size + 255) / 256;
return Cost;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c9d9dd1fb82c..6fbe5d45ce0a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -30,6 +30,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/DebugCounter.h"
@@ -51,26 +52,6 @@ static cl::opt<bool> ForceEmitZeroFlag(
cl::init(false), cl::Hidden);
namespace {
-
-template <typename EnumT>
-class enum_iterator
- : public iterator_facade_base<enum_iterator<EnumT>,
- std::forward_iterator_tag, const EnumT> {
- EnumT Value;
-public:
- enum_iterator() = default;
- enum_iterator(EnumT Value) : Value(Value) {}
-
- enum_iterator &operator++() {
- Value = static_cast<EnumT>(Value + 1);
- return *this;
- }
-
- bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
-
- EnumT operator*() const { return Value; }
-};
-
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
// s_waitcnt instruction needs to be emitted.
@@ -78,27 +59,32 @@ public:
#define CNT_MASK(t) (1u << (t))
enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
+} // namespace
-iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
- return make_range(enum_iterator<InstCounterType>(VM_CNT),
- enum_iterator<InstCounterType>(NUM_INST_CNTS));
-}
+namespace llvm {
+template <> struct enum_iteration_traits<InstCounterType> {
+ static constexpr bool is_iterable = true;
+};
+} // namespace llvm
+
+namespace {
+auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
using RegInterval = std::pair<int, int>;
-struct {
+struct HardwareLimits {
unsigned VmcntMax;
unsigned ExpcntMax;
unsigned LgkmcntMax;
unsigned VscntMax;
-} HardwareLimits;
+};
-struct {
+struct RegisterEncoding {
unsigned VGPR0;
unsigned VGPRL;
unsigned SGPR0;
unsigned SGPRL;
-} RegisterEncoding;
+};
enum WaitEventType {
VMEM_ACCESS, // vector-memory read & write
@@ -194,18 +180,20 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
// "s_waitcnt 0" before use.
class WaitcntBrackets {
public:
- WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
+ WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
+ RegisterEncoding Encoding)
+ : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
- static unsigned getWaitCountMax(InstCounterType T) {
+ unsigned getWaitCountMax(InstCounterType T) const {
switch (T) {
case VM_CNT:
- return HardwareLimits.VmcntMax;
+ return Limits.VmcntMax;
case LGKM_CNT:
- return HardwareLimits.LgkmcntMax;
+ return Limits.LgkmcntMax;
case EXP_CNT:
- return HardwareLimits.ExpcntMax;
+ return Limits.ExpcntMax;
case VS_CNT:
- return HardwareLimits.VscntMax;
+ return Limits.VscntMax;
default:
break;
}
@@ -338,6 +326,8 @@ private:
unsigned OpNo, unsigned Val);
const GCNSubtarget *ST = nullptr;
+ HardwareLimits Limits = {};
+ RegisterEncoding Encoding = {};
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
unsigned PendingEvents = 0;
@@ -471,14 +461,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
- assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
- Result.first = Reg - RegisterEncoding.VGPR0;
+ assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
+ Result.first = Reg - Encoding.VGPR0;
if (TRI->isAGPR(*MRI, Op.getReg()))
Result.first += AGPR_OFFSET;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
} else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
- assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
- Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
+ assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+ Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
assert(Result.first >= NUM_ALL_VGPRS &&
Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
}
@@ -1589,20 +1579,22 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
- HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
- HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
- HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
- HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
+ HardwareLimits Limits = {};
+ Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+ Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+ Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+ Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
- RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
- RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
- RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
- RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
+ RegisterEncoding Encoding = {};
+ Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
+ Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
+ Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
+ Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
TrackedWaitcntSet.clear();
BlockInfos.clear();
@@ -1652,9 +1644,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
*Brackets = *BI.Incoming;
} else {
if (!Brackets)
- Brackets = std::make_unique<WaitcntBrackets>(ST);
+ Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
else
- *Brackets = WaitcntBrackets(ST);
+ *Brackets = WaitcntBrackets(ST, Limits, Encoding);
}
Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
@@ -1686,45 +1678,47 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
} while (Repeat);
- SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
- bool HaveScalarStores = false;
+ if (ST->hasScalarStores()) {
+ SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+ bool HaveScalarStores = false;
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (!HaveScalarStores && TII->isScalarStore(MI))
- HaveScalarStores = true;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (!HaveScalarStores && TII->isScalarStore(MI))
+ HaveScalarStores = true;
- if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
- EndPgmBlocks.push_back(&MBB);
+ if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+ MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+ EndPgmBlocks.push_back(&MBB);
+ }
}
- }
- if (HaveScalarStores) {
- // If scalar writes are used, the cache must be flushed or else the next
- // wave to reuse the same scratch memory can be clobbered.
- //
- // Insert s_dcache_wb at wave termination points if there were any scalar
- // stores, and only if the cache hasn't already been flushed. This could be
- // improved by looking across blocks for flushes in postdominating blocks
- // from the stores but an explicitly requested flush is probably very rare.
- for (MachineBasicBlock *MBB : EndPgmBlocks) {
- bool SeenDCacheWB = false;
-
- for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
- ++I) {
- if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
- SeenDCacheWB = true;
- else if (TII->isScalarStore(*I))
- SeenDCacheWB = false;
-
- // FIXME: It would be better to insert this before a waitcnt if any.
- if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
- I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
- !SeenDCacheWB) {
- Modified = true;
- BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+ if (HaveScalarStores) {
+ // If scalar writes are used, the cache must be flushed or else the next
+ // wave to reuse the same scratch memory can be clobbered.
+ //
+ // Insert s_dcache_wb at wave termination points if there were any scalar
+ // stores, and only if the cache hasn't already been flushed. This could
+ // be improved by looking across blocks for flushes in postdominating
+ // blocks from the stores but an explicitly requested flush is probably
+ // very rare.
+ for (MachineBasicBlock *MBB : EndPgmBlocks) {
+ bool SeenDCacheWB = false;
+
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+ if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+ SeenDCacheWB = true;
+ else if (TII->isScalarStore(*I))
+ SeenDCacheWB = false;
+
+ // FIXME: It would be better to insert this before a waitcnt if any.
+ if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+ !SeenDCacheWB) {
+ Modified = true;
+ BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+ }
}
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92f5322b8ad2..1755b93538ce 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -899,8 +899,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isAGPRClass(RC)) {
- Opcode = (RI.hasVGPRs(SrcRC)) ?
- AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
+ if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
+ Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
+ else if (RI.hasVGPRs(SrcRC))
+ Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
+ else
+ Opcode = AMDGPU::INSTRUCTION_LIST_END;
} else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
} else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
@@ -1417,6 +1421,33 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
}
}
+static unsigned getAVSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_AV32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_AV64_SAVE;
+ case 12:
+ return AMDGPU::SI_SPILL_AV96_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_AV128_SAVE;
+ case 20:
+ return AMDGPU::SI_SPILL_AV160_SAVE;
+ case 24:
+ return AMDGPU::SI_SPILL_AV192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_AV224_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_AV256_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_AV512_SAVE;
+ case 128:
+ return AMDGPU::SI_SPILL_AV1024_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register SrcReg, bool isKill,
@@ -1463,21 +1494,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
return;
}
- unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
- : getVGPRSpillSaveOpcode(SpillSize);
+ unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize)
+ : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+ : getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
- if (RI.isVectorSuperClass(RC)) {
- // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
- // equivalent VGPR register beforehand. Regalloc might want to introduce
- // AV spills only to be relevant until rewriter at which they become
- // either spills of VGPRs or AGPRs.
- Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
- BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
- .addReg(SrcReg, RegState::Kill);
- SrcReg = TmpVReg;
- }
-
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
@@ -1567,6 +1588,33 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
}
}
+static unsigned getAVSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_AV32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_AV64_RESTORE;
+ case 12:
+ return AMDGPU::SI_SPILL_AV96_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_AV128_RESTORE;
+ case 20:
+ return AMDGPU::SI_SPILL_AV160_RESTORE;
+ case 24:
+ return AMDGPU::SI_SPILL_AV192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_AV224_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_AV256_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_AV512_RESTORE;
+ case 128:
+ return AMDGPU::SI_SPILL_AV1024_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
@@ -1609,26 +1657,15 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
return;
}
- unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
- : getVGPRSpillRestoreOpcode(SpillSize);
-
- bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
- Register TmpReg = DestReg;
- if (IsVectorSuperClass) {
- // For AV classes, insert the spill restore to a VGPR followed by a copy
- // into an equivalent AV register.
- MachineRegisterInfo &MRI = MF->getRegInfo();
- DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
- }
+ unsigned Opcode = RI.isVectorSuperClass(RC)
+ ? getAVSpillRestoreOpcode(SpillSize)
+ : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+ : getVGPRSpillRestoreOpcode(SpillSize);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex) // vaddr
- .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
- .addImm(0) // offset
- .addMemOperand(MMO);
-
- if (IsVectorSuperClass)
- BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
- .addReg(DestReg, RegState::Kill);
+ .addFrameIndex(FrameIndex) // vaddr
+ .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
}
void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -2358,8 +2395,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
auto *ShAmt = MCConstantExpr::create(32, MCCtx);
OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
-
- return;
}
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
@@ -3106,23 +3141,26 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
}
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
- int64_t &Imm) {
+ int64_t &Imm, MachineInstr **DefMI = nullptr) {
if (Reg.isPhysical())
return false;
auto *Def = MRI.getUniqueVRegDef(Reg);
if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
Imm = Def->getOperand(1).getImm();
+ if (DefMI)
+ *DefMI = Def;
return true;
}
return false;
}
-static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) {
+static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
+ MachineInstr **DefMI = nullptr) {
if (!MO->isReg())
return false;
const MachineFunction *MF = MO->getParent()->getParent()->getParent();
const MachineRegisterInfo &MRI = MF->getRegInfo();
- return getFoldableImm(MO->getReg(), MRI, Imm);
+ return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
}
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
@@ -3195,8 +3233,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
+ MachineInstr *DefMI;
+ const auto killDef = [&DefMI, &MBB, this]() -> void {
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ // The only user is the instruction which will be killed.
+ if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg()))
+ return;
+ // We cannot just remove the DefMI here, calling pass will crash.
+ DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
+ for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
+ DefMI->RemoveOperand(I);
+ };
+
int64_t Imm;
- if (getFoldableImm(Src2, Imm)) {
+ if (getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
@@ -3209,13 +3259,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ killDef();
return MIB;
}
}
unsigned NewOpc = IsFMA
? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
- if (getFoldableImm(Src1, Imm)) {
+ if (getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
@@ -3225,10 +3276,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ killDef();
return MIB;
}
}
- if (getFoldableImm(Src0, Imm)) {
+ if (getFoldableImm(Src0, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1 &&
isOperandLegal(
MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
@@ -3241,12 +3293,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ killDef();
return MIB;
}
}
}
- unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
+ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
: IsF64 ? AMDGPU::V_FMA_F64_e64
: AMDGPU::V_FMA_F32_e64)
: (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
@@ -3605,12 +3658,6 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
const MachineRegisterInfo &MRI) const {
const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
// Can't shrink instruction with three operands.
- // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
- // a special case for it. It can only be shrunk if the third operand
- // is vcc, and src0_modifiers and src1_modifiers are not set.
- // We should handle this the same way we handle vopc, by addding
- // a register allocation hint pre-regalloc and then do the shrinking
- // post-regalloc.
if (Src2) {
switch (MI.getOpcode()) {
default: return false;
@@ -4563,8 +4610,9 @@ static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
unsigned RCID,
bool IsAllocatable) {
if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
- (TID.mayLoad() || TID.mayStore() ||
- (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+ (((TID.mayLoad() || TID.mayStore()) &&
+ !(TID.TSFlags & SIInstrFlags::VGPRSpill)) ||
+ (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
switch (RCID) {
case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
@@ -5001,8 +5049,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
--ConstantBusLimit;
}
- for (unsigned i = 0; i < 3; ++i) {
- int Idx = VOP3Idx[i];
+ for (int Idx : VOP3Idx) {
if (Idx == -1)
break;
MachineOperand &MO = MI.getOperand(Idx);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 47ee83eb9351..dda92d3d25ff 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1350,11 +1350,11 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
// Complex patterns
//===----------------------------------------------------------------------===//
-def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
-def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
-def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">;
+def DS1Addr1Offset : ComplexPattern<iPTR, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<iPTR, 3, "SelectDS64Bit4ByteAligned">;
+def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned">;
-def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
+def MOVRELOffset : ComplexPattern<iPTR, 2, "SelectMOVRELOffset">;
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index d55d8da8699a..636337ede000 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -761,6 +761,17 @@ defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
+defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>;
+defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>;
+defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>;
+defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>;
+defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
+defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
+defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
+defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
+defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
+defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
+
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
@@ -2106,6 +2117,19 @@ def : GCNPat <
} // end isWave32
def : GCNPat <
+ (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
+ (V_NOT_B32_e32 $src0)
+>;
+
+def : GCNPat <
+ (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))),
+ (REG_SEQUENCE VReg_64,
+ (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0,
+ (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1
+ )
+>;
+
+def : GCNPat <
(f16 (sint_to_fp i1:$src)),
(V_CVT_F16_F32_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -2188,18 +2212,18 @@ def : GCNPat <
>;
def : GCNPat <
- (i1 (trunc i32:$a)),
- (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+ (i1 (DivergentUnaryFrag<trunc> i32:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
def : GCNPat <
- (i1 (trunc i16:$a)),
- (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
def : GCNPat <
- (i1 (trunc i64:$a)),
- (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
+ (i1 (DivergentUnaryFrag<trunc> i64:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
@@ -2405,21 +2429,37 @@ def : GCNPat <
// COPY is workaround tablegen bug from multiple outputs
// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
- (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
def : GCNPat <
- (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))),
+ (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
+ (v2i16 (V_LSHLREV_B32_e64 (i16 16), SReg_32:$src1))
+>;
+
+
+def : GCNPat <
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
def : GCNPat <
- (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
+ (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1))
+>;
+
+def : GCNPat <
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
def : GCNPat <
+ (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1))
+>;
+
+def : GCNPat <
(v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;
@@ -2435,42 +2475,74 @@ def : GCNPat <
>;
def : GCNPat <
- (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
def : GCNPat <
- (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))),
+ (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
+ (v2i16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1))
+>;
+
+
+def : GCNPat <
+ (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
+def : GCNPat <
+ (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
+ (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1))
+>;
+
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
- (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
+def : GCNPat <
+ (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
+ (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
+>;
+
// With multiple uses of the shift, this will duplicate the shift and
// increase register pressure.
def : GCNPat <
- (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
(v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1))
>;
+def : GCNPat <
+ (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+ (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1))
+>;
+
def : GCNPat <
- (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
(i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
(S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
-// TODO: Should source modifiers be matched to v_pack_b32_f16?
def : GCNPat <
- (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
+ (v2i16 (DivergentBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
+ (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+ (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0))))
+>;
+
+def : GCNPat <
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
def : GCNPat <
+ (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
+ (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0))))
+>;
+
+
+def : GCNPat <
(v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
(f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
(V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
@@ -2866,6 +2938,18 @@ def G_AMDGPU_UMED3 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_FMED3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+ let hasSideEffects = 0;
+}
+
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c4007f56f350..3ce368ef4db9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -62,11 +62,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
// calls.
const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
- // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
- // have any calls.
- const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
- CC != CallingConv::AMDGPU_Gfx &&
- (!isEntryFunction() || HasCalls);
const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
CC == CallingConv::SPIR_KERNEL;
@@ -80,7 +75,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
if (!isEntryFunction()) {
- if (UseFixedABI)
+ if (CC != CallingConv::AMDGPU_Gfx)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
// TODO: Pick a high register, and shift down, similar to a kernel.
@@ -110,20 +105,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
else if (ST.isMesaGfxShader(F))
ImplicitBufferPtr = true;
- if (UseFixedABI) {
- DispatchPtr = true;
- QueuePtr = true;
- ImplicitArgPtr = true;
- WorkGroupIDX = true;
- WorkGroupIDY = true;
- WorkGroupIDZ = true;
- WorkItemIDX = true;
- WorkItemIDY = true;
- WorkItemIDZ = true;
-
- // FIXME: We don't need this?
- DispatchID = true;
- } else if (!AMDGPU::isGraphics(CC)) {
+ if (!AMDGPU::isGraphics(CC)) {
if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
WorkGroupIDX = true;
@@ -462,7 +444,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
MFI.setStackID(i, TargetStackID::Default);
for (auto &R : VGPRToAGPRSpills) {
- if (R.second.FullyAllocated)
+ if (R.second.IsDead)
MFI.RemoveStackObject(R.first);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index c305bc20e40d..8accbf611c5f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -465,6 +465,7 @@ public:
struct VGPRSpillToAGPR {
SmallVector<MCPhysReg, 32> Lanes;
bool FullyAllocated = false;
+ bool IsDead = false;
};
// Map WWM VGPR to a stack slot that is used to save/restore it in the
@@ -546,6 +547,12 @@ public:
: I->second.Lanes[Lane];
}
+ void setVGPRToAGPRSpillDead(int FrameIndex) {
+ auto I = VGPRToAGPRSpills.find(FrameIndex);
+ if (I != VGPRToAGPRSpills.end())
+ I->second.IsDead = true;
+ }
+
bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 5590d84cc3ab..81db66a98ddf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -869,29 +869,27 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() {
}
void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
- unsigned DAGSize = DAG->SUnits.size();
std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
// Every combination of colors given by the top down
// and bottom up Reserved node dependency
- for (unsigned i = 0, e = DAGSize; i != e; ++i) {
- SUnit *SU = &DAG->SUnits[i];
+ for (const SUnit &SU : DAG->SUnits) {
std::pair<unsigned, unsigned> SUColors;
// High latency instructions: already given.
- if (CurrentColoring[SU->NodeNum])
+ if (CurrentColoring[SU.NodeNum])
continue;
- SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
- SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+ SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum];
+ SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum];
std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
ColorCombinations.find(SUColors);
if (Pos != ColorCombinations.end()) {
- CurrentColoring[SU->NodeNum] = Pos->second;
+ CurrentColoring[SU.NodeNum] = Pos->second;
} else {
- CurrentColoring[SU->NodeNum] = NextNonReservedID;
+ CurrentColoring[SU.NodeNum] = NextNonReservedID;
ColorCombinations[SUColors] = NextNonReservedID++;
}
}
@@ -1232,15 +1230,13 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
}
// Free root and leafs of all blocks to enable scheduling inside them.
- for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = CurrentBlocks[i];
+ for (SIScheduleBlock *Block : CurrentBlocks)
Block->finalizeUnits();
- }
- LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
- for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = CurrentBlocks[i];
- Block->printDebug(true);
- });
+ LLVM_DEBUG({
+ dbgs() << "Blocks created:\n\n";
+ for (SIScheduleBlock *Block : CurrentBlocks)
+ Block->printDebug(true);
+ });
}
// Two functions taken from Codegen/MachineScheduler.cpp
@@ -1379,9 +1375,9 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
}
}
- LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = CurrentBlocks[i];
- Block->printDebug(true);
+ LLVM_DEBUG({
+ for (SIScheduleBlock *Block : CurrentBlocks)
+ Block->printDebug(true);
});
}
@@ -1437,8 +1433,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
// found for several parents, we increment the usage of the one with the
// highest topological index.
LiveOutRegsNumUsages.resize(Blocks.size());
- for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = Blocks[i];
+ for (SIScheduleBlock *Block : Blocks) {
for (unsigned Reg : Block->getInRegs()) {
bool Found = false;
int topoInd = -1;
@@ -1502,8 +1497,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
// Fill LiveRegsConsumers for regs that were already
// defined before scheduling.
- for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
- SIScheduleBlock *Block = Blocks[i];
+ for (SIScheduleBlock *Block : Blocks) {
for (unsigned Reg : Block->getInRegs()) {
bool Found = false;
for (SIScheduleBlock* Pred: Block->getPreds()) {
@@ -1700,10 +1694,7 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
decreaseLiveRegs(Block, Block->getInRegs());
addLiveRegs(Block->getOutRegs());
releaseBlockSuccs(Block);
- for (std::map<unsigned, unsigned>::iterator RegI =
- LiveOutRegsNumUsages[Block->getID()].begin(),
- E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
- std::pair<unsigned, unsigned> RegP = *RegI;
+ for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
// We produce this register, thus it must not be previously alive.
assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
LiveRegsConsumers[RegP.first] == 0);
@@ -1759,8 +1750,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
ScheduledBlocks = Scheduler.getBlocks();
- for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
- SIScheduleBlock *Block = ScheduledBlocks[b];
+ for (SIScheduleBlock *Block : ScheduledBlocks) {
std::vector<SUnit*> SUs = Block->getScheduledUnits();
for (SUnit* SU : SUs)
@@ -2000,9 +1990,8 @@ void SIScheduleDAGMI::schedule()
assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
TopRPTracker.setPos(CurrentTop);
- for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
- E = ScheduledSUnits.end(); I != E; ++I) {
- SUnit *SU = &SUnits[*I];
+ for (unsigned I : ScheduledSUnits) {
+ SUnit *SU = &SUnits[I];
scheduleMI(SU, true);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a1d9a23a5084..21aed4ececb5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -210,6 +210,7 @@ struct SGPRSpillBuilder {
auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
if (!TmpVGPRLive)
I.addReg(TmpVGPR, RegState::ImplicitDefine);
+ I->getOperand(2).setIsDead(true); // Mark SCC as dead.
TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
}
}
@@ -242,9 +243,10 @@ struct SGPRSpillBuilder {
TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
/*IsKill*/ false);
auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
- if (!TmpVGPRLive) {
+ if (!TmpVGPRLive)
I.addReg(TmpVGPR, RegState::ImplicitKill);
- }
+ I->getOperand(2).setIsDead(true); // Mark SCC as dead.
+
// Restore active lanes
if (TmpVGPRLive)
TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
@@ -267,9 +269,11 @@ struct SGPRSpillBuilder {
TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
/*IsKill*/ false);
// Spill inactive lanes
- BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
- BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
}
}
@@ -908,6 +912,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V1024_RESTORE:
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A1024_RESTORE:
+ case AMDGPU::SI_SPILL_AV1024_SAVE:
+ case AMDGPU::SI_SPILL_AV1024_RESTORE:
return 32;
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S512_RESTORE:
@@ -915,6 +921,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V512_RESTORE:
case AMDGPU::SI_SPILL_A512_SAVE:
case AMDGPU::SI_SPILL_A512_RESTORE:
+ case AMDGPU::SI_SPILL_AV512_SAVE:
+ case AMDGPU::SI_SPILL_AV512_RESTORE:
return 16;
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S256_RESTORE:
@@ -922,6 +930,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_A256_SAVE:
case AMDGPU::SI_SPILL_A256_RESTORE:
+ case AMDGPU::SI_SPILL_AV256_SAVE:
+ case AMDGPU::SI_SPILL_AV256_RESTORE:
return 8;
case AMDGPU::SI_SPILL_S224_SAVE:
case AMDGPU::SI_SPILL_S224_RESTORE:
@@ -929,6 +939,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V224_RESTORE:
case AMDGPU::SI_SPILL_A224_SAVE:
case AMDGPU::SI_SPILL_A224_RESTORE:
+ case AMDGPU::SI_SPILL_AV224_SAVE:
+ case AMDGPU::SI_SPILL_AV224_RESTORE:
return 7;
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S192_RESTORE:
@@ -936,6 +948,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V192_RESTORE:
case AMDGPU::SI_SPILL_A192_SAVE:
case AMDGPU::SI_SPILL_A192_RESTORE:
+ case AMDGPU::SI_SPILL_AV192_SAVE:
+ case AMDGPU::SI_SPILL_AV192_RESTORE:
return 6;
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S160_RESTORE:
@@ -943,6 +957,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V160_RESTORE:
case AMDGPU::SI_SPILL_A160_SAVE:
case AMDGPU::SI_SPILL_A160_RESTORE:
+ case AMDGPU::SI_SPILL_AV160_SAVE:
+ case AMDGPU::SI_SPILL_AV160_RESTORE:
return 5;
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
@@ -950,6 +966,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_A128_SAVE:
case AMDGPU::SI_SPILL_A128_RESTORE:
+ case AMDGPU::SI_SPILL_AV128_SAVE:
+ case AMDGPU::SI_SPILL_AV128_RESTORE:
return 4;
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -957,6 +975,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_A96_SAVE:
case AMDGPU::SI_SPILL_A96_RESTORE:
+ case AMDGPU::SI_SPILL_AV96_SAVE:
+ case AMDGPU::SI_SPILL_AV96_RESTORE:
return 3;
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S64_RESTORE:
@@ -964,6 +984,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_A64_SAVE:
case AMDGPU::SI_SPILL_A64_RESTORE:
+ case AMDGPU::SI_SPILL_AV64_SAVE:
+ case AMDGPU::SI_SPILL_AV64_RESTORE:
return 2;
case AMDGPU::SI_SPILL_S32_SAVE:
case AMDGPU::SI_SPILL_S32_RESTORE:
@@ -971,6 +993,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_A32_SAVE:
case AMDGPU::SI_SPILL_A32_RESTORE:
+ case AMDGPU::SI_SPILL_AV32_SAVE:
+ case AMDGPU::SI_SPILL_AV32_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -1240,9 +1264,10 @@ void SIRegisterInfo::buildSpillLoadStore(
if (ScratchOffsetReg == AMDGPU::NoRegister) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
} else {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
+ auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
.addReg(ScratchOffsetReg)
.addImm(Offset);
+ Add->getOperand(3).setIsDead(); // Mark SCC as dead.
}
Offset = 0;
@@ -1810,7 +1835,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_A128_SAVE:
case AMDGPU::SI_SPILL_A96_SAVE:
case AMDGPU::SI_SPILL_A64_SAVE:
- case AMDGPU::SI_SPILL_A32_SAVE: {
+ case AMDGPU::SI_SPILL_A32_SAVE:
+ case AMDGPU::SI_SPILL_AV1024_SAVE:
+ case AMDGPU::SI_SPILL_AV512_SAVE:
+ case AMDGPU::SI_SPILL_AV256_SAVE:
+ case AMDGPU::SI_SPILL_AV224_SAVE:
+ case AMDGPU::SI_SPILL_AV192_SAVE:
+ case AMDGPU::SI_SPILL_AV160_SAVE:
+ case AMDGPU::SI_SPILL_AV128_SAVE:
+ case AMDGPU::SI_SPILL_AV96_SAVE:
+ case AMDGPU::SI_SPILL_AV64_SAVE:
+ case AMDGPU::SI_SPILL_AV32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -1846,7 +1881,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_A224_RESTORE:
case AMDGPU::SI_SPILL_A256_RESTORE:
case AMDGPU::SI_SPILL_A512_RESTORE:
- case AMDGPU::SI_SPILL_A1024_RESTORE: {
+ case AMDGPU::SI_SPILL_A1024_RESTORE:
+ case AMDGPU::SI_SPILL_AV32_RESTORE:
+ case AMDGPU::SI_SPILL_AV64_RESTORE:
+ case AMDGPU::SI_SPILL_AV96_RESTORE:
+ case AMDGPU::SI_SPILL_AV128_RESTORE:
+ case AMDGPU::SI_SPILL_AV160_RESTORE:
+ case AMDGPU::SI_SPILL_AV192_RESTORE:
+ case AMDGPU::SI_SPILL_AV224_RESTORE:
+ case AMDGPU::SI_SPILL_AV256_RESTORE:
+ case AMDGPU::SI_SPILL_AV512_RESTORE:
+ case AMDGPU::SI_SPILL_AV1024_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 3a372d4519fb..c8f1daf26de9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -731,11 +731,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- // getVOPe32 could be -1 here if we started with an instruction that had
- // a 32-bit encoding and then commuted it to an instruction that did not.
- if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
- continue;
-
int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
if (TII->isVOPC(Op32)) {
@@ -776,10 +771,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
const MachineOperand *SDst = TII->getNamedOperand(MI,
AMDGPU::OpName::sdst);
- // Check the carry-in operand for v_addc_u32_e64.
- const MachineOperand *Src2 = TII->getNamedOperand(MI,
- AMDGPU::OpName::src2);
-
if (SDst) {
bool Next = false;
@@ -791,6 +782,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// All of the instructions with carry outs also have an SGPR input in
// src2.
+ const MachineOperand *Src2 = TII->getNamedOperand(MI,
+ AMDGPU::OpName::src2);
if (Src2 && Src2->getReg() != VCCReg) {
if (Src2->getReg().isVirtual())
MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 46012e5d7d97..77ee3c0ff0e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -495,11 +495,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// instruction as needing e.g. WQM before visiting it and realizing it needs
// WQM disabled.
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
- for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
- MachineBasicBlock &MBB = **BI;
- BlockInfo &BBI = Blocks[&MBB];
+ for (MachineBasicBlock *MBB : RPOT) {
+ BlockInfo &BBI = Blocks[MBB];
- for (MachineInstr &MI : MBB) {
+ for (MachineInstr &MI : *MBB) {
InstrInfo &III = Instructions[&MI];
unsigned Opcode = MI.getOpcode();
char Flags = 0;
@@ -561,7 +560,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
BBI.InNeeds |= StateExact;
- Worklist.push_back(&MBB);
+ Worklist.push_back(MBB);
}
GlobalFlags |= StateExact;
III.Disabled = StateWQM | StateStrict;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
index 8502ed61b366..184c871db775 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -181,15 +181,8 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse
" $sdst", [(set i64:$sdst, (node))]> {
let hasSideEffects = 1;
- // FIXME: This should be definitively mayStore = 0. TableGen
- // brokenly tries to infer these based on the intrinsic properties
- // corresponding to the IR attributes. The target intrinsics are
- // considered as writing to memory for IR dependency purposes, but
- // those can be modeled with hasSideEffects here. These also end up
- // inferring differently for llvm.readcyclecounter and the amdgcn
- // intrinsics.
- let mayStore = ?;
- let mayLoad = 1;
+ let mayStore = 0;
+ let mayLoad = 0;
let has_sbase = 0;
let has_offset = 0;
}
@@ -765,11 +758,11 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
}];
}
-def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
-def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
-def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
-def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
-def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
+def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
multiclass SMRD_Pattern <string Instr, ValueType vt> {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 61ecc13620a1..1713586dcf5b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -157,6 +157,42 @@ class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> :
let has_sdst = 0;
}
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return !N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0, node:$src1),
+ (Op $src0, $src1),
+ [{ return !N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
+class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0, node:$src1),
+ (Op $src0, $src1),
+ [{ return N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
let isMoveImm = 1 in {
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -172,11 +208,11 @@ let isMoveImm = 1 in {
let Defs = [SCC] in {
def S_NOT_B32 : SOP1_32 <"s_not_b32",
- [(set i32:$sdst, (not i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
>;
def S_NOT_B64 : SOP1_64 <"s_not_b64",
- [(set i64:$sdst, (not i64:$src0))]
+ [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))]
>;
def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
@@ -221,22 +257,22 @@ let isReMaterializable = 1 in {
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
- [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]
>;
def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
- [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))]
>;
def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
- [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))]
>;
def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
- [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))]
>;
def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
- [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))]
>;
def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
@@ -426,41 +462,6 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
"$sdst, $src0, $src1", pattern
>;
-class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
- (ops node:$src0),
- (Op $src0),
- [{ return !N->isDivergent(); }]> {
- // This check is unnecessary as it's captured by the result register
- // bank constraint.
- //
- // FIXME: Should add a way for the emitter to recognize this is a
- // trivially true predicate to eliminate the check.
- let GISelPredicateCode = [{return true;}];
-}
-
-class UniformBinFrag<SDPatternOperator Op> : PatFrag <
- (ops node:$src0, node:$src1),
- (Op $src0, $src1),
- [{ return !N->isDivergent(); }]> {
- // This check is unnecessary as it's captured by the result register
- // bank constraint.
- //
- // FIXME: Should add a way for the emitter to recognize this is a
- // trivially true predicate to eliminate the check.
- let GISelPredicateCode = [{return true;}];
-}
-
-class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
- (ops node:$src0, node:$src1),
- (Op $src0, $src1),
- [{ return N->isDivergent(); }]> {
- // This check is unnecessary as it's captured by the result register
- // bank constraint.
- //
- // FIXME: Should add a way for the emitter to recognize this is a
- // trivially true predicate to eliminate the check.
- let GISelPredicateCode = [{return true;}];
-}
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
@@ -485,19 +486,18 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
[(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
} // End Uses = [SCC]
-
let isCommutable = 1 in {
def S_MIN_I32 : SOP2_32 <"s_min_i32",
- [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
>;
def S_MIN_U32 : SOP2_32 <"s_min_u32",
- [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
>;
def S_MAX_I32 : SOP2_32 <"s_max_i32",
- [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
>;
def S_MAX_U32 : SOP2_32 <"s_max_u32",
- [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
>;
} // End isCommutable = 1
} // End Defs = [SCC]
@@ -870,7 +870,7 @@ def S_GETREG_B32 : SOPK_Pseudo <
}
} // End mayLoad = 1
-let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in {
+let Defs = [MODE], Uses = [MODE] in {
// FIXME: Need to truncate immediate to 16-bits.
class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo <
@@ -914,7 +914,7 @@ def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo {
let hasSideEffects = 0;
}
-} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE]
+} // End Defs = [MODE], Uses = [MODE]
class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
SOPK_Pseudo<
@@ -1264,7 +1264,7 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let mayStore = 1;
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+let hasSideEffects = 1 in
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
@@ -1278,8 +1278,6 @@ def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
"$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
let hasSideEffects = 1;
- let mayLoad = 0;
- let mayStore = 0;
}
def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
@@ -1305,14 +1303,10 @@ def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> {
def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_incperflevel timm:$simm16)]> {
let hasSideEffects = 1;
- let mayLoad = 0;
- let mayStore = 0;
}
def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_decperflevel timm:$simm16)]> {
let hasSideEffects = 1;
- let mayLoad = 0;
- let mayStore = 0;
}
def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
let simm16 = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
index 2e4d83fbbc39..a83ff6667956 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -15,7 +15,6 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/CallGraph.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/ReplaceConstant.h"
@@ -25,175 +24,6 @@ namespace llvm {
namespace AMDGPU {
-// An helper class for collecting all reachable callees for each kernel defined
-// within the module.
-class CollectReachableCallees {
- Module &M;
- CallGraph CG;
- SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
-
- // Collect all address taken functions within the module.
- void collectAddressTakenFunctions() {
- auto *ECNode = CG.getExternalCallingNode();
-
- for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
- auto *CGN = GI->second;
- auto *F = CGN->getFunction();
- if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
- continue;
- AddressTakenFunctions.insert(CGN);
- }
- }
-
- // For given kernel, collect all its reachable non-kernel functions.
- SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
- SmallPtrSet<Function *, 8> ReachableCallees;
-
- // Call graph node which represents this kernel.
- auto *KCGN = CG[K];
-
- // Go through all call graph nodes reachable from the node representing this
- // kernel, visit all their call sites, if the call site is direct, add
- // corresponding callee to reachable callee set, if it is indirect, resolve
- // the indirect call site to potential reachable callees, add them to
- // reachable callee set, and repeat the process for the newly added
- // potential callee nodes.
- //
- // FIXME: Need to handle bit-casted function pointers.
- //
- SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
- SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
- while (!CGNStack.empty()) {
- auto *CGN = CGNStack.pop_back_val();
-
- if (!VisitedCGNodes.insert(CGN).second)
- continue;
-
- // Ignore call graph node which does not have associated function or
- // associated function is not a definition.
- if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
- continue;
-
- for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
- auto *RCB = cast<CallBase>(GI->first.getValue());
- auto *RCGN = GI->second;
-
- if (auto *DCallee = RCGN->getFunction()) {
- ReachableCallees.insert(DCallee);
- } else if (RCB->isIndirectCall()) {
- auto *RCBFTy = RCB->getFunctionType();
- for (auto *ACGN : AddressTakenFunctions) {
- auto *ACallee = ACGN->getFunction();
- if (ACallee->getFunctionType() == RCBFTy) {
- ReachableCallees.insert(ACallee);
- CGNStack.append(df_begin(ACGN), df_end(ACGN));
- }
- }
- }
- }
- }
-
- return ReachableCallees;
- }
-
-public:
- explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
- // Collect address taken functions.
- collectAddressTakenFunctions();
- }
-
- void collectReachableCallees(
- DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
- // Collect reachable callee set for each kernel defined in the module.
- for (Function &F : M.functions()) {
- if (!AMDGPU::isKernelCC(&F))
- continue;
- Function *K = &F;
- KernelToCallees[K] = collectReachableCallees(K);
- }
- }
-};
-
-void collectReachableCallees(
- Module &M,
- DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
- CollectReachableCallees CRC{M};
- CRC.collectReachableCallees(KernelToCallees);
-}
-
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
- SmallPtrSet<Function *, 8> LDSAccessors;
- SmallVector<User *, 8> UserStack(GV->users());
- SmallPtrSet<User *, 8> VisitedUsers;
-
- while (!UserStack.empty()) {
- auto *U = UserStack.pop_back_val();
-
- // `U` is already visited? continue to next one.
- if (!VisitedUsers.insert(U).second)
- continue;
-
- // `U` is a global variable which is initialized with LDS. Ignore LDS.
- if (isa<GlobalValue>(U))
- return SmallPtrSet<Function *, 8>();
-
- // Recursively explore constant users.
- if (isa<Constant>(U)) {
- append_range(UserStack, U->users());
- continue;
- }
-
- // `U` should be an instruction, if it belongs to a non-kernel function F,
- // then collect F.
- Function *F = cast<Instruction>(U)->getFunction();
- if (!AMDGPU::isKernelCC(F))
- LDSAccessors.insert(F);
- }
-
- return LDSAccessors;
-}
-
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
- DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
- SmallVector<User *, 8> UserStack;
- SmallPtrSet<User *, 8> VisitedUsers;
-
- UserStack.push_back(U);
-
- while (!UserStack.empty()) {
- auto *UU = UserStack.pop_back_val();
-
- if (!VisitedUsers.insert(UU).second)
- continue;
-
- if (isa<GlobalValue>(UU))
- continue;
-
- if (isa<Constant>(UU)) {
- append_range(UserStack, UU->users());
- continue;
- }
-
- auto *I = cast<Instruction>(UU);
- Function *F = I->getFunction();
- if (CollectKernelInsts) {
- if (!AMDGPU::isKernelCC(F)) {
- continue;
- }
- } else {
- if (AMDGPU::isKernelCC(F)) {
- continue;
- }
- }
-
- FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
- FunctionToInsts[F].insert(I);
- }
-
- return FunctionToInsts;
-}
-
bool isKernelCC(const Function *Func) {
return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
}
@@ -232,26 +62,8 @@ void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
}
}
-bool hasUserInstruction(const GlobalValue *GV) {
- SmallPtrSet<const User *, 8> Visited;
- SmallVector<const User *, 16> Stack(GV->users());
-
- while (!Stack.empty()) {
- const User *U = Stack.pop_back_val();
-
- if (!Visited.insert(U).second)
- continue;
-
- if (isa<Instruction>(U))
- return true;
-
- append_range(Stack, U->users());
- }
-
- return false;
-}
-
-bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
+static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
+ const Function *F) {
// We are not interested in kernel LDS lowering for module LDS itself.
if (F && GV.getName() == "llvm.amdgcn.module.lds")
return false;
@@ -259,7 +71,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
bool Ret = false;
SmallPtrSet<const User *, 8> Visited;
SmallVector<const User *, 16> Stack(GV.users());
- SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
assert(!F || isKernelCC(F));
@@ -267,15 +78,10 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
const User *V = Stack.pop_back_val();
Visited.insert(V);
- if (auto *G = dyn_cast<GlobalValue>(V)) {
- StringRef GName = G->getName();
- if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
- // For kernel LDS lowering, if G is not a compiler.used list, then we
- // cannot lower the lds GV since we cannot replace the use of GV within
- // G.
- return false;
- }
- GlobalUsers.insert(G);
+ if (isa<GlobalValue>(V)) {
+ // This use of the LDS variable is the initializer of a global variable.
+ // This is ill formed. The address of an LDS variable is kernel dependent
+ // and unknown until runtime. It can't be written to a global variable.
continue;
}
@@ -297,15 +103,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
append_range(Stack, V->users());
}
- if (!F && !Ret) {
- // For module LDS lowering, we have not yet decided if we should lower GV or
- // not. Explore all global users of GV, and check if atleast one of these
- // global users appear as an use within an instruction (possibly nested use
- // via constant expression), if so, then conservately lower LDS.
- for (auto *G : GlobalUsers)
- Ret |= hasUserInstruction(G);
- }
-
return Ret;
}
@@ -324,7 +121,7 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
continue;
}
if (!isa<UndefValue>(GV.getInitializer())) {
- // Initializers are unimplemented for local address space.
+ // Initializers are unimplemented for LDS address space.
// Leave such variables in place for consistent error reporting.
continue;
}
@@ -342,20 +139,6 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
return LocalVars;
}
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
- SmallPtrSet<GlobalValue *, 32> UsedList;
-
- SmallVector<GlobalValue *, 32> TmpVec;
- collectUsedGlobalVariables(M, TmpVec, true);
- UsedList.insert(TmpVec.begin(), TmpVec.end());
-
- TmpVec.clear();
- collectUsedGlobalVariables(M, TmpVec, false);
- UsedList.insert(TmpVec.begin(), TmpVec.end());
-
- return UsedList;
-}
-
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
index d1c9229bc336..83ef68cc3f60 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -22,44 +22,13 @@ class ConstantExpr;
namespace AMDGPU {
-/// Collect reachable callees for each kernel defined in the module \p M and
-/// return collected callees at \p KernelToCallees.
-void collectReachableCallees(
- Module &M,
- DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);
-
-/// For the given LDS global \p GV, visit all its users and collect all
-/// non-kernel functions within which \p GV is used and return collected list of
-/// such non-kernel functions.
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
-
-/// Collect all the instructions where user \p U belongs to. \p U could be
-/// instruction itself or it could be a constant expression which is used within
-/// an instruction. If \p CollectKernelInsts is true, collect instructions only
-/// from kernels, otherwise collect instructions only from non-kernel functions.
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts);
-
bool isKernelCC(const Function *Func);
Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
-/// \returns true if a given global variable \p GV (or its global users) appear
-/// as an use within some instruction (either from kernel or from non-kernel).
-bool hasUserInstruction(const GlobalValue *GV);
-
-/// \returns true if an LDS global requires lowering to a module LDS structure
-/// if \p F is not given. If \p F is given it must be a kernel and function
-/// \returns true if an LDS global is directly used from that kernel and it
-/// is safe to replace its uses with a kernel LDS structure member.
-bool shouldLowerLDSToStruct(const GlobalVariable &GV,
- const Function *F = nullptr);
-
std::vector<GlobalVariable *> findVariablesToLower(Module &M,
const Function *F = nullptr);
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
-
/// Replace all uses of constant \p C with instructions in \p F.
void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
} // end namespace AMDGPU
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a3eccf13cd71..a8368892c565 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -794,6 +794,18 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
list<dag> ret = [!con(Outs, (set Ins))];
}
+class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> {
list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []);
}